From 700bebb958e93f4d472c383de62ced9db8e64bec Mon Sep 17 00:00:00 2001
From: Jason Ekstrand <jason.ekstrand@intel.com>
Date: Tue, 28 Feb 2017 09:10:43 -0800
Subject: i965: Move the back-end compiler to src/intel/compiler

Mostly a dummy git mv with a couple of noticable parts:
 - With the earlier header cleanups, nothing in src/intel depends
files from src/mesa/drivers/dri/i965/
 - Both Autoconf and Android builds are addressed. Thanks to Mauro and
Tapani for the fixups in the latter
 - brw_util.[ch] is not really compiler specific, so it's moved to i965.

v2:
 - move brw_eu_defines.h instead of brw_defines.h
 - remove no-longer applicable includes
 - add missing vulkan/ prefix in the Android build (thanks Tapani)

v3:
 - don't list brw_defines.h in src/intel/Makefile.sources (Jason)
 - rebase on top of the oa patches

[Emil Velikov: commit message, various small fixes througout]
Signed-off-by: Emil Velikov <emil.velikov@collabora.com>
Reviewed-by: Jason Ekstrand <jason@jlekstrand.net>
---
 src/intel/compiler/.gitignore                      |   10 +
 src/intel/compiler/brw_cfg.cpp                     |  531 ++
 src/intel/compiler/brw_cfg.h                       |  358 +
 src/intel/compiler/brw_compiler.c                  |  160 +
 src/intel/compiler/brw_compiler.h                  | 1057 +++
 src/intel/compiler/brw_dead_control_flow.cpp       |  119 +
 src/intel/compiler/brw_dead_control_flow.h         |   26 +
 src/intel/compiler/brw_disasm.c                    | 1646 +++++
 src/intel/compiler/brw_eu.c                        |  719 +++
 src/intel/compiler/brw_eu.h                        |  612 ++
 src/intel/compiler/brw_eu_compact.c                | 1579 +++++
 src/intel/compiler/brw_eu_defines.h                | 1246 ++++
 src/intel/compiler/brw_eu_emit.c                   | 3675 +++++++++++
 src/intel/compiler/brw_eu_util.c                   |  123 +
 src/intel/compiler/brw_eu_validate.c               | 1051 +++
 src/intel/compiler/brw_fs.cpp                      | 6805 ++++++++++++++++++++
 src/intel/compiler/brw_fs.h                        |  500 ++
 src/intel/compiler/brw_fs_builder.h                |  662 ++
 src/intel/compiler/brw_fs_cmod_propagation.cpp     |  183 +
 src/intel/compiler/brw_fs_combine_constants.cpp    |  329 +
 src/intel/compiler/brw_fs_copy_propagation.cpp     |  869 +++
 src/intel/compiler/brw_fs_cse.cpp                  |  380 ++
 src/intel/compiler/brw_fs_dead_code_eliminate.cpp  |  148 +
 src/intel/compiler/brw_fs_generator.cpp            | 2126 ++++++
 src/intel/compiler/brw_fs_live_variables.cpp       |  334 +
 src/intel/compiler/brw_fs_live_variables.h         |  115 +
 src/intel/compiler/brw_fs_lower_d2x.cpp            |   78 +
 src/intel/compiler/brw_fs_lower_pack.cpp           |   55 +
 src/intel/compiler/brw_fs_nir.cpp                  | 4679 ++++++++++++++
 src/intel/compiler/brw_fs_reg_allocate.cpp         |  992 +++
 src/intel/compiler/brw_fs_register_coalesce.cpp    |  295 +
 src/intel/compiler/brw_fs_saturate_propagation.cpp |  156 +
 src/intel/compiler/brw_fs_sel_peephole.cpp         |  220 +
 src/intel/compiler/brw_fs_surface_builder.cpp      | 1194 ++++
 src/intel/compiler/brw_fs_surface_builder.h        |   88 +
 src/intel/compiler/brw_fs_validate.cpp             |   57 +
 src/intel/compiler/brw_fs_visitor.cpp              |  953 +++
 src/intel/compiler/brw_inst.h                      |  866 +++
 src/intel/compiler/brw_interpolation_map.c         |  109 +
 src/intel/compiler/brw_ir_allocator.h              |   87 +
 src/intel/compiler/brw_ir_fs.h                     |  451 ++
 src/intel/compiler/brw_ir_vec4.h                   |  409 ++
 src/intel/compiler/brw_nir.c                       |  764 +++
 src/intel/compiler/brw_nir.h                       |  154 +
 .../compiler/brw_nir_analyze_boolean_resolves.c    |  269 +
 src/intel/compiler/brw_nir_attribute_workarounds.c |  176 +
 src/intel/compiler/brw_nir_intrinsics.c            |  186 +
 src/intel/compiler/brw_nir_opt_peephole_ffma.c     |  297 +
 src/intel/compiler/brw_nir_tcs_workarounds.c       |  152 +
 src/intel/compiler/brw_nir_trig_workarounds.py     |   43 +
 src/intel/compiler/brw_packed_float.c              |   75 +
 src/intel/compiler/brw_predicated_break.cpp        |  148 +
 src/intel/compiler/brw_reg.h                       | 1135 ++++
 src/intel/compiler/brw_schedule_instructions.cpp   | 1753 +++++
 src/intel/compiler/brw_shader.cpp                  | 1273 ++++
 src/intel/compiler/brw_shader.h                    |  295 +
 src/intel/compiler/brw_vec4.cpp                    | 2851 ++++++++
 src/intel/compiler/brw_vec4.h                      |  399 ++
 src/intel/compiler/brw_vec4_builder.h              |  634 ++
 src/intel/compiler/brw_vec4_cmod_propagation.cpp   |  172 +
 src/intel/compiler/brw_vec4_copy_propagation.cpp   |  558 ++
 src/intel/compiler/brw_vec4_cse.cpp                |  296 +
 .../compiler/brw_vec4_dead_code_eliminate.cpp      |  160 +
 src/intel/compiler/brw_vec4_generator.cpp          | 2217 +++++++
 src/intel/compiler/brw_vec4_gs_nir.cpp             |  145 +
 src/intel/compiler/brw_vec4_gs_visitor.cpp         |  933 +++
 src/intel/compiler/brw_vec4_gs_visitor.h           |   81 +
 src/intel/compiler/brw_vec4_live_variables.cpp     |  343 +
 src/intel/compiler/brw_vec4_live_variables.h       |  112 +
 src/intel/compiler/brw_vec4_nir.cpp                | 2407 +++++++
 src/intel/compiler/brw_vec4_reg_allocate.cpp       |  558 ++
 src/intel/compiler/brw_vec4_surface_builder.cpp    |  332 +
 src/intel/compiler/brw_vec4_surface_builder.h      |   69 +
 src/intel/compiler/brw_vec4_tcs.cpp                |  516 ++
 src/intel/compiler/brw_vec4_tcs.h                  |   88 +
 src/intel/compiler/brw_vec4_tes.cpp                |  296 +
 src/intel/compiler/brw_vec4_tes.h                  |   68 +
 src/intel/compiler/brw_vec4_visitor.cpp            | 1917 ++++++
 src/intel/compiler/brw_vec4_vs.h                   |   68 +
 src/intel/compiler/brw_vec4_vs_visitor.cpp         |  221 +
 src/intel/compiler/brw_vue_map.c                   |  307 +
 src/intel/compiler/brw_wm_iz.cpp                   |  169 +
 src/intel/compiler/gen6_gs_visitor.cpp             |  753 +++
 src/intel/compiler/gen6_gs_visitor.h               |   91 +
 src/intel/compiler/intel_asm_annotation.c          |  198 +
 src/intel/compiler/intel_asm_annotation.h          |   80 +
 src/intel/compiler/test_eu_compact.c               |  300 +
 src/intel/compiler/test_eu_validate.cpp            |  847 +++
 src/intel/compiler/test_fs_cmod_propagation.cpp    |  556 ++
 src/intel/compiler/test_fs_copy_propagation.cpp    |  213 +
 .../compiler/test_fs_saturate_propagation.cpp      |  600 ++
 src/intel/compiler/test_vec4_cmod_propagation.cpp  |  823 +++
 src/intel/compiler/test_vec4_copy_propagation.cpp  |  181 +
 src/intel/compiler/test_vec4_register_coalesce.cpp |  242 +
 src/intel/compiler/test_vf_float_conversions.cpp   |  110 +
 95 files changed, 63683 insertions(+)
 create mode 100644 src/intel/compiler/.gitignore
 create mode 100644 src/intel/compiler/brw_cfg.cpp
 create mode 100644 src/intel/compiler/brw_cfg.h
 create mode 100644 src/intel/compiler/brw_compiler.c
 create mode 100644 src/intel/compiler/brw_compiler.h
 create mode 100644 src/intel/compiler/brw_dead_control_flow.cpp
 create mode 100644 src/intel/compiler/brw_dead_control_flow.h
 create mode 100644 src/intel/compiler/brw_disasm.c
 create mode 100644 src/intel/compiler/brw_eu.c
 create mode 100644 src/intel/compiler/brw_eu.h
 create mode 100644 src/intel/compiler/brw_eu_compact.c
 create mode 100644 src/intel/compiler/brw_eu_defines.h
 create mode 100644 src/intel/compiler/brw_eu_emit.c
 create mode 100644 src/intel/compiler/brw_eu_util.c
 create mode 100644 src/intel/compiler/brw_eu_validate.c
 create mode 100644 src/intel/compiler/brw_fs.cpp
 create mode 100644 src/intel/compiler/brw_fs.h
 create mode 100644 src/intel/compiler/brw_fs_builder.h
 create mode 100644 src/intel/compiler/brw_fs_cmod_propagation.cpp
 create mode 100644 src/intel/compiler/brw_fs_combine_constants.cpp
 create mode 100644 src/intel/compiler/brw_fs_copy_propagation.cpp
 create mode 100644 src/intel/compiler/brw_fs_cse.cpp
 create mode 100644 src/intel/compiler/brw_fs_dead_code_eliminate.cpp
 create mode 100644 src/intel/compiler/brw_fs_generator.cpp
 create mode 100644 src/intel/compiler/brw_fs_live_variables.cpp
 create mode 100644 src/intel/compiler/brw_fs_live_variables.h
 create mode 100644 src/intel/compiler/brw_fs_lower_d2x.cpp
 create mode 100644 src/intel/compiler/brw_fs_lower_pack.cpp
 create mode 100644 src/intel/compiler/brw_fs_nir.cpp
 create mode 100644 src/intel/compiler/brw_fs_reg_allocate.cpp
 create mode 100644 src/intel/compiler/brw_fs_register_coalesce.cpp
 create mode 100644 src/intel/compiler/brw_fs_saturate_propagation.cpp
 create mode 100644 src/intel/compiler/brw_fs_sel_peephole.cpp
 create mode 100644 src/intel/compiler/brw_fs_surface_builder.cpp
 create mode 100644 src/intel/compiler/brw_fs_surface_builder.h
 create mode 100644 src/intel/compiler/brw_fs_validate.cpp
 create mode 100644 src/intel/compiler/brw_fs_visitor.cpp
 create mode 100644 src/intel/compiler/brw_inst.h
 create mode 100644 src/intel/compiler/brw_interpolation_map.c
 create mode 100644 src/intel/compiler/brw_ir_allocator.h
 create mode 100644 src/intel/compiler/brw_ir_fs.h
 create mode 100644 src/intel/compiler/brw_ir_vec4.h
 create mode 100644 src/intel/compiler/brw_nir.c
 create mode 100644 src/intel/compiler/brw_nir.h
 create mode 100644 src/intel/compiler/brw_nir_analyze_boolean_resolves.c
 create mode 100644 src/intel/compiler/brw_nir_attribute_workarounds.c
 create mode 100644 src/intel/compiler/brw_nir_intrinsics.c
 create mode 100644 src/intel/compiler/brw_nir_opt_peephole_ffma.c
 create mode 100644 src/intel/compiler/brw_nir_tcs_workarounds.c
 create mode 100644 src/intel/compiler/brw_nir_trig_workarounds.py
 create mode 100644 src/intel/compiler/brw_packed_float.c
 create mode 100644 src/intel/compiler/brw_predicated_break.cpp
 create mode 100644 src/intel/compiler/brw_reg.h
 create mode 100644 src/intel/compiler/brw_schedule_instructions.cpp
 create mode 100644 src/intel/compiler/brw_shader.cpp
 create mode 100644 src/intel/compiler/brw_shader.h
 create mode 100644 src/intel/compiler/brw_vec4.cpp
 create mode 100644 src/intel/compiler/brw_vec4.h
 create mode 100644 src/intel/compiler/brw_vec4_builder.h
 create mode 100644 src/intel/compiler/brw_vec4_cmod_propagation.cpp
 create mode 100644 src/intel/compiler/brw_vec4_copy_propagation.cpp
 create mode 100644 src/intel/compiler/brw_vec4_cse.cpp
 create mode 100644 src/intel/compiler/brw_vec4_dead_code_eliminate.cpp
 create mode 100644 src/intel/compiler/brw_vec4_generator.cpp
 create mode 100644 src/intel/compiler/brw_vec4_gs_nir.cpp
 create mode 100644 src/intel/compiler/brw_vec4_gs_visitor.cpp
 create mode 100644 src/intel/compiler/brw_vec4_gs_visitor.h
 create mode 100644 src/intel/compiler/brw_vec4_live_variables.cpp
 create mode 100644 src/intel/compiler/brw_vec4_live_variables.h
 create mode 100644 src/intel/compiler/brw_vec4_nir.cpp
 create mode 100644 src/intel/compiler/brw_vec4_reg_allocate.cpp
 create mode 100644 src/intel/compiler/brw_vec4_surface_builder.cpp
 create mode 100644 src/intel/compiler/brw_vec4_surface_builder.h
 create mode 100644 src/intel/compiler/brw_vec4_tcs.cpp
 create mode 100644 src/intel/compiler/brw_vec4_tcs.h
 create mode 100644 src/intel/compiler/brw_vec4_tes.cpp
 create mode 100644 src/intel/compiler/brw_vec4_tes.h
 create mode 100644 src/intel/compiler/brw_vec4_visitor.cpp
 create mode 100644 src/intel/compiler/brw_vec4_vs.h
 create mode 100644 src/intel/compiler/brw_vec4_vs_visitor.cpp
 create mode 100644 src/intel/compiler/brw_vue_map.c
 create mode 100644 src/intel/compiler/brw_wm_iz.cpp
 create mode 100644 src/intel/compiler/gen6_gs_visitor.cpp
 create mode 100644 src/intel/compiler/gen6_gs_visitor.h
 create mode 100644 src/intel/compiler/intel_asm_annotation.c
 create mode 100644 src/intel/compiler/intel_asm_annotation.h
 create mode 100644 src/intel/compiler/test_eu_compact.c
 create mode 100644 src/intel/compiler/test_eu_validate.cpp
 create mode 100644 src/intel/compiler/test_fs_cmod_propagation.cpp
 create mode 100644 src/intel/compiler/test_fs_copy_propagation.cpp
 create mode 100644 src/intel/compiler/test_fs_saturate_propagation.cpp
 create mode 100644 src/intel/compiler/test_vec4_cmod_propagation.cpp
 create mode 100644 src/intel/compiler/test_vec4_copy_propagation.cpp
 create mode 100644 src/intel/compiler/test_vec4_register_coalesce.cpp
 create mode 100644 src/intel/compiler/test_vf_float_conversions.cpp

(limited to 'src/intel/compiler')

diff --git a/src/intel/compiler/.gitignore b/src/intel/compiler/.gitignore
new file mode 100644
index 00000000000..e844421b336
--- /dev/null
+++ b/src/intel/compiler/.gitignore
@@ -0,0 +1,10 @@
+brw_nir_trig_workarounds.c
+test_eu_compact
+test_eu_validate
+test_fs_cmod_propagation
+test_fs_copy_propagation
+test_fs_saturate_propagation
+test_vec4_cmod_propagation
+test_vec4_copy_propagation
+test_vec4_register_coalesce
+test_vf_float_conversions
diff --git a/src/intel/compiler/brw_cfg.cpp b/src/intel/compiler/brw_cfg.cpp
new file mode 100644
index 00000000000..fad12eec588
--- /dev/null
+++ b/src/intel/compiler/brw_cfg.cpp
@@ -0,0 +1,531 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "brw_cfg.h"
+
+/** @file brw_cfg.cpp
+ *
+ * Walks the shader instructions generated and creates a set of basic
+ * blocks with successor/predecessor edges connecting them.
+ */
+
+static bblock_t *
+pop_stack(exec_list *list)
+{
+   bblock_link *link = (bblock_link *)list->get_tail();
+   bblock_t *block = link->block;
+   link->link.remove();
+
+   return block;
+}
+
+static exec_node *
+link(void *mem_ctx, bblock_t *block)
+{
+   bblock_link *l = new(mem_ctx) bblock_link(block);
+   return &l->link;
+}
+
+bblock_t::bblock_t(cfg_t *cfg) :
+   cfg(cfg), idom(NULL), start_ip(0), end_ip(0), num(0), cycle_count(0)
+{
+   instructions.make_empty();
+   parents.make_empty();
+   children.make_empty();
+}
+
+void
+bblock_t::add_successor(void *mem_ctx, bblock_t *successor)
+{
+   successor->parents.push_tail(::link(mem_ctx, this));
+   children.push_tail(::link(mem_ctx, successor));
+}
+
+bool
+bblock_t::is_predecessor_of(const bblock_t *block) const
+{
+   foreach_list_typed_safe (bblock_link, parent, link, &block->parents) {
+      if (parent->block == this) {
+         return true;
+      }
+   }
+
+   return false;
+}
+
+bool
+bblock_t::is_successor_of(const bblock_t *block) const
+{
+   foreach_list_typed_safe (bblock_link, child, link, &block->children) {
+      if (child->block == this) {
+         return true;
+      }
+   }
+
+   return false;
+}
+
+static bool
+ends_block(const backend_instruction *inst)
+{
+   enum opcode op = inst->opcode;
+
+   return op == BRW_OPCODE_IF ||
+          op == BRW_OPCODE_ELSE ||
+          op == BRW_OPCODE_CONTINUE ||
+          op == BRW_OPCODE_BREAK ||
+          op == BRW_OPCODE_WHILE;
+}
+
+static bool
+starts_block(const backend_instruction *inst)
+{
+   enum opcode op = inst->opcode;
+
+   return op == BRW_OPCODE_DO ||
+          op == BRW_OPCODE_ENDIF;
+}
+
+bool
+bblock_t::can_combine_with(const bblock_t *that) const
+{
+   if ((const bblock_t *)this->link.next != that)
+      return false;
+
+   if (ends_block(this->end()) ||
+       starts_block(that->start()))
+      return false;
+
+   return true;
+}
+
+void
+bblock_t::combine_with(bblock_t *that)
+{
+   assert(this->can_combine_with(that));
+   foreach_list_typed (bblock_link, link, link, &this->children) {
+      assert(link->block == that);
+   }
+   foreach_list_typed (bblock_link, link, link, &that->parents) {
+      assert(link->block == this);
+   }
+
+   this->end_ip = that->end_ip;
+   this->instructions.append_list(&that->instructions);
+
+   this->cfg->remove_block(that);
+}
+
+void
+bblock_t::dump(backend_shader *s) const
+{
+   int ip = this->start_ip;
+   foreach_inst_in_block(backend_instruction, inst, this) {
+      fprintf(stderr, "%5d: ", ip);
+      s->dump_instruction(inst);
+      ip++;
+   }
+}
+
+cfg_t::cfg_t(exec_list *instructions)
+{
+   mem_ctx = ralloc_context(NULL);
+   block_list.make_empty();
+   blocks = NULL;
+   num_blocks = 0;
+   idom_dirty = true;
+   cycle_count = 0;
+
+   bblock_t *cur = NULL;
+   int ip = 0;
+
+   bblock_t *entry = new_block();
+   bblock_t *cur_if = NULL;    /**< BB ending with IF. */
+   bblock_t *cur_else = NULL;  /**< BB ending with ELSE. */
+   bblock_t *cur_endif = NULL; /**< BB starting with ENDIF. */
+   bblock_t *cur_do = NULL;    /**< BB starting with DO. */
+   bblock_t *cur_while = NULL; /**< BB immediately following WHILE. */
+   exec_list if_stack, else_stack, do_stack, while_stack;
+   bblock_t *next;
+
+   set_next_block(&cur, entry, ip);
+
+   foreach_in_list_safe(backend_instruction, inst, instructions) {
+      /* set_next_block wants the post-incremented ip */
+      ip++;
+
+      inst->exec_node::remove();
+
+      switch (inst->opcode) {
+      case BRW_OPCODE_IF:
+         cur->instructions.push_tail(inst);
+
+	 /* Push our information onto a stack so we can recover from
+	  * nested ifs.
+	  */
+	 if_stack.push_tail(link(mem_ctx, cur_if));
+	 else_stack.push_tail(link(mem_ctx, cur_else));
+
+	 cur_if = cur;
+	 cur_else = NULL;
+         cur_endif = NULL;
+
+	 /* Set up our immediately following block, full of "then"
+	  * instructions.
+	  */
+	 next = new_block();
+	 cur_if->add_successor(mem_ctx, next);
+
+	 set_next_block(&cur, next, ip);
+	 break;
+
+      case BRW_OPCODE_ELSE:
+         cur->instructions.push_tail(inst);
+
+         cur_else = cur;
+
+	 next = new_block();
+         assert(cur_if != NULL);
+	 cur_if->add_successor(mem_ctx, next);
+
+	 set_next_block(&cur, next, ip);
+	 break;
+
+      case BRW_OPCODE_ENDIF: {
+         if (cur->instructions.is_empty()) {
+            /* New block was just created; use it. */
+            cur_endif = cur;
+         } else {
+            cur_endif = new_block();
+
+            cur->add_successor(mem_ctx, cur_endif);
+
+            set_next_block(&cur, cur_endif, ip - 1);
+         }
+
+         cur->instructions.push_tail(inst);
+
+         if (cur_else) {
+            cur_else->add_successor(mem_ctx, cur_endif);
+         } else {
+            assert(cur_if != NULL);
+            cur_if->add_successor(mem_ctx, cur_endif);
+         }
+
+         assert(cur_if->end()->opcode == BRW_OPCODE_IF);
+         assert(!cur_else || cur_else->end()->opcode == BRW_OPCODE_ELSE);
+
+	 /* Pop the stack so we're in the previous if/else/endif */
+	 cur_if = pop_stack(&if_stack);
+	 cur_else = pop_stack(&else_stack);
+	 break;
+      }
+      case BRW_OPCODE_DO:
+	 /* Push our information onto a stack so we can recover from
+	  * nested loops.
+	  */
+	 do_stack.push_tail(link(mem_ctx, cur_do));
+	 while_stack.push_tail(link(mem_ctx, cur_while));
+
+	 /* Set up the block just after the while.  Don't know when exactly
+	  * it will start, yet.
+	  */
+	 cur_while = new_block();
+
+         if (cur->instructions.is_empty()) {
+            /* New block was just created; use it. */
+            cur_do = cur;
+         } else {
+            cur_do = new_block();
+
+            cur->add_successor(mem_ctx, cur_do);
+
+            set_next_block(&cur, cur_do, ip - 1);
+         }
+
+         cur->instructions.push_tail(inst);
+	 break;
+
+      case BRW_OPCODE_CONTINUE:
+         cur->instructions.push_tail(inst);
+
+         assert(cur_do != NULL);
+	 cur->add_successor(mem_ctx, cur_do);
+
+	 next = new_block();
+	 if (inst->predicate)
+	    cur->add_successor(mem_ctx, next);
+
+	 set_next_block(&cur, next, ip);
+	 break;
+
+      case BRW_OPCODE_BREAK:
+         cur->instructions.push_tail(inst);
+
+         assert(cur_while != NULL);
+	 cur->add_successor(mem_ctx, cur_while);
+
+	 next = new_block();
+	 if (inst->predicate)
+	    cur->add_successor(mem_ctx, next);
+
+	 set_next_block(&cur, next, ip);
+	 break;
+
+      case BRW_OPCODE_WHILE:
+         cur->instructions.push_tail(inst);
+
+         assert(cur_do != NULL && cur_while != NULL);
+	 cur->add_successor(mem_ctx, cur_do);
+
+         if (inst->predicate)
+            cur->add_successor(mem_ctx, cur_while);
+
+	 set_next_block(&cur, cur_while, ip);
+
+	 /* Pop the stack so we're in the previous loop */
+	 cur_do = pop_stack(&do_stack);
+	 cur_while = pop_stack(&while_stack);
+	 break;
+
+      default:
+         cur->instructions.push_tail(inst);
+	 break;
+      }
+   }
+
+   cur->end_ip = ip - 1;
+
+   make_block_array();
+}
+
+cfg_t::~cfg_t()
+{
+   ralloc_free(mem_ctx);
+}
+
+void
+cfg_t::remove_block(bblock_t *block)
+{
+   foreach_list_typed_safe (bblock_link, predecessor, link, &block->parents) {
+      /* Remove block from all of its predecessors' successor lists. */
+      foreach_list_typed_safe (bblock_link, successor, link,
+                               &predecessor->block->children) {
+         if (block == successor->block) {
+            successor->link.remove();
+            ralloc_free(successor);
+         }
+      }
+
+      /* Add removed-block's successors to its predecessors' successor lists. */
+      foreach_list_typed (bblock_link, successor, link, &block->children) {
+         if (!successor->block->is_successor_of(predecessor->block)) {
+            predecessor->block->children.push_tail(link(mem_ctx,
+                                                        successor->block));
+         }
+      }
+   }
+
+   foreach_list_typed_safe (bblock_link, successor, link, &block->children) {
+      /* Remove block from all of its childrens' parents lists. */
+      foreach_list_typed_safe (bblock_link, predecessor, link,
+                               &successor->block->parents) {
+         if (block == predecessor->block) {
+            predecessor->link.remove();
+            ralloc_free(predecessor);
+         }
+      }
+
+      /* Add removed-block's predecessors to its successors' predecessor lists. */
+      foreach_list_typed (bblock_link, predecessor, link, &block->parents) {
+         if (!predecessor->block->is_predecessor_of(successor->block)) {
+            successor->block->parents.push_tail(link(mem_ctx,
+                                                     predecessor->block));
+         }
+      }
+   }
+
+   block->link.remove();
+
+   for (int b = block->num; b < this->num_blocks - 1; b++) {
+      this->blocks[b] = this->blocks[b + 1];
+      this->blocks[b]->num = b;
+   }
+
+   this->blocks[this->num_blocks - 1]->num = this->num_blocks - 2;
+   this->num_blocks--;
+   idom_dirty = true;
+}
+
+bblock_t *
+cfg_t::new_block()
+{
+   bblock_t *block = new(mem_ctx) bblock_t(this);
+
+   return block;
+}
+
+void
+cfg_t::set_next_block(bblock_t **cur, bblock_t *block, int ip)
+{
+   if (*cur) {
+      (*cur)->end_ip = ip - 1;
+   }
+
+   block->start_ip = ip;
+   block->num = num_blocks++;
+   block_list.push_tail(&block->link);
+   *cur = block;
+}
+
+void
+cfg_t::make_block_array()
+{
+   blocks = ralloc_array(mem_ctx, bblock_t *, num_blocks);
+
+   int i = 0;
+   foreach_block (block, this) {
+      blocks[i++] = block;
+   }
+   assert(i == num_blocks);
+}
+
+void
+cfg_t::dump(backend_shader *s)
+{
+   if (idom_dirty)
+      calculate_idom();
+
+   foreach_block (block, this) {
+      if (block->idom)
+         fprintf(stderr, "START B%d IDOM(B%d)", block->num, block->idom->num);
+      else
+         fprintf(stderr, "START B%d IDOM(none)", block->num);
+
+      foreach_list_typed(bblock_link, link, link, &block->parents) {
+         fprintf(stderr, " <-B%d",
+                 link->block->num);
+      }
+      fprintf(stderr, "\n");
+      if (s != NULL)
+         block->dump(s);
+      fprintf(stderr, "END B%d", block->num);
+      foreach_list_typed(bblock_link, link, link, &block->children) {
+         fprintf(stderr, " ->B%d",
+                 link->block->num);
+      }
+      fprintf(stderr, "\n");
+   }
+}
+
+/* Calculates the immediate dominator of each block, according to "A Simple,
+ * Fast Dominance Algorithm" by Keith D. Cooper, Timothy J. Harvey, and Ken
+ * Kennedy.
+ *
+ * The authors claim that for control flow graphs of sizes normally encountered
+ * (less than 1000 nodes) that this algorithm is significantly faster than
+ * others like Lengauer-Tarjan.
+ */
+void
+cfg_t::calculate_idom()
+{
+   foreach_block(block, this) {
+      block->idom = NULL;
+   }
+   blocks[0]->idom = blocks[0];
+
+   bool changed;
+   do {
+      changed = false;
+
+      foreach_block(block, this) {
+         if (block->num == 0)
+            continue;
+
+         bblock_t *new_idom = NULL;
+         foreach_list_typed(bblock_link, parent, link, &block->parents) {
+            if (parent->block->idom) {
+               if (new_idom == NULL) {
+                  new_idom = parent->block;
+               } else if (parent->block->idom != NULL) {
+                  new_idom = intersect(parent->block, new_idom);
+               }
+            }
+         }
+
+         if (block->idom != new_idom) {
+            block->idom = new_idom;
+            changed = true;
+         }
+      }
+   } while (changed);
+
+   idom_dirty = false;
+}
+
+bblock_t *
+cfg_t::intersect(bblock_t *b1, bblock_t *b2)
+{
+   /* Note, the comparisons here are the opposite of what the paper says
+    * because we index blocks from beginning -> end (i.e. reverse post-order)
+    * instead of post-order like they assume.
+    */
+   while (b1->num != b2->num) {
+      while (b1->num > b2->num)
+         b1 = b1->idom;
+      while (b2->num > b1->num)
+         b2 = b2->idom;
+   }
+   assert(b1);
+   return b1;
+}
+
+void
+cfg_t::dump_cfg()
+{
+   printf("digraph CFG {\n");
+   for (int b = 0; b < num_blocks; b++) {
+      bblock_t *block = this->blocks[b];
+
+      foreach_list_typed_safe (bblock_link, child, link, &block->children) {
+         printf("\t%d -> %d\n", b, child->block->num);
+      }
+   }
+   printf("}\n");
+}
+
+void
+cfg_t::dump_domtree()
+{
+   printf("digraph DominanceTree {\n");
+   foreach_block(block, this) {
+      if (block->idom) {
+         printf("\t%d -> %d\n", block->idom->num, block->num);
+      }
+   }
+   printf("}\n");
+}
diff --git a/src/intel/compiler/brw_cfg.h b/src/intel/compiler/brw_cfg.h
new file mode 100644
index 00000000000..b8af40f725f
--- /dev/null
+++ b/src/intel/compiler/brw_cfg.h
@@ -0,0 +1,358 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#pragma once
+#ifndef BRW_CFG_H
+#define BRW_CFG_H
+
+#include "brw_shader.h"
+
+struct bblock_t;
+
+struct bblock_link {
+#ifdef __cplusplus
+   DECLARE_RALLOC_CXX_OPERATORS(bblock_link)
+
+   bblock_link(bblock_t *block)
+      : block(block)
+   {
+   }
+#endif
+
+   struct exec_node link;
+   struct bblock_t *block;
+};
+
+struct backend_instruction;
+
+struct bblock_t {
+#ifdef __cplusplus
+   DECLARE_RALLOC_CXX_OPERATORS(bblock_t)
+
+   explicit bblock_t(cfg_t *cfg);
+
+   void add_successor(void *mem_ctx, bblock_t *successor);
+   bool is_predecessor_of(const bblock_t *block) const;
+   bool is_successor_of(const bblock_t *block) const;
+   bool can_combine_with(const bblock_t *that) const;
+   void combine_with(bblock_t *that);
+   void dump(backend_shader *s) const;
+
+   backend_instruction *start();
+   const backend_instruction *start() const;
+   backend_instruction *end();
+   const backend_instruction *end() const;
+
+   bblock_t *next();
+   const bblock_t *next() const;
+   bblock_t *prev();
+   const bblock_t *prev() const;
+
+   bool starts_with_control_flow() const;
+   bool ends_with_control_flow() const;
+
+   backend_instruction *first_non_control_flow_inst();
+   backend_instruction *last_non_control_flow_inst();
+#endif
+
+   struct exec_node link;
+   struct cfg_t *cfg;
+   struct bblock_t *idom;
+
+   int start_ip;
+   int end_ip;
+
+   struct exec_list instructions;
+   struct exec_list parents;
+   struct exec_list children;
+   int num;
+
+   unsigned cycle_count;
+};
+
+static inline struct backend_instruction *
+bblock_start(struct bblock_t *block)
+{
+   return (struct backend_instruction *)exec_list_get_head(&block->instructions);
+}
+
+static inline const struct backend_instruction *
+bblock_start_const(const struct bblock_t *block)
+{
+   return (const struct backend_instruction *)exec_list_get_head_const(&block->instructions);
+}
+
+static inline struct backend_instruction *
+bblock_end(struct bblock_t *block)
+{
+   return (struct backend_instruction *)exec_list_get_tail(&block->instructions);
+}
+
+static inline const struct backend_instruction *
+bblock_end_const(const struct bblock_t *block)
+{
+   return (const struct backend_instruction *)exec_list_get_tail_const(&block->instructions);
+}
+
+static inline struct bblock_t *
+bblock_next(struct bblock_t *block)
+{
+   if (exec_node_is_tail_sentinel(block->link.next))
+      return NULL;
+
+   return (struct bblock_t *)block->link.next;
+}
+
+static inline const struct bblock_t *
+bblock_next_const(const struct bblock_t *block)
+{
+   if (exec_node_is_tail_sentinel(block->link.next))
+      return NULL;
+
+   return (const struct bblock_t *)block->link.next;
+}
+
+static inline struct bblock_t *
+bblock_prev(struct bblock_t *block)
+{
+   if (exec_node_is_head_sentinel(block->link.prev))
+      return NULL;
+
+   return (struct bblock_t *)block->link.prev;
+}
+
+static inline const struct bblock_t *
+bblock_prev_const(const struct bblock_t *block)
+{
+   if (exec_node_is_head_sentinel(block->link.prev))
+      return NULL;
+
+   return (const struct bblock_t *)block->link.prev;
+}
+
+static inline bool
+bblock_starts_with_control_flow(const struct bblock_t *block)
+{
+   enum opcode op = bblock_start_const(block)->opcode;
+   return op == BRW_OPCODE_DO || op == BRW_OPCODE_ENDIF;
+}
+
+static inline bool
+bblock_ends_with_control_flow(const struct bblock_t *block)
+{
+   enum opcode op = bblock_end_const(block)->opcode;
+   return op == BRW_OPCODE_IF ||
+          op == BRW_OPCODE_ELSE ||
+          op == BRW_OPCODE_WHILE ||
+          op == BRW_OPCODE_BREAK ||
+          op == BRW_OPCODE_CONTINUE;
+}
+
+static inline struct backend_instruction *
+bblock_first_non_control_flow_inst(struct bblock_t *block)
+{
+   struct backend_instruction *inst = bblock_start(block);
+   if (bblock_starts_with_control_flow(block))
+#ifdef __cplusplus
+      inst = (struct backend_instruction *)inst->next;
+#else
+      inst = (struct backend_instruction *)inst->link.next;
+#endif
+   return inst;
+}
+
+static inline struct backend_instruction *
+bblock_last_non_control_flow_inst(struct bblock_t *block)
+{
+   struct backend_instruction *inst = bblock_end(block);
+   if (bblock_ends_with_control_flow(block))
+#ifdef __cplusplus
+      inst = (struct backend_instruction *)inst->prev;
+#else
+      inst = (struct backend_instruction *)inst->link.prev;
+#endif
+   return inst;
+}
+
+#ifdef __cplusplus
+inline backend_instruction *
+bblock_t::start()
+{
+   return bblock_start(this);
+}
+
+inline const backend_instruction *
+bblock_t::start() const
+{
+   return bblock_start_const(this);
+}
+
+inline backend_instruction *
+bblock_t::end()
+{
+   return bblock_end(this);
+}
+
+inline const backend_instruction *
+bblock_t::end() const
+{
+   return bblock_end_const(this);
+}
+
+inline bblock_t *
+bblock_t::next()
+{
+   return bblock_next(this);
+}
+
+inline const bblock_t *
+bblock_t::next() const
+{
+   return bblock_next_const(this);
+}
+
+inline bblock_t *
+bblock_t::prev()
+{
+   return bblock_prev(this);
+}
+
+inline const bblock_t *
+bblock_t::prev() const
+{
+   return bblock_prev_const(this);
+}
+
+inline bool
+bblock_t::starts_with_control_flow() const
+{
+   return bblock_starts_with_control_flow(this);
+}
+
+inline bool
+bblock_t::ends_with_control_flow() const
+{
+   return bblock_ends_with_control_flow(this);
+}
+
+inline backend_instruction *
+bblock_t::first_non_control_flow_inst()
+{
+   return bblock_first_non_control_flow_inst(this);
+}
+
+inline backend_instruction *
+bblock_t::last_non_control_flow_inst()
+{
+   return bblock_last_non_control_flow_inst(this);
+}
+#endif
+
+struct cfg_t {
+#ifdef __cplusplus
+   DECLARE_RALLOC_CXX_OPERATORS(cfg_t)
+
+   cfg_t(exec_list *instructions);
+   ~cfg_t();
+
+   void remove_block(bblock_t *block);
+
+   bblock_t *new_block();
+   void set_next_block(bblock_t **cur, bblock_t *block, int ip);
+   void make_block_array();
+   void calculate_idom();
+   static bblock_t *intersect(bblock_t *b1, bblock_t *b2);
+
+   void dump(backend_shader *s);
+   void dump_cfg();
+   void dump_domtree();
+#endif
+   void *mem_ctx;
+
+   /** Ordered list (by ip) of basic blocks */
+   struct exec_list block_list;
+   struct bblock_t **blocks;
+   int num_blocks;
+
+   bool idom_dirty;
+
+   unsigned cycle_count;
+};
+
+/* Note that this is implemented with a double for loop -- break will
+ * break from the inner loop only!
+ */
+#define foreach_block_and_inst(__block, __type, __inst, __cfg) \
+   foreach_block (__block, __cfg)                              \
+      foreach_inst_in_block (__type, __inst, __block)
+
+/* Note that this is implemented with a double for loop -- break will
+ * break from the inner loop only!
+ */
+#define foreach_block_and_inst_safe(__block, __type, __inst, __cfg) \
+   foreach_block_safe (__block, __cfg)                              \
+      foreach_inst_in_block_safe (__type, __inst, __block)
+
+#define foreach_block(__block, __cfg)                          \
+   foreach_list_typed (bblock_t, __block, link, &(__cfg)->block_list)
+
+#define foreach_block_reverse(__block, __cfg)                  \
+   foreach_list_typed_reverse (bblock_t, __block, link, &(__cfg)->block_list)
+
+#define foreach_block_safe(__block, __cfg)                     \
+   foreach_list_typed_safe (bblock_t, __block, link, &(__cfg)->block_list)
+
+#define foreach_block_reverse_safe(__block, __cfg)             \
+   foreach_list_typed_reverse_safe (bblock_t, __block, link, &(__cfg)->block_list)
+
+#define foreach_inst_in_block(__type, __inst, __block)         \
+   foreach_in_list(__type, __inst, &(__block)->instructions)
+
+#define foreach_inst_in_block_safe(__type, __inst, __block)    \
+   for (__type *__inst = (__type *)__block->instructions.head_sentinel.next, \
+               *__next = (__type *)__inst->next;               \
+        __next != NULL;                                        \
+        __inst = __next,                                       \
+        __next = (__type *)__next->next)
+
+#define foreach_inst_in_block_reverse(__type, __inst, __block) \
+   foreach_in_list_reverse(__type, __inst, &(__block)->instructions)
+
+#define foreach_inst_in_block_reverse_safe(__type, __inst, __block) \
+   foreach_in_list_reverse_safe(__type, __inst, &(__block)->instructions)
+
+#define foreach_inst_in_block_starting_from(__type, __scan_inst, __inst) \
+   for (__type *__scan_inst = (__type *)__inst->next;          \
+        !__scan_inst->is_tail_sentinel();                      \
+        __scan_inst = (__type *)__scan_inst->next)
+
+#define foreach_inst_in_block_reverse_starting_from(__type, __scan_inst, __inst) \
+   for (__type *__scan_inst = (__type *)__inst->prev;          \
+        !__scan_inst->is_head_sentinel();                      \
+        __scan_inst = (__type *)__scan_inst->prev)
+
+#endif /* BRW_CFG_H */
diff --git a/src/intel/compiler/brw_compiler.c b/src/intel/compiler/brw_compiler.c
new file mode 100644
index 00000000000..cd9473f9a3b
--- /dev/null
+++ b/src/intel/compiler/brw_compiler.c
@@ -0,0 +1,160 @@
+/*
+ * Copyright © 2015-2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_compiler.h"
+#include "brw_shader.h"
+#include "brw_eu.h"
+#include "common/gen_debug.h"
+#include "compiler/nir/nir.h"
+#include "main/errors.h"
+#include "util/debug.h"
+
+#define COMMON_OPTIONS                                                        \
+   .lower_sub = true,                                                         \
+   .lower_fdiv = true,                                                        \
+   .lower_scmp = true,                                                        \
+   .lower_fmod32 = true,                                                      \
+   .lower_fmod64 = false,                                                     \
+   .lower_bitfield_extract = true,                                            \
+   .lower_bitfield_insert = true,                                             \
+   .lower_uadd_carry = true,                                                  \
+   .lower_usub_borrow = true,                                                 \
+   .lower_fdiv = true,                                                        \
+   .lower_flrp64 = true,                                                      \
+   .native_integers = true,                                                   \
+   .use_interpolated_input_intrinsics = true,                                 \
+   .vertex_id_zero_based = true
+
+static const struct nir_shader_compiler_options scalar_nir_options = {
+   COMMON_OPTIONS,
+   .lower_pack_half_2x16 = true,
+   .lower_pack_snorm_2x16 = true,
+   .lower_pack_snorm_4x8 = true,
+   .lower_pack_unorm_2x16 = true,
+   .lower_pack_unorm_4x8 = true,
+   .lower_unpack_half_2x16 = true,
+   .lower_unpack_snorm_2x16 = true,
+   .lower_unpack_snorm_4x8 = true,
+   .lower_unpack_unorm_2x16 = true,
+   .lower_unpack_unorm_4x8 = true,
+   .max_unroll_iterations = 32,
+};
+
+static const struct nir_shader_compiler_options vector_nir_options = {
+   COMMON_OPTIONS,
+
+   /* In the vec4 backend, our dpN instruction replicates its result to all the
+    * components of a vec4.  We would like NIR to give us replicated fdot
+    * instructions because it can optimize better for us.
+    */
+   .fdot_replicates = true,
+
+   /* Prior to Gen6, there are no three source operations for SIMD4x2. */
+   .lower_flrp32 = true,
+
+   .lower_pack_snorm_2x16 = true,
+   .lower_pack_unorm_2x16 = true,
+   .lower_unpack_snorm_2x16 = true,
+   .lower_unpack_unorm_2x16 = true,
+   .lower_extract_byte = true,
+   .lower_extract_word = true,
+   .max_unroll_iterations = 32,
+};
+
+static const struct nir_shader_compiler_options vector_nir_options_gen6 = {
+   COMMON_OPTIONS,
+
+   /* In the vec4 backend, our dpN instruction replicates its result to all the
+    * components of a vec4.  We would like NIR to give us replicated fdot
+    * instructions because it can optimize better for us.
+    */
+   .fdot_replicates = true,
+
+   .lower_pack_snorm_2x16 = true,
+   .lower_pack_unorm_2x16 = true,
+   .lower_unpack_snorm_2x16 = true,
+   .lower_unpack_unorm_2x16 = true,
+   .lower_extract_byte = true,
+   .lower_extract_word = true,
+   .max_unroll_iterations = 32,
+};
+
+struct brw_compiler *
+brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo)
+{
+   struct brw_compiler *compiler = rzalloc(mem_ctx, struct brw_compiler);
+
+   compiler->devinfo = devinfo;
+
+   brw_fs_alloc_reg_sets(compiler);
+   brw_vec4_alloc_reg_set(compiler);
+   brw_init_compaction_tables(devinfo);
+
+   compiler->precise_trig = env_var_as_boolean("INTEL_PRECISE_TRIG", false);
+
+   compiler->scalar_stage[MESA_SHADER_VERTEX] =
+      devinfo->gen >= 8 && !(INTEL_DEBUG & DEBUG_VEC4VS);
+   compiler->scalar_stage[MESA_SHADER_TESS_CTRL] =
+      devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TCS", true);
+   compiler->scalar_stage[MESA_SHADER_TESS_EVAL] =
+      devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_TES", true);
+   compiler->scalar_stage[MESA_SHADER_GEOMETRY] =
+      devinfo->gen >= 8 && env_var_as_boolean("INTEL_SCALAR_GS", true);
+   compiler->scalar_stage[MESA_SHADER_FRAGMENT] = true;
+   compiler->scalar_stage[MESA_SHADER_COMPUTE] = true;
+
+   /* We want the GLSL compiler to emit code that uses condition codes */
+   for (int i = 0; i < MESA_SHADER_STAGES; i++) {
+      compiler->glsl_compiler_options[i].MaxUnrollIterations = 0;
+      compiler->glsl_compiler_options[i].MaxIfDepth =
+         devinfo->gen < 6 ? 16 : UINT_MAX;
+
+      compiler->glsl_compiler_options[i].EmitNoIndirectInput = true;
+      compiler->glsl_compiler_options[i].EmitNoIndirectUniform = false;
+
+      bool is_scalar = compiler->scalar_stage[i];
+
+      compiler->glsl_compiler_options[i].EmitNoIndirectOutput = is_scalar;
+      compiler->glsl_compiler_options[i].EmitNoIndirectTemp = is_scalar;
+      compiler->glsl_compiler_options[i].OptimizeForAOS = !is_scalar;
+
+      if (is_scalar) {
+         compiler->glsl_compiler_options[i].NirOptions = &scalar_nir_options;
+      } else {
+         compiler->glsl_compiler_options[i].NirOptions =
+            devinfo->gen < 6 ? &vector_nir_options : &vector_nir_options_gen6;
+      }
+
+      compiler->glsl_compiler_options[i].LowerBufferInterfaceBlocks = true;
+      compiler->glsl_compiler_options[i].ClampBlockIndicesToArrayBounds = true;
+   }
+
+   compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectInput = false;
+   compiler->glsl_compiler_options[MESA_SHADER_TESS_EVAL].EmitNoIndirectInput = false;
+   compiler->glsl_compiler_options[MESA_SHADER_TESS_CTRL].EmitNoIndirectOutput = false;
+
+   if (compiler->scalar_stage[MESA_SHADER_GEOMETRY])
+      compiler->glsl_compiler_options[MESA_SHADER_GEOMETRY].EmitNoIndirectInput = false;
+
+   return compiler;
+}
diff --git a/src/intel/compiler/brw_compiler.h b/src/intel/compiler/brw_compiler.h
new file mode 100644
index 00000000000..85257d494af
--- /dev/null
+++ b/src/intel/compiler/brw_compiler.h
@@ -0,0 +1,1057 @@
+/*
+ * Copyright © 2010 - 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdio.h>
+#include "common/gen_device_info.h"
+#include "main/mtypes.h"
+#include "main/macros.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ra_regs;
+struct nir_shader;
+struct brw_program;
+union gl_constant_value;
+
+struct brw_compiler {
+   const struct gen_device_info *devinfo;
+
+   struct {
+      struct ra_regs *regs;
+
+      /**
+       * Array of the ra classes for the unaligned contiguous register
+       * block sizes used.
+       */
+      int *classes;
+
+      /**
+       * Mapping for register-allocated objects in *regs to the first
+       * GRF for that object.
+       */
+      uint8_t *ra_reg_to_grf;
+   } vec4_reg_set;
+
+   struct {
+      struct ra_regs *regs;
+
+      /**
+       * Array of the ra classes for the unaligned contiguous register
+       * block sizes used, indexed by register size.
+       */
+      int classes[16];
+
+      /**
+       * Mapping from classes to ra_reg ranges.  Each of the per-size
+       * classes corresponds to a range of ra_reg nodes.  This array stores
+       * those ranges in the form of first ra_reg in each class and the
+       * total number of ra_reg elements in the last array element.  This
+       * way the range of the i'th class is given by:
+       * [ class_to_ra_reg_range[i], class_to_ra_reg_range[i+1] )
+       */
+      int class_to_ra_reg_range[17];
+
+      /**
+       * Mapping for register-allocated objects in *regs to the first
+       * GRF for that object.
+       */
+      uint8_t *ra_reg_to_grf;
+
+      /**
+       * ra class for the aligned pairs we use for PLN, which doesn't
+       * appear in *classes.
+       */
+      int aligned_pairs_class;
+   } fs_reg_sets[3];
+
+   void (*shader_debug_log)(void *, const char *str, ...) PRINTFLIKE(2, 3);
+   void (*shader_perf_log)(void *, const char *str, ...) PRINTFLIKE(2, 3);
+
+   bool scalar_stage[MESA_SHADER_STAGES];
+   struct gl_shader_compiler_options glsl_compiler_options[MESA_SHADER_STAGES];
+
+   /**
+    * Apply workarounds for SIN and COS output range problems.
+    * This can negatively impact performance.
+    */
+   bool precise_trig;
+};
+
+
+/**
+ * Program key structures.
+ *
+ * When drawing, we look for the currently bound shaders in the program
+ * cache.  This is essentially a hash table lookup, and these are the keys.
+ *
+ * Sometimes OpenGL features specified as state need to be simulated via
+ * shader code, due to a mismatch between the API and the hardware.  This
+ * is often referred to as "non-orthagonal state" or "NOS".  We store NOS
+ * in the program key so it's considered when searching for a program.  If
+ * we haven't seen a particular combination before, we have to recompile a
+ * new specialized version.
+ *
+ * Shader compilation should not look up state in gl_context directly, but
+ * instead use the copy in the program key.  This guarantees recompiles will
+ * happen correctly.
+ *
+ *  @{
+ */
+
+enum PACKED gen6_gather_sampler_wa {
+   WA_SIGN = 1,      /* whether we need to sign extend */
+   WA_8BIT = 2,      /* if we have an 8bit format needing wa */
+   WA_16BIT = 4,     /* if we have a 16bit format needing wa */
+};
+
+/**
+ * Sampler information needed by VS, WM, and GS program cache keys.
+ */
+struct brw_sampler_prog_key_data {
+   /**
+    * EXT_texture_swizzle and DEPTH_TEXTURE_MODE swizzles.
+    */
+   uint16_t swizzles[MAX_SAMPLERS];
+
+   uint32_t gl_clamp_mask[3];
+
+   /**
+    * For RG32F, gather4's channel select is broken.
+    */
+   uint32_t gather_channel_quirk_mask;
+
+   /**
+    * Whether this sampler uses the compressed multisample surface layout.
+    */
+   uint32_t compressed_multisample_layout_mask;
+
+   /**
+    * Whether this sampler is using 16x multisampling. If so fetching from
+    * this sampler will be handled with a different instruction, ld2dms_w
+    * instead of ld2dms.
+    */
+   uint32_t msaa_16;
+
+   /**
+    * For Sandybridge, which shader w/a we need for gather quirks.
+    */
+   enum gen6_gather_sampler_wa gen6_gather_wa[MAX_SAMPLERS];
+
+   /**
+    * Texture units that have a YUV image bound.
+    */
+   uint32_t y_u_v_image_mask;
+   uint32_t y_uv_image_mask;
+   uint32_t yx_xuxv_image_mask;
+};
+
+/**
+ * The VF can't natively handle certain types of attributes, such as GL_FIXED
+ * or most 10_10_10_2 types.  These flags enable various VS workarounds to
+ * "fix" attributes at the beginning of shaders.
+ */
+#define BRW_ATTRIB_WA_COMPONENT_MASK    7  /* mask for GL_FIXED scale channel count */
+#define BRW_ATTRIB_WA_NORMALIZE     8   /* normalize in shader */
+#define BRW_ATTRIB_WA_BGRA          16  /* swap r/b channels in shader */
+#define BRW_ATTRIB_WA_SIGN          32  /* interpret as signed in shader */
+#define BRW_ATTRIB_WA_SCALE         64  /* interpret as scaled in shader */
+
+/** The program key for Vertex Shaders. */
+struct brw_vs_prog_key {
+   unsigned program_string_id;
+
+   /**
+    * Per-attribute workaround flags
+    *
+    * For each attribute, a combination of BRW_ATTRIB_WA_*.
+    */
+   uint8_t gl_attrib_wa_flags[VERT_ATTRIB_MAX];
+
+   bool copy_edgeflag:1;
+
+   bool clamp_vertex_color:1;
+
+   /**
+    * How many user clipping planes are being uploaded to the vertex shader as
+    * push constants.
+    *
+    * These are used for lowering legacy gl_ClipVertex/gl_Position clipping to
+    * clip distances.
+    */
+   unsigned nr_userclip_plane_consts:4;
+
+   /**
+    * For pre-Gen6 hardware, a bitfield indicating which texture coordinates
+    * are going to be replaced with point coordinates (as a consequence of a
+    * call to glTexEnvi(GL_POINT_SPRITE, GL_COORD_REPLACE, GL_TRUE)).  Because
+    * our SF thread requires exact matching between VS outputs and FS inputs,
+    * these texture coordinates will need to be unconditionally included in
+    * the VUE, even if they aren't written by the vertex shader.
+    */
+   uint8_t point_coord_replace;
+
+   struct brw_sampler_prog_key_data tex;
+};
+
+/** The program key for Tessellation Control Shaders. */
+struct brw_tcs_prog_key
+{
+   unsigned program_string_id;
+
+   GLenum tes_primitive_mode;
+
+   unsigned input_vertices;
+
+   /** A bitfield of per-patch outputs written. */
+   uint32_t patch_outputs_written;
+
+   /** A bitfield of per-vertex outputs written. */
+   uint64_t outputs_written;
+
+   bool quads_workaround;
+
+   struct brw_sampler_prog_key_data tex;
+};
+
+/** The program key for Tessellation Evaluation Shaders. */
+struct brw_tes_prog_key
+{
+   unsigned program_string_id;
+
+   /** A bitfield of per-patch inputs read. */
+   uint32_t patch_inputs_read;
+
+   /** A bitfield of per-vertex inputs read. */
+   uint64_t inputs_read;
+
+   struct brw_sampler_prog_key_data tex;
+};
+
+/** The program key for Geometry Shaders. */
+struct brw_gs_prog_key
+{
+   unsigned program_string_id;
+
+   struct brw_sampler_prog_key_data tex;
+};
+
+/* A big lookup table is used to figure out which and how many
+ * additional regs will inserted before the main payload in the WM
+ * program execution.  These mainly relate to depth and stencil
+ * processing and the early-depth-test optimization.
+ */
+enum brw_wm_iz_bits {
+   BRW_WM_IZ_PS_KILL_ALPHATEST_BIT     = 0x1,
+   BRW_WM_IZ_PS_COMPUTES_DEPTH_BIT     = 0x2,
+   BRW_WM_IZ_DEPTH_WRITE_ENABLE_BIT    = 0x4,
+   BRW_WM_IZ_DEPTH_TEST_ENABLE_BIT     = 0x8,
+   BRW_WM_IZ_STENCIL_WRITE_ENABLE_BIT  = 0x10,
+   BRW_WM_IZ_STENCIL_TEST_ENABLE_BIT   = 0x20,
+   BRW_WM_IZ_BIT_MAX                   = 0x40
+};
+
+enum brw_wm_aa_enable {
+   BRW_WM_AA_NEVER,
+   BRW_WM_AA_SOMETIMES,
+   BRW_WM_AA_ALWAYS
+};
+
+/** The program key for Fragment/Pixel Shaders. */
+struct brw_wm_prog_key {
+   /* Some collection of BRW_WM_IZ_* */
+   uint8_t iz_lookup;
+   bool stats_wm:1;
+   bool flat_shade:1;
+   unsigned nr_color_regions:5;
+   bool replicate_alpha:1;
+   bool clamp_fragment_color:1;
+   bool persample_interp:1;
+   bool multisample_fbo:1;
+   enum brw_wm_aa_enable line_aa:2;
+   bool high_quality_derivatives:1;
+   bool force_dual_color_blend:1;
+   bool coherent_fb_fetch:1;
+
+   uint16_t drawable_height;
+   uint64_t input_slots_valid;
+   unsigned program_string_id;
+   GLenum alpha_test_func;          /* < For Gen4/5 MRT alpha test */
+   float alpha_test_ref;
+
+   struct brw_sampler_prog_key_data tex;
+};
+
+struct brw_cs_prog_key {
+   uint32_t program_string_id;
+   struct brw_sampler_prog_key_data tex;
+};
+
+/*
+ * Image metadata structure as laid out in the shader parameter
+ * buffer.  Entries have to be 16B-aligned for the vec4 back-end to be
+ * able to use them.  That's okay because the padding and any unused
+ * entries [most of them except when we're doing untyped surface
+ * access] will be removed by the uniform packing pass.
+ */
+#define BRW_IMAGE_PARAM_SURFACE_IDX_OFFSET      0
+#define BRW_IMAGE_PARAM_OFFSET_OFFSET           4
+#define BRW_IMAGE_PARAM_SIZE_OFFSET             8
+#define BRW_IMAGE_PARAM_STRIDE_OFFSET           12
+#define BRW_IMAGE_PARAM_TILING_OFFSET           16
+#define BRW_IMAGE_PARAM_SWIZZLING_OFFSET        20
+#define BRW_IMAGE_PARAM_SIZE                    24
+
+struct brw_image_param {
+   /** Surface binding table index. */
+   uint32_t surface_idx;
+
+   /** Offset applied to the X and Y surface coordinates. */
+   uint32_t offset[2];
+
+   /** Surface X, Y and Z dimensions. */
+   uint32_t size[3];
+
+   /** X-stride in bytes, Y-stride in pixels, horizontal slice stride in
+    * pixels, vertical slice stride in pixels.
+    */
+   uint32_t stride[4];
+
+   /** Log2 of the tiling modulus in the X, Y and Z dimension. */
+   uint32_t tiling[3];
+
+   /**
+    * Right shift to apply for bit 6 address swizzling.  Two different
+    * swizzles can be specified and will be applied one after the other.  The
+    * resulting address will be:
+    *
+    *  addr' = addr ^ ((1 << 6) & ((addr >> swizzling[0]) ^
+    *                              (addr >> swizzling[1])))
+    *
+    * Use \c 0xff if any of the swizzles is not required.
+    */
+   uint32_t swizzling[2];
+};
+
+/** Max number of render targets in a shader */
+#define BRW_MAX_DRAW_BUFFERS 8
+
+/**
+ * Max number of binding table entries used for stream output.
+ *
+ * From the OpenGL 3.0 spec, table 6.44 (Transform Feedback State), the
+ * minimum value of MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS is 64.
+ *
+ * On Gen6, the size of transform feedback data is limited not by the number
+ * of components but by the number of binding table entries we set aside.  We
+ * use one binding table entry for a float, one entry for a vector, and one
+ * entry per matrix column.  Since the only way we can communicate our
+ * transform feedback capabilities to the client is via
+ * MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS, we need to plan for the
+ * worst case, in which all the varyings are floats, so we use up one binding
+ * table entry per component.  Therefore we need to set aside at least 64
+ * binding table entries for use by transform feedback.
+ *
+ * Note: since we don't currently pack varyings, it is currently impossible
+ * for the client to actually use up all of these binding table entries--if
+ * all of their varyings were floats, they would run out of varying slots and
+ * fail to link.  But that's a bug, so it seems prudent to go ahead and
+ * allocate the number of binding table entries we will need once the bug is
+ * fixed.
+ */
+#define BRW_MAX_SOL_BINDINGS 64
+
+/**
+ * Binding table index for the first gen6 SOL binding.
+ */
+#define BRW_GEN6_SOL_BINDING_START 0
+
+/**
+ * Stride in bytes between shader_time entries.
+ *
+ * We separate entries by a cacheline to reduce traffic between EUs writing to
+ * different entries.
+ */
+#define BRW_SHADER_TIME_STRIDE 64
+
+struct brw_stage_prog_data {
+   struct {
+      /** size of our binding table. */
+      uint32_t size_bytes;
+
+      /** @{
+       * surface indices for the various groups of surfaces
+       */
+      uint32_t pull_constants_start;
+      uint32_t texture_start;
+      uint32_t gather_texture_start;
+      uint32_t ubo_start;
+      uint32_t ssbo_start;
+      uint32_t abo_start;
+      uint32_t image_start;
+      uint32_t shader_time_start;
+      uint32_t plane_start[3];
+      /** @} */
+   } binding_table;
+
+   GLuint nr_params;       /**< number of float params/constants */
+   GLuint nr_pull_params;
+   unsigned nr_image_params;
+
+   unsigned curb_read_length;
+   unsigned total_scratch;
+   unsigned total_shared;
+
+   /**
+    * Register where the thread expects to find input data from the URB
+    * (typically uniforms, followed by vertex or fragment attributes).
+    */
+   unsigned dispatch_grf_start_reg;
+
+   bool use_alt_mode; /**< Use ALT floating point mode?  Otherwise, IEEE. */
+
+   /* Pointers to tracked values (only valid once
+    * _mesa_load_state_parameters has been called at runtime).
+    */
+   const union gl_constant_value **param;
+   const union gl_constant_value **pull_param;
+
+   /** Image metadata passed to the shader as uniforms. */
+   struct brw_image_param *image_param;
+};
+
+static inline void
+brw_mark_surface_used(struct brw_stage_prog_data *prog_data,
+                      unsigned surf_index)
+{
+   /* A binding table index is 8 bits and the top 3 values are reserved for
+    * special things (stateless and SLM).
+    */
+   assert(surf_index <= 252);
+
+   prog_data->binding_table.size_bytes =
+      MAX2(prog_data->binding_table.size_bytes, (surf_index + 1) * 4);
+}
+
+/* Data about a particular attempt to compile a program.  Note that
+ * there can be many of these, each in a different GL state
+ * corresponding to a different brw_wm_prog_key struct, with different
+ * compiled programs.
+ */
+struct brw_wm_prog_data {
+   struct brw_stage_prog_data base;
+
+   GLuint num_varying_inputs;
+
+   uint8_t reg_blocks_0;
+   uint8_t reg_blocks_2;
+
+   uint8_t dispatch_grf_start_reg_2;
+   uint32_t prog_offset_2;
+
+   struct {
+      /** @{
+       * surface indices the WM-specific surfaces
+       */
+      uint32_t render_target_start;
+      uint32_t render_target_read_start;
+      /** @} */
+   } binding_table;
+
+   uint8_t computed_depth_mode;
+   bool computed_stencil;
+
+   bool early_fragment_tests;
+   bool post_depth_coverage;
+   bool inner_coverage;
+   bool dispatch_8;
+   bool dispatch_16;
+   bool dual_src_blend;
+   bool persample_dispatch;
+   bool uses_pos_offset;
+   bool uses_omask;
+   bool uses_kill;
+   bool uses_src_depth;
+   bool uses_src_w;
+   bool uses_sample_mask;
+   bool has_side_effects;
+   bool pulls_bary;
+
+   bool contains_flat_varying;
+   bool contains_noperspective_varying;
+
+   /**
+    * Mask of which interpolation modes are required by the fragment shader.
+    * Used in hardware setup on gen6+.
+    */
+   uint32_t barycentric_interp_modes;
+
+   /**
+    * Mask of which FS inputs are marked flat by the shader source.  This is
+    * needed for setting up 3DSTATE_SF/SBE.
+    */
+   uint32_t flat_inputs;
+
+   /* Mapping of VUE slots to interpolation modes.
+    * Used by the Gen4-5 clip/sf/wm stages.
+    */
+   unsigned char interp_mode[65]; /* BRW_VARYING_SLOT_COUNT */
+
+   /**
+    * Map from gl_varying_slot to the position within the FS setup data
+    * payload where the varying's attribute vertex deltas should be delivered.
+    * For varying slots that are not used by the FS, the value is -1.
+    */
+   int urb_setup[VARYING_SLOT_MAX];
+};
+
+struct brw_push_const_block {
+   unsigned dwords;     /* Dword count, not reg aligned */
+   unsigned regs;
+   unsigned size;       /* Bytes, register aligned */
+};
+
+struct brw_cs_prog_data {
+   struct brw_stage_prog_data base;
+
+   GLuint dispatch_grf_start_reg_16;
+   unsigned local_size[3];
+   unsigned simd_size;
+   unsigned threads;
+   bool uses_barrier;
+   bool uses_num_work_groups;
+   int thread_local_id_index;
+
+   struct {
+      struct brw_push_const_block cross_thread;
+      struct brw_push_const_block per_thread;
+      struct brw_push_const_block total;
+   } push;
+
+   struct {
+      /** @{
+       * surface indices the CS-specific surfaces
+       */
+      uint32_t work_groups_start;
+      /** @} */
+   } binding_table;
+};
+
+/**
+ * Enum representing the i965-specific vertex results that don't correspond
+ * exactly to any element of gl_varying_slot.  The values of this enum are
+ * assigned such that they don't conflict with gl_varying_slot.
+ */
+typedef enum
+{
+   BRW_VARYING_SLOT_NDC = VARYING_SLOT_MAX,
+   BRW_VARYING_SLOT_PAD,
+   /**
+    * Technically this is not a varying but just a placeholder that
+    * compile_sf_prog() inserts into its VUE map to cause the gl_PointCoord
+    * builtin variable to be compiled correctly. see compile_sf_prog() for
+    * more info.
+    */
+   BRW_VARYING_SLOT_PNTC,
+   BRW_VARYING_SLOT_COUNT
+} brw_varying_slot;
+
+/**
+ * We always program SF to start reading at an offset of 1 (2 varying slots)
+ * from the start of the vertex URB entry.  This causes it to skip:
+ * - VARYING_SLOT_PSIZ and BRW_VARYING_SLOT_NDC on gen4-5
+ * - VARYING_SLOT_PSIZ and VARYING_SLOT_POS on gen6+
+ */
+#define BRW_SF_URB_ENTRY_READ_OFFSET 1
+
+/**
+ * Bitmask indicating which fragment shader inputs represent varyings (and
+ * hence have to be delivered to the fragment shader by the SF/SBE stage).
+ */
+#define BRW_FS_VARYING_INPUT_MASK \
+   (BITFIELD64_RANGE(0, VARYING_SLOT_MAX) & \
+    ~VARYING_BIT_POS & ~VARYING_BIT_FACE)
+
+/**
+ * Data structure recording the relationship between the gl_varying_slot enum
+ * and "slots" within the vertex URB entry (VUE).  A "slot" is defined as a
+ * single octaword within the VUE (128 bits).
+ *
+ * Note that each BRW register contains 256 bits (2 octawords), so when
+ * accessing the VUE in URB_NOSWIZZLE mode, each register corresponds to two
+ * consecutive VUE slots.  When accessing the VUE in URB_INTERLEAVED mode (as
+ * in a vertex shader), each register corresponds to a single VUE slot, since
+ * it contains data for two separate vertices.
+ */
+struct brw_vue_map {
+   /**
+    * Bitfield representing all varying slots that are (a) stored in this VUE
+    * map, and (b) actually written by the shader.  Does not include any of
+    * the additional varying slots defined in brw_varying_slot.
+    */
+   uint64_t slots_valid;
+
+   /**
+    * Is this VUE map for a separate shader pipeline?
+    *
+    * Separable programs (GL_ARB_separate_shader_objects) can be mixed and matched
+    * without the linker having a chance to dead code eliminate unused varyings.
+    *
+    * This means that we have to use a fixed slot layout, based on the output's
+    * location field, rather than assigning slots in a compact contiguous block.
+    */
+   bool separate;
+
+   /**
+    * Map from gl_varying_slot value to VUE slot.  For gl_varying_slots that are
+    * not stored in a slot (because they are not written, or because
+    * additional processing is applied before storing them in the VUE), the
+    * value is -1.
+    */
+   signed char varying_to_slot[VARYING_SLOT_TESS_MAX];
+
+   /**
+    * Map from VUE slot to gl_varying_slot value.  For slots that do not
+    * directly correspond to a gl_varying_slot, the value comes from
+    * brw_varying_slot.
+    *
+    * For slots that are not in use, the value is BRW_VARYING_SLOT_PAD.
+    */
+   signed char slot_to_varying[VARYING_SLOT_TESS_MAX];
+
+   /**
+    * Total number of VUE slots in use
+    */
+   int num_slots;
+
+   /**
+    * Number of per-patch VUE slots. Only valid for tessellation control
+    * shader outputs and tessellation evaluation shader inputs.
+    */
+   int num_per_patch_slots;
+
+   /**
+    * Number of per-vertex VUE slots. Only valid for tessellation control
+    * shader outputs and tessellation evaluation shader inputs.
+    */
+   int num_per_vertex_slots;
+};
+
+void brw_print_vue_map(FILE *fp, const struct brw_vue_map *vue_map);
+
+/**
+ * Convert a VUE slot number into a byte offset within the VUE.
+ */
+static inline GLuint brw_vue_slot_to_offset(GLuint slot)
+{
+   return 16*slot;
+}
+
+/**
+ * Convert a vertex output (brw_varying_slot) into a byte offset within the
+ * VUE.
+ */
+static inline
+GLuint brw_varying_to_offset(const struct brw_vue_map *vue_map, GLuint varying)
+{
+   return brw_vue_slot_to_offset(vue_map->varying_to_slot[varying]);
+}
+
+void brw_compute_vue_map(const struct gen_device_info *devinfo,
+                         struct brw_vue_map *vue_map,
+                         uint64_t slots_valid,
+                         bool separate_shader);
+
+void brw_compute_tess_vue_map(struct brw_vue_map *const vue_map,
+                              uint64_t slots_valid,
+                              uint32_t is_patch);
+
+/* brw_interpolation_map.c */
+void brw_setup_vue_interpolation(struct brw_vue_map *vue_map,
+                                 struct nir_shader *nir,
+                                 struct brw_wm_prog_data *prog_data,
+                                 const struct gen_device_info *devinfo);
+
+enum shader_dispatch_mode {
+   DISPATCH_MODE_4X1_SINGLE = 0,
+   DISPATCH_MODE_4X2_DUAL_INSTANCE = 1,
+   DISPATCH_MODE_4X2_DUAL_OBJECT = 2,
+   DISPATCH_MODE_SIMD8 = 3,
+};
+
+/**
+ * @defgroup Tessellator parameter enumerations.
+ *
+ * These correspond to the hardware values in 3DSTATE_TE, and are provided
+ * as part of the tessellation evaluation shader.
+ *
+ * @{
+ */
+enum brw_tess_partitioning {
+   BRW_TESS_PARTITIONING_INTEGER         = 0,
+   BRW_TESS_PARTITIONING_ODD_FRACTIONAL  = 1,
+   BRW_TESS_PARTITIONING_EVEN_FRACTIONAL = 2,
+};
+
+enum brw_tess_output_topology {
+   BRW_TESS_OUTPUT_TOPOLOGY_POINT   = 0,
+   BRW_TESS_OUTPUT_TOPOLOGY_LINE    = 1,
+   BRW_TESS_OUTPUT_TOPOLOGY_TRI_CW  = 2,
+   BRW_TESS_OUTPUT_TOPOLOGY_TRI_CCW = 3,
+};
+
+enum brw_tess_domain {
+   BRW_TESS_DOMAIN_QUAD    = 0,
+   BRW_TESS_DOMAIN_TRI     = 1,
+   BRW_TESS_DOMAIN_ISOLINE = 2,
+};
+/** @} */
+
+struct brw_vue_prog_data {
+   struct brw_stage_prog_data base;
+   struct brw_vue_map vue_map;
+
+   /** Should the hardware deliver input VUE handles for URB pull loads? */
+   bool include_vue_handles;
+
+   GLuint urb_read_length;
+   GLuint total_grf;
+
+   uint32_t clip_distance_mask;
+   uint32_t cull_distance_mask;
+
+   /* Used for calculating urb partitions.  In the VS, this is the size of the
+    * URB entry used for both input and output to the thread.  In the GS, this
+    * is the size of the URB entry used for output.
+    */
+   GLuint urb_entry_size;
+
+   enum shader_dispatch_mode dispatch_mode;
+};
+
+struct brw_vs_prog_data {
+   struct brw_vue_prog_data base;
+
+   GLbitfield64 inputs_read;
+   GLbitfield64 double_inputs_read;
+
+   unsigned nr_attributes;
+   unsigned nr_attribute_slots;
+
+   bool uses_vertexid;
+   bool uses_instanceid;
+   bool uses_basevertex;
+   bool uses_baseinstance;
+   bool uses_drawid;
+};
+
+struct brw_tcs_prog_data
+{
+   struct brw_vue_prog_data base;
+
+   /** Number vertices in output patch */
+   int instances;
+};
+
+
+struct brw_tes_prog_data
+{
+   struct brw_vue_prog_data base;
+
+   enum brw_tess_partitioning partitioning;
+   enum brw_tess_output_topology output_topology;
+   enum brw_tess_domain domain;
+};
+
+struct brw_gs_prog_data
+{
+   struct brw_vue_prog_data base;
+
+   unsigned vertices_in;
+
+   /**
+    * Size of an output vertex, measured in HWORDS (32 bytes).
+    */
+   unsigned output_vertex_size_hwords;
+
+   unsigned output_topology;
+
+   /**
+    * Size of the control data (cut bits or StreamID bits), in hwords (32
+    * bytes).  0 if there is no control data.
+    */
+   unsigned control_data_header_size_hwords;
+
+   /**
+    * Format of the control data (either GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID
+    * if the control data is StreamID bits, or
+    * GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT if the control data is cut bits).
+    * Ignored if control_data_header_size is 0.
+    */
+   unsigned control_data_format;
+
+   bool include_primitive_id;
+
+   /**
+    * The number of vertices emitted, if constant - otherwise -1.
+    */
+   int static_vertex_count;
+
+   int invocations;
+
+   /**
+    * Gen6: Provoking vertex convention for odd-numbered triangles
+    * in tristrips.
+    */
+   GLuint pv_first:1;
+
+   /**
+    * Gen6: Number of varyings that are output to transform feedback.
+    */
+   GLuint num_transform_feedback_bindings:7; /* 0-BRW_MAX_SOL_BINDINGS */
+
+   /**
+    * Gen6: Map from the index of a transform feedback binding table entry to the
+    * gl_varying_slot that should be streamed out through that binding table
+    * entry.
+    */
+   unsigned char transform_feedback_bindings[64 /* BRW_MAX_SOL_BINDINGS */];
+
+   /**
+    * Gen6: Map from the index of a transform feedback binding table entry to the
+    * swizzles that should be used when streaming out data through that
+    * binding table entry.
+    */
+   unsigned char transform_feedback_swizzles[64 /* BRW_MAX_SOL_BINDINGS */];
+};
+
+#define DEFINE_PROG_DATA_DOWNCAST(stage)                       \
+static inline struct brw_##stage##_prog_data *                 \
+brw_##stage##_prog_data(struct brw_stage_prog_data *prog_data) \
+{                                                              \
+   return (struct brw_##stage##_prog_data *) prog_data;        \
+}
+DEFINE_PROG_DATA_DOWNCAST(vue)
+DEFINE_PROG_DATA_DOWNCAST(vs)
+DEFINE_PROG_DATA_DOWNCAST(tcs)
+DEFINE_PROG_DATA_DOWNCAST(tes)
+DEFINE_PROG_DATA_DOWNCAST(gs)
+DEFINE_PROG_DATA_DOWNCAST(wm)
+DEFINE_PROG_DATA_DOWNCAST(cs)
+DEFINE_PROG_DATA_DOWNCAST(ff_gs)
+DEFINE_PROG_DATA_DOWNCAST(clip)
+DEFINE_PROG_DATA_DOWNCAST(sf)
+#undef DEFINE_PROG_DATA_DOWNCAST
+
+/** @} */
+
+struct brw_compiler *
+brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo);
+
+/**
+ * Compile a vertex shader.
+ *
+ * Returns the final assembly and the program's size.
+ */
+const unsigned *
+brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
+               void *mem_ctx,
+               const struct brw_vs_prog_key *key,
+               struct brw_vs_prog_data *prog_data,
+               const struct nir_shader *shader,
+               gl_clip_plane *clip_planes,
+               bool use_legacy_snorm_formula,
+               int shader_time_index,
+               unsigned *final_assembly_size,
+               char **error_str);
+
+/**
+ * Compile a tessellation control shader.
+ *
+ * Returns the final assembly and the program's size.
+ */
+const unsigned *
+brw_compile_tcs(const struct brw_compiler *compiler,
+                void *log_data,
+                void *mem_ctx,
+                const struct brw_tcs_prog_key *key,
+                struct brw_tcs_prog_data *prog_data,
+                const struct nir_shader *nir,
+                int shader_time_index,
+                unsigned *final_assembly_size,
+                char **error_str);
+
+/**
+ * Compile a tessellation evaluation shader.
+ *
+ * Returns the final assembly and the program's size.
+ */
+const unsigned *
+brw_compile_tes(const struct brw_compiler *compiler, void *log_data,
+                void *mem_ctx,
+                const struct brw_tes_prog_key *key,
+                const struct brw_vue_map *input_vue_map,
+                struct brw_tes_prog_data *prog_data,
+                const struct nir_shader *shader,
+                struct gl_program *prog,
+                int shader_time_index,
+                unsigned *final_assembly_size,
+                char **error_str);
+
+/**
+ * Compile a vertex shader.
+ *
+ * Returns the final assembly and the program's size.
+ */
+const unsigned *
+brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
+               void *mem_ctx,
+               const struct brw_gs_prog_key *key,
+               struct brw_gs_prog_data *prog_data,
+               const struct nir_shader *shader,
+               struct gl_program *prog,
+               int shader_time_index,
+               unsigned *final_assembly_size,
+               char **error_str);
+
+/**
+ * Compile a fragment shader.
+ *
+ * Returns the final assembly and the program's size.
+ */
+const unsigned *
+brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
+               void *mem_ctx,
+               const struct brw_wm_prog_key *key,
+               struct brw_wm_prog_data *prog_data,
+               const struct nir_shader *shader,
+               struct gl_program *prog,
+               int shader_time_index8,
+               int shader_time_index16,
+               bool allow_spilling,
+               bool use_rep_send, struct brw_vue_map *vue_map,
+               unsigned *final_assembly_size,
+               char **error_str);
+
+/**
+ * Compile a compute shader.
+ *
+ * Returns the final assembly and the program's size.
+ */
+const unsigned *
+brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
+               void *mem_ctx,
+               const struct brw_cs_prog_key *key,
+               struct brw_cs_prog_data *prog_data,
+               const struct nir_shader *shader,
+               int shader_time_index,
+               unsigned *final_assembly_size,
+               char **error_str);
+
+static inline uint32_t
+encode_slm_size(unsigned gen, uint32_t bytes)
+{
+   uint32_t slm_size = 0;
+
+   /* Shared Local Memory is specified as powers of two, and encoded in
+    * INTERFACE_DESCRIPTOR_DATA with the following representations:
+    *
+    * Size   | 0 kB | 1 kB | 2 kB | 4 kB | 8 kB | 16 kB | 32 kB | 64 kB |
+    * -------------------------------------------------------------------
+    * Gen7-8 |    0 | none | none |    1 |    2 |     4 |     8 |    16 |
+    * -------------------------------------------------------------------
+    * Gen9+  |    0 |    1 |    2 |    3 |    4 |     5 |     6 |     7 |
+    */
+   assert(bytes <= 64 * 1024);
+
+   if (bytes > 0) {
+      /* Shared Local Memory Size is specified as powers of two. */
+      slm_size = util_next_power_of_two(bytes);
+
+      if (gen >= 9) {
+         /* Use a minimum of 1kB; turn an exponent of 10 (1024 kB) into 1. */
+         slm_size = ffs(MAX2(slm_size, 1024)) - 10;
+      } else {
+         /* Use a minimum of 4kB; convert to the pre-Gen9 representation. */
+         slm_size = MAX2(slm_size, 4096) / 4096;
+      }
+   }
+
+   return slm_size;
+}
+
+/**
+ * Return true if the given shader stage is dispatched contiguously by the
+ * relevant fixed function starting from channel 0 of the SIMD thread, which
+ * implies that the dispatch mask of a thread can be assumed to have the form
+ * '2^n - 1' for some n.
+ */
+static inline bool
+brw_stage_has_packed_dispatch(const struct gen_device_info *devinfo,
+                              gl_shader_stage stage,
+                              const struct brw_stage_prog_data *prog_data)
+{
+   /* The code below makes assumptions about the hardware's thread dispatch
+    * behavior that could be proven wrong in future generations -- Make sure
+    * to do a full test run with brw_fs_test_dispatch_packing() hooked up to
+    * the NIR front-end before changing this assertion.
+    */
+   assert(devinfo->gen <= 9);
+
+   switch (stage) {
+   case MESA_SHADER_FRAGMENT: {
+      /* The PSD discards subspans coming in with no lit samples, which in the
+       * per-pixel shading case implies that each subspan will either be fully
+       * lit (due to the VMask being used to allow derivative computations),
+       * or not dispatched at all.  In per-sample dispatch mode individual
+       * samples from the same subspan have a fixed relative location within
+       * the SIMD thread, so dispatch of unlit samples cannot be avoided in
+       * general and we should return false.
+       */
+      const struct brw_wm_prog_data *wm_prog_data =
+         (const struct brw_wm_prog_data *)prog_data;
+      return !wm_prog_data->persample_dispatch;
+   }
+   case MESA_SHADER_COMPUTE:
+      /* Compute shaders will be spawned with either a fully enabled dispatch
+       * mask or with whatever bottom/right execution mask was given to the
+       * GPGPU walker command to be used along the workgroup edges -- In both
+       * cases the dispatch mask is required to be tightly packed for our
+       * invocation index calculations to work.
+       */
+      return true;
+   default:
+      /* Most remaining fixed functions are limited to use a packed dispatch
+       * mask due to the hardware representation of the dispatch mask as a
+       * single counter representing the number of enabled channels.
+       */
+      return true;
+   }
+}
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
diff --git a/src/intel/compiler/brw_dead_control_flow.cpp b/src/intel/compiler/brw_dead_control_flow.cpp
new file mode 100644
index 00000000000..114dc6cb212
--- /dev/null
+++ b/src/intel/compiler/brw_dead_control_flow.cpp
@@ -0,0 +1,119 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_dead_control_flow.cpp
+ *
+ * This file implements the dead control flow elimination optimization pass.
+ */
+
+#include "brw_shader.h"
+#include "brw_cfg.h"
+
+/* Look for and eliminate dead control flow:
+ *
+ *   - if/endif
+ *   - else in else/endif
+ *   - then in if/else/endif
+ */
+bool
+dead_control_flow_eliminate(backend_shader *s)
+{
+   bool progress = false;
+
+   foreach_block_safe (block, s->cfg) {
+      bblock_t *prev_block = block->prev();
+
+      if (!prev_block)
+         continue;
+
+      backend_instruction *const inst = block->start();
+      backend_instruction *const prev_inst = prev_block->end();
+
+      /* ENDIF instructions, by definition, can only be found at the start of
+       * basic blocks.
+       */
+      if (inst->opcode == BRW_OPCODE_ENDIF &&
+          prev_inst->opcode == BRW_OPCODE_ELSE) {
+         bblock_t *const else_block = prev_block;
+         backend_instruction *const else_inst = prev_inst;
+
+         else_inst->remove(else_block);
+         progress = true;
+      } else if (inst->opcode == BRW_OPCODE_ENDIF &&
+                 prev_inst->opcode == BRW_OPCODE_IF) {
+         bblock_t *const endif_block = block;
+         bblock_t *const if_block = prev_block;
+         backend_instruction *const endif_inst = inst;
+         backend_instruction *const if_inst = prev_inst;
+
+         bblock_t *earlier_block = NULL, *later_block = NULL;
+
+         if (if_block->start_ip == if_block->end_ip) {
+            earlier_block = if_block->prev();
+         } else {
+            earlier_block = if_block;
+         }
+         if_inst->remove(if_block);
+
+         if (endif_block->start_ip == endif_block->end_ip) {
+            later_block = endif_block->next();
+         } else {
+            later_block = endif_block;
+         }
+         endif_inst->remove(endif_block);
+
+         assert((earlier_block == NULL) == (later_block == NULL));
+         if (earlier_block && earlier_block->can_combine_with(later_block)) {
+            earlier_block->combine_with(later_block);
+
+            /* If ENDIF was in its own block, then we've now deleted it and
+             * merged the two surrounding blocks, the latter of which the
+             * __next block pointer was pointing to.
+             */
+            if (endif_block != later_block) {
+               __next = earlier_block->next();
+            }
+         }
+
+         progress = true;
+      } else if (inst->opcode == BRW_OPCODE_ELSE &&
+                 prev_inst->opcode == BRW_OPCODE_IF) {
+         bblock_t *const else_block = block;
+         backend_instruction *const if_inst = prev_inst;
+         backend_instruction *const else_inst = inst;
+
+         /* Since the else-branch is becoming the new then-branch, the
+          * condition has to be inverted.
+          */
+         if_inst->predicate_inverse = !if_inst->predicate_inverse;
+         else_inst->remove(else_block);
+
+         progress = true;
+      }
+   }
+
+   if (progress)
+      s->invalidate_live_intervals();
+
+   return progress;
+}
diff --git a/src/intel/compiler/brw_dead_control_flow.h b/src/intel/compiler/brw_dead_control_flow.h
new file mode 100644
index 00000000000..83fd9b1e79e
--- /dev/null
+++ b/src/intel/compiler/brw_dead_control_flow.h
@@ -0,0 +1,26 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_shader.h"
+
+bool dead_control_flow_eliminate(backend_shader *s);
diff --git a/src/intel/compiler/brw_disasm.c b/src/intel/compiler/brw_disasm.c
new file mode 100644
index 00000000000..536a003dcbe
--- /dev/null
+++ b/src/intel/compiler/brw_disasm.c
@@ -0,0 +1,1646 @@
+/*
+ * Copyright © 2008 Keith Packard
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that copyright
+ * notice and this permission notice appear in supporting documentation, and
+ * that the name of the copyright holders not be used in advertising or
+ * publicity pertaining to distribution of the software without specific,
+ * written prior permission.  The copyright holders make no representations
+ * about the suitability of this software for any purpose.  It is provided "as
+ * is" without express or implied warranty.
+ *
+ * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO
+ * EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <stdarg.h>
+
+#include "brw_eu_defines.h"
+#include "brw_inst.h"
+#include "brw_shader.h"
+#include "brw_reg.h"
+#include "brw_inst.h"
+#include "brw_eu.h"
+
+static bool
+has_jip(const struct gen_device_info *devinfo, enum opcode opcode)
+{
+   if (devinfo->gen < 6)
+      return false;
+
+   return opcode == BRW_OPCODE_IF ||
+          opcode == BRW_OPCODE_ELSE ||
+          opcode == BRW_OPCODE_ENDIF ||
+          opcode == BRW_OPCODE_WHILE ||
+          opcode == BRW_OPCODE_BREAK ||
+          opcode == BRW_OPCODE_CONTINUE ||
+          opcode == BRW_OPCODE_HALT;
+}
+
+static bool
+has_uip(const struct gen_device_info *devinfo, enum opcode opcode)
+{
+   if (devinfo->gen < 6)
+      return false;
+
+   return (devinfo->gen >= 7 && opcode == BRW_OPCODE_IF) ||
+          (devinfo->gen >= 8 && opcode == BRW_OPCODE_ELSE) ||
+          opcode == BRW_OPCODE_BREAK ||
+          opcode == BRW_OPCODE_CONTINUE ||
+          opcode == BRW_OPCODE_HALT;
+}
+
+static bool
+has_branch_ctrl(const struct gen_device_info *devinfo, enum opcode opcode)
+{
+   if (devinfo->gen < 8)
+      return false;
+
+   return opcode == BRW_OPCODE_IF ||
+          opcode == BRW_OPCODE_ELSE;
+          /* opcode == BRW_OPCODE_GOTO; */
+}
+
+static bool
+is_logic_instruction(unsigned opcode)
+{
+   return opcode == BRW_OPCODE_AND ||
+          opcode == BRW_OPCODE_NOT ||
+          opcode == BRW_OPCODE_OR ||
+          opcode == BRW_OPCODE_XOR;
+}
+
+const char *const conditional_modifier[16] = {
+   [BRW_CONDITIONAL_NONE] = "",
+   [BRW_CONDITIONAL_Z]    = ".z",
+   [BRW_CONDITIONAL_NZ]   = ".nz",
+   [BRW_CONDITIONAL_G]    = ".g",
+   [BRW_CONDITIONAL_GE]   = ".ge",
+   [BRW_CONDITIONAL_L]    = ".l",
+   [BRW_CONDITIONAL_LE]   = ".le",
+   [BRW_CONDITIONAL_R]    = ".r",
+   [BRW_CONDITIONAL_O]    = ".o",
+   [BRW_CONDITIONAL_U]    = ".u",
+};
+
+static const char *const m_negate[2] = {
+   [0] = "",
+   [1] = "-",
+};
+
+static const char *const _abs[2] = {
+   [0] = "",
+   [1] = "(abs)",
+};
+
+static const char *const m_bitnot[2] = { "", "~" };
+
+static const char *const vert_stride[16] = {
+   [0] = "0",
+   [1] = "1",
+   [2] = "2",
+   [3] = "4",
+   [4] = "8",
+   [5] = "16",
+   [6] = "32",
+   [15] = "VxH",
+};
+
+static const char *const width[8] = {
+   [0] = "1",
+   [1] = "2",
+   [2] = "4",
+   [3] = "8",
+   [4] = "16",
+};
+
+static const char *const horiz_stride[4] = {
+   [0] = "0",
+   [1] = "1",
+   [2] = "2",
+   [3] = "4"
+};
+
+static const char *const chan_sel[4] = {
+   [0] = "x",
+   [1] = "y",
+   [2] = "z",
+   [3] = "w",
+};
+
+static const char *const debug_ctrl[2] = {
+   [0] = "",
+   [1] = ".breakpoint"
+};
+
+static const char *const saturate[2] = {
+   [0] = "",
+   [1] = ".sat"
+};
+
+static const char *const cmpt_ctrl[2] = {
+   [0] = "",
+   [1] = "compacted"
+};
+
+static const char *const accwr[2] = {
+   [0] = "",
+   [1] = "AccWrEnable"
+};
+
+static const char *const branch_ctrl[2] = {
+   [0] = "",
+   [1] = "BranchCtrl"
+};
+
+static const char *const wectrl[2] = {
+   [0] = "",
+   [1] = "WE_all"
+};
+
+static const char *const exec_size[8] = {
+   [0] = "1",
+   [1] = "2",
+   [2] = "4",
+   [3] = "8",
+   [4] = "16",
+   [5] = "32"
+};
+
+static const char *const pred_inv[2] = {
+   [0] = "+",
+   [1] = "-"
+};
+
+const char *const pred_ctrl_align16[16] = {
+   [1] = "",
+   [2] = ".x",
+   [3] = ".y",
+   [4] = ".z",
+   [5] = ".w",
+   [6] = ".any4h",
+   [7] = ".all4h",
+};
+
+static const char *const pred_ctrl_align1[16] = {
+   [BRW_PREDICATE_NORMAL]        = "",
+   [BRW_PREDICATE_ALIGN1_ANYV]   = ".anyv",
+   [BRW_PREDICATE_ALIGN1_ALLV]   = ".allv",
+   [BRW_PREDICATE_ALIGN1_ANY2H]  = ".any2h",
+   [BRW_PREDICATE_ALIGN1_ALL2H]  = ".all2h",
+   [BRW_PREDICATE_ALIGN1_ANY4H]  = ".any4h",
+   [BRW_PREDICATE_ALIGN1_ALL4H]  = ".all4h",
+   [BRW_PREDICATE_ALIGN1_ANY8H]  = ".any8h",
+   [BRW_PREDICATE_ALIGN1_ALL8H]  = ".all8h",
+   [BRW_PREDICATE_ALIGN1_ANY16H] = ".any16h",
+   [BRW_PREDICATE_ALIGN1_ALL16H] = ".all16h",
+   [BRW_PREDICATE_ALIGN1_ANY32H] = ".any32h",
+   [BRW_PREDICATE_ALIGN1_ALL32H] = ".all32h",
+};
+
+static const char *const thread_ctrl[4] = {
+   [BRW_THREAD_NORMAL] = "",
+   [BRW_THREAD_ATOMIC] = "atomic",
+   [BRW_THREAD_SWITCH] = "switch",
+};
+
+static const char *const compr_ctrl[4] = {
+   [0] = "",
+   [1] = "sechalf",
+   [2] = "compr",
+   [3] = "compr4",
+};
+
+static const char *const dep_ctrl[4] = {
+   [0] = "",
+   [1] = "NoDDClr",
+   [2] = "NoDDChk",
+   [3] = "NoDDClr,NoDDChk",
+};
+
+static const char *const mask_ctrl[4] = {
+   [0] = "",
+   [1] = "nomask",
+};
+
+static const char *const access_mode[2] = {
+   [0] = "align1",
+   [1] = "align16",
+};
+
+static const char * const reg_encoding[] = {
+   [BRW_HW_REG_TYPE_UD]          = "UD",
+   [BRW_HW_REG_TYPE_D]           = "D",
+   [BRW_HW_REG_TYPE_UW]          = "UW",
+   [BRW_HW_REG_TYPE_W]           = "W",
+   [BRW_HW_REG_NON_IMM_TYPE_UB]  = "UB",
+   [BRW_HW_REG_NON_IMM_TYPE_B]   = "B",
+   [GEN7_HW_REG_NON_IMM_TYPE_DF] = "DF",
+   [BRW_HW_REG_TYPE_F]           = "F",
+   [GEN8_HW_REG_TYPE_UQ]         = "UQ",
+   [GEN8_HW_REG_TYPE_Q]          = "Q",
+   [GEN8_HW_REG_NON_IMM_TYPE_HF] = "HF",
+};
+
+static const char *const three_source_reg_encoding[] = {
+   [BRW_3SRC_TYPE_F]  = "F",
+   [BRW_3SRC_TYPE_D]  = "D",
+   [BRW_3SRC_TYPE_UD] = "UD",
+   [BRW_3SRC_TYPE_DF] = "DF",
+};
+
+static const char *const reg_file[4] = {
+   [0] = "A",
+   [1] = "g",
+   [2] = "m",
+   [3] = "imm",
+};
+
+static const char *const writemask[16] = {
+   [0x0] = ".",
+   [0x1] = ".x",
+   [0x2] = ".y",
+   [0x3] = ".xy",
+   [0x4] = ".z",
+   [0x5] = ".xz",
+   [0x6] = ".yz",
+   [0x7] = ".xyz",
+   [0x8] = ".w",
+   [0x9] = ".xw",
+   [0xa] = ".yw",
+   [0xb] = ".xyw",
+   [0xc] = ".zw",
+   [0xd] = ".xzw",
+   [0xe] = ".yzw",
+   [0xf] = "",
+};
+
+static const char *const end_of_thread[2] = {
+   [0] = "",
+   [1] = "EOT"
+};
+
+/* SFIDs on Gen4-5 */
+static const char *const gen4_sfid[16] = {
+   [BRW_SFID_NULL]            = "null",
+   [BRW_SFID_MATH]            = "math",
+   [BRW_SFID_SAMPLER]         = "sampler",
+   [BRW_SFID_MESSAGE_GATEWAY] = "gateway",
+   [BRW_SFID_DATAPORT_READ]   = "read",
+   [BRW_SFID_DATAPORT_WRITE]  = "write",
+   [BRW_SFID_URB]             = "urb",
+   [BRW_SFID_THREAD_SPAWNER]  = "thread_spawner",
+   [BRW_SFID_VME]             = "vme",
+};
+
+static const char *const gen6_sfid[16] = {
+   [BRW_SFID_NULL]                     = "null",
+   [BRW_SFID_MATH]                     = "math",
+   [BRW_SFID_SAMPLER]                  = "sampler",
+   [BRW_SFID_MESSAGE_GATEWAY]          = "gateway",
+   [BRW_SFID_URB]                      = "urb",
+   [BRW_SFID_THREAD_SPAWNER]           = "thread_spawner",
+   [GEN6_SFID_DATAPORT_SAMPLER_CACHE]  = "sampler",
+   [GEN6_SFID_DATAPORT_RENDER_CACHE]   = "render",
+   [GEN6_SFID_DATAPORT_CONSTANT_CACHE] = "const",
+   [GEN7_SFID_DATAPORT_DATA_CACHE]     = "data",
+   [GEN7_SFID_PIXEL_INTERPOLATOR]      = "pixel interp",
+   [HSW_SFID_DATAPORT_DATA_CACHE_1]    = "dp data 1",
+   [HSW_SFID_CRE]                      = "cre",
+};
+
+static const char *const gen7_gateway_subfuncid[8] = {
+   [BRW_MESSAGE_GATEWAY_SFID_OPEN_GATEWAY] = "open",
+   [BRW_MESSAGE_GATEWAY_SFID_CLOSE_GATEWAY] = "close",
+   [BRW_MESSAGE_GATEWAY_SFID_FORWARD_MSG] = "forward msg",
+   [BRW_MESSAGE_GATEWAY_SFID_GET_TIMESTAMP] = "get timestamp",
+   [BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG] = "barrier msg",
+   [BRW_MESSAGE_GATEWAY_SFID_UPDATE_GATEWAY_STATE] = "update state",
+   [BRW_MESSAGE_GATEWAY_SFID_MMIO_READ_WRITE] = "mmio read/write",
+};
+
+static const char *const gen4_dp_read_port_msg_type[4] = {
+   [0b00] = "OWord Block Read",
+   [0b01] = "OWord Dual Block Read",
+   [0b10] = "Media Block Read",
+   [0b11] = "DWord Scattered Read",
+};
+
+static const char *const g45_dp_read_port_msg_type[8] = {
+   [0b000] = "OWord Block Read",
+   [0b010] = "OWord Dual Block Read",
+   [0b100] = "Media Block Read",
+   [0b110] = "DWord Scattered Read",
+   [0b001] = "Render Target UNORM Read",
+   [0b011] = "AVC Loop Filter Read",
+};
+
+static const char *const dp_write_port_msg_type[8] = {
+   [0b000] = "OWord block write",
+   [0b001] = "OWord dual block write",
+   [0b010] = "media block write",
+   [0b011] = "DWord scattered write",
+   [0b100] = "RT write",
+   [0b101] = "streamed VB write",
+   [0b110] = "RT UNORM write", /* G45+ */
+   [0b111] = "flush render cache",
+};
+
+static const char *const dp_rc_msg_type_gen6[16] = {
+   [BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ] = "OWORD block read",
+   [GEN6_DATAPORT_READ_MESSAGE_RENDER_UNORM_READ] = "RT UNORM read",
+   [GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ] = "OWORD dual block read",
+   [GEN6_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ] = "media block read",
+   [GEN6_DATAPORT_READ_MESSAGE_OWORD_UNALIGN_BLOCK_READ] =
+      "OWORD unaligned block read",
+   [GEN6_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ] = "DWORD scattered read",
+   [GEN6_DATAPORT_WRITE_MESSAGE_DWORD_ATOMIC_WRITE] = "DWORD atomic write",
+   [GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE] = "OWORD block write",
+   [GEN6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE] =
+      "OWORD dual block write",
+   [GEN6_DATAPORT_WRITE_MESSAGE_MEDIA_BLOCK_WRITE] = "media block write",
+   [GEN6_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE] =
+      "DWORD scattered write",
+   [GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE] = "RT write",
+   [GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE] = "streamed VB write",
+   [GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_UNORM_WRITE] = "RT UNORM write",
+};
+
+static const char *const dp_rc_msg_type_gen7[16] = {
+   [GEN7_DATAPORT_RC_MEDIA_BLOCK_READ] = "media block read",
+   [GEN7_DATAPORT_RC_TYPED_SURFACE_READ] = "typed surface read",
+   [GEN7_DATAPORT_RC_TYPED_ATOMIC_OP] = "typed atomic op",
+   [GEN7_DATAPORT_RC_MEMORY_FENCE] = "memory fence",
+   [GEN7_DATAPORT_RC_MEDIA_BLOCK_WRITE] = "media block write",
+   [GEN7_DATAPORT_RC_RENDER_TARGET_WRITE] = "RT write",
+   [GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE] = "typed surface write"
+};
+
+static const char *const dp_rc_msg_type_gen9[16] = {
+   [GEN9_DATAPORT_RC_RENDER_TARGET_WRITE] = "RT write",
+   [GEN9_DATAPORT_RC_RENDER_TARGET_READ] = "RT read"
+};
+
+static const char *const *
+dp_rc_msg_type(const struct gen_device_info *devinfo)
+{
+   return (devinfo->gen >= 9 ? dp_rc_msg_type_gen9 :
+           devinfo->gen >= 7 ? dp_rc_msg_type_gen7 :
+           devinfo->gen >= 6 ? dp_rc_msg_type_gen6 :
+           dp_write_port_msg_type);
+}
+
+static const char *const m_rt_write_subtype[] = {
+   [0b000] = "SIMD16",
+   [0b001] = "SIMD16/RepData",
+   [0b010] = "SIMD8/DualSrcLow",
+   [0b011] = "SIMD8/DualSrcHigh",
+   [0b100] = "SIMD8",
+   [0b101] = "SIMD8/ImageWrite",   /* Gen6+ */
+   [0b111] = "SIMD16/RepData-111", /* no idea how this is different than 1 */
+};
+
+static const char *const dp_dc0_msg_type_gen7[16] = {
+   [GEN7_DATAPORT_DC_OWORD_BLOCK_READ] = "DC OWORD block read",
+   [GEN7_DATAPORT_DC_UNALIGNED_OWORD_BLOCK_READ] =
+      "DC unaligned OWORD block read",
+   [GEN7_DATAPORT_DC_OWORD_DUAL_BLOCK_READ] = "DC OWORD dual block read",
+   [GEN7_DATAPORT_DC_DWORD_SCATTERED_READ] = "DC DWORD scattered read",
+   [GEN7_DATAPORT_DC_BYTE_SCATTERED_READ] = "DC byte scattered read",
+   [GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ] = "DC untyped surface read",
+   [GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP] = "DC untyped atomic",
+   [GEN7_DATAPORT_DC_MEMORY_FENCE] = "DC mfence",
+   [GEN7_DATAPORT_DC_OWORD_BLOCK_WRITE] = "DC OWORD block write",
+   [GEN7_DATAPORT_DC_OWORD_DUAL_BLOCK_WRITE] = "DC OWORD dual block write",
+   [GEN7_DATAPORT_DC_DWORD_SCATTERED_WRITE] = "DC DWORD scatterd write",
+   [GEN7_DATAPORT_DC_BYTE_SCATTERED_WRITE] = "DC byte scattered write",
+   [GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE] = "DC untyped surface write",
+};
+
+static const char *const dp_dc1_msg_type_hsw[16] = {
+   [HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ] = "untyped surface read",
+   [HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP] = "DC untyped atomic op",
+   [HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2] =
+      "DC untyped 4x2 atomic op",
+   [HSW_DATAPORT_DC_PORT1_MEDIA_BLOCK_READ] = "DC media block read",
+   [HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ] = "DC typed surface read",
+   [HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP] = "DC typed atomic",
+   [HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2] = "DC typed 4x2 atomic op",
+   [HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE] = "DC untyped surface write",
+   [HSW_DATAPORT_DC_PORT1_MEDIA_BLOCK_WRITE] = "DC media block write",
+   [HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP] = "DC atomic counter op",
+   [HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP_SIMD4X2] =
+      "DC 4x2 atomic counter op",
+   [HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE] = "DC typed surface write",
+};
+
+static const char *const aop[16] = {
+   [BRW_AOP_AND]    = "and",
+   [BRW_AOP_OR]     = "or",
+   [BRW_AOP_XOR]    = "xor",
+   [BRW_AOP_MOV]    = "mov",
+   [BRW_AOP_INC]    = "inc",
+   [BRW_AOP_DEC]    = "dec",
+   [BRW_AOP_ADD]    = "add",
+   [BRW_AOP_SUB]    = "sub",
+   [BRW_AOP_REVSUB] = "revsub",
+   [BRW_AOP_IMAX]   = "imax",
+   [BRW_AOP_IMIN]   = "imin",
+   [BRW_AOP_UMAX]   = "umax",
+   [BRW_AOP_UMIN]   = "umin",
+   [BRW_AOP_CMPWR]  = "cmpwr",
+   [BRW_AOP_PREDEC] = "predec",
+};
+
+static const char * const pixel_interpolator_msg_types[4] = {
+    [GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET] = "per_message_offset",
+    [GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE] = "sample_position",
+    [GEN7_PIXEL_INTERPOLATOR_LOC_CENTROID] = "centroid",
+    [GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET] = "per_slot_offset",
+};
+
+static const char *const math_function[16] = {
+   [BRW_MATH_FUNCTION_INV]    = "inv",
+   [BRW_MATH_FUNCTION_LOG]    = "log",
+   [BRW_MATH_FUNCTION_EXP]    = "exp",
+   [BRW_MATH_FUNCTION_SQRT]   = "sqrt",
+   [BRW_MATH_FUNCTION_RSQ]    = "rsq",
+   [BRW_MATH_FUNCTION_SIN]    = "sin",
+   [BRW_MATH_FUNCTION_COS]    = "cos",
+   [BRW_MATH_FUNCTION_SINCOS] = "sincos",
+   [BRW_MATH_FUNCTION_FDIV]   = "fdiv",
+   [BRW_MATH_FUNCTION_POW]    = "pow",
+   [BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER] = "intdivmod",
+   [BRW_MATH_FUNCTION_INT_DIV_QUOTIENT]  = "intdiv",
+   [BRW_MATH_FUNCTION_INT_DIV_REMAINDER] = "intmod",
+   [GEN8_MATH_FUNCTION_INVM]  = "invm",
+   [GEN8_MATH_FUNCTION_RSQRTM] = "rsqrtm",
+};
+
+static const char *const math_saturate[2] = {
+   [0] = "",
+   [1] = "sat"
+};
+
+static const char *const math_signed[2] = {
+   [0] = "",
+   [1] = "signed"
+};
+
+static const char *const math_scalar[2] = {
+   [0] = "",
+   [1] = "scalar"
+};
+
+static const char *const math_precision[2] = {
+   [0] = "",
+   [1] = "partial_precision"
+};
+
+static const char *const gen5_urb_opcode[] = {
+   [0] = "urb_write",
+   [1] = "ff_sync",
+};
+
+static const char *const gen7_urb_opcode[] = {
+   [BRW_URB_OPCODE_WRITE_HWORD] = "write HWord",
+   [BRW_URB_OPCODE_WRITE_OWORD] = "write OWord",
+   [BRW_URB_OPCODE_READ_HWORD] = "read HWord",
+   [BRW_URB_OPCODE_READ_OWORD] = "read OWord",
+   [GEN7_URB_OPCODE_ATOMIC_MOV] = "atomic mov",  /* Gen7+ */
+   [GEN7_URB_OPCODE_ATOMIC_INC] = "atomic inc",  /* Gen7+ */
+   [GEN8_URB_OPCODE_ATOMIC_ADD] = "atomic add",  /* Gen8+ */
+   [GEN8_URB_OPCODE_SIMD8_WRITE] = "SIMD8 write", /* Gen8+ */
+   [GEN8_URB_OPCODE_SIMD8_READ] = "SIMD8 read",  /* Gen8+ */
+   /* [9-15] - reserved */
+};
+
+static const char *const urb_swizzle[4] = {
+   [BRW_URB_SWIZZLE_NONE]       = "",
+   [BRW_URB_SWIZZLE_INTERLEAVE] = "interleave",
+   [BRW_URB_SWIZZLE_TRANSPOSE]  = "transpose",
+};
+
+static const char *const urb_allocate[2] = {
+   [0] = "",
+   [1] = "allocate"
+};
+
+static const char *const urb_used[2] = {
+   [0] = "",
+   [1] = "used"
+};
+
+static const char *const urb_complete[2] = {
+   [0] = "",
+   [1] = "complete"
+};
+
+static const char *const gen5_sampler_msg_type[] = {
+   [GEN5_SAMPLER_MESSAGE_SAMPLE]              = "sample",
+   [GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS]         = "sample_b",
+   [GEN5_SAMPLER_MESSAGE_SAMPLE_LOD]          = "sample_l",
+   [GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE]      = "sample_c",
+   [GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS]       = "sample_d",
+   [GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE] = "sample_b_c",
+   [GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE]  = "sample_l_c",
+   [GEN5_SAMPLER_MESSAGE_SAMPLE_LD]           = "ld",
+   [GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4]      = "gather4",
+   [GEN5_SAMPLER_MESSAGE_LOD]                 = "lod",
+   [GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO]      = "resinfo",
+   [GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO]   = "sampleinfo",
+   [GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C]    = "gather4_c",
+   [GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO]   = "gather4_po",
+   [GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C] = "gather4_po_c",
+   [HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE] = "sample_d_c",
+   [GEN9_SAMPLER_MESSAGE_SAMPLE_LZ]           = "sample_lz",
+   [GEN9_SAMPLER_MESSAGE_SAMPLE_C_LZ]         = "sample_c_lz",
+   [GEN9_SAMPLER_MESSAGE_SAMPLE_LD_LZ]        = "ld_lz",
+   [GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W]     = "ld2dms_w",
+   [GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS]       = "ld_mcs",
+   [GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS]       = "ld2dms",
+   [GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS]       = "ld2dss",
+};
+
+static const char *const gen5_sampler_simd_mode[4] = {
+   [BRW_SAMPLER_SIMD_MODE_SIMD4X2]   = "SIMD4x2",
+   [BRW_SAMPLER_SIMD_MODE_SIMD8]     = "SIMD8",
+   [BRW_SAMPLER_SIMD_MODE_SIMD16]    = "SIMD16",
+   [BRW_SAMPLER_SIMD_MODE_SIMD32_64] = "SIMD32/64",
+};
+
+static const char *const sampler_target_format[4] = {
+   [0] = "F",
+   [2] = "UD",
+   [3] = "D"
+};
+
+
+static int column;
+
+static int
+string(FILE *file, const char *string)
+{
+   fputs(string, file);
+   column += strlen(string);
+   return 0;
+}
+
+static int
+format(FILE *f, const char *format, ...) PRINTFLIKE(2, 3);
+
+static int
+format(FILE *f, const char *format, ...)
+{
+   char buf[1024];
+   va_list args;
+   va_start(args, format);
+
+   vsnprintf(buf, sizeof(buf) - 1, format, args);
+   va_end(args);
+   string(f, buf);
+   return 0;
+}
+
+static int
+newline(FILE *f)
+{
+   putc('\n', f);
+   column = 0;
+   return 0;
+}
+
+static int
+pad(FILE *f, int c)
+{
+   do
+      string(f, " ");
+   while (column < c);
+   return 0;
+}
+
+static int
+control(FILE *file, const char *name, const char *const ctrl[],
+        unsigned id, int *space)
+{
+   if (!ctrl[id]) {
+      fprintf(file, "*** invalid %s value %d ", name, id);
+      return 1;
+   }
+   if (ctrl[id][0]) {
+      if (space && *space)
+         string(file, " ");
+      string(file, ctrl[id]);
+      if (space)
+         *space = 1;
+   }
+   return 0;
+}
+
+static int
+print_opcode(FILE *file, const struct gen_device_info *devinfo,
+             enum opcode id)
+{
+   const struct opcode_desc *desc = brw_opcode_desc(devinfo, id);
+   if (!desc) {
+      format(file, "*** invalid opcode value %d ", id);
+      return 1;
+   }
+   string(file, desc->name);
+   return 0;
+}
+
+static int
+reg(FILE *file, unsigned _reg_file, unsigned _reg_nr)
+{
+   int err = 0;
+
+   /* Clear the Compr4 instruction compression bit. */
+   if (_reg_file == BRW_MESSAGE_REGISTER_FILE)
+      _reg_nr &= ~BRW_MRF_COMPR4;
+
+   if (_reg_file == BRW_ARCHITECTURE_REGISTER_FILE) {
+      switch (_reg_nr & 0xf0) {
+      case BRW_ARF_NULL:
+         string(file, "null");
+         break;
+      case BRW_ARF_ADDRESS:
+         format(file, "a%d", _reg_nr & 0x0f);
+         break;
+      case BRW_ARF_ACCUMULATOR:
+         format(file, "acc%d", _reg_nr & 0x0f);
+         break;
+      case BRW_ARF_FLAG:
+         format(file, "f%d", _reg_nr & 0x0f);
+         break;
+      case BRW_ARF_MASK:
+         format(file, "mask%d", _reg_nr & 0x0f);
+         break;
+      case BRW_ARF_MASK_STACK:
+         format(file, "msd%d", _reg_nr & 0x0f);
+         break;
+      case BRW_ARF_STATE:
+         format(file, "sr%d", _reg_nr & 0x0f);
+         break;
+      case BRW_ARF_CONTROL:
+         format(file, "cr%d", _reg_nr & 0x0f);
+         break;
+      case BRW_ARF_NOTIFICATION_COUNT:
+         format(file, "n%d", _reg_nr & 0x0f);
+         break;
+      case BRW_ARF_IP:
+         string(file, "ip");
+         return -1;
+         break;
+      case BRW_ARF_TDR:
+         format(file, "tdr0");
+         return -1;
+      case BRW_ARF_TIMESTAMP:
+         format(file, "tm%d", _reg_nr & 0x0f);
+         break;
+      default:
+         format(file, "ARF%d", _reg_nr);
+         break;
+      }
+   } else {
+      err |= control(file, "src reg file", reg_file, _reg_file, NULL);
+      format(file, "%d", _reg_nr);
+   }
+   return err;
+}
+
+static int
+dest(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst)
+{
+   unsigned elem_size = brw_element_size(devinfo, inst, dst);
+   int err = 0;
+
+   if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+      if (brw_inst_dst_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) {
+         err |= reg(file, brw_inst_dst_reg_file(devinfo, inst),
+                    brw_inst_dst_da_reg_nr(devinfo, inst));
+         if (err == -1)
+            return 0;
+         if (brw_inst_dst_da1_subreg_nr(devinfo, inst))
+            format(file, ".%"PRIu64, brw_inst_dst_da1_subreg_nr(devinfo, inst) /
+                   elem_size);
+         string(file, "<");
+         err |= control(file, "horiz stride", horiz_stride,
+                        brw_inst_dst_hstride(devinfo, inst), NULL);
+         string(file, ">");
+         err |= control(file, "dest reg encoding", reg_encoding,
+                        brw_inst_dst_reg_type(devinfo, inst), NULL);
+      } else {
+         string(file, "g[a0");
+         if (brw_inst_dst_ia_subreg_nr(devinfo, inst))
+            format(file, ".%"PRIu64, brw_inst_dst_ia_subreg_nr(devinfo, inst) /
+                   elem_size);
+         if (brw_inst_dst_ia1_addr_imm(devinfo, inst))
+            format(file, " %d", brw_inst_dst_ia1_addr_imm(devinfo, inst));
+         string(file, "]<");
+         err |= control(file, "horiz stride", horiz_stride,
+                        brw_inst_dst_hstride(devinfo, inst), NULL);
+         string(file, ">");
+         err |= control(file, "dest reg encoding", reg_encoding,
+                        brw_inst_dst_reg_type(devinfo, inst), NULL);
+      }
+   } else {
+      if (brw_inst_dst_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) {
+         err |= reg(file, brw_inst_dst_reg_file(devinfo, inst),
+                    brw_inst_dst_da_reg_nr(devinfo, inst));
+         if (err == -1)
+            return 0;
+         if (brw_inst_dst_da16_subreg_nr(devinfo, inst))
+            format(file, ".%u", 16 / elem_size);
+         string(file, "<1>");
+         err |= control(file, "writemask", writemask,
+                        brw_inst_da16_writemask(devinfo, inst), NULL);
+         err |= control(file, "dest reg encoding", reg_encoding,
+                        brw_inst_dst_reg_type(devinfo, inst), NULL);
+      } else {
+         err = 1;
+         string(file, "Indirect align16 address mode not supported");
+      }
+   }
+
+   return 0;
+}
+
+static int
+dest_3src(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst)
+{
+   int err = 0;
+   uint32_t reg_file;
+
+   if (devinfo->gen == 6 && brw_inst_3src_dst_reg_file(devinfo, inst))
+      reg_file = BRW_MESSAGE_REGISTER_FILE;
+   else
+      reg_file = BRW_GENERAL_REGISTER_FILE;
+
+   err |= reg(file, reg_file, brw_inst_3src_dst_reg_nr(devinfo, inst));
+   if (err == -1)
+      return 0;
+   if (brw_inst_3src_dst_subreg_nr(devinfo, inst))
+      format(file, ".%"PRIu64, brw_inst_3src_dst_subreg_nr(devinfo, inst));
+   string(file, "<1>");
+   err |= control(file, "writemask", writemask,
+                  brw_inst_3src_dst_writemask(devinfo, inst), NULL);
+   err |= control(file, "dest reg encoding", three_source_reg_encoding,
+                  brw_inst_3src_dst_type(devinfo, inst), NULL);
+
+   return 0;
+}
+
+static int
+src_align1_region(FILE *file,
+                  unsigned _vert_stride, unsigned _width,
+                  unsigned _horiz_stride)
+{
+   int err = 0;
+   string(file, "<");
+   err |= control(file, "vert stride", vert_stride, _vert_stride, NULL);
+   string(file, ",");
+   err |= control(file, "width", width, _width, NULL);
+   string(file, ",");
+   err |= control(file, "horiz_stride", horiz_stride, _horiz_stride, NULL);
+   string(file, ">");
+   return err;
+}
+
+static int
+src_da1(FILE *file,
+        const struct gen_device_info *devinfo,
+        unsigned opcode,
+        unsigned type, unsigned _reg_file,
+        unsigned _vert_stride, unsigned _width, unsigned _horiz_stride,
+        unsigned reg_num, unsigned sub_reg_num, unsigned __abs,
+        unsigned _negate)
+{
+   int err = 0;
+
+   if (devinfo->gen >= 8 && is_logic_instruction(opcode))
+      err |= control(file, "bitnot", m_bitnot, _negate, NULL);
+   else
+      err |= control(file, "negate", m_negate, _negate, NULL);
+
+   err |= control(file, "abs", _abs, __abs, NULL);
+
+   err |= reg(file, _reg_file, reg_num);
+   if (err == -1)
+      return 0;
+   if (sub_reg_num) {
+      unsigned elem_size = brw_hw_reg_type_to_size(devinfo, type, _reg_file);
+      format(file, ".%d", sub_reg_num / elem_size);   /* use formal style like spec */
+   }
+   src_align1_region(file, _vert_stride, _width, _horiz_stride);
+   err |= control(file, "src reg encoding", reg_encoding, type, NULL);
+   return err;
+}
+
+static int
+src_ia1(FILE *file,
+        const struct gen_device_info *devinfo,
+        unsigned opcode,
+        unsigned type,
+        unsigned _reg_file,
+        int _addr_imm,
+        unsigned _addr_subreg_nr,
+        unsigned _negate,
+        unsigned __abs,
+        unsigned _horiz_stride, unsigned _width, unsigned _vert_stride)
+{
+   int err = 0;
+
+   if (devinfo->gen >= 8 && is_logic_instruction(opcode))
+      err |= control(file, "bitnot", m_bitnot, _negate, NULL);
+   else
+      err |= control(file, "negate", m_negate, _negate, NULL);
+
+   err |= control(file, "abs", _abs, __abs, NULL);
+
+   string(file, "g[a0");
+   if (_addr_subreg_nr)
+      format(file, ".%d", _addr_subreg_nr);
+   if (_addr_imm)
+      format(file, " %d", _addr_imm);
+   string(file, "]");
+   src_align1_region(file, _vert_stride, _width, _horiz_stride);
+   err |= control(file, "src reg encoding", reg_encoding, type, NULL);
+   return err;
+}
+
+static int
+src_swizzle(FILE *file, unsigned swiz)
+{
+   unsigned x = BRW_GET_SWZ(swiz, BRW_CHANNEL_X);
+   unsigned y = BRW_GET_SWZ(swiz, BRW_CHANNEL_Y);
+   unsigned z = BRW_GET_SWZ(swiz, BRW_CHANNEL_Z);
+   unsigned w = BRW_GET_SWZ(swiz, BRW_CHANNEL_W);
+   int err = 0;
+
+   if (x == y && x == z && x == w) {
+      string(file, ".");
+      err |= control(file, "channel select", chan_sel, x, NULL);
+   } else if (swiz != BRW_SWIZZLE_XYZW) {
+      string(file, ".");
+      err |= control(file, "channel select", chan_sel, x, NULL);
+      err |= control(file, "channel select", chan_sel, y, NULL);
+      err |= control(file, "channel select", chan_sel, z, NULL);
+      err |= control(file, "channel select", chan_sel, w, NULL);
+   }
+   return err;
+}
+
+static int
+src_da16(FILE *file,
+         const struct gen_device_info *devinfo,
+         unsigned opcode,
+         unsigned _reg_type,
+         unsigned _reg_file,
+         unsigned _vert_stride,
+         unsigned _reg_nr,
+         unsigned _subreg_nr,
+         unsigned __abs,
+         unsigned _negate,
+         unsigned swz_x, unsigned swz_y, unsigned swz_z, unsigned swz_w)
+{
+   int err = 0;
+
+   if (devinfo->gen >= 8 && is_logic_instruction(opcode))
+      err |= control(file, "bitnot", m_bitnot, _negate, NULL);
+   else
+      err |= control(file, "negate", m_negate, _negate, NULL);
+
+   err |= control(file, "abs", _abs, __abs, NULL);
+
+   err |= reg(file, _reg_file, _reg_nr);
+   if (err == -1)
+      return 0;
+   if (_subreg_nr) {
+      unsigned elem_size =
+         brw_hw_reg_type_to_size(devinfo, _reg_type, _reg_file);
+
+      /* bit4 for subreg number byte addressing. Make this same meaning as
+         in da1 case, so output looks consistent. */
+      format(file, ".%d", 16 / elem_size);
+   }
+   string(file, "<");
+   err |= control(file, "vert stride", vert_stride, _vert_stride, NULL);
+   string(file, ">");
+   err |= src_swizzle(file, BRW_SWIZZLE4(swz_x, swz_y, swz_z, swz_w));
+   err |= control(file, "src da16 reg type", reg_encoding, _reg_type, NULL);
+   return err;
+}
+
+static int
+src0_3src(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst)
+{
+   int err = 0;
+   unsigned src0_subreg_nr = brw_inst_3src_src0_subreg_nr(devinfo, inst);
+
+   err |= control(file, "negate", m_negate,
+                  brw_inst_3src_src0_negate(devinfo, inst), NULL);
+   err |= control(file, "abs", _abs, brw_inst_3src_src0_abs(devinfo, inst), NULL);
+
+   err |= reg(file, BRW_GENERAL_REGISTER_FILE,
+              brw_inst_3src_src0_reg_nr(devinfo, inst));
+   if (err == -1)
+      return 0;
+   if (src0_subreg_nr || brw_inst_3src_src0_rep_ctrl(devinfo, inst))
+      format(file, ".%d", src0_subreg_nr);
+   if (brw_inst_3src_src0_rep_ctrl(devinfo, inst))
+      string(file, "<0,1,0>");
+   else {
+      string(file, "<4,4,1>");
+      err |= src_swizzle(file, brw_inst_3src_src0_swizzle(devinfo, inst));
+   }
+   err |= control(file, "src da16 reg type", three_source_reg_encoding,
+                  brw_inst_3src_src_type(devinfo, inst), NULL);
+   return err;
+}
+
+static int
+src1_3src(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst)
+{
+   int err = 0;
+   unsigned src1_subreg_nr = brw_inst_3src_src1_subreg_nr(devinfo, inst);
+
+   err |= control(file, "negate", m_negate,
+                  brw_inst_3src_src1_negate(devinfo, inst), NULL);
+   err |= control(file, "abs", _abs, brw_inst_3src_src1_abs(devinfo, inst), NULL);
+
+   err |= reg(file, BRW_GENERAL_REGISTER_FILE,
+              brw_inst_3src_src1_reg_nr(devinfo, inst));
+   if (err == -1)
+      return 0;
+   if (src1_subreg_nr || brw_inst_3src_src1_rep_ctrl(devinfo, inst))
+      format(file, ".%d", src1_subreg_nr);
+   if (brw_inst_3src_src1_rep_ctrl(devinfo, inst))
+      string(file, "<0,1,0>");
+   else {
+      string(file, "<4,4,1>");
+      err |= src_swizzle(file, brw_inst_3src_src1_swizzle(devinfo, inst));
+   }
+   err |= control(file, "src da16 reg type", three_source_reg_encoding,
+                  brw_inst_3src_src_type(devinfo, inst), NULL);
+   return err;
+}
+
+
+static int
+src2_3src(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst)
+{
+   int err = 0;
+   unsigned src2_subreg_nr = brw_inst_3src_src2_subreg_nr(devinfo, inst);
+
+   err |= control(file, "negate", m_negate,
+                  brw_inst_3src_src2_negate(devinfo, inst), NULL);
+   err |= control(file, "abs", _abs, brw_inst_3src_src2_abs(devinfo, inst), NULL);
+
+   err |= reg(file, BRW_GENERAL_REGISTER_FILE,
+              brw_inst_3src_src2_reg_nr(devinfo, inst));
+   if (err == -1)
+      return 0;
+   if (src2_subreg_nr || brw_inst_3src_src2_rep_ctrl(devinfo, inst))
+      format(file, ".%d", src2_subreg_nr);
+   if (brw_inst_3src_src2_rep_ctrl(devinfo, inst))
+      string(file, "<0,1,0>");
+   else {
+      string(file, "<4,4,1>");
+      err |= src_swizzle(file, brw_inst_3src_src2_swizzle(devinfo, inst));
+   }
+   err |= control(file, "src da16 reg type", three_source_reg_encoding,
+                  brw_inst_3src_src_type(devinfo, inst), NULL);
+   return err;
+}
+
+static int
+imm(FILE *file, const struct gen_device_info *devinfo, unsigned type, brw_inst *inst)
+{
+   switch (type) {
+   case BRW_HW_REG_TYPE_UD:
+      format(file, "0x%08xUD", brw_inst_imm_ud(devinfo, inst));
+      break;
+   case BRW_HW_REG_TYPE_D:
+      format(file, "%dD", brw_inst_imm_d(devinfo, inst));
+      break;
+   case BRW_HW_REG_TYPE_UW:
+      format(file, "0x%04xUW", (uint16_t) brw_inst_imm_ud(devinfo, inst));
+      break;
+   case BRW_HW_REG_TYPE_W:
+      format(file, "%dW", (int16_t) brw_inst_imm_d(devinfo, inst));
+      break;
+   case BRW_HW_REG_IMM_TYPE_UV:
+      format(file, "0x%08xUV", brw_inst_imm_ud(devinfo, inst));
+      break;
+   case BRW_HW_REG_IMM_TYPE_VF:
+      format(file, "[%-gF, %-gF, %-gF, %-gF]VF",
+             brw_vf_to_float(brw_inst_imm_ud(devinfo, inst)),
+             brw_vf_to_float(brw_inst_imm_ud(devinfo, inst) >> 8),
+             brw_vf_to_float(brw_inst_imm_ud(devinfo, inst) >> 16),
+             brw_vf_to_float(brw_inst_imm_ud(devinfo, inst) >> 24));
+      break;
+   case BRW_HW_REG_IMM_TYPE_V:
+      format(file, "0x%08xV", brw_inst_imm_ud(devinfo, inst));
+      break;
+   case BRW_HW_REG_TYPE_F:
+      format(file, "%-gF", brw_inst_imm_f(devinfo, inst));
+      break;
+   case GEN8_HW_REG_IMM_TYPE_DF:
+      format(file, "%-gDF", brw_inst_imm_df(devinfo, inst));
+      break;
+   case GEN8_HW_REG_IMM_TYPE_HF:
+      string(file, "Half Float IMM");
+      break;
+   }
+   return 0;
+}
+
+static int
+src0(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst)
+{
+   if (brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE) {
+      return imm(file, devinfo, brw_inst_src0_reg_type(devinfo, inst), inst);
+   } else if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+      if (brw_inst_src0_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) {
+         return src_da1(file,
+                        devinfo,
+                        brw_inst_opcode(devinfo, inst),
+                        brw_inst_src0_reg_type(devinfo, inst),
+                        brw_inst_src0_reg_file(devinfo, inst),
+                        brw_inst_src0_vstride(devinfo, inst),
+                        brw_inst_src0_width(devinfo, inst),
+                        brw_inst_src0_hstride(devinfo, inst),
+                        brw_inst_src0_da_reg_nr(devinfo, inst),
+                        brw_inst_src0_da1_subreg_nr(devinfo, inst),
+                        brw_inst_src0_abs(devinfo, inst),
+                        brw_inst_src0_negate(devinfo, inst));
+      } else {
+         return src_ia1(file,
+                        devinfo,
+                        brw_inst_opcode(devinfo, inst),
+                        brw_inst_src0_reg_type(devinfo, inst),
+                        brw_inst_src0_reg_file(devinfo, inst),
+                        brw_inst_src0_ia1_addr_imm(devinfo, inst),
+                        brw_inst_src0_ia_subreg_nr(devinfo, inst),
+                        brw_inst_src0_negate(devinfo, inst),
+                        brw_inst_src0_abs(devinfo, inst),
+                        brw_inst_src0_hstride(devinfo, inst),
+                        brw_inst_src0_width(devinfo, inst),
+                        brw_inst_src0_vstride(devinfo, inst));
+      }
+   } else {
+      if (brw_inst_src0_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) {
+         return src_da16(file,
+                         devinfo,
+                         brw_inst_opcode(devinfo, inst),
+                         brw_inst_src0_reg_type(devinfo, inst),
+                         brw_inst_src0_reg_file(devinfo, inst),
+                         brw_inst_src0_vstride(devinfo, inst),
+                         brw_inst_src0_da_reg_nr(devinfo, inst),
+                         brw_inst_src0_da16_subreg_nr(devinfo, inst),
+                         brw_inst_src0_abs(devinfo, inst),
+                         brw_inst_src0_negate(devinfo, inst),
+                         brw_inst_src0_da16_swiz_x(devinfo, inst),
+                         brw_inst_src0_da16_swiz_y(devinfo, inst),
+                         brw_inst_src0_da16_swiz_z(devinfo, inst),
+                         brw_inst_src0_da16_swiz_w(devinfo, inst));
+      } else {
+         string(file, "Indirect align16 address mode not supported");
+         return 1;
+      }
+   }
+}
+
+static int
+src1(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst)
+{
+   if (brw_inst_src1_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE) {
+      return imm(file, devinfo, brw_inst_src1_reg_type(devinfo, inst), inst);
+   } else if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+      if (brw_inst_src1_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) {
+         return src_da1(file,
+                        devinfo,
+                        brw_inst_opcode(devinfo, inst),
+                        brw_inst_src1_reg_type(devinfo, inst),
+                        brw_inst_src1_reg_file(devinfo, inst),
+                        brw_inst_src1_vstride(devinfo, inst),
+                        brw_inst_src1_width(devinfo, inst),
+                        brw_inst_src1_hstride(devinfo, inst),
+                        brw_inst_src1_da_reg_nr(devinfo, inst),
+                        brw_inst_src1_da1_subreg_nr(devinfo, inst),
+                        brw_inst_src1_abs(devinfo, inst),
+                        brw_inst_src1_negate(devinfo, inst));
+      } else {
+         return src_ia1(file,
+                        devinfo,
+                        brw_inst_opcode(devinfo, inst),
+                        brw_inst_src1_reg_type(devinfo, inst),
+                        brw_inst_src1_reg_file(devinfo, inst),
+                        brw_inst_src1_ia1_addr_imm(devinfo, inst),
+                        brw_inst_src1_ia_subreg_nr(devinfo, inst),
+                        brw_inst_src1_negate(devinfo, inst),
+                        brw_inst_src1_abs(devinfo, inst),
+                        brw_inst_src1_hstride(devinfo, inst),
+                        brw_inst_src1_width(devinfo, inst),
+                        brw_inst_src1_vstride(devinfo, inst));
+      }
+   } else {
+      if (brw_inst_src1_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) {
+         return src_da16(file,
+                         devinfo,
+                         brw_inst_opcode(devinfo, inst),
+                         brw_inst_src1_reg_type(devinfo, inst),
+                         brw_inst_src1_reg_file(devinfo, inst),
+                         brw_inst_src1_vstride(devinfo, inst),
+                         brw_inst_src1_da_reg_nr(devinfo, inst),
+                         brw_inst_src1_da16_subreg_nr(devinfo, inst),
+                         brw_inst_src1_abs(devinfo, inst),
+                         brw_inst_src1_negate(devinfo, inst),
+                         brw_inst_src1_da16_swiz_x(devinfo, inst),
+                         brw_inst_src1_da16_swiz_y(devinfo, inst),
+                         brw_inst_src1_da16_swiz_z(devinfo, inst),
+                         brw_inst_src1_da16_swiz_w(devinfo, inst));
+      } else {
+         string(file, "Indirect align16 address mode not supported");
+         return 1;
+      }
+   }
+}
+
+static int
+qtr_ctrl(FILE *file, const struct gen_device_info *devinfo, brw_inst *inst)
+{
+   int qtr_ctl = brw_inst_qtr_control(devinfo, inst);
+   int exec_size = 1 << brw_inst_exec_size(devinfo, inst);
+
+   if (exec_size < 8) {
+      const unsigned nib_ctl = devinfo->gen < 7 ? 0 :
+                               brw_inst_nib_control(devinfo, inst);
+      format(file, " %dN", qtr_ctl * 2 + nib_ctl + 1);
+   } else if (exec_size == 8) {
+      switch (qtr_ctl) {
+      case 0:
+         string(file, " 1Q");
+         break;
+      case 1:
+         string(file, " 2Q");
+         break;
+      case 2:
+         string(file, " 3Q");
+         break;
+      case 3:
+         string(file, " 4Q");
+         break;
+      }
+   } else if (exec_size == 16) {
+      if (qtr_ctl < 2)
+         string(file, " 1H");
+      else
+         string(file, " 2H");
+   }
+   return 0;
+}
+
+#ifdef DEBUG
+static __attribute__((__unused__)) int
+brw_disassemble_imm(const struct gen_device_info *devinfo,
+                    uint32_t dw3, uint32_t dw2, uint32_t dw1, uint32_t dw0)
+{
+   brw_inst inst;
+   inst.data[0] = (((uint64_t) dw1) << 32) | ((uint64_t) dw0);
+   inst.data[1] = (((uint64_t) dw3) << 32) | ((uint64_t) dw2);
+   return brw_disassemble_inst(stderr, devinfo, &inst, false);
+}
+#endif
+
+int
+brw_disassemble_inst(FILE *file, const struct gen_device_info *devinfo,
+                     brw_inst *inst, bool is_compacted)
+{
+   int err = 0;
+   int space = 0;
+
+   const enum opcode opcode = brw_inst_opcode(devinfo, inst);
+   const struct opcode_desc *desc = brw_opcode_desc(devinfo, opcode);
+
+   if (brw_inst_pred_control(devinfo, inst)) {
+      string(file, "(");
+      err |= control(file, "predicate inverse", pred_inv,
+                     brw_inst_pred_inv(devinfo, inst), NULL);
+      format(file, "f%"PRIu64, devinfo->gen >= 7 ? brw_inst_flag_reg_nr(devinfo, inst) : 0);
+      if (brw_inst_flag_subreg_nr(devinfo, inst))
+         format(file, ".%"PRIu64, brw_inst_flag_subreg_nr(devinfo, inst));
+      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+         err |= control(file, "predicate control align1", pred_ctrl_align1,
+                        brw_inst_pred_control(devinfo, inst), NULL);
+      } else {
+         err |= control(file, "predicate control align16", pred_ctrl_align16,
+                        brw_inst_pred_control(devinfo, inst), NULL);
+      }
+      string(file, ") ");
+   }
+
+   err |= print_opcode(file, devinfo, opcode);
+   err |= control(file, "saturate", saturate, brw_inst_saturate(devinfo, inst),
+                  NULL);
+
+   err |= control(file, "debug control", debug_ctrl,
+                  brw_inst_debug_control(devinfo, inst), NULL);
+
+   if (opcode == BRW_OPCODE_MATH) {
+      string(file, " ");
+      err |= control(file, "function", math_function,
+                     brw_inst_math_function(devinfo, inst), NULL);
+   } else if (opcode != BRW_OPCODE_SEND && opcode != BRW_OPCODE_SENDC) {
+      err |= control(file, "conditional modifier", conditional_modifier,
+                     brw_inst_cond_modifier(devinfo, inst), NULL);
+
+      /* If we're using the conditional modifier, print which flags reg is
+       * used for it.  Note that on gen6+, the embedded-condition SEL and
+       * control flow doesn't update flags.
+       */
+      if (brw_inst_cond_modifier(devinfo, inst) &&
+          (devinfo->gen < 6 || (opcode != BRW_OPCODE_SEL &&
+                            opcode != BRW_OPCODE_IF &&
+                            opcode != BRW_OPCODE_WHILE))) {
+         format(file, ".f%"PRIu64,
+                devinfo->gen >= 7 ? brw_inst_flag_reg_nr(devinfo, inst) : 0);
+         if (brw_inst_flag_subreg_nr(devinfo, inst))
+            format(file, ".%"PRIu64, brw_inst_flag_subreg_nr(devinfo, inst));
+      }
+   }
+
+   if (opcode != BRW_OPCODE_NOP && opcode != BRW_OPCODE_NENOP) {
+      string(file, "(");
+      err |= control(file, "execution size", exec_size,
+                     brw_inst_exec_size(devinfo, inst), NULL);
+      string(file, ")");
+   }
+
+   if (opcode == BRW_OPCODE_SEND && devinfo->gen < 6)
+      format(file, " %"PRIu64, brw_inst_base_mrf(devinfo, inst));
+
+   if (has_uip(devinfo, opcode)) {
+      /* Instructions that have UIP also have JIP. */
+      pad(file, 16);
+      format(file, "JIP: %d", brw_inst_jip(devinfo, inst));
+      pad(file, 32);
+      format(file, "UIP: %d", brw_inst_uip(devinfo, inst));
+   } else if (has_jip(devinfo, opcode)) {
+      pad(file, 16);
+      if (devinfo->gen >= 7) {
+         format(file, "JIP: %d", brw_inst_jip(devinfo, inst));
+      } else {
+         format(file, "JIP: %d", brw_inst_gen6_jump_count(devinfo, inst));
+      }
+   } else if (devinfo->gen < 6 && (opcode == BRW_OPCODE_BREAK ||
+                               opcode == BRW_OPCODE_CONTINUE ||
+                               opcode == BRW_OPCODE_ELSE)) {
+      pad(file, 16);
+      format(file, "Jump: %d", brw_inst_gen4_jump_count(devinfo, inst));
+      pad(file, 32);
+      format(file, "Pop: %"PRIu64, brw_inst_gen4_pop_count(devinfo, inst));
+   } else if (devinfo->gen < 6 && (opcode == BRW_OPCODE_IF ||
+                               opcode == BRW_OPCODE_IFF ||
+                               opcode == BRW_OPCODE_HALT)) {
+      pad(file, 16);
+      format(file, "Jump: %d", brw_inst_gen4_jump_count(devinfo, inst));
+   } else if (devinfo->gen < 6 && opcode == BRW_OPCODE_ENDIF) {
+      pad(file, 16);
+      format(file, "Pop: %"PRIu64, brw_inst_gen4_pop_count(devinfo, inst));
+   } else if (opcode == BRW_OPCODE_JMPI) {
+      pad(file, 16);
+      err |= src1(file, devinfo, inst);
+   } else if (desc && desc->nsrc == 3) {
+      pad(file, 16);
+      err |= dest_3src(file, devinfo, inst);
+
+      pad(file, 32);
+      err |= src0_3src(file, devinfo, inst);
+
+      pad(file, 48);
+      err |= src1_3src(file, devinfo, inst);
+
+      pad(file, 64);
+      err |= src2_3src(file, devinfo, inst);
+   } else if (desc) {
+      if (desc->ndst > 0) {
+         pad(file, 16);
+         err |= dest(file, devinfo, inst);
+      }
+
+      if (desc->nsrc > 0) {
+         pad(file, 32);
+         err |= src0(file, devinfo, inst);
+      }
+
+      if (desc->nsrc > 1) {
+         pad(file, 48);
+         err |= src1(file, devinfo, inst);
+      }
+   }
+
+   if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) {
+      enum brw_message_target sfid = brw_inst_sfid(devinfo, inst);
+
+      if (brw_inst_src1_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE) {
+         /* show the indirect descriptor source */
+         pad(file, 48);
+         err |= src1(file, devinfo, inst);
+      }
+
+      newline(file);
+      pad(file, 16);
+      space = 0;
+
+      fprintf(file, "            ");
+      err |= control(file, "SFID", devinfo->gen >= 6 ? gen6_sfid : gen4_sfid,
+                     sfid, &space);
+
+
+      if (brw_inst_src1_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE) {
+         format(file, " indirect");
+      } else {
+         switch (sfid) {
+         case BRW_SFID_MATH:
+            err |= control(file, "math function", math_function,
+                           brw_inst_math_msg_function(devinfo, inst), &space);
+            err |= control(file, "math saturate", math_saturate,
+                           brw_inst_math_msg_saturate(devinfo, inst), &space);
+            err |= control(file, "math signed", math_signed,
+                           brw_inst_math_msg_signed_int(devinfo, inst), &space);
+            err |= control(file, "math scalar", math_scalar,
+                           brw_inst_math_msg_data_type(devinfo, inst), &space);
+            err |= control(file, "math precision", math_precision,
+                           brw_inst_math_msg_precision(devinfo, inst), &space);
+            break;
+         case BRW_SFID_SAMPLER:
+            if (devinfo->gen >= 5) {
+               err |= control(file, "sampler message", gen5_sampler_msg_type,
+                              brw_inst_sampler_msg_type(devinfo, inst), &space);
+               err |= control(file, "sampler simd mode", gen5_sampler_simd_mode,
+                              brw_inst_sampler_simd_mode(devinfo, inst), &space);
+               format(file, " Surface = %"PRIu64" Sampler = %"PRIu64,
+                      brw_inst_binding_table_index(devinfo, inst),
+                      brw_inst_sampler(devinfo, inst));
+            } else {
+               format(file, " (%"PRIu64", %"PRIu64", %"PRIu64", ",
+                      brw_inst_binding_table_index(devinfo, inst),
+                      brw_inst_sampler(devinfo, inst),
+                      brw_inst_sampler_msg_type(devinfo, inst));
+               if (!devinfo->is_g4x) {
+                  err |= control(file, "sampler target format",
+                                 sampler_target_format,
+                                 brw_inst_sampler_return_format(devinfo, inst), NULL);
+               }
+               string(file, ")");
+            }
+            break;
+         case GEN6_SFID_DATAPORT_SAMPLER_CACHE:
+         case GEN6_SFID_DATAPORT_CONSTANT_CACHE:
+            /* aka BRW_SFID_DATAPORT_READ on Gen4-5 */
+            if (devinfo->gen >= 6) {
+               format(file, " (%"PRIu64", %"PRIu64", %"PRIu64", %"PRIu64")",
+                      brw_inst_binding_table_index(devinfo, inst),
+                      brw_inst_dp_msg_control(devinfo, inst),
+                      brw_inst_dp_msg_type(devinfo, inst),
+                      devinfo->gen >= 7 ? 0 : brw_inst_dp_write_commit(devinfo, inst));
+            } else {
+               bool is_965 = devinfo->gen == 4 && !devinfo->is_g4x;
+               err |= control(file, "DP read message type",
+                              is_965 ? gen4_dp_read_port_msg_type :
+                                       g45_dp_read_port_msg_type,
+                              brw_inst_dp_read_msg_type(devinfo, inst),
+                              &space);
+
+               format(file, " MsgCtrl = 0x%"PRIx64,
+                      brw_inst_dp_read_msg_control(devinfo, inst));
+
+               format(file, " Surface = %"PRIu64, brw_inst_binding_table_index(devinfo, inst));
+            }
+            break;
+
+         case GEN6_SFID_DATAPORT_RENDER_CACHE: {
+            /* aka BRW_SFID_DATAPORT_WRITE on Gen4-5 */
+            unsigned msg_type = brw_inst_dp_write_msg_type(devinfo, inst);
+
+            err |= control(file, "DP rc message type",
+                           dp_rc_msg_type(devinfo), msg_type, &space);
+
+            bool is_rt_write = msg_type ==
+               (devinfo->gen >= 6 ? GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE
+                                  : BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE);
+
+            if (is_rt_write) {
+               err |= control(file, "RT message type", m_rt_write_subtype,
+                              brw_inst_rt_message_type(devinfo, inst), &space);
+               if (devinfo->gen >= 6 && brw_inst_rt_slot_group(devinfo, inst))
+                  string(file, " Hi");
+               if (brw_inst_rt_last(devinfo, inst))
+                  string(file, " LastRT");
+               if (devinfo->gen < 7 && brw_inst_dp_write_commit(devinfo, inst))
+                  string(file, " WriteCommit");
+            } else {
+               format(file, " MsgCtrl = 0x%"PRIx64,
+                      brw_inst_dp_write_msg_control(devinfo, inst));
+            }
+
+            format(file, " Surface = %"PRIu64, brw_inst_binding_table_index(devinfo, inst));
+            break;
+         }
+
+         case BRW_SFID_URB: {
+            unsigned opcode = brw_inst_urb_opcode(devinfo, inst);
+
+            format(file, " %"PRIu64, brw_inst_urb_global_offset(devinfo, inst));
+
+            space = 1;
+
+            err |= control(file, "urb opcode",
+                           devinfo->gen >= 7 ? gen7_urb_opcode
+                                             : gen5_urb_opcode,
+                           opcode, &space);
+
+            if (devinfo->gen >= 7 &&
+                brw_inst_urb_per_slot_offset(devinfo, inst)) {
+               string(file, " per-slot");
+            }
+
+            if (opcode == GEN8_URB_OPCODE_SIMD8_WRITE ||
+                opcode == GEN8_URB_OPCODE_SIMD8_READ) {
+               if (brw_inst_urb_channel_mask_present(devinfo, inst))
+                  string(file, " masked");
+            } else {
+               err |= control(file, "urb swizzle", urb_swizzle,
+                              brw_inst_urb_swizzle_control(devinfo, inst),
+                              &space);
+            }
+
+            if (devinfo->gen < 7) {
+               err |= control(file, "urb allocate", urb_allocate,
+                              brw_inst_urb_allocate(devinfo, inst), &space);
+               err |= control(file, "urb used", urb_used,
+                              brw_inst_urb_used(devinfo, inst), &space);
+            }
+            if (devinfo->gen < 8) {
+               err |= control(file, "urb complete", urb_complete,
+                              brw_inst_urb_complete(devinfo, inst), &space);
+            }
+            break;
+         }
+         case BRW_SFID_THREAD_SPAWNER:
+            break;
+
+         case BRW_SFID_MESSAGE_GATEWAY:
+            format(file, " (%s)",
+                   gen7_gateway_subfuncid[brw_inst_gateway_subfuncid(devinfo, inst)]);
+            break;
+
+         case GEN7_SFID_DATAPORT_DATA_CACHE:
+            if (devinfo->gen >= 7) {
+               format(file, " (");
+
+               err |= control(file, "DP DC0 message type",
+                              dp_dc0_msg_type_gen7,
+                              brw_inst_dp_msg_type(devinfo, inst), &space);
+
+               format(file, ", %"PRIu64", ", brw_inst_binding_table_index(devinfo, inst));
+
+               switch (brw_inst_dp_msg_type(devinfo, inst)) {
+               case GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP:
+                  control(file, "atomic op", aop,
+                          brw_inst_imm_ud(devinfo, inst) >> 8 & 0xf, &space);
+                  break;
+               default:
+                  format(file, "%"PRIu64, brw_inst_dp_msg_control(devinfo, inst));
+               }
+               format(file, ")");
+               break;
+            }
+            /* FALLTHROUGH */
+
+         case HSW_SFID_DATAPORT_DATA_CACHE_1: {
+            if (devinfo->gen >= 7) {
+               format(file, " (");
+
+               unsigned msg_ctrl = brw_inst_dp_msg_control(devinfo, inst);
+
+               err |= control(file, "DP DC1 message type",
+                              dp_dc1_msg_type_hsw,
+                              brw_inst_dp_msg_type(devinfo, inst), &space);
+
+               format(file, ", Surface = %"PRIu64", ",
+                      brw_inst_binding_table_index(devinfo, inst));
+
+               switch (brw_inst_dp_msg_type(devinfo, inst)) {
+               case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP:
+               case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP:
+               case HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP:
+                  format(file, "SIMD%d,", (msg_ctrl & (1 << 4)) ? 8 : 16);
+                  /* fallthrough */
+               case HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2:
+               case HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2:
+               case HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP_SIMD4X2:
+                  control(file, "atomic op", aop, msg_ctrl & 0xf, &space);
+                  break;
+               case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ:
+               case HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE:
+               case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ:
+               case HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE: {
+                  static const char *simd_modes[] = { "4x2", "16", "8" };
+                  format(file, "SIMD%s, Mask = 0x%x",
+                         simd_modes[msg_ctrl >> 4], msg_ctrl & 0xf);
+                  break;
+               }
+               default:
+                  format(file, "0x%x", msg_ctrl);
+               }
+               format(file, ")");
+               break;
+            }
+            /* FALLTHROUGH */
+         }
+
+         case GEN7_SFID_PIXEL_INTERPOLATOR:
+            if (devinfo->gen >= 7) {
+               format(file, " (%s, %s, 0x%02"PRIx64")",
+                      brw_inst_pi_nopersp(devinfo, inst) ? "linear" : "persp",
+                      pixel_interpolator_msg_types[brw_inst_pi_message_type(devinfo, inst)],
+                      brw_inst_pi_message_data(devinfo, inst));
+               break;
+            }
+            /* FALLTHROUGH */
+
+         default:
+            format(file, "unsupported shared function ID %d", sfid);
+            break;
+         }
+
+         if (space)
+            string(file, " ");
+         format(file, "mlen %"PRIu64, brw_inst_mlen(devinfo, inst));
+         format(file, " rlen %"PRIu64, brw_inst_rlen(devinfo, inst));
+      }
+   }
+   pad(file, 64);
+   if (opcode != BRW_OPCODE_NOP && opcode != BRW_OPCODE_NENOP) {
+      string(file, "{");
+      space = 1;
+      err |= control(file, "access mode", access_mode,
+                     brw_inst_access_mode(devinfo, inst), &space);
+      if (devinfo->gen >= 6) {
+         err |= control(file, "write enable control", wectrl,
+                        brw_inst_mask_control(devinfo, inst), &space);
+      } else {
+         err |= control(file, "mask control", mask_ctrl,
+                        brw_inst_mask_control(devinfo, inst), &space);
+      }
+      err |= control(file, "dependency control", dep_ctrl,
+                     ((brw_inst_no_dd_check(devinfo, inst) << 1) |
+                      brw_inst_no_dd_clear(devinfo, inst)), &space);
+
+      if (devinfo->gen >= 6)
+         err |= qtr_ctrl(file, devinfo, inst);
+      else {
+         if (brw_inst_qtr_control(devinfo, inst) == BRW_COMPRESSION_COMPRESSED &&
+             desc && desc->ndst > 0 &&
+             brw_inst_dst_reg_file(devinfo, inst) == BRW_MESSAGE_REGISTER_FILE &&
+             brw_inst_dst_da_reg_nr(devinfo, inst) & BRW_MRF_COMPR4) {
+            format(file, " compr4");
+         } else {
+            err |= control(file, "compression control", compr_ctrl,
+                           brw_inst_qtr_control(devinfo, inst), &space);
+         }
+      }
+
+      err |= control(file, "compaction", cmpt_ctrl, is_compacted, &space);
+      err |= control(file, "thread control", thread_ctrl,
+                     brw_inst_thread_control(devinfo, inst), &space);
+      if (has_branch_ctrl(devinfo, opcode)) {
+         err |= control(file, "branch ctrl", branch_ctrl,
+                        brw_inst_branch_control(devinfo, inst), &space);
+      } else if (devinfo->gen >= 6) {
+         err |= control(file, "acc write control", accwr,
+                        brw_inst_acc_wr_control(devinfo, inst), &space);
+      }
+      if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC)
+         err |= control(file, "end of thread", end_of_thread,
+                        brw_inst_eot(devinfo, inst), &space);
+      if (space)
+         string(file, " ");
+      string(file, "}");
+   }
+   string(file, ";");
+   newline(file);
+   return err;
+}
diff --git a/src/intel/compiler/brw_eu.c b/src/intel/compiler/brw_eu.c
new file mode 100644
index 00000000000..77400c19914
--- /dev/null
+++ b/src/intel/compiler/brw_eu.c
@@ -0,0 +1,719 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+
+#include "brw_eu_defines.h"
+#include "brw_eu.h"
+#include "brw_shader.h"
+#include "common/gen_debug.h"
+
+#include "util/ralloc.h"
+
+/**
+ * Converts a BRW_REGISTER_TYPE_* enum to a short string (F, UD, and so on).
+ *
+ * This is different than reg_encoding from brw_disasm.c in that it operates
+ * on the abstract enum values, rather than the generation-specific encoding.
+ */
+const char *
+brw_reg_type_letters(unsigned type)
+{
+   const char *names[] = {
+      [BRW_REGISTER_TYPE_UD] = "UD",
+      [BRW_REGISTER_TYPE_D]  = "D",
+      [BRW_REGISTER_TYPE_UW] = "UW",
+      [BRW_REGISTER_TYPE_W]  = "W",
+      [BRW_REGISTER_TYPE_F]  = "F",
+      [BRW_REGISTER_TYPE_UB] = "UB",
+      [BRW_REGISTER_TYPE_B]  = "B",
+      [BRW_REGISTER_TYPE_UV] = "UV",
+      [BRW_REGISTER_TYPE_V]  = "V",
+      [BRW_REGISTER_TYPE_VF] = "VF",
+      [BRW_REGISTER_TYPE_DF] = "DF",
+      [BRW_REGISTER_TYPE_HF] = "HF",
+      [BRW_REGISTER_TYPE_UQ] = "UQ",
+      [BRW_REGISTER_TYPE_Q]  = "Q",
+   };
+   assert(type <= BRW_REGISTER_TYPE_Q);
+   return names[type];
+}
+
+/* Returns a conditional modifier that negates the condition. */
+enum brw_conditional_mod
+brw_negate_cmod(uint32_t cmod)
+{
+   switch (cmod) {
+   case BRW_CONDITIONAL_Z:
+      return BRW_CONDITIONAL_NZ;
+   case BRW_CONDITIONAL_NZ:
+      return BRW_CONDITIONAL_Z;
+   case BRW_CONDITIONAL_G:
+      return BRW_CONDITIONAL_LE;
+   case BRW_CONDITIONAL_GE:
+      return BRW_CONDITIONAL_L;
+   case BRW_CONDITIONAL_L:
+      return BRW_CONDITIONAL_GE;
+   case BRW_CONDITIONAL_LE:
+      return BRW_CONDITIONAL_G;
+   default:
+      return ~0;
+   }
+}
+
+/* Returns the corresponding conditional mod for swapping src0 and
+ * src1 in e.g. CMP.
+ */
+enum brw_conditional_mod
+brw_swap_cmod(uint32_t cmod)
+{
+   switch (cmod) {
+   case BRW_CONDITIONAL_Z:
+   case BRW_CONDITIONAL_NZ:
+      return cmod;
+   case BRW_CONDITIONAL_G:
+      return BRW_CONDITIONAL_L;
+   case BRW_CONDITIONAL_GE:
+      return BRW_CONDITIONAL_LE;
+   case BRW_CONDITIONAL_L:
+      return BRW_CONDITIONAL_G;
+   case BRW_CONDITIONAL_LE:
+      return BRW_CONDITIONAL_GE;
+   default:
+      return BRW_CONDITIONAL_NONE;
+   }
+}
+
+/**
+ * Get the least significant bit offset of the i+1-th component of immediate
+ * type \p type.  For \p i equal to the two's complement of j, return the
+ * offset of the j-th component starting from the end of the vector.  For
+ * scalar register types return zero.
+ */
+static unsigned
+imm_shift(enum brw_reg_type type, unsigned i)
+{
+   assert(type != BRW_REGISTER_TYPE_UV && type != BRW_REGISTER_TYPE_V &&
+          "Not implemented.");
+
+   if (type == BRW_REGISTER_TYPE_VF)
+      return 8 * (i & 3);
+   else
+      return 0;
+}
+
+/**
+ * Swizzle an arbitrary immediate \p x of the given type according to the
+ * permutation specified as \p swz.
+ */
+uint32_t
+brw_swizzle_immediate(enum brw_reg_type type, uint32_t x, unsigned swz)
+{
+   if (imm_shift(type, 1)) {
+      const unsigned n = 32 / imm_shift(type, 1);
+      uint32_t y = 0;
+
+      for (unsigned i = 0; i < n; i++) {
+         /* Shift the specified component all the way to the right and left to
+          * discard any undesired L/MSBs, then shift it right into component i.
+          */
+         y |= x >> imm_shift(type, (i & ~3) + BRW_GET_SWZ(swz, i & 3))
+                << imm_shift(type, ~0u)
+                >> imm_shift(type, ~0u - i);
+      }
+
+      return y;
+   } else {
+      return x;
+   }
+}
+
+void
+brw_set_default_exec_size(struct brw_codegen *p, unsigned value)
+{
+   brw_inst_set_exec_size(p->devinfo, p->current, value);
+}
+
+void brw_set_default_predicate_control( struct brw_codegen *p, unsigned pc )
+{
+   brw_inst_set_pred_control(p->devinfo, p->current, pc);
+}
+
+void brw_set_default_predicate_inverse(struct brw_codegen *p, bool predicate_inverse)
+{
+   brw_inst_set_pred_inv(p->devinfo, p->current, predicate_inverse);
+}
+
+void brw_set_default_flag_reg(struct brw_codegen *p, int reg, int subreg)
+{
+   if (p->devinfo->gen >= 7)
+      brw_inst_set_flag_reg_nr(p->devinfo, p->current, reg);
+
+   brw_inst_set_flag_subreg_nr(p->devinfo, p->current, subreg);
+}
+
+void brw_set_default_access_mode( struct brw_codegen *p, unsigned access_mode )
+{
+   brw_inst_set_access_mode(p->devinfo, p->current, access_mode);
+}
+
+void
+brw_set_default_compression_control(struct brw_codegen *p,
+			    enum brw_compression compression_control)
+{
+   if (p->devinfo->gen >= 6) {
+      /* Since we don't use the SIMD32 support in gen6, we translate
+       * the pre-gen6 compression control here.
+       */
+      switch (compression_control) {
+      case BRW_COMPRESSION_NONE:
+	 /* This is the "use the first set of bits of dmask/vmask/arf
+	  * according to execsize" option.
+	  */
+         brw_inst_set_qtr_control(p->devinfo, p->current, GEN6_COMPRESSION_1Q);
+	 break;
+      case BRW_COMPRESSION_2NDHALF:
+	 /* For SIMD8, this is "use the second set of 8 bits." */
+         brw_inst_set_qtr_control(p->devinfo, p->current, GEN6_COMPRESSION_2Q);
+	 break;
+      case BRW_COMPRESSION_COMPRESSED:
+	 /* For SIMD16 instruction compression, use the first set of 16 bits
+	  * since we don't do SIMD32 dispatch.
+	  */
+         brw_inst_set_qtr_control(p->devinfo, p->current, GEN6_COMPRESSION_1H);
+	 break;
+      default:
+         unreachable("not reached");
+      }
+   } else {
+      brw_inst_set_qtr_control(p->devinfo, p->current, compression_control);
+   }
+}
+
+/**
+ * Enable or disable instruction compression on the given instruction leaving
+ * the currently selected channel enable group untouched.
+ */
+void
+brw_inst_set_compression(const struct gen_device_info *devinfo,
+                         brw_inst *inst, bool on)
+{
+   if (devinfo->gen >= 6) {
+      /* No-op, the EU will figure out for us whether the instruction needs to
+       * be compressed.
+       */
+   } else {
+      /* The channel group and compression controls are non-orthogonal, there
+       * are two possible representations for uncompressed instructions and we
+       * may need to preserve the current one to avoid changing the selected
+       * channel group inadvertently.
+       */
+      if (on)
+         brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_COMPRESSED);
+      else if (brw_inst_qtr_control(devinfo, inst)
+               == BRW_COMPRESSION_COMPRESSED)
+         brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
+   }
+}
+
+void
+brw_set_default_compression(struct brw_codegen *p, bool on)
+{
+   brw_inst_set_compression(p->devinfo, p->current, on);
+}
+
+/**
+ * Apply the range of channel enable signals given by
+ * [group, group + exec_size) to the instruction passed as argument.
+ */
+void
+brw_inst_set_group(const struct gen_device_info *devinfo,
+                   brw_inst *inst, unsigned group)
+{
+   if (devinfo->gen >= 7) {
+      assert(group % 4 == 0 && group < 32);
+      brw_inst_set_qtr_control(devinfo, inst, group / 8);
+      brw_inst_set_nib_control(devinfo, inst, (group / 4) % 2);
+
+   } else if (devinfo->gen == 6) {
+      assert(group % 8 == 0 && group < 32);
+      brw_inst_set_qtr_control(devinfo, inst, group / 8);
+
+   } else {
+      assert(group % 8 == 0 && group < 16);
+      /* The channel group and compression controls are non-orthogonal, there
+       * are two possible representations for group zero and we may need to
+       * preserve the current one to avoid changing the selected compression
+       * enable inadvertently.
+       */
+      if (group == 8)
+         brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_2NDHALF);
+      else if (brw_inst_qtr_control(devinfo, inst) == BRW_COMPRESSION_2NDHALF)
+         brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
+   }
+}
+
+void
+brw_set_default_group(struct brw_codegen *p, unsigned group)
+{
+   brw_inst_set_group(p->devinfo, p->current, group);
+}
+
+void brw_set_default_mask_control( struct brw_codegen *p, unsigned value )
+{
+   brw_inst_set_mask_control(p->devinfo, p->current, value);
+}
+
+void brw_set_default_saturate( struct brw_codegen *p, bool enable )
+{
+   brw_inst_set_saturate(p->devinfo, p->current, enable);
+}
+
+void brw_set_default_acc_write_control(struct brw_codegen *p, unsigned value)
+{
+   if (p->devinfo->gen >= 6)
+      brw_inst_set_acc_wr_control(p->devinfo, p->current, value);
+}
+
+void brw_push_insn_state( struct brw_codegen *p )
+{
+   assert(p->current != &p->stack[BRW_EU_MAX_INSN_STACK-1]);
+   memcpy(p->current + 1, p->current, sizeof(brw_inst));
+   p->current++;
+}
+
+void brw_pop_insn_state( struct brw_codegen *p )
+{
+   assert(p->current != p->stack);
+   p->current--;
+}
+
+
+/***********************************************************************
+ */
+void
+brw_init_codegen(const struct gen_device_info *devinfo,
+                 struct brw_codegen *p, void *mem_ctx)
+{
+   memset(p, 0, sizeof(*p));
+
+   p->devinfo = devinfo;
+   /*
+    * Set the initial instruction store array size to 1024, if found that
+    * isn't enough, then it will double the store size at brw_next_insn()
+    * until out of memory.
+    */
+   p->store_size = 1024;
+   p->store = rzalloc_array(mem_ctx, brw_inst, p->store_size);
+   p->nr_insn = 0;
+   p->current = p->stack;
+   memset(p->current, 0, sizeof(p->current[0]));
+
+   p->mem_ctx = mem_ctx;
+
+   /* Some defaults?
+    */
+   brw_set_default_exec_size(p, BRW_EXECUTE_8);
+   brw_set_default_mask_control(p, BRW_MASK_ENABLE); /* what does this do? */
+   brw_set_default_saturate(p, 0);
+   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+
+   /* Set up control flow stack */
+   p->if_stack_depth = 0;
+   p->if_stack_array_size = 16;
+   p->if_stack = rzalloc_array(mem_ctx, int, p->if_stack_array_size);
+
+   p->loop_stack_depth = 0;
+   p->loop_stack_array_size = 16;
+   p->loop_stack = rzalloc_array(mem_ctx, int, p->loop_stack_array_size);
+   p->if_depth_in_loop = rzalloc_array(mem_ctx, int, p->loop_stack_array_size);
+}
+
+
+const unsigned *brw_get_program( struct brw_codegen *p,
+			       unsigned *sz )
+{
+   *sz = p->next_insn_offset;
+   return (const unsigned *)p->store;
+}
+
+void
+brw_disassemble(const struct gen_device_info *devinfo,
+                void *assembly, int start, int end, FILE *out)
+{
+   bool dump_hex = (INTEL_DEBUG & DEBUG_HEX) != 0;
+
+   for (int offset = start; offset < end;) {
+      brw_inst *insn = assembly + offset;
+      brw_inst uncompacted;
+      bool compacted = brw_inst_cmpt_control(devinfo, insn);
+      if (0)
+         fprintf(out, "0x%08x: ", offset);
+
+      if (compacted) {
+         brw_compact_inst *compacted = (void *)insn;
+	 if (dump_hex) {
+	    fprintf(out, "0x%08x 0x%08x                       ",
+		    ((uint32_t *)insn)[1],
+		    ((uint32_t *)insn)[0]);
+	 }
+
+	 brw_uncompact_instruction(devinfo, &uncompacted, compacted);
+	 insn = &uncompacted;
+	 offset += 8;
+      } else {
+	 if (dump_hex) {
+	    fprintf(out, "0x%08x 0x%08x 0x%08x 0x%08x ",
+		    ((uint32_t *)insn)[3],
+		    ((uint32_t *)insn)[2],
+		    ((uint32_t *)insn)[1],
+		    ((uint32_t *)insn)[0]);
+	 }
+	 offset += 16;
+      }
+
+      brw_disassemble_inst(out, devinfo, insn, compacted);
+   }
+}
+
+enum gen {
+   GEN4  = (1 << 0),
+   GEN45 = (1 << 1),
+   GEN5  = (1 << 2),
+   GEN6  = (1 << 3),
+   GEN7  = (1 << 4),
+   GEN75 = (1 << 5),
+   GEN8  = (1 << 6),
+   GEN9  = (1 << 7),
+   GEN_ALL = ~0
+};
+
+#define GEN_LT(gen) ((gen) - 1)
+#define GEN_GE(gen) (~GEN_LT(gen))
+#define GEN_LE(gen) (GEN_LT(gen) | (gen))
+
+static const struct opcode_desc opcode_10_descs[] = {
+   { .name = "dim",   .nsrc = 1, .ndst = 1, .gens = GEN75 },
+   { .name = "smov",  .nsrc = 0, .ndst = 0, .gens = GEN_GE(GEN8) },
+};
+
+static const struct opcode_desc opcode_35_descs[] = {
+   { .name = "iff",   .nsrc = 0, .ndst = 0, .gens = GEN_LE(GEN5) },
+   { .name = "brc",   .nsrc = 0, .ndst = 0, .gens = GEN_GE(GEN7) },
+};
+
+static const struct opcode_desc opcode_38_descs[] = {
+   { .name = "do",    .nsrc = 0, .ndst = 0, .gens = GEN_LE(GEN5) },
+   { .name = "case",  .nsrc = 0, .ndst = 0, .gens = GEN6 },
+};
+
+static const struct opcode_desc opcode_44_descs[] = {
+   { .name = "msave", .nsrc = 0, .ndst = 0, .gens = GEN_LE(GEN5) },
+   { .name = "call",  .nsrc = 0, .ndst = 0, .gens = GEN_GE(GEN6) },
+};
+
+static const struct opcode_desc opcode_45_descs[] = {
+   { .name = "mrest", .nsrc = 0, .ndst = 0, .gens = GEN_LE(GEN5) },
+   { .name = "ret",   .nsrc = 0, .ndst = 0, .gens = GEN_GE(GEN6) },
+};
+
+static const struct opcode_desc opcode_46_descs[] = {
+   { .name = "push",  .nsrc = 0, .ndst = 0, .gens = GEN_LE(GEN5) },
+   { .name = "fork",  .nsrc = 0, .ndst = 0, .gens = GEN6 },
+   { .name = "goto",  .nsrc = 0, .ndst = 0, .gens = GEN_GE(GEN8) },
+};
+
+static const struct opcode_desc opcode_descs[128] = {
+   [BRW_OPCODE_ILLEGAL] = {
+      .name = "illegal", .nsrc = 0, .ndst = 0, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_MOV] = {
+      .name = "mov",     .nsrc = 1, .ndst = 1, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_SEL] = {
+      .name = "sel",     .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_MOVI] = {
+      .name = "movi",    .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN45),
+   },
+   [BRW_OPCODE_NOT] = {
+      .name = "not",     .nsrc = 1, .ndst = 1, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_AND] = {
+      .name = "and",     .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_OR] = {
+      .name = "or",      .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_XOR] = {
+      .name = "xor",     .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_SHR] = {
+      .name = "shr",     .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_SHL] = {
+      .name = "shl",     .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+   },
+   [10] = {
+      .table = opcode_10_descs, .size = ARRAY_SIZE(opcode_10_descs),
+   },
+   /* Reserved - 11 */
+   [BRW_OPCODE_ASR] = {
+      .name = "asr",     .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+   },
+   /* Reserved - 13-15 */
+   [BRW_OPCODE_CMP] = {
+      .name = "cmp",     .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_CMPN] = {
+      .name = "cmpn",    .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_CSEL] = {
+      .name = "csel",    .nsrc = 3, .ndst = 1, .gens = GEN_GE(GEN8),
+   },
+   [BRW_OPCODE_F32TO16] = {
+      .name = "f32to16", .nsrc = 1, .ndst = 1, .gens = GEN7 | GEN75,
+   },
+   [BRW_OPCODE_F16TO32] = {
+      .name = "f16to32", .nsrc = 1, .ndst = 1, .gens = GEN7 | GEN75,
+   },
+   /* Reserved - 21-22 */
+   [BRW_OPCODE_BFREV] = {
+      .name = "bfrev",   .nsrc = 1, .ndst = 1, .gens = GEN_GE(GEN7),
+   },
+   [BRW_OPCODE_BFE] = {
+      .name = "bfe",     .nsrc = 3, .ndst = 1, .gens = GEN_GE(GEN7),
+   },
+   [BRW_OPCODE_BFI1] = {
+      .name = "bfi1",    .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN7),
+   },
+   [BRW_OPCODE_BFI2] = {
+      .name = "bfi2",    .nsrc = 3, .ndst = 1, .gens = GEN_GE(GEN7),
+   },
+   /* Reserved - 27-31 */
+   [BRW_OPCODE_JMPI] = {
+      .name = "jmpi",    .nsrc = 0, .ndst = 0, .gens = GEN_ALL,
+   },
+   [33] = {
+      .name = "brd",     .nsrc = 0, .ndst = 0, .gens = GEN_GE(GEN7),
+   },
+   [BRW_OPCODE_IF] = {
+      .name = "if",      .nsrc = 0, .ndst = 0, .gens = GEN_ALL,
+   },
+   [35] = {
+      .table = opcode_35_descs, .size = ARRAY_SIZE(opcode_35_descs),
+   },
+   [BRW_OPCODE_ELSE] = {
+      .name = "else",    .nsrc = 0, .ndst = 0, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_ENDIF] = {
+      .name = "endif",   .nsrc = 0, .ndst = 0, .gens = GEN_ALL,
+   },
+   [38] = {
+      .table = opcode_38_descs, .size = ARRAY_SIZE(opcode_38_descs),
+   },
+   [BRW_OPCODE_WHILE] = {
+      .name = "while",   .nsrc = 0, .ndst = 0, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_BREAK] = {
+      .name = "break",   .nsrc = 0, .ndst = 0, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_CONTINUE] = {
+      .name = "cont",    .nsrc = 0, .ndst = 0, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_HALT] = {
+      .name = "halt",    .nsrc = 0, .ndst = 0, .gens = GEN_ALL,
+   },
+   [43] = {
+      .name = "calla",   .nsrc = 0, .ndst = 0, .gens = GEN_GE(GEN75),
+   },
+   [44] = {
+      .table = opcode_44_descs, .size = ARRAY_SIZE(opcode_44_descs),
+   },
+   [45] = {
+      .table = opcode_45_descs, .size = ARRAY_SIZE(opcode_45_descs),
+   },
+   [46] = {
+      .table = opcode_46_descs, .size = ARRAY_SIZE(opcode_46_descs),
+   },
+   [47] = {
+      .name = "pop",     .nsrc = 2, .ndst = 0, .gens = GEN_LE(GEN5),
+   },
+   [BRW_OPCODE_WAIT] = {
+      .name = "wait",    .nsrc = 1, .ndst = 0, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_SEND] = {
+      .name = "send",    .nsrc = 1, .ndst = 1, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_SENDC] = {
+      .name = "sendc",   .nsrc = 1, .ndst = 1, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_SENDS] = {
+      .name = "sends",   .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN9),
+   },
+   [BRW_OPCODE_SENDSC] = {
+      .name = "sendsc",  .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN9),
+   },
+   /* Reserved 53-55 */
+   [BRW_OPCODE_MATH] = {
+      .name = "math",    .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN6),
+   },
+   /* Reserved 57-63 */
+   [BRW_OPCODE_ADD] = {
+      .name = "add",     .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_MUL] = {
+      .name = "mul",     .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_AVG] = {
+      .name = "avg",     .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_FRC] = {
+      .name = "frc",     .nsrc = 1, .ndst = 1, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_RNDU] = {
+      .name = "rndu",    .nsrc = 1, .ndst = 1, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_RNDD] = {
+      .name = "rndd",    .nsrc = 1, .ndst = 1, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_RNDE] = {
+      .name = "rnde",    .nsrc = 1, .ndst = 1, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_RNDZ] = {
+      .name = "rndz",    .nsrc = 1, .ndst = 1, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_MAC] = {
+      .name = "mac",     .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_MACH] = {
+      .name = "mach",    .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_LZD] = {
+      .name = "lzd",     .nsrc = 1, .ndst = 1, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_FBH] = {
+      .name = "fbh",     .nsrc = 1, .ndst = 1, .gens = GEN_GE(GEN7),
+   },
+   [BRW_OPCODE_FBL] = {
+      .name = "fbl",     .nsrc = 1, .ndst = 1, .gens = GEN_GE(GEN7),
+   },
+   [BRW_OPCODE_CBIT] = {
+      .name = "cbit",    .nsrc = 1, .ndst = 1, .gens = GEN_GE(GEN7),
+   },
+   [BRW_OPCODE_ADDC] = {
+      .name = "addc",    .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN7),
+   },
+   [BRW_OPCODE_SUBB] = {
+      .name = "subb",    .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN7),
+   },
+   [BRW_OPCODE_SAD2] = {
+      .name = "sad2",    .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_SADA2] = {
+      .name = "sada2",   .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+   },
+   /* Reserved 82-83 */
+   [BRW_OPCODE_DP4] = {
+      .name = "dp4",     .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_DPH] = {
+      .name = "dph",     .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_DP3] = {
+      .name = "dp3",     .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_DP2] = {
+      .name = "dp2",     .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+   },
+   /* Reserved 88 */
+   [BRW_OPCODE_LINE] = {
+      .name = "line",    .nsrc = 2, .ndst = 1, .gens = GEN_ALL,
+   },
+   [BRW_OPCODE_PLN] = {
+      .name = "pln",     .nsrc = 2, .ndst = 1, .gens = GEN_GE(GEN45),
+   },
+   [BRW_OPCODE_MAD] = {
+      .name = "mad",     .nsrc = 3, .ndst = 1, .gens = GEN_GE(GEN6),
+   },
+   [BRW_OPCODE_LRP] = {
+      .name = "lrp",     .nsrc = 3, .ndst = 1, .gens = GEN_GE(GEN6),
+   },
+   [93] = {
+      .name = "madm",    .nsrc = 3, .ndst = 1, .gens = GEN_GE(GEN8),
+   },
+   /* Reserved 94-124 */
+   [BRW_OPCODE_NENOP] = {
+      .name = "nenop",   .nsrc = 0, .ndst = 0, .gens = GEN45,
+   },
+   [BRW_OPCODE_NOP] = {
+      .name = "nop",     .nsrc = 0, .ndst = 0, .gens = GEN_ALL,
+   },
+};
+
+static enum gen
+gen_from_devinfo(const struct gen_device_info *devinfo)
+{
+   switch (devinfo->gen) {
+   case 4: return devinfo->is_g4x ? GEN45 : GEN4;
+   case 5: return GEN5;
+   case 6: return GEN6;
+   case 7: return devinfo->is_haswell ? GEN75 : GEN7;
+   case 8: return GEN8;
+   case 9: return GEN9;
+   default:
+      unreachable("not reached");
+   }
+}
+
+/* Return the matching opcode_desc for the specified opcode number and
+ * hardware generation, or NULL if the opcode is not supported by the device.
+ */
+const struct opcode_desc *
+brw_opcode_desc(const struct gen_device_info *devinfo, enum opcode opcode)
+{
+   if (opcode >= ARRAY_SIZE(opcode_descs))
+      return NULL;
+
+   enum gen gen = gen_from_devinfo(devinfo);
+   if (opcode_descs[opcode].gens != 0) {
+      if ((opcode_descs[opcode].gens & gen) != 0) {
+         return &opcode_descs[opcode];
+      }
+   } else if (opcode_descs[opcode].table != NULL) {
+      const struct opcode_desc *table = opcode_descs[opcode].table;
+      for (unsigned i = 0; i < opcode_descs[opcode].size; i++) {
+         if ((table[i].gens & gen) != 0) {
+            return &table[i];
+         }
+      }
+   }
+   return NULL;
+}
diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h
new file mode 100644
index 00000000000..f4225952333
--- /dev/null
+++ b/src/intel/compiler/brw_eu.h
@@ -0,0 +1,612 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+
+#ifndef BRW_EU_H
+#define BRW_EU_H
+
+#include <stdbool.h>
+#include "brw_inst.h"
+#include "brw_eu_defines.h"
+#include "brw_reg.h"
+#include "intel_asm_annotation.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#define BRW_EU_MAX_INSN_STACK 5
+
+/* A helper for accessing the last instruction emitted.  This makes it easy
+ * to set various bits on an instruction without having to create temporary
+ * variable and assign the emitted instruction to those.
+ */
+#define brw_last_inst (&p->store[p->nr_insn - 1])
+
+struct brw_codegen {
+   brw_inst *store;
+   int store_size;
+   unsigned nr_insn;
+   unsigned int next_insn_offset;
+
+   void *mem_ctx;
+
+   /* Allow clients to push/pop instruction state:
+    */
+   brw_inst stack[BRW_EU_MAX_INSN_STACK];
+   bool compressed_stack[BRW_EU_MAX_INSN_STACK];
+   brw_inst *current;
+
+   bool single_program_flow;
+   const struct gen_device_info *devinfo;
+
+   /* Control flow stacks:
+    * - if_stack contains IF and ELSE instructions which must be patched
+    *   (and popped) once the matching ENDIF instruction is encountered.
+    *
+    *   Just store the instruction pointer(an index).
+    */
+   int *if_stack;
+   int if_stack_depth;
+   int if_stack_array_size;
+
+   /**
+    * loop_stack contains the instruction pointers of the starts of loops which
+    * must be patched (and popped) once the matching WHILE instruction is
+    * encountered.
+    */
+   int *loop_stack;
+   /**
+    * pre-gen6, the BREAK and CONT instructions had to tell how many IF/ENDIF
+    * blocks they were popping out of, to fix up the mask stack.  This tracks
+    * the IF/ENDIF nesting in each current nested loop level.
+    */
+   int *if_depth_in_loop;
+   int loop_stack_depth;
+   int loop_stack_array_size;
+};
+
+void brw_pop_insn_state( struct brw_codegen *p );
+void brw_push_insn_state( struct brw_codegen *p );
+void brw_set_default_exec_size(struct brw_codegen *p, unsigned value);
+void brw_set_default_mask_control( struct brw_codegen *p, unsigned value );
+void brw_set_default_saturate( struct brw_codegen *p, bool enable );
+void brw_set_default_access_mode( struct brw_codegen *p, unsigned access_mode );
+void brw_inst_set_compression(const struct gen_device_info *devinfo,
+                              brw_inst *inst, bool on);
+void brw_set_default_compression(struct brw_codegen *p, bool on);
+void brw_inst_set_group(const struct gen_device_info *devinfo,
+                        brw_inst *inst, unsigned group);
+void brw_set_default_group(struct brw_codegen *p, unsigned group);
+void brw_set_default_compression_control(struct brw_codegen *p, enum brw_compression c);
+void brw_set_default_predicate_control( struct brw_codegen *p, unsigned pc );
+void brw_set_default_predicate_inverse(struct brw_codegen *p, bool predicate_inverse);
+void brw_set_default_flag_reg(struct brw_codegen *p, int reg, int subreg);
+void brw_set_default_acc_write_control(struct brw_codegen *p, unsigned value);
+
+void brw_init_codegen(const struct gen_device_info *, struct brw_codegen *p,
+		      void *mem_ctx);
+int brw_disassemble_inst(FILE *file, const struct gen_device_info *devinfo,
+                         struct brw_inst *inst, bool is_compacted);
+void brw_disassemble(const struct gen_device_info *devinfo, void *assembly,
+                     int start, int end, FILE *out);
+const unsigned *brw_get_program( struct brw_codegen *p, unsigned *sz );
+
+brw_inst *brw_next_insn(struct brw_codegen *p, unsigned opcode);
+void brw_set_dest(struct brw_codegen *p, brw_inst *insn, struct brw_reg dest);
+void brw_set_src0(struct brw_codegen *p, brw_inst *insn, struct brw_reg reg);
+
+void gen6_resolve_implied_move(struct brw_codegen *p,
+			       struct brw_reg *src,
+			       unsigned msg_reg_nr);
+
+/* Helpers for regular instructions:
+ */
+#define ALU1(OP)				\
+brw_inst *brw_##OP(struct brw_codegen *p,	\
+	      struct brw_reg dest,		\
+	      struct brw_reg src0);
+
+#define ALU2(OP)				\
+brw_inst *brw_##OP(struct brw_codegen *p,	\
+	      struct brw_reg dest,		\
+	      struct brw_reg src0,		\
+	      struct brw_reg src1);
+
+#define ALU3(OP)				\
+brw_inst *brw_##OP(struct brw_codegen *p,	\
+	      struct brw_reg dest,		\
+	      struct brw_reg src0,		\
+	      struct brw_reg src1,		\
+	      struct brw_reg src2);
+
+#define ROUND(OP) \
+void brw_##OP(struct brw_codegen *p, struct brw_reg dest, struct brw_reg src0);
+
+ALU1(MOV)
+ALU2(SEL)
+ALU1(NOT)
+ALU2(AND)
+ALU2(OR)
+ALU2(XOR)
+ALU2(SHR)
+ALU2(SHL)
+ALU1(DIM)
+ALU2(ASR)
+ALU1(F32TO16)
+ALU1(F16TO32)
+ALU2(ADD)
+ALU2(AVG)
+ALU2(MUL)
+ALU1(FRC)
+ALU1(RNDD)
+ALU2(MAC)
+ALU2(MACH)
+ALU1(LZD)
+ALU2(DP4)
+ALU2(DPH)
+ALU2(DP3)
+ALU2(DP2)
+ALU2(LINE)
+ALU2(PLN)
+ALU3(MAD)
+ALU3(LRP)
+ALU1(BFREV)
+ALU3(BFE)
+ALU2(BFI1)
+ALU3(BFI2)
+ALU1(FBH)
+ALU1(FBL)
+ALU1(CBIT)
+ALU2(ADDC)
+ALU2(SUBB)
+ALU2(MAC)
+
+ROUND(RNDZ)
+ROUND(RNDE)
+
+#undef ALU1
+#undef ALU2
+#undef ALU3
+#undef ROUND
+
+
+/* Helpers for SEND instruction:
+ */
+void brw_set_sampler_message(struct brw_codegen *p,
+                             brw_inst *insn,
+                             unsigned binding_table_index,
+                             unsigned sampler,
+                             unsigned msg_type,
+                             unsigned response_length,
+                             unsigned msg_length,
+                             unsigned header_present,
+                             unsigned simd_mode,
+                             unsigned return_format);
+
+void brw_set_message_descriptor(struct brw_codegen *p,
+                                brw_inst *inst,
+                                enum brw_message_target sfid,
+                                unsigned msg_length,
+                                unsigned response_length,
+                                bool header_present,
+                                bool end_of_thread);
+
+void brw_set_dp_read_message(struct brw_codegen *p,
+			     brw_inst *insn,
+			     unsigned binding_table_index,
+			     unsigned msg_control,
+			     unsigned msg_type,
+			     unsigned target_cache,
+			     unsigned msg_length,
+                             bool header_present,
+			     unsigned response_length);
+
+void brw_set_dp_write_message(struct brw_codegen *p,
+			      brw_inst *insn,
+			      unsigned binding_table_index,
+			      unsigned msg_control,
+			      unsigned msg_type,
+                              unsigned target_cache,
+			      unsigned msg_length,
+			      bool header_present,
+			      unsigned last_render_target,
+			      unsigned response_length,
+			      unsigned end_of_thread,
+			      unsigned send_commit_msg);
+
+void brw_urb_WRITE(struct brw_codegen *p,
+		   struct brw_reg dest,
+		   unsigned msg_reg_nr,
+		   struct brw_reg src0,
+                   enum brw_urb_write_flags flags,
+		   unsigned msg_length,
+		   unsigned response_length,
+		   unsigned offset,
+		   unsigned swizzle);
+
+/**
+ * Send message to shared unit \p sfid with a possibly indirect descriptor \p
+ * desc.  If \p desc is not an immediate it will be transparently loaded to an
+ * address register using an OR instruction.  The returned instruction can be
+ * passed as argument to the usual brw_set_*_message() functions in order to
+ * specify any additional descriptor bits -- If \p desc is an immediate this
+ * will be the SEND instruction itself, otherwise it will be the OR
+ * instruction.
+ */
+struct brw_inst *
+brw_send_indirect_message(struct brw_codegen *p,
+                          unsigned sfid,
+                          struct brw_reg dst,
+                          struct brw_reg payload,
+                          struct brw_reg desc);
+
+void brw_ff_sync(struct brw_codegen *p,
+		   struct brw_reg dest,
+		   unsigned msg_reg_nr,
+		   struct brw_reg src0,
+		   bool allocate,
+		   unsigned response_length,
+		   bool eot);
+
+void brw_svb_write(struct brw_codegen *p,
+                   struct brw_reg dest,
+                   unsigned msg_reg_nr,
+                   struct brw_reg src0,
+                   unsigned binding_table_index,
+                   bool   send_commit_msg);
+
+void brw_fb_WRITE(struct brw_codegen *p,
+		   struct brw_reg payload,
+		   struct brw_reg implied_header,
+		   unsigned msg_control,
+		   unsigned binding_table_index,
+		   unsigned msg_length,
+		   unsigned response_length,
+		   bool eot,
+		   bool last_render_target,
+		   bool header_present);
+
+brw_inst *gen9_fb_READ(struct brw_codegen *p,
+                       struct brw_reg dst,
+                       struct brw_reg payload,
+                       unsigned binding_table_index,
+                       unsigned msg_length,
+                       unsigned response_length,
+                       bool per_sample);
+
+void brw_SAMPLE(struct brw_codegen *p,
+		struct brw_reg dest,
+		unsigned msg_reg_nr,
+		struct brw_reg src0,
+		unsigned binding_table_index,
+		unsigned sampler,
+		unsigned msg_type,
+		unsigned response_length,
+		unsigned msg_length,
+		unsigned header_present,
+		unsigned simd_mode,
+		unsigned return_format);
+
+void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
+                                      struct brw_reg header,
+                                      struct brw_reg sampler_index);
+
+void gen4_math(struct brw_codegen *p,
+	       struct brw_reg dest,
+	       unsigned function,
+	       unsigned msg_reg_nr,
+	       struct brw_reg src,
+	       unsigned precision );
+
+void gen6_math(struct brw_codegen *p,
+	       struct brw_reg dest,
+	       unsigned function,
+	       struct brw_reg src0,
+	       struct brw_reg src1);
+
+void brw_oword_block_read(struct brw_codegen *p,
+			  struct brw_reg dest,
+			  struct brw_reg mrf,
+			  uint32_t offset,
+			  uint32_t bind_table_index);
+
+unsigned brw_scratch_surface_idx(const struct brw_codegen *p);
+
+void brw_oword_block_read_scratch(struct brw_codegen *p,
+				  struct brw_reg dest,
+				  struct brw_reg mrf,
+				  int num_regs,
+				  unsigned offset);
+
+void brw_oword_block_write_scratch(struct brw_codegen *p,
+				   struct brw_reg mrf,
+				   int num_regs,
+				   unsigned offset);
+
+void gen7_block_read_scratch(struct brw_codegen *p,
+                             struct brw_reg dest,
+                             int num_regs,
+                             unsigned offset);
+
+void brw_shader_time_add(struct brw_codegen *p,
+                         struct brw_reg payload,
+                         uint32_t surf_index);
+
+/**
+ * Return the generation-specific jump distance scaling factor.
+ *
+ * Given the number of instructions to jump, we need to scale by
+ * some number to obtain the actual jump distance to program in an
+ * instruction.
+ */
+static inline unsigned
+brw_jump_scale(const struct gen_device_info *devinfo)
+{
+   /* Broadwell measures jump targets in bytes. */
+   if (devinfo->gen >= 8)
+      return 16;
+
+   /* Ironlake and later measure jump targets in 64-bit data chunks (in order
+    * (to support compaction), so each 128-bit instruction requires 2 chunks.
+    */
+   if (devinfo->gen >= 5)
+      return 2;
+
+   /* Gen4 simply uses the number of 128-bit instructions. */
+   return 1;
+}
+
+void brw_barrier(struct brw_codegen *p, struct brw_reg src);
+
+/* If/else/endif.  Works by manipulating the execution flags on each
+ * channel.
+ */
+brw_inst *brw_IF(struct brw_codegen *p, unsigned execute_size);
+brw_inst *gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
+                  struct brw_reg src0, struct brw_reg src1);
+
+void brw_ELSE(struct brw_codegen *p);
+void brw_ENDIF(struct brw_codegen *p);
+
+/* DO/WHILE loops:
+ */
+brw_inst *brw_DO(struct brw_codegen *p, unsigned execute_size);
+
+brw_inst *brw_WHILE(struct brw_codegen *p);
+
+brw_inst *brw_BREAK(struct brw_codegen *p);
+brw_inst *brw_CONT(struct brw_codegen *p);
+brw_inst *gen6_HALT(struct brw_codegen *p);
+
+/* Forward jumps:
+ */
+void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx);
+
+brw_inst *brw_JMPI(struct brw_codegen *p, struct brw_reg index,
+                   unsigned predicate_control);
+
+void brw_NOP(struct brw_codegen *p);
+
+void brw_WAIT(struct brw_codegen *p);
+
+/* Special case: there is never a destination, execution size will be
+ * taken from src0:
+ */
+void brw_CMP(struct brw_codegen *p,
+	     struct brw_reg dest,
+	     unsigned conditional,
+	     struct brw_reg src0,
+	     struct brw_reg src1);
+
+void
+brw_untyped_atomic(struct brw_codegen *p,
+                   struct brw_reg dst,
+                   struct brw_reg payload,
+                   struct brw_reg surface,
+                   unsigned atomic_op,
+                   unsigned msg_length,
+                   bool response_expected);
+
+void
+brw_untyped_surface_read(struct brw_codegen *p,
+                         struct brw_reg dst,
+                         struct brw_reg payload,
+                         struct brw_reg surface,
+                         unsigned msg_length,
+                         unsigned num_channels);
+
+void
+brw_untyped_surface_write(struct brw_codegen *p,
+                          struct brw_reg payload,
+                          struct brw_reg surface,
+                          unsigned msg_length,
+                          unsigned num_channels);
+
+void
+brw_typed_atomic(struct brw_codegen *p,
+                 struct brw_reg dst,
+                 struct brw_reg payload,
+                 struct brw_reg surface,
+                 unsigned atomic_op,
+                 unsigned msg_length,
+                 bool response_expected);
+
+void
+brw_typed_surface_read(struct brw_codegen *p,
+                       struct brw_reg dst,
+                       struct brw_reg payload,
+                       struct brw_reg surface,
+                       unsigned msg_length,
+                       unsigned num_channels);
+
+void
+brw_typed_surface_write(struct brw_codegen *p,
+                        struct brw_reg payload,
+                        struct brw_reg surface,
+                        unsigned msg_length,
+                        unsigned num_channels);
+
+void
+brw_memory_fence(struct brw_codegen *p,
+                 struct brw_reg dst);
+
+void
+brw_pixel_interpolator_query(struct brw_codegen *p,
+                             struct brw_reg dest,
+                             struct brw_reg mrf,
+                             bool noperspective,
+                             unsigned mode,
+                             struct brw_reg data,
+                             unsigned msg_length,
+                             unsigned response_length);
+
+void
+brw_find_live_channel(struct brw_codegen *p,
+                      struct brw_reg dst,
+                      struct brw_reg mask);
+
+void
+brw_broadcast(struct brw_codegen *p,
+              struct brw_reg dst,
+              struct brw_reg src,
+              struct brw_reg idx);
+
+/***********************************************************************
+ * brw_eu_util.c:
+ */
+
+void brw_copy_indirect_to_indirect(struct brw_codegen *p,
+				   struct brw_indirect dst_ptr,
+				   struct brw_indirect src_ptr,
+				   unsigned count);
+
+void brw_copy_from_indirect(struct brw_codegen *p,
+			    struct brw_reg dst,
+			    struct brw_indirect ptr,
+			    unsigned count);
+
+void brw_copy4(struct brw_codegen *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       unsigned count);
+
+void brw_copy8(struct brw_codegen *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       unsigned count);
+
+void brw_math_invert( struct brw_codegen *p,
+		      struct brw_reg dst,
+		      struct brw_reg src);
+
+void brw_set_src1(struct brw_codegen *p, brw_inst *insn, struct brw_reg reg);
+
+void brw_set_uip_jip(struct brw_codegen *p, int start_offset);
+
+enum brw_conditional_mod brw_negate_cmod(uint32_t cmod);
+enum brw_conditional_mod brw_swap_cmod(uint32_t cmod);
+
+/* brw_eu_compact.c */
+void brw_init_compaction_tables(const struct gen_device_info *devinfo);
+void brw_compact_instructions(struct brw_codegen *p, int start_offset,
+                              int num_annotations, struct annotation *annotation);
+void brw_uncompact_instruction(const struct gen_device_info *devinfo,
+                               brw_inst *dst, brw_compact_inst *src);
+bool brw_try_compact_instruction(const struct gen_device_info *devinfo,
+                                 brw_compact_inst *dst, brw_inst *src);
+
+void brw_debug_compact_uncompact(const struct gen_device_info *devinfo,
+                                 brw_inst *orig, brw_inst *uncompacted);
+
+/* brw_eu_validate.c */
+bool brw_validate_instructions(const struct brw_codegen *p, int start_offset,
+                               struct annotation_info *annotation);
+
+static inline int
+next_offset(const struct gen_device_info *devinfo, void *store, int offset)
+{
+   brw_inst *insn = (brw_inst *)((char *)store + offset);
+
+   if (brw_inst_cmpt_control(devinfo, insn))
+      return offset + 8;
+   else
+      return offset + 16;
+}
+
+struct opcode_desc {
+   /* The union is an implementation detail used by brw_opcode_desc() to handle
+    * opcodes that have been reused for different instructions across hardware
+    * generations.
+    *
+    * The gens field acts as a tag. If it is non-zero, name points to a string
+    * containing the instruction mnemonic. If it is zero, the table field is
+    * valid and either points to a secondary opcode_desc table with 'size'
+    * elements or is NULL and no such instruction exists for the opcode.
+    */
+   union {
+      struct {
+         char    *name;
+         int      nsrc;
+      };
+      struct {
+         const struct opcode_desc *table;
+         unsigned size;
+      };
+   };
+   int      ndst;
+   int      gens;
+};
+
+const struct opcode_desc *
+brw_opcode_desc(const struct gen_device_info *devinfo, enum opcode opcode);
+
+static inline bool
+is_3src(const struct gen_device_info *devinfo, enum opcode opcode)
+{
+   const struct opcode_desc *desc = brw_opcode_desc(devinfo, opcode);
+   return desc && desc->nsrc == 3;
+}
+
+/** Maximum SEND message length */
+#define BRW_MAX_MSG_LENGTH 15
+
+/** First MRF register used by pull loads */
+#define FIRST_SPILL_MRF(gen) ((gen) == 6 ? 21 : 13)
+
+/** First MRF register used by spills */
+#define FIRST_PULL_LOAD_MRF(gen) ((gen) == 6 ? 16 : 13)
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/intel/compiler/brw_eu_compact.c b/src/intel/compiler/brw_eu_compact.c
new file mode 100644
index 00000000000..b2af76d533a
--- /dev/null
+++ b/src/intel/compiler/brw_eu_compact.c
@@ -0,0 +1,1579 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_eu_compact.c
+ *
+ * Instruction compaction is a feature of G45 and newer hardware that allows
+ * for a smaller instruction encoding.
+ *
+ * The instruction cache is on the order of 32KB, and many programs generate
+ * far more instructions than that.  The instruction cache is built to barely
+ * keep up with instruction dispatch ability in cache hit cases -- L1
+ * instruction cache misses that still hit in the next level could limit
+ * throughput by around 50%.
+ *
+ * The idea of instruction compaction is that most instructions use a tiny
+ * subset of the GPU functionality, so we can encode what would be a 16 byte
+ * instruction in 8 bytes using some lookup tables for various fields.
+ *
+ *
+ * Instruction compaction capabilities vary subtly by generation.
+ *
+ * G45's support for instruction compaction is very limited. Jump counts on
+ * this generation are in units of 16-byte uncompacted instructions. As such,
+ * all jump targets must be 16-byte aligned. Also, all instructions must be
+ * naturally aligned, i.e. uncompacted instructions must be 16-byte aligned.
+ * A G45-only instruction, NENOP, must be used to provide padding to align
+ * uncompacted instructions.
+ *
+ * Gen5 removes these restrictions and changes jump counts to be in units of
+ * 8-byte compacted instructions, allowing jump targets to be only 8-byte
+ * aligned. Uncompacted instructions can also be placed on 8-byte boundaries.
+ *
+ * Gen6 adds the ability to compact instructions with a limited range of
+ * immediate values. Compactable immediates have 12 unrestricted bits, and a
+ * 13th bit that's replicated through the high 20 bits, to create the 32-bit
+ * value of DW3 in the uncompacted instruction word.
+ *
+ * On Gen7 we can compact some control flow instructions with a small positive
+ * immediate in the low bits of DW3, like ENDIF with the JIP field. Other
+ * control flow instructions with UIP cannot be compacted, because of the
+ * replicated 13th bit. No control flow instructions can be compacted on Gen6
+ * since the jump count field is not in DW3.
+ *
+ *    break    JIP/UIP
+ *    cont     JIP/UIP
+ *    halt     JIP/UIP
+ *    if       JIP/UIP
+ *    else     JIP (plus UIP on BDW+)
+ *    endif    JIP
+ *    while    JIP (must be negative)
+ *
+ * Gen 8 adds support for compacting 3-src instructions.
+ */
+
+#include "brw_eu.h"
+#include "brw_shader.h"
+#include "intel_asm_annotation.h"
+#include "common/gen_debug.h"
+
+static const uint32_t g45_control_index_table[32] = {
+   0b00000000000000000,
+   0b01000000000000000,
+   0b00110000000000000,
+   0b00000000000000010,
+   0b00100000000000000,
+   0b00010000000000000,
+   0b01000000000100000,
+   0b01000000100000000,
+   0b01010000000100000,
+   0b00000000100000010,
+   0b11000000000000000,
+   0b00001000100000010,
+   0b01001000100000000,
+   0b00000000100000000,
+   0b11000000000100000,
+   0b00001000100000000,
+   0b10110000000000000,
+   0b11010000000100000,
+   0b00110000100000000,
+   0b00100000100000000,
+   0b01000000000001000,
+   0b01000000000000100,
+   0b00111100000000000,
+   0b00101011000000000,
+   0b00110000000010000,
+   0b00010000100000000,
+   0b01000000000100100,
+   0b01000000000101000,
+   0b00110000000000110,
+   0b00000000000001010,
+   0b01010000000101000,
+   0b01010000000100100
+};
+
+static const uint32_t g45_datatype_table[32] = {
+   0b001000000000100001,
+   0b001011010110101101,
+   0b001000001000110001,
+   0b001111011110111101,
+   0b001011010110101100,
+   0b001000000110101101,
+   0b001000000000100000,
+   0b010100010110110001,
+   0b001100011000101101,
+   0b001000000000100010,
+   0b001000001000110110,
+   0b010000001000110001,
+   0b001000001000110010,
+   0b011000001000110010,
+   0b001111011110111100,
+   0b001000000100101000,
+   0b010100011000110001,
+   0b001010010100101001,
+   0b001000001000101001,
+   0b010000001000110110,
+   0b101000001000110001,
+   0b001011011000101101,
+   0b001000000100001001,
+   0b001011011000101100,
+   0b110100011000110001,
+   0b001000001110111101,
+   0b110000001000110001,
+   0b011000000100101010,
+   0b101000001000101001,
+   0b001011010110001100,
+   0b001000000110100001,
+   0b001010010100001000
+};
+
+static const uint16_t g45_subreg_table[32] = {
+   0b000000000000000,
+   0b000000010000000,
+   0b000001000000000,
+   0b000100000000000,
+   0b000000000100000,
+   0b100000000000000,
+   0b000000000010000,
+   0b001100000000000,
+   0b001010000000000,
+   0b000000100000000,
+   0b001000000000000,
+   0b000000000001000,
+   0b000000001000000,
+   0b000000000000001,
+   0b000010000000000,
+   0b000000010100000,
+   0b000000000000111,
+   0b000001000100000,
+   0b011000000000000,
+   0b000000110000000,
+   0b000000000000010,
+   0b000000000000100,
+   0b000000001100000,
+   0b000100000000010,
+   0b001110011000110,
+   0b001110100001000,
+   0b000110011000110,
+   0b000001000011000,
+   0b000110010000100,
+   0b001100000000110,
+   0b000000010000110,
+   0b000001000110000
+};
+
+static const uint16_t g45_src_index_table[32] = {
+   0b000000000000,
+   0b010001101000,
+   0b010110001000,
+   0b011010010000,
+   0b001101001000,
+   0b010110001010,
+   0b010101110000,
+   0b011001111000,
+   0b001000101000,
+   0b000000101000,
+   0b010001010000,
+   0b111101101100,
+   0b010110001100,
+   0b010001101100,
+   0b011010010100,
+   0b010001001100,
+   0b001100101000,
+   0b000000000010,
+   0b111101001100,
+   0b011001101000,
+   0b010101001000,
+   0b000000000100,
+   0b000000101100,
+   0b010001101010,
+   0b000000111000,
+   0b010101011000,
+   0b000100100000,
+   0b010110000000,
+   0b010000000100,
+   0b010000111000,
+   0b000101100000,
+   0b111101110100
+};
+
+static const uint32_t gen6_control_index_table[32] = {
+   0b00000000000000000,
+   0b01000000000000000,
+   0b00110000000000000,
+   0b00000000100000000,
+   0b00010000000000000,
+   0b00001000100000000,
+   0b00000000100000010,
+   0b00000000000000010,
+   0b01000000100000000,
+   0b01010000000000000,
+   0b10110000000000000,
+   0b00100000000000000,
+   0b11010000000000000,
+   0b11000000000000000,
+   0b01001000100000000,
+   0b01000000000001000,
+   0b01000000000000100,
+   0b00000000000001000,
+   0b00000000000000100,
+   0b00111000100000000,
+   0b00001000100000010,
+   0b00110000100000000,
+   0b00110000000000001,
+   0b00100000000000001,
+   0b00110000000000010,
+   0b00110000000000101,
+   0b00110000000001001,
+   0b00110000000010000,
+   0b00110000000000011,
+   0b00110000000000100,
+   0b00110000100001000,
+   0b00100000000001001
+};
+
+static const uint32_t gen6_datatype_table[32] = {
+   0b001001110000000000,
+   0b001000110000100000,
+   0b001001110000000001,
+   0b001000000001100000,
+   0b001010110100101001,
+   0b001000000110101101,
+   0b001100011000101100,
+   0b001011110110101101,
+   0b001000000111101100,
+   0b001000000001100001,
+   0b001000110010100101,
+   0b001000000001000001,
+   0b001000001000110001,
+   0b001000001000101001,
+   0b001000000000100000,
+   0b001000001000110010,
+   0b001010010100101001,
+   0b001011010010100101,
+   0b001000000110100101,
+   0b001100011000101001,
+   0b001011011000101100,
+   0b001011010110100101,
+   0b001011110110100101,
+   0b001111011110111101,
+   0b001111011110111100,
+   0b001111011110111101,
+   0b001111011110011101,
+   0b001111011110111110,
+   0b001000000000100001,
+   0b001000000000100010,
+   0b001001111111011101,
+   0b001000001110111110,
+};
+
+static const uint16_t gen6_subreg_table[32] = {
+   0b000000000000000,
+   0b000000000000100,
+   0b000000110000000,
+   0b111000000000000,
+   0b011110000001000,
+   0b000010000000000,
+   0b000000000010000,
+   0b000110000001100,
+   0b001000000000000,
+   0b000001000000000,
+   0b000001010010100,
+   0b000000001010110,
+   0b010000000000000,
+   0b110000000000000,
+   0b000100000000000,
+   0b000000010000000,
+   0b000000000001000,
+   0b100000000000000,
+   0b000001010000000,
+   0b001010000000000,
+   0b001100000000000,
+   0b000000001010100,
+   0b101101010010100,
+   0b010100000000000,
+   0b000000010001111,
+   0b011000000000000,
+   0b111110000000000,
+   0b101000000000000,
+   0b000000000001111,
+   0b000100010001111,
+   0b001000010001111,
+   0b000110000000000,
+};
+
+static const uint16_t gen6_src_index_table[32] = {
+   0b000000000000,
+   0b010110001000,
+   0b010001101000,
+   0b001000101000,
+   0b011010010000,
+   0b000100100000,
+   0b010001101100,
+   0b010101110000,
+   0b011001111000,
+   0b001100101000,
+   0b010110001100,
+   0b001000100000,
+   0b010110001010,
+   0b000000000010,
+   0b010101010000,
+   0b010101101000,
+   0b111101001100,
+   0b111100101100,
+   0b011001110000,
+   0b010110001001,
+   0b010101011000,
+   0b001101001000,
+   0b010000101100,
+   0b010000000000,
+   0b001101110000,
+   0b001100010000,
+   0b001100000000,
+   0b010001101010,
+   0b001101111000,
+   0b000001110000,
+   0b001100100000,
+   0b001101010000,
+};
+
+static const uint32_t gen7_control_index_table[32] = {
+   0b0000000000000000010,
+   0b0000100000000000000,
+   0b0000100000000000001,
+   0b0000100000000000010,
+   0b0000100000000000011,
+   0b0000100000000000100,
+   0b0000100000000000101,
+   0b0000100000000000111,
+   0b0000100000000001000,
+   0b0000100000000001001,
+   0b0000100000000001101,
+   0b0000110000000000000,
+   0b0000110000000000001,
+   0b0000110000000000010,
+   0b0000110000000000011,
+   0b0000110000000000100,
+   0b0000110000000000101,
+   0b0000110000000000111,
+   0b0000110000000001001,
+   0b0000110000000001101,
+   0b0000110000000010000,
+   0b0000110000100000000,
+   0b0001000000000000000,
+   0b0001000000000000010,
+   0b0001000000000000100,
+   0b0001000000100000000,
+   0b0010110000000000000,
+   0b0010110000000010000,
+   0b0011000000000000000,
+   0b0011000000100000000,
+   0b0101000000000000000,
+   0b0101000000100000000
+};
+
+static const uint32_t gen7_datatype_table[32] = {
+   0b001000000000000001,
+   0b001000000000100000,
+   0b001000000000100001,
+   0b001000000001100001,
+   0b001000000010111101,
+   0b001000001011111101,
+   0b001000001110100001,
+   0b001000001110100101,
+   0b001000001110111101,
+   0b001000010000100001,
+   0b001000110000100000,
+   0b001000110000100001,
+   0b001001010010100101,
+   0b001001110010100100,
+   0b001001110010100101,
+   0b001111001110111101,
+   0b001111011110011101,
+   0b001111011110111100,
+   0b001111011110111101,
+   0b001111111110111100,
+   0b000000001000001100,
+   0b001000000000111101,
+   0b001000000010100101,
+   0b001000010000100000,
+   0b001001010010100100,
+   0b001001110010000100,
+   0b001010010100001001,
+   0b001101111110111101,
+   0b001111111110111101,
+   0b001011110110101100,
+   0b001010010100101000,
+   0b001010110100101000
+};
+
+static const uint16_t gen7_subreg_table[32] = {
+   0b000000000000000,
+   0b000000000000001,
+   0b000000000001000,
+   0b000000000001111,
+   0b000000000010000,
+   0b000000010000000,
+   0b000000100000000,
+   0b000000110000000,
+   0b000001000000000,
+   0b000001000010000,
+   0b000010100000000,
+   0b001000000000000,
+   0b001000000000001,
+   0b001000010000001,
+   0b001000010000010,
+   0b001000010000011,
+   0b001000010000100,
+   0b001000010000111,
+   0b001000010001000,
+   0b001000010001110,
+   0b001000010001111,
+   0b001000110000000,
+   0b001000111101000,
+   0b010000000000000,
+   0b010000110000000,
+   0b011000000000000,
+   0b011110010000111,
+   0b100000000000000,
+   0b101000000000000,
+   0b110000000000000,
+   0b111000000000000,
+   0b111000000011100
+};
+
+static const uint16_t gen7_src_index_table[32] = {
+   0b000000000000,
+   0b000000000010,
+   0b000000010000,
+   0b000000010010,
+   0b000000011000,
+   0b000000100000,
+   0b000000101000,
+   0b000001001000,
+   0b000001010000,
+   0b000001110000,
+   0b000001111000,
+   0b001100000000,
+   0b001100000010,
+   0b001100001000,
+   0b001100010000,
+   0b001100010010,
+   0b001100100000,
+   0b001100101000,
+   0b001100111000,
+   0b001101000000,
+   0b001101000010,
+   0b001101001000,
+   0b001101010000,
+   0b001101100000,
+   0b001101101000,
+   0b001101110000,
+   0b001101110001,
+   0b001101111000,
+   0b010001101000,
+   0b010001101001,
+   0b010001101010,
+   0b010110001000
+};
+
+static const uint32_t gen8_control_index_table[32] = {
+   0b0000000000000000010,
+   0b0000100000000000000,
+   0b0000100000000000001,
+   0b0000100000000000010,
+   0b0000100000000000011,
+   0b0000100000000000100,
+   0b0000100000000000101,
+   0b0000100000000000111,
+   0b0000100000000001000,
+   0b0000100000000001001,
+   0b0000100000000001101,
+   0b0000110000000000000,
+   0b0000110000000000001,
+   0b0000110000000000010,
+   0b0000110000000000011,
+   0b0000110000000000100,
+   0b0000110000000000101,
+   0b0000110000000000111,
+   0b0000110000000001001,
+   0b0000110000000001101,
+   0b0000110000000010000,
+   0b0000110000100000000,
+   0b0001000000000000000,
+   0b0001000000000000010,
+   0b0001000000000000100,
+   0b0001000000100000000,
+   0b0010110000000000000,
+   0b0010110000000010000,
+   0b0011000000000000000,
+   0b0011000000100000000,
+   0b0101000000000000000,
+   0b0101000000100000000
+};
+
+static const uint32_t gen8_datatype_table[32] = {
+   0b001000000000000000001,
+   0b001000000000001000000,
+   0b001000000000001000001,
+   0b001000000000011000001,
+   0b001000000000101011101,
+   0b001000000010111011101,
+   0b001000000011101000001,
+   0b001000000011101000101,
+   0b001000000011101011101,
+   0b001000001000001000001,
+   0b001000011000001000000,
+   0b001000011000001000001,
+   0b001000101000101000101,
+   0b001000111000101000100,
+   0b001000111000101000101,
+   0b001011100011101011101,
+   0b001011101011100011101,
+   0b001011101011101011100,
+   0b001011101011101011101,
+   0b001011111011101011100,
+   0b000000000010000001100,
+   0b001000000000001011101,
+   0b001000000000101000101,
+   0b001000001000001000000,
+   0b001000101000101000100,
+   0b001000111000100000100,
+   0b001001001001000001001,
+   0b001010111011101011101,
+   0b001011111011101011101,
+   0b001001111001101001100,
+   0b001001001001001001000,
+   0b001001011001001001000
+};
+
+static const uint16_t gen8_subreg_table[32] = {
+   0b000000000000000,
+   0b000000000000001,
+   0b000000000001000,
+   0b000000000001111,
+   0b000000000010000,
+   0b000000010000000,
+   0b000000100000000,
+   0b000000110000000,
+   0b000001000000000,
+   0b000001000010000,
+   0b000001010000000,
+   0b001000000000000,
+   0b001000000000001,
+   0b001000010000001,
+   0b001000010000010,
+   0b001000010000011,
+   0b001000010000100,
+   0b001000010000111,
+   0b001000010001000,
+   0b001000010001110,
+   0b001000010001111,
+   0b001000110000000,
+   0b001000111101000,
+   0b010000000000000,
+   0b010000110000000,
+   0b011000000000000,
+   0b011110010000111,
+   0b100000000000000,
+   0b101000000000000,
+   0b110000000000000,
+   0b111000000000000,
+   0b111000000011100
+};
+
+static const uint16_t gen8_src_index_table[32] = {
+   0b000000000000,
+   0b000000000010,
+   0b000000010000,
+   0b000000010010,
+   0b000000011000,
+   0b000000100000,
+   0b000000101000,
+   0b000001001000,
+   0b000001010000,
+   0b000001110000,
+   0b000001111000,
+   0b001100000000,
+   0b001100000010,
+   0b001100001000,
+   0b001100010000,
+   0b001100010010,
+   0b001100100000,
+   0b001100101000,
+   0b001100111000,
+   0b001101000000,
+   0b001101000010,
+   0b001101001000,
+   0b001101010000,
+   0b001101100000,
+   0b001101101000,
+   0b001101110000,
+   0b001101110001,
+   0b001101111000,
+   0b010001101000,
+   0b010001101001,
+   0b010001101010,
+   0b010110001000
+};
+
+/* This is actually the control index table for Cherryview (26 bits), but the
+ * only difference from Broadwell (24 bits) is that it has two extra 0-bits at
+ * the start.
+ *
+ * The low 24 bits have the same mappings on both hardware.
+ */
+static const uint32_t gen8_3src_control_index_table[4] = {
+   0b00100000000110000000000001,
+   0b00000000000110000000000001,
+   0b00000000001000000000000001,
+   0b00000000001000000000100001
+};
+
+/* This is actually the control index table for Cherryview (49 bits), but the
+ * only difference from Broadwell (46 bits) is that it has three extra 0-bits
+ * at the start.
+ *
+ * The low 44 bits have the same mappings on both hardware, and since the high
+ * three bits on Broadwell are zero, we can reuse Cherryview's table.
+ */
+static const uint64_t gen8_3src_source_index_table[4] = {
+   0b0000001110010011100100111001000001111000000000000,
+   0b0000001110010011100100111001000001111000000000010,
+   0b0000001110010011100100111001000001111000000001000,
+   0b0000001110010011100100111001000001111000000100000
+};
+
+static const uint32_t *control_index_table;
+static const uint32_t *datatype_table;
+static const uint16_t *subreg_table;
+static const uint16_t *src_index_table;
+
+static bool
+set_control_index(const struct gen_device_info *devinfo,
+                  brw_compact_inst *dst, brw_inst *src)
+{
+   uint32_t uncompacted = devinfo->gen >= 8  /* 17b/G45; 19b/IVB+ */
+      ? (brw_inst_bits(src, 33, 31) << 16) | /*  3b */
+        (brw_inst_bits(src, 23, 12) <<  4) | /* 12b */
+        (brw_inst_bits(src, 10,  9) <<  2) | /*  2b */
+        (brw_inst_bits(src, 34, 34) <<  1) | /*  1b */
+        (brw_inst_bits(src,  8,  8))         /*  1b */
+      : (brw_inst_bits(src, 31, 31) << 16) | /*  1b */
+        (brw_inst_bits(src, 23,  8));        /* 16b */
+
+   /* On gen7, the flag register and subregister numbers are integrated into
+    * the control index.
+    */
+   if (devinfo->gen == 7)
+      uncompacted |= brw_inst_bits(src, 90, 89) << 17; /* 2b */
+
+   for (int i = 0; i < 32; i++) {
+      if (control_index_table[i] == uncompacted) {
+         brw_compact_inst_set_control_index(devinfo, dst, i);
+	 return true;
+      }
+   }
+
+   return false;
+}
+
+static bool
+set_datatype_index(const struct gen_device_info *devinfo, brw_compact_inst *dst,
+                   brw_inst *src)
+{
+   uint32_t uncompacted = devinfo->gen >= 8  /* 18b/G45+; 21b/BDW+ */
+      ? (brw_inst_bits(src, 63, 61) << 18) | /*  3b */
+        (brw_inst_bits(src, 94, 89) << 12) | /*  6b */
+        (brw_inst_bits(src, 46, 35))         /* 12b */
+      : (brw_inst_bits(src, 63, 61) << 15) | /*  3b */
+        (brw_inst_bits(src, 46, 32));        /* 15b */
+
+   for (int i = 0; i < 32; i++) {
+      if (datatype_table[i] == uncompacted) {
+         brw_compact_inst_set_datatype_index(devinfo, dst, i);
+	 return true;
+      }
+   }
+
+   return false;
+}
+
+static bool
+set_subreg_index(const struct gen_device_info *devinfo, brw_compact_inst *dst,
+                 brw_inst *src, bool is_immediate)
+{
+   uint16_t uncompacted =                 /* 15b */
+      (brw_inst_bits(src, 52, 48) << 0) | /*  5b */
+      (brw_inst_bits(src, 68, 64) << 5);  /*  5b */
+
+   if (!is_immediate)
+      uncompacted |= brw_inst_bits(src, 100, 96) << 10; /* 5b */
+
+   for (int i = 0; i < 32; i++) {
+      if (subreg_table[i] == uncompacted) {
+         brw_compact_inst_set_subreg_index(devinfo, dst, i);
+	 return true;
+      }
+   }
+
+   return false;
+}
+
+static bool
+get_src_index(uint16_t uncompacted,
+              uint16_t *compacted)
+{
+   for (int i = 0; i < 32; i++) {
+      if (src_index_table[i] == uncompacted) {
+	 *compacted = i;
+	 return true;
+      }
+   }
+
+   return false;
+}
+
+static bool
+set_src0_index(const struct gen_device_info *devinfo,
+               brw_compact_inst *dst, brw_inst *src)
+{
+   uint16_t compacted;
+   uint16_t uncompacted = brw_inst_bits(src, 88, 77); /* 12b */
+
+   if (!get_src_index(uncompacted, &compacted))
+      return false;
+
+   brw_compact_inst_set_src0_index(devinfo, dst, compacted);
+
+   return true;
+}
+
+static bool
+set_src1_index(const struct gen_device_info *devinfo, brw_compact_inst *dst,
+               brw_inst *src, bool is_immediate)
+{
+   uint16_t compacted;
+
+   if (is_immediate) {
+      compacted = (brw_inst_imm_ud(devinfo, src) >> 8) & 0x1f;
+   } else {
+      uint16_t uncompacted = brw_inst_bits(src, 120, 109); /* 12b */
+
+      if (!get_src_index(uncompacted, &compacted))
+         return false;
+   }
+
+   brw_compact_inst_set_src1_index(devinfo, dst, compacted);
+
+   return true;
+}
+
+static bool
+set_3src_control_index(const struct gen_device_info *devinfo,
+                       brw_compact_inst *dst, brw_inst *src)
+{
+   assert(devinfo->gen >= 8);
+
+   uint32_t uncompacted =                  /* 24b/BDW; 26b/CHV */
+      (brw_inst_bits(src, 34, 32) << 21) | /*  3b */
+      (brw_inst_bits(src, 28,  8));        /* 21b */
+
+   if (devinfo->gen >= 9 || devinfo->is_cherryview)
+      uncompacted |= brw_inst_bits(src, 36, 35) << 24; /* 2b */
+
+   for (unsigned i = 0; i < ARRAY_SIZE(gen8_3src_control_index_table); i++) {
+      if (gen8_3src_control_index_table[i] == uncompacted) {
+         brw_compact_inst_set_3src_control_index(devinfo, dst, i);
+	 return true;
+      }
+   }
+
+   return false;
+}
+
+static bool
+set_3src_source_index(const struct gen_device_info *devinfo,
+                      brw_compact_inst *dst, brw_inst *src)
+{
+   assert(devinfo->gen >= 8);
+
+   uint64_t uncompacted =                    /* 46b/BDW; 49b/CHV */
+      (brw_inst_bits(src,  83,  83) << 43) | /*  1b */
+      (brw_inst_bits(src, 114, 107) << 35) | /*  8b */
+      (brw_inst_bits(src,  93,  86) << 27) | /*  8b */
+      (brw_inst_bits(src,  72,  65) << 19) | /*  8b */
+      (brw_inst_bits(src,  55,  37));        /* 19b */
+
+   if (devinfo->gen >= 9 || devinfo->is_cherryview) {
+      uncompacted |=
+         (brw_inst_bits(src, 126, 125) << 47) | /* 2b */
+         (brw_inst_bits(src, 105, 104) << 45) | /* 2b */
+         (brw_inst_bits(src,  84,  84) << 44);  /* 1b */
+   } else {
+      uncompacted |=
+         (brw_inst_bits(src, 125, 125) << 45) | /* 1b */
+         (brw_inst_bits(src, 104, 104) << 44);  /* 1b */
+   }
+
+   for (unsigned i = 0; i < ARRAY_SIZE(gen8_3src_source_index_table); i++) {
+      if (gen8_3src_source_index_table[i] == uncompacted) {
+         brw_compact_inst_set_3src_source_index(devinfo, dst, i);
+	 return true;
+      }
+   }
+
+   return false;
+}
+
+static bool
+has_unmapped_bits(const struct gen_device_info *devinfo, brw_inst *src)
+{
+   /* EOT can only be mapped on a send if the src1 is an immediate */
+   if ((brw_inst_opcode(devinfo, src) == BRW_OPCODE_SENDC ||
+        brw_inst_opcode(devinfo, src) == BRW_OPCODE_SEND) &&
+       brw_inst_eot(devinfo, src))
+      return true;
+
+   /* Check for instruction bits that don't map to any of the fields of the
+    * compacted instruction.  The instruction cannot be compacted if any of
+    * them are set.  They overlap with:
+    *  - NibCtrl (bit 47 on Gen7, bit 11 on Gen8)
+    *  - Dst.AddrImm[9] (bit 47 on Gen8)
+    *  - Src0.AddrImm[9] (bit 95 on Gen8)
+    *  - Imm64[27:31] (bits 91-95 on Gen7, bit 95 on Gen8)
+    *  - UIP[31] (bit 95 on Gen8)
+    */
+   if (devinfo->gen >= 8) {
+      assert(!brw_inst_bits(src, 7,  7));
+      return brw_inst_bits(src, 95, 95) ||
+             brw_inst_bits(src, 47, 47) ||
+             brw_inst_bits(src, 11, 11);
+   } else {
+      assert(!brw_inst_bits(src, 7,  7) &&
+             !(devinfo->gen < 7 && brw_inst_bits(src, 90, 90)));
+      return brw_inst_bits(src, 95, 91) ||
+             brw_inst_bits(src, 47, 47);
+   }
+}
+
+static bool
+has_3src_unmapped_bits(const struct gen_device_info *devinfo, brw_inst *src)
+{
+   /* Check for three-source instruction bits that don't map to any of the
+    * fields of the compacted instruction.  All of them seem to be reserved
+    * bits currently.
+    */
+   if (devinfo->gen >= 9 || devinfo->is_cherryview) {
+      assert(!brw_inst_bits(src, 127, 127) &&
+             !brw_inst_bits(src, 7,  7));
+   } else {
+      assert(devinfo->gen >= 8);
+      assert(!brw_inst_bits(src, 127, 126) &&
+             !brw_inst_bits(src, 105, 105) &&
+             !brw_inst_bits(src, 84, 84) &&
+             !brw_inst_bits(src, 36, 35) &&
+             !brw_inst_bits(src, 7,  7));
+   }
+
+   return false;
+}
+
+static bool
+brw_try_compact_3src_instruction(const struct gen_device_info *devinfo,
+                                 brw_compact_inst *dst, brw_inst *src)
+{
+   assert(devinfo->gen >= 8);
+
+   if (has_3src_unmapped_bits(devinfo, src))
+      return false;
+
+#define compact(field) \
+   brw_compact_inst_set_3src_##field(devinfo, dst, brw_inst_3src_##field(devinfo, src))
+
+   compact(opcode);
+
+   if (!set_3src_control_index(devinfo, dst, src))
+      return false;
+
+   if (!set_3src_source_index(devinfo, dst, src))
+      return false;
+
+   compact(dst_reg_nr);
+   compact(src0_rep_ctrl);
+   brw_compact_inst_set_3src_cmpt_control(devinfo, dst, true);
+   compact(debug_control);
+   compact(saturate);
+   compact(src1_rep_ctrl);
+   compact(src2_rep_ctrl);
+   compact(src0_reg_nr);
+   compact(src1_reg_nr);
+   compact(src2_reg_nr);
+   compact(src0_subreg_nr);
+   compact(src1_subreg_nr);
+   compact(src2_subreg_nr);
+
+#undef compact
+
+   return true;
+}
+
+/* Compacted instructions have 12-bits for immediate sources, and a 13th bit
+ * that's replicated through the high 20 bits.
+ *
+ * Effectively this means we get 12-bit integers, 0.0f, and some limited uses
+ * of packed vectors as compactable immediates.
+ */
+static bool
+is_compactable_immediate(unsigned imm)
+{
+   /* We get the low 12 bits as-is. */
+   imm &= ~0xfff;
+
+   /* We get one bit replicated through the top 20 bits. */
+   return imm == 0 || imm == 0xfffff000;
+}
+
+/**
+ * Tries to compact instruction src into dst.
+ *
+ * It doesn't modify dst unless src is compactable, which is relied on by
+ * brw_compact_instructions().
+ */
+bool
+brw_try_compact_instruction(const struct gen_device_info *devinfo,
+                            brw_compact_inst *dst, brw_inst *src)
+{
+   brw_compact_inst temp;
+
+   assert(brw_inst_cmpt_control(devinfo, src) == 0);
+
+   if (is_3src(devinfo, brw_inst_opcode(devinfo, src))) {
+      if (devinfo->gen >= 8) {
+         memset(&temp, 0, sizeof(temp));
+         if (brw_try_compact_3src_instruction(devinfo, &temp, src)) {
+            *dst = temp;
+            return true;
+         } else {
+            return false;
+         }
+      } else {
+         return false;
+      }
+   }
+
+   bool is_immediate =
+      brw_inst_src0_reg_file(devinfo, src) == BRW_IMMEDIATE_VALUE ||
+      brw_inst_src1_reg_file(devinfo, src) == BRW_IMMEDIATE_VALUE;
+   if (is_immediate &&
+       (devinfo->gen < 6 ||
+        !is_compactable_immediate(brw_inst_imm_ud(devinfo, src)))) {
+      return false;
+   }
+
+   if (has_unmapped_bits(devinfo, src))
+      return false;
+
+   memset(&temp, 0, sizeof(temp));
+
+#define compact(field) \
+   brw_compact_inst_set_##field(devinfo, &temp, brw_inst_##field(devinfo, src))
+
+   compact(opcode);
+   compact(debug_control);
+
+   if (!set_control_index(devinfo, &temp, src))
+      return false;
+   if (!set_datatype_index(devinfo, &temp, src))
+      return false;
+   if (!set_subreg_index(devinfo, &temp, src, is_immediate))
+      return false;
+
+   if (devinfo->gen >= 6) {
+      compact(acc_wr_control);
+   } else {
+      compact(mask_control_ex);
+   }
+
+   compact(cond_modifier);
+
+   if (devinfo->gen <= 6)
+      compact(flag_subreg_nr);
+
+   brw_compact_inst_set_cmpt_control(devinfo, &temp, true);
+
+   if (!set_src0_index(devinfo, &temp, src))
+      return false;
+   if (!set_src1_index(devinfo, &temp, src, is_immediate))
+      return false;
+
+   brw_compact_inst_set_dst_reg_nr(devinfo, &temp,
+                                   brw_inst_dst_da_reg_nr(devinfo, src));
+   brw_compact_inst_set_src0_reg_nr(devinfo, &temp,
+                                    brw_inst_src0_da_reg_nr(devinfo, src));
+
+   if (is_immediate) {
+      brw_compact_inst_set_src1_reg_nr(devinfo, &temp,
+                                       brw_inst_imm_ud(devinfo, src) & 0xff);
+   } else {
+      brw_compact_inst_set_src1_reg_nr(devinfo, &temp,
+                                       brw_inst_src1_da_reg_nr(devinfo, src));
+   }
+
+#undef compact
+
+   *dst = temp;
+
+   return true;
+}
+
+static void
+set_uncompacted_control(const struct gen_device_info *devinfo, brw_inst *dst,
+                        brw_compact_inst *src)
+{
+   uint32_t uncompacted =
+      control_index_table[brw_compact_inst_control_index(devinfo, src)];
+
+   if (devinfo->gen >= 8) {
+      brw_inst_set_bits(dst, 33, 31, (uncompacted >> 16));
+      brw_inst_set_bits(dst, 23, 12, (uncompacted >>  4) & 0xfff);
+      brw_inst_set_bits(dst, 10,  9, (uncompacted >>  2) & 0x3);
+      brw_inst_set_bits(dst, 34, 34, (uncompacted >>  1) & 0x1);
+      brw_inst_set_bits(dst,  8,  8, (uncompacted >>  0) & 0x1);
+   } else {
+      brw_inst_set_bits(dst, 31, 31, (uncompacted >> 16) & 0x1);
+      brw_inst_set_bits(dst, 23,  8, (uncompacted & 0xffff));
+
+      if (devinfo->gen == 7)
+         brw_inst_set_bits(dst, 90, 89, uncompacted >> 17);
+   }
+}
+
+static void
+set_uncompacted_datatype(const struct gen_device_info *devinfo, brw_inst *dst,
+                         brw_compact_inst *src)
+{
+   uint32_t uncompacted =
+      datatype_table[brw_compact_inst_datatype_index(devinfo, src)];
+
+   if (devinfo->gen >= 8) {
+      brw_inst_set_bits(dst, 63, 61, (uncompacted >> 18));
+      brw_inst_set_bits(dst, 94, 89, (uncompacted >> 12) & 0x3f);
+      brw_inst_set_bits(dst, 46, 35, (uncompacted >>  0) & 0xfff);
+   } else {
+      brw_inst_set_bits(dst, 63, 61, (uncompacted >> 15));
+      brw_inst_set_bits(dst, 46, 32, (uncompacted & 0x7fff));
+   }
+}
+
+static void
+set_uncompacted_subreg(const struct gen_device_info *devinfo, brw_inst *dst,
+                       brw_compact_inst *src)
+{
+   uint16_t uncompacted =
+      subreg_table[brw_compact_inst_subreg_index(devinfo, src)];
+
+   brw_inst_set_bits(dst, 100, 96, (uncompacted >> 10));
+   brw_inst_set_bits(dst,  68, 64, (uncompacted >>  5) & 0x1f);
+   brw_inst_set_bits(dst,  52, 48, (uncompacted >>  0) & 0x1f);
+}
+
+static void
+set_uncompacted_src0(const struct gen_device_info *devinfo, brw_inst *dst,
+                     brw_compact_inst *src)
+{
+   uint32_t compacted = brw_compact_inst_src0_index(devinfo, src);
+   uint16_t uncompacted = src_index_table[compacted];
+
+   brw_inst_set_bits(dst, 88, 77, uncompacted);
+}
+
+static void
+set_uncompacted_src1(const struct gen_device_info *devinfo, brw_inst *dst,
+                     brw_compact_inst *src, bool is_immediate)
+{
+   if (is_immediate) {
+      signed high5 = brw_compact_inst_src1_index(devinfo, src);
+      /* Replicate top bit of src1_index into high 20 bits of the immediate. */
+      brw_inst_set_imm_ud(devinfo, dst, (high5 << 27) >> 19);
+   } else {
+      uint16_t uncompacted =
+         src_index_table[brw_compact_inst_src1_index(devinfo, src)];
+
+      brw_inst_set_bits(dst, 120, 109, uncompacted);
+   }
+}
+
+static void
+set_uncompacted_3src_control_index(const struct gen_device_info *devinfo,
+                                   brw_inst *dst, brw_compact_inst *src)
+{
+   assert(devinfo->gen >= 8);
+
+   uint32_t compacted = brw_compact_inst_3src_control_index(devinfo, src);
+   uint32_t uncompacted = gen8_3src_control_index_table[compacted];
+
+   brw_inst_set_bits(dst, 34, 32, (uncompacted >> 21) & 0x7);
+   brw_inst_set_bits(dst, 28,  8, (uncompacted >>  0) & 0x1fffff);
+
+   if (devinfo->gen >= 9 || devinfo->is_cherryview)
+      brw_inst_set_bits(dst, 36, 35, (uncompacted >> 24) & 0x3);
+}
+
+static void
+set_uncompacted_3src_source_index(const struct gen_device_info *devinfo,
+                                  brw_inst *dst, brw_compact_inst *src)
+{
+   assert(devinfo->gen >= 8);
+
+   uint32_t compacted = brw_compact_inst_3src_source_index(devinfo, src);
+   uint64_t uncompacted = gen8_3src_source_index_table[compacted];
+
+   brw_inst_set_bits(dst,  83,  83, (uncompacted >> 43) & 0x1);
+   brw_inst_set_bits(dst, 114, 107, (uncompacted >> 35) & 0xff);
+   brw_inst_set_bits(dst,  93,  86, (uncompacted >> 27) & 0xff);
+   brw_inst_set_bits(dst,  72,  65, (uncompacted >> 19) & 0xff);
+   brw_inst_set_bits(dst,  55,  37, (uncompacted >>  0) & 0x7ffff);
+
+   if (devinfo->gen >= 9 || devinfo->is_cherryview) {
+      brw_inst_set_bits(dst, 126, 125, (uncompacted >> 47) & 0x3);
+      brw_inst_set_bits(dst, 105, 104, (uncompacted >> 45) & 0x3);
+      brw_inst_set_bits(dst,  84,  84, (uncompacted >> 44) & 0x1);
+   } else {
+      brw_inst_set_bits(dst, 125, 125, (uncompacted >> 45) & 0x1);
+      brw_inst_set_bits(dst, 104, 104, (uncompacted >> 44) & 0x1);
+   }
+}
+
+static void
+brw_uncompact_3src_instruction(const struct gen_device_info *devinfo,
+                               brw_inst *dst, brw_compact_inst *src)
+{
+   assert(devinfo->gen >= 8);
+
+#define uncompact(field) \
+   brw_inst_set_3src_##field(devinfo, dst, brw_compact_inst_3src_##field(devinfo, src))
+
+   uncompact(opcode);
+
+   set_uncompacted_3src_control_index(devinfo, dst, src);
+   set_uncompacted_3src_source_index(devinfo, dst, src);
+
+   uncompact(dst_reg_nr);
+   uncompact(src0_rep_ctrl);
+   brw_inst_set_3src_cmpt_control(devinfo, dst, false);
+   uncompact(debug_control);
+   uncompact(saturate);
+   uncompact(src1_rep_ctrl);
+   uncompact(src2_rep_ctrl);
+   uncompact(src0_reg_nr);
+   uncompact(src1_reg_nr);
+   uncompact(src2_reg_nr);
+   uncompact(src0_subreg_nr);
+   uncompact(src1_subreg_nr);
+   uncompact(src2_subreg_nr);
+
+#undef uncompact
+}
+
+void
+brw_uncompact_instruction(const struct gen_device_info *devinfo, brw_inst *dst,
+                          brw_compact_inst *src)
+{
+   memset(dst, 0, sizeof(*dst));
+
+   if (devinfo->gen >= 8 &&
+       is_3src(devinfo, brw_compact_inst_3src_opcode(devinfo, src))) {
+      brw_uncompact_3src_instruction(devinfo, dst, src);
+      return;
+   }
+
+#define uncompact(field) \
+   brw_inst_set_##field(devinfo, dst, brw_compact_inst_##field(devinfo, src))
+
+   uncompact(opcode);
+   uncompact(debug_control);
+
+   set_uncompacted_control(devinfo, dst, src);
+   set_uncompacted_datatype(devinfo, dst, src);
+
+   /* src0/1 register file fields are in the datatype table. */
+   bool is_immediate = brw_inst_src0_reg_file(devinfo, dst) == BRW_IMMEDIATE_VALUE ||
+                       brw_inst_src1_reg_file(devinfo, dst) == BRW_IMMEDIATE_VALUE;
+
+   set_uncompacted_subreg(devinfo, dst, src);
+
+   if (devinfo->gen >= 6) {
+      uncompact(acc_wr_control);
+   } else {
+      uncompact(mask_control_ex);
+   }
+
+   uncompact(cond_modifier);
+
+   if (devinfo->gen <= 6)
+      uncompact(flag_subreg_nr);
+
+   set_uncompacted_src0(devinfo, dst, src);
+   set_uncompacted_src1(devinfo, dst, src, is_immediate);
+
+   brw_inst_set_dst_da_reg_nr(devinfo, dst,
+                              brw_compact_inst_dst_reg_nr(devinfo, src));
+   brw_inst_set_src0_da_reg_nr(devinfo, dst,
+                               brw_compact_inst_src0_reg_nr(devinfo, src));
+
+   if (is_immediate) {
+      brw_inst_set_imm_ud(devinfo, dst,
+                          brw_inst_imm_ud(devinfo, dst) |
+                          brw_compact_inst_src1_reg_nr(devinfo, src));
+   } else {
+      brw_inst_set_src1_da_reg_nr(devinfo, dst,
+                                  brw_compact_inst_src1_reg_nr(devinfo, src));
+   }
+
+#undef uncompact
+}
+
+void brw_debug_compact_uncompact(const struct gen_device_info *devinfo,
+                                 brw_inst *orig,
+                                 brw_inst *uncompacted)
+{
+   fprintf(stderr, "Instruction compact/uncompact changed (gen%d):\n",
+           devinfo->gen);
+
+   fprintf(stderr, "  before: ");
+   brw_disassemble_inst(stderr, devinfo, orig, true);
+
+   fprintf(stderr, "  after:  ");
+   brw_disassemble_inst(stderr, devinfo, uncompacted, false);
+
+   uint32_t *before_bits = (uint32_t *)orig;
+   uint32_t *after_bits = (uint32_t *)uncompacted;
+   fprintf(stderr, "  changed bits:\n");
+   for (int i = 0; i < 128; i++) {
+      uint32_t before = before_bits[i / 32] & (1 << (i & 31));
+      uint32_t after = after_bits[i / 32] & (1 << (i & 31));
+
+      if (before != after) {
+         fprintf(stderr, "  bit %d, %s to %s\n", i,
+                 before ? "set" : "unset",
+                 after ? "set" : "unset");
+      }
+   }
+}
+
+static int
+compacted_between(int old_ip, int old_target_ip, int *compacted_counts)
+{
+   int this_compacted_count = compacted_counts[old_ip];
+   int target_compacted_count = compacted_counts[old_target_ip];
+   return target_compacted_count - this_compacted_count;
+}
+
+static void
+update_uip_jip(const struct gen_device_info *devinfo, brw_inst *insn,
+               int this_old_ip, int *compacted_counts)
+{
+   /* JIP and UIP are in units of:
+    *    - bytes on Gen8+; and
+    *    - compacted instructions on Gen6+.
+    */
+   int shift = devinfo->gen >= 8 ? 3 : 0;
+
+   int32_t jip_compacted = brw_inst_jip(devinfo, insn) >> shift;
+   jip_compacted -= compacted_between(this_old_ip,
+                                      this_old_ip + (jip_compacted / 2),
+                                      compacted_counts);
+   brw_inst_set_jip(devinfo, insn, jip_compacted << shift);
+
+   if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_ENDIF ||
+       brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE ||
+       (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_ELSE && devinfo->gen <= 7))
+      return;
+
+   int32_t uip_compacted = brw_inst_uip(devinfo, insn) >> shift;
+   uip_compacted -= compacted_between(this_old_ip,
+                                      this_old_ip + (uip_compacted / 2),
+                                      compacted_counts);
+   brw_inst_set_uip(devinfo, insn, uip_compacted << shift);
+}
+
+static void
+update_gen4_jump_count(const struct gen_device_info *devinfo, brw_inst *insn,
+                       int this_old_ip, int *compacted_counts)
+{
+   assert(devinfo->gen == 5 || devinfo->is_g4x);
+
+   /* Jump Count is in units of:
+    *    - uncompacted instructions on G45; and
+    *    - compacted instructions on Gen5.
+    */
+   int shift = devinfo->is_g4x ? 1 : 0;
+
+   int jump_count_compacted = brw_inst_gen4_jump_count(devinfo, insn) << shift;
+
+   int target_old_ip = this_old_ip + (jump_count_compacted / 2);
+
+   int this_compacted_count = compacted_counts[this_old_ip];
+   int target_compacted_count = compacted_counts[target_old_ip];
+
+   jump_count_compacted -= (target_compacted_count - this_compacted_count);
+   brw_inst_set_gen4_jump_count(devinfo, insn, jump_count_compacted >> shift);
+}
+
+void
+brw_init_compaction_tables(const struct gen_device_info *devinfo)
+{
+   assert(g45_control_index_table[ARRAY_SIZE(g45_control_index_table) - 1] != 0);
+   assert(g45_datatype_table[ARRAY_SIZE(g45_datatype_table) - 1] != 0);
+   assert(g45_subreg_table[ARRAY_SIZE(g45_subreg_table) - 1] != 0);
+   assert(g45_src_index_table[ARRAY_SIZE(g45_src_index_table) - 1] != 0);
+   assert(gen6_control_index_table[ARRAY_SIZE(gen6_control_index_table) - 1] != 0);
+   assert(gen6_datatype_table[ARRAY_SIZE(gen6_datatype_table) - 1] != 0);
+   assert(gen6_subreg_table[ARRAY_SIZE(gen6_subreg_table) - 1] != 0);
+   assert(gen6_src_index_table[ARRAY_SIZE(gen6_src_index_table) - 1] != 0);
+   assert(gen7_control_index_table[ARRAY_SIZE(gen7_control_index_table) - 1] != 0);
+   assert(gen7_datatype_table[ARRAY_SIZE(gen7_datatype_table) - 1] != 0);
+   assert(gen7_subreg_table[ARRAY_SIZE(gen7_subreg_table) - 1] != 0);
+   assert(gen7_src_index_table[ARRAY_SIZE(gen7_src_index_table) - 1] != 0);
+   assert(gen8_control_index_table[ARRAY_SIZE(gen8_control_index_table) - 1] != 0);
+   assert(gen8_datatype_table[ARRAY_SIZE(gen8_datatype_table) - 1] != 0);
+   assert(gen8_subreg_table[ARRAY_SIZE(gen8_subreg_table) - 1] != 0);
+   assert(gen8_src_index_table[ARRAY_SIZE(gen8_src_index_table) - 1] != 0);
+
+   switch (devinfo->gen) {
+   case 9:
+   case 8:
+      control_index_table = gen8_control_index_table;
+      datatype_table = gen8_datatype_table;
+      subreg_table = gen8_subreg_table;
+      src_index_table = gen8_src_index_table;
+      break;
+   case 7:
+      control_index_table = gen7_control_index_table;
+      datatype_table = gen7_datatype_table;
+      subreg_table = gen7_subreg_table;
+      src_index_table = gen7_src_index_table;
+      break;
+   case 6:
+      control_index_table = gen6_control_index_table;
+      datatype_table = gen6_datatype_table;
+      subreg_table = gen6_subreg_table;
+      src_index_table = gen6_src_index_table;
+      break;
+   case 5:
+   case 4:
+      control_index_table = g45_control_index_table;
+      datatype_table = g45_datatype_table;
+      subreg_table = g45_subreg_table;
+      src_index_table = g45_src_index_table;
+      break;
+   default:
+      unreachable("unknown generation");
+   }
+}
+
+void
+brw_compact_instructions(struct brw_codegen *p, int start_offset,
+                         int num_annotations, struct annotation *annotation)
+{
+   if (unlikely(INTEL_DEBUG & DEBUG_NO_COMPACTION))
+      return;
+
+   const struct gen_device_info *devinfo = p->devinfo;
+   void *store = p->store + start_offset / 16;
+   /* For an instruction at byte offset 16*i before compaction, this is the
+    * number of compacted instructions minus the number of padding NOP/NENOPs
+    * that preceded it.
+    */
+   int compacted_counts[(p->next_insn_offset - start_offset) / sizeof(brw_inst)];
+   /* For an instruction at byte offset 8*i after compaction, this was its IP
+    * (in 16-byte units) before compaction.
+    */
+   int old_ip[(p->next_insn_offset - start_offset) / sizeof(brw_compact_inst)];
+
+   if (devinfo->gen == 4 && !devinfo->is_g4x)
+      return;
+
+   int offset = 0;
+   int compacted_count = 0;
+   for (int src_offset = 0; src_offset < p->next_insn_offset - start_offset;
+        src_offset += sizeof(brw_inst)) {
+      brw_inst *src = store + src_offset;
+      void *dst = store + offset;
+
+      old_ip[offset / sizeof(brw_compact_inst)] = src_offset / sizeof(brw_inst);
+      compacted_counts[src_offset / sizeof(brw_inst)] = compacted_count;
+
+      brw_inst saved = *src;
+
+      if (brw_try_compact_instruction(devinfo, dst, src)) {
+         compacted_count++;
+
+         if (INTEL_DEBUG) {
+            brw_inst uncompacted;
+            brw_uncompact_instruction(devinfo, &uncompacted, dst);
+            if (memcmp(&saved, &uncompacted, sizeof(uncompacted))) {
+               brw_debug_compact_uncompact(devinfo, &saved, &uncompacted);
+            }
+         }
+
+         offset += sizeof(brw_compact_inst);
+      } else {
+         /* All uncompacted instructions need to be aligned on G45. */
+         if ((offset & sizeof(brw_compact_inst)) != 0 && devinfo->is_g4x){
+            brw_compact_inst *align = store + offset;
+            memset(align, 0, sizeof(*align));
+            brw_compact_inst_set_opcode(devinfo, align, BRW_OPCODE_NENOP);
+            brw_compact_inst_set_cmpt_control(devinfo, align, true);
+            offset += sizeof(brw_compact_inst);
+            compacted_count--;
+            compacted_counts[src_offset / sizeof(brw_inst)] = compacted_count;
+            old_ip[offset / sizeof(brw_compact_inst)] = src_offset / sizeof(brw_inst);
+
+            dst = store + offset;
+         }
+
+         /* If we didn't compact this intruction, we need to move it down into
+          * place.
+          */
+         if (offset != src_offset) {
+            memmove(dst, src, sizeof(brw_inst));
+         }
+         offset += sizeof(brw_inst);
+      }
+   }
+
+   /* Fix up control flow offsets. */
+   p->next_insn_offset = start_offset + offset;
+   for (offset = 0; offset < p->next_insn_offset - start_offset;
+        offset = next_offset(devinfo, store, offset)) {
+      brw_inst *insn = store + offset;
+      int this_old_ip = old_ip[offset / sizeof(brw_compact_inst)];
+      int this_compacted_count = compacted_counts[this_old_ip];
+
+      switch (brw_inst_opcode(devinfo, insn)) {
+      case BRW_OPCODE_BREAK:
+      case BRW_OPCODE_CONTINUE:
+      case BRW_OPCODE_HALT:
+         if (devinfo->gen >= 6) {
+            update_uip_jip(devinfo, insn, this_old_ip, compacted_counts);
+         } else {
+            update_gen4_jump_count(devinfo, insn, this_old_ip,
+                                   compacted_counts);
+         }
+         break;
+
+      case BRW_OPCODE_IF:
+      case BRW_OPCODE_IFF:
+      case BRW_OPCODE_ELSE:
+      case BRW_OPCODE_ENDIF:
+      case BRW_OPCODE_WHILE:
+         if (devinfo->gen >= 7) {
+            if (brw_inst_cmpt_control(devinfo, insn)) {
+               brw_inst uncompacted;
+               brw_uncompact_instruction(devinfo, &uncompacted,
+                                         (brw_compact_inst *)insn);
+
+               update_uip_jip(devinfo, &uncompacted, this_old_ip,
+                              compacted_counts);
+
+               bool ret = brw_try_compact_instruction(devinfo,
+                                                      (brw_compact_inst *)insn,
+                                                      &uncompacted);
+               assert(ret); (void)ret;
+            } else {
+               update_uip_jip(devinfo, insn, this_old_ip, compacted_counts);
+            }
+         } else if (devinfo->gen == 6) {
+            assert(!brw_inst_cmpt_control(devinfo, insn));
+
+            /* Jump Count is in units of compacted instructions on Gen6. */
+            int jump_count_compacted = brw_inst_gen6_jump_count(devinfo, insn);
+
+            int target_old_ip = this_old_ip + (jump_count_compacted / 2);
+            int target_compacted_count = compacted_counts[target_old_ip];
+            jump_count_compacted -= (target_compacted_count - this_compacted_count);
+            brw_inst_set_gen6_jump_count(devinfo, insn, jump_count_compacted);
+         } else {
+            update_gen4_jump_count(devinfo, insn, this_old_ip,
+                                   compacted_counts);
+         }
+         break;
+
+      case BRW_OPCODE_ADD:
+         /* Add instructions modifying the IP register use an immediate src1,
+          * and Gens that use this cannot compact instructions with immediate
+          * operands.
+          */
+         if (brw_inst_cmpt_control(devinfo, insn))
+            break;
+
+         if (brw_inst_dst_reg_file(devinfo, insn) == BRW_ARCHITECTURE_REGISTER_FILE &&
+             brw_inst_dst_da_reg_nr(devinfo, insn) == BRW_ARF_IP) {
+            assert(brw_inst_src1_reg_file(devinfo, insn) == BRW_IMMEDIATE_VALUE);
+
+            int shift = 3;
+            int jump_compacted = brw_inst_imm_d(devinfo, insn) >> shift;
+
+            int target_old_ip = this_old_ip + (jump_compacted / 2);
+            int target_compacted_count = compacted_counts[target_old_ip];
+            jump_compacted -= (target_compacted_count - this_compacted_count);
+            brw_inst_set_imm_ud(devinfo, insn, jump_compacted << shift);
+         }
+         break;
+      }
+   }
+
+   /* p->nr_insn is counting the number of uncompacted instructions still, so
+    * divide.  We do want to be sure there's a valid instruction in any
+    * alignment padding, so that the next compression pass (for the FS 8/16
+    * compile passes) parses correctly.
+    */
+   if (p->next_insn_offset & sizeof(brw_compact_inst)) {
+      brw_compact_inst *align = store + offset;
+      memset(align, 0, sizeof(*align));
+      brw_compact_inst_set_opcode(devinfo, align, BRW_OPCODE_NOP);
+      brw_compact_inst_set_cmpt_control(devinfo, align, true);
+      p->next_insn_offset += sizeof(brw_compact_inst);
+   }
+   p->nr_insn = p->next_insn_offset / sizeof(brw_inst);
+
+   /* Update the instruction offsets for each annotation. */
+   if (annotation) {
+      for (int offset = 0, i = 0; i < num_annotations; i++) {
+         while (start_offset + old_ip[offset / sizeof(brw_compact_inst)] *
+                sizeof(brw_inst) != annotation[i].offset) {
+            assert(start_offset + old_ip[offset / sizeof(brw_compact_inst)] *
+                   sizeof(brw_inst) < annotation[i].offset);
+            offset = next_offset(devinfo, store, offset);
+         }
+
+         annotation[i].offset = start_offset + offset;
+
+         offset = next_offset(devinfo, store, offset);
+      }
+
+      annotation[num_annotations].offset = p->next_insn_offset;
+   }
+}
diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h
new file mode 100644
index 00000000000..5848f920448
--- /dev/null
+++ b/src/intel/compiler/brw_eu_defines.h
@@ -0,0 +1,1246 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+#ifndef BRW_EU_DEFINES_H
+#define BRW_EU_DEFINES_H
+
+#include "util/macros.h"
+
+/* The following hunk, up-to "Execution Unit" is used by both the
+ * intel/compiler and i965 codebase. */
+
+#define INTEL_MASK(high, low) (((1u<<((high)-(low)+1))-1)<<(low))
+/* Using the GNU statement expression extension */
+#define SET_FIELD(value, field)                                         \
+   ({                                                                   \
+      uint32_t fieldval = (value) << field ## _SHIFT;                   \
+      assert((fieldval & ~ field ## _MASK) == 0);                       \
+      fieldval & field ## _MASK;                                        \
+   })
+
+#define GET_BITS(data, high, low) ((data & INTEL_MASK((high), (low))) >> (low))
+#define GET_FIELD(word, field) (((word)  & field ## _MASK) >> field ## _SHIFT)
+
+#define _3DPRIM_POINTLIST         0x01
+#define _3DPRIM_LINELIST          0x02
+#define _3DPRIM_LINESTRIP         0x03
+#define _3DPRIM_TRILIST           0x04
+#define _3DPRIM_TRISTRIP          0x05
+#define _3DPRIM_TRIFAN            0x06
+#define _3DPRIM_QUADLIST          0x07
+#define _3DPRIM_QUADSTRIP         0x08
+#define _3DPRIM_LINELIST_ADJ      0x09 /* G45+ */
+#define _3DPRIM_LINESTRIP_ADJ     0x0A /* G45+ */
+#define _3DPRIM_TRILIST_ADJ       0x0B /* G45+ */
+#define _3DPRIM_TRISTRIP_ADJ      0x0C /* G45+ */
+#define _3DPRIM_TRISTRIP_REVERSE  0x0D
+#define _3DPRIM_POLYGON           0x0E
+#define _3DPRIM_RECTLIST          0x0F
+#define _3DPRIM_LINELOOP          0x10
+#define _3DPRIM_POINTLIST_BF      0x11
+#define _3DPRIM_LINESTRIP_CONT    0x12
+#define _3DPRIM_LINESTRIP_BF      0x13
+#define _3DPRIM_LINESTRIP_CONT_BF 0x14
+#define _3DPRIM_TRIFAN_NOSTIPPLE  0x16
+#define _3DPRIM_PATCHLIST(n) ({ assert(n > 0 && n <= 32); 0x20 + (n - 1); })
+
+enum brw_barycentric_mode {
+   BRW_BARYCENTRIC_PERSPECTIVE_PIXEL       = 0,
+   BRW_BARYCENTRIC_PERSPECTIVE_CENTROID    = 1,
+   BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE      = 2,
+   BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL    = 3,
+   BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID = 4,
+   BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE   = 5,
+   BRW_BARYCENTRIC_MODE_COUNT              = 6
+};
+#define BRW_BARYCENTRIC_NONPERSPECTIVE_BITS \
+   ((1 << BRW_BARYCENTRIC_NONPERSPECTIVE_PIXEL) | \
+    (1 << BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID) | \
+    (1 << BRW_BARYCENTRIC_NONPERSPECTIVE_SAMPLE))
+
+enum brw_pixel_shader_computed_depth_mode {
+   BRW_PSCDEPTH_OFF   = 0, /* PS does not compute depth */
+   BRW_PSCDEPTH_ON    = 1, /* PS computes depth; no guarantee about value */
+   BRW_PSCDEPTH_ON_GE = 2, /* PS guarantees output depth >= source depth */
+   BRW_PSCDEPTH_ON_LE = 3, /* PS guarantees output depth <= source depth */
+};
+
+/* Bitfields for the URB_WRITE message, DW2 of message header: */
+#define URB_WRITE_PRIM_END		0x1
+#define URB_WRITE_PRIM_START		0x2
+#define URB_WRITE_PRIM_TYPE_SHIFT	2
+
+# define GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT		0
+# define GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID		1
+
+/* Execution Unit (EU) defines
+ */
+
+#define BRW_ALIGN_1   0
+#define BRW_ALIGN_16  1
+
+#define BRW_ADDRESS_DIRECT                        0
+#define BRW_ADDRESS_REGISTER_INDIRECT_REGISTER    1
+
+#define BRW_CHANNEL_X     0
+#define BRW_CHANNEL_Y     1
+#define BRW_CHANNEL_Z     2
+#define BRW_CHANNEL_W     3
+
+enum brw_compression {
+   BRW_COMPRESSION_NONE       = 0,
+   BRW_COMPRESSION_2NDHALF    = 1,
+   BRW_COMPRESSION_COMPRESSED = 2,
+};
+
+#define GEN6_COMPRESSION_1Q		0
+#define GEN6_COMPRESSION_2Q		1
+#define GEN6_COMPRESSION_3Q		2
+#define GEN6_COMPRESSION_4Q		3
+#define GEN6_COMPRESSION_1H		0
+#define GEN6_COMPRESSION_2H		2
+
+enum PACKED brw_conditional_mod {
+   BRW_CONDITIONAL_NONE = 0,
+   BRW_CONDITIONAL_Z    = 1,
+   BRW_CONDITIONAL_NZ   = 2,
+   BRW_CONDITIONAL_EQ   = 1,	/* Z */
+   BRW_CONDITIONAL_NEQ  = 2,	/* NZ */
+   BRW_CONDITIONAL_G    = 3,
+   BRW_CONDITIONAL_GE   = 4,
+   BRW_CONDITIONAL_L    = 5,
+   BRW_CONDITIONAL_LE   = 6,
+   BRW_CONDITIONAL_R    = 7,    /* Gen <= 5 */
+   BRW_CONDITIONAL_O    = 8,
+   BRW_CONDITIONAL_U    = 9,
+};
+
+#define BRW_DEBUG_NONE        0
+#define BRW_DEBUG_BREAKPOINT  1
+
+#define BRW_DEPENDENCY_NORMAL         0
+#define BRW_DEPENDENCY_NOTCLEARED     1
+#define BRW_DEPENDENCY_NOTCHECKED     2
+#define BRW_DEPENDENCY_DISABLE        3
+
+enum PACKED brw_execution_size {
+   BRW_EXECUTE_1  = 0,
+   BRW_EXECUTE_2  = 1,
+   BRW_EXECUTE_4  = 2,
+   BRW_EXECUTE_8  = 3,
+   BRW_EXECUTE_16 = 4,
+   BRW_EXECUTE_32 = 5,
+};
+
+enum PACKED brw_horizontal_stride {
+   BRW_HORIZONTAL_STRIDE_0 = 0,
+   BRW_HORIZONTAL_STRIDE_1 = 1,
+   BRW_HORIZONTAL_STRIDE_2 = 2,
+   BRW_HORIZONTAL_STRIDE_4 = 3,
+};
+
+#define BRW_INSTRUCTION_NORMAL    0
+#define BRW_INSTRUCTION_SATURATE  1
+
+#define BRW_MASK_ENABLE   0
+#define BRW_MASK_DISABLE  1
+
+/** @{
+ *
+ * Gen6 has replaced "mask enable/disable" with WECtrl, which is
+ * effectively the same but much simpler to think about.  Now, there
+ * are two contributors ANDed together to whether channels are
+ * executed: The predication on the instruction, and the channel write
+ * enable.
+ */
+/**
+ * This is the default value.  It means that a channel's write enable is set
+ * if the per-channel IP is pointing at this instruction.
+ */
+#define BRW_WE_NORMAL		0
+/**
+ * This is used like BRW_MASK_DISABLE, and causes all channels to have
+ * their write enable set.  Note that predication still contributes to
+ * whether the channel actually gets written.
+ */
+#define BRW_WE_ALL		1
+/** @} */
+
+enum opcode {
+   /* These are the actual hardware opcodes. */
+   BRW_OPCODE_ILLEGAL = 0,
+   BRW_OPCODE_MOV =	1,
+   BRW_OPCODE_SEL =	2,
+   BRW_OPCODE_MOVI =	3,   /**< G45+ */
+   BRW_OPCODE_NOT =	4,
+   BRW_OPCODE_AND =	5,
+   BRW_OPCODE_OR =	6,
+   BRW_OPCODE_XOR =	7,
+   BRW_OPCODE_SHR =	8,
+   BRW_OPCODE_SHL =	9,
+   BRW_OPCODE_DIM =	10,  /**< Gen7.5 only */ /* Reused */
+   // BRW_OPCODE_SMOV =	10,  /**< Gen8+       */ /* Reused */
+   /* Reserved - 11 */
+   BRW_OPCODE_ASR =	12,
+   /* Reserved - 13-15 */
+   BRW_OPCODE_CMP =	16,
+   BRW_OPCODE_CMPN =	17,
+   BRW_OPCODE_CSEL =	18,  /**< Gen8+ */
+   BRW_OPCODE_F32TO16 = 19,  /**< Gen7 only */
+   BRW_OPCODE_F16TO32 = 20,  /**< Gen7 only */
+   /* Reserved - 21-22 */
+   BRW_OPCODE_BFREV =	23,  /**< Gen7+ */
+   BRW_OPCODE_BFE =	24,  /**< Gen7+ */
+   BRW_OPCODE_BFI1 =	25,  /**< Gen7+ */
+   BRW_OPCODE_BFI2 =	26,  /**< Gen7+ */
+   /* Reserved - 27-31 */
+   BRW_OPCODE_JMPI =	32,
+   // BRW_OPCODE_BRD =	33,  /**< Gen7+ */
+   BRW_OPCODE_IF =	34,
+   BRW_OPCODE_IFF =	35,  /**< Pre-Gen6    */ /* Reused */
+   // BRW_OPCODE_BRC =	35,  /**< Gen7+       */ /* Reused */
+   BRW_OPCODE_ELSE =	36,
+   BRW_OPCODE_ENDIF =	37,
+   BRW_OPCODE_DO =	38,  /**< Pre-Gen6    */ /* Reused */
+   // BRW_OPCODE_CASE =	38,  /**< Gen6 only   */ /* Reused */
+   BRW_OPCODE_WHILE =	39,
+   BRW_OPCODE_BREAK =	40,
+   BRW_OPCODE_CONTINUE = 41,
+   BRW_OPCODE_HALT =	42,
+   // BRW_OPCODE_CALLA =	43,  /**< Gen7.5+     */
+   // BRW_OPCODE_MSAVE =	44,  /**< Pre-Gen6    */ /* Reused */
+   // BRW_OPCODE_CALL =	44,  /**< Gen6+       */ /* Reused */
+   // BRW_OPCODE_MREST =	45,  /**< Pre-Gen6    */ /* Reused */
+   // BRW_OPCODE_RET =	45,  /**< Gen6+       */ /* Reused */
+   // BRW_OPCODE_PUSH =	46,  /**< Pre-Gen6    */ /* Reused */
+   // BRW_OPCODE_FORK =	46,  /**< Gen6 only   */ /* Reused */
+   // BRW_OPCODE_GOTO =	46,  /**< Gen8+       */ /* Reused */
+   // BRW_OPCODE_POP =	47,  /**< Pre-Gen6    */
+   BRW_OPCODE_WAIT =	48,
+   BRW_OPCODE_SEND =	49,
+   BRW_OPCODE_SENDC =	50,
+   BRW_OPCODE_SENDS =	51,  /**< Gen9+ */
+   BRW_OPCODE_SENDSC =	52,  /**< Gen9+ */
+   /* Reserved 53-55 */
+   BRW_OPCODE_MATH =	56,  /**< Gen6+ */
+   /* Reserved 57-63 */
+   BRW_OPCODE_ADD =	64,
+   BRW_OPCODE_MUL =	65,
+   BRW_OPCODE_AVG =	66,
+   BRW_OPCODE_FRC =	67,
+   BRW_OPCODE_RNDU =	68,
+   BRW_OPCODE_RNDD =	69,
+   BRW_OPCODE_RNDE =	70,
+   BRW_OPCODE_RNDZ =	71,
+   BRW_OPCODE_MAC =	72,
+   BRW_OPCODE_MACH =	73,
+   BRW_OPCODE_LZD =	74,
+   BRW_OPCODE_FBH =	75,  /**< Gen7+ */
+   BRW_OPCODE_FBL =	76,  /**< Gen7+ */
+   BRW_OPCODE_CBIT =	77,  /**< Gen7+ */
+   BRW_OPCODE_ADDC =	78,  /**< Gen7+ */
+   BRW_OPCODE_SUBB =	79,  /**< Gen7+ */
+   BRW_OPCODE_SAD2 =	80,
+   BRW_OPCODE_SADA2 =	81,
+   /* Reserved 82-83 */
+   BRW_OPCODE_DP4 =	84,
+   BRW_OPCODE_DPH =	85,
+   BRW_OPCODE_DP3 =	86,
+   BRW_OPCODE_DP2 =	87,
+   /* Reserved 88 */
+   BRW_OPCODE_LINE =	89,
+   BRW_OPCODE_PLN =	90,  /**< G45+ */
+   BRW_OPCODE_MAD =	91,  /**< Gen6+ */
+   BRW_OPCODE_LRP =	92,  /**< Gen6+ */
+   // BRW_OPCODE_MADM =	93,  /**< Gen8+ */
+   /* Reserved 94-124 */
+   BRW_OPCODE_NENOP =	125, /**< G45 only */
+   BRW_OPCODE_NOP =	126,
+   /* Reserved 127 */
+
+   /* These are compiler backend opcodes that get translated into other
+    * instructions.
+    */
+   FS_OPCODE_FB_WRITE = 128,
+
+   /**
+    * Same as FS_OPCODE_FB_WRITE but expects its arguments separately as
+    * individual sources instead of as a single payload blob. The
+    * position/ordering of the arguments are defined by the enum
+    * fb_write_logical_srcs.
+    */
+   FS_OPCODE_FB_WRITE_LOGICAL,
+
+   FS_OPCODE_REP_FB_WRITE,
+
+   FS_OPCODE_FB_READ,
+   FS_OPCODE_FB_READ_LOGICAL,
+
+   SHADER_OPCODE_RCP,
+   SHADER_OPCODE_RSQ,
+   SHADER_OPCODE_SQRT,
+   SHADER_OPCODE_EXP2,
+   SHADER_OPCODE_LOG2,
+   SHADER_OPCODE_POW,
+   SHADER_OPCODE_INT_QUOTIENT,
+   SHADER_OPCODE_INT_REMAINDER,
+   SHADER_OPCODE_SIN,
+   SHADER_OPCODE_COS,
+
+   /**
+    * Texture sampling opcodes.
+    *
+    * LOGICAL opcodes are eventually translated to the matching non-LOGICAL
+    * opcode but instead of taking a single payload blob they expect their
+    * arguments separately as individual sources. The position/ordering of the
+    * arguments are defined by the enum tex_logical_srcs.
+    */
+   SHADER_OPCODE_TEX,
+   SHADER_OPCODE_TEX_LOGICAL,
+   SHADER_OPCODE_TXD,
+   SHADER_OPCODE_TXD_LOGICAL,
+   SHADER_OPCODE_TXF,
+   SHADER_OPCODE_TXF_LOGICAL,
+   SHADER_OPCODE_TXF_LZ,
+   SHADER_OPCODE_TXL,
+   SHADER_OPCODE_TXL_LOGICAL,
+   SHADER_OPCODE_TXL_LZ,
+   SHADER_OPCODE_TXS,
+   SHADER_OPCODE_TXS_LOGICAL,
+   FS_OPCODE_TXB,
+   FS_OPCODE_TXB_LOGICAL,
+   SHADER_OPCODE_TXF_CMS,
+   SHADER_OPCODE_TXF_CMS_LOGICAL,
+   SHADER_OPCODE_TXF_CMS_W,
+   SHADER_OPCODE_TXF_CMS_W_LOGICAL,
+   SHADER_OPCODE_TXF_UMS,
+   SHADER_OPCODE_TXF_UMS_LOGICAL,
+   SHADER_OPCODE_TXF_MCS,
+   SHADER_OPCODE_TXF_MCS_LOGICAL,
+   SHADER_OPCODE_LOD,
+   SHADER_OPCODE_LOD_LOGICAL,
+   SHADER_OPCODE_TG4,
+   SHADER_OPCODE_TG4_LOGICAL,
+   SHADER_OPCODE_TG4_OFFSET,
+   SHADER_OPCODE_TG4_OFFSET_LOGICAL,
+   SHADER_OPCODE_SAMPLEINFO,
+   SHADER_OPCODE_SAMPLEINFO_LOGICAL,
+
+   /**
+    * Combines multiple sources of size 1 into a larger virtual GRF.
+    * For example, parameters for a send-from-GRF message.  Or, updating
+    * channels of a size 4 VGRF used to store vec4s such as texturing results.
+    *
+    * This will be lowered into MOVs from each source to consecutive offsets
+    * of the destination VGRF.
+    *
+    * src[0] may be BAD_FILE.  If so, the lowering pass skips emitting the MOV,
+    * but still reserves the first channel of the destination VGRF.  This can be
+    * used to reserve space for, say, a message header set up by the generators.
+    */
+   SHADER_OPCODE_LOAD_PAYLOAD,
+
+   /**
+    * Packs a number of sources into a single value. Unlike LOAD_PAYLOAD, this
+    * acts intra-channel, obtaining the final value for each channel by
+    * combining the sources values for the same channel, the first source
+    * occupying the lowest bits and the last source occupying the highest
+    * bits.
+    */
+   FS_OPCODE_PACK,
+
+   SHADER_OPCODE_SHADER_TIME_ADD,
+
+   /**
+    * Typed and untyped surface access opcodes.
+    *
+    * LOGICAL opcodes are eventually translated to the matching non-LOGICAL
+    * opcode but instead of taking a single payload blob they expect their
+    * arguments separately as individual sources:
+    *
+    * Source 0: [required] Surface coordinates.
+    * Source 1: [optional] Operation source.
+    * Source 2: [required] Surface index.
+    * Source 3: [required] Number of coordinate components (as UD immediate).
+    * Source 4: [required] Opcode-specific control immediate, same as source 2
+    *                      of the matching non-LOGICAL opcode.
+    */
+   SHADER_OPCODE_UNTYPED_ATOMIC,
+   SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
+   SHADER_OPCODE_UNTYPED_SURFACE_READ,
+   SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
+   SHADER_OPCODE_UNTYPED_SURFACE_WRITE,
+   SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
+
+   SHADER_OPCODE_TYPED_ATOMIC,
+   SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
+   SHADER_OPCODE_TYPED_SURFACE_READ,
+   SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
+   SHADER_OPCODE_TYPED_SURFACE_WRITE,
+   SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
+
+   SHADER_OPCODE_MEMORY_FENCE,
+
+   SHADER_OPCODE_GEN4_SCRATCH_READ,
+   SHADER_OPCODE_GEN4_SCRATCH_WRITE,
+   SHADER_OPCODE_GEN7_SCRATCH_READ,
+
+   /**
+    * Gen8+ SIMD8 URB Read messages.
+    */
+   SHADER_OPCODE_URB_READ_SIMD8,
+   SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT,
+
+   SHADER_OPCODE_URB_WRITE_SIMD8,
+   SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT,
+   SHADER_OPCODE_URB_WRITE_SIMD8_MASKED,
+   SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT,
+
+   /**
+    * Return the index of an arbitrary live channel (i.e. one of the channels
+    * enabled in the current execution mask) and assign it to the first
+    * component of the destination.  Expected to be used as input for the
+    * BROADCAST pseudo-opcode.
+    */
+   SHADER_OPCODE_FIND_LIVE_CHANNEL,
+
+   /**
+    * Pick the channel from its first source register given by the index
+    * specified as second source.  Useful for variable indexing of surfaces.
+    *
+    * Note that because the result of this instruction is by definition
+    * uniform and it can always be splatted to multiple channels using a
+    * scalar regioning mode, only the first channel of the destination region
+    * is guaranteed to be updated, which implies that BROADCAST instructions
+    * should usually be marked force_writemask_all.
+    */
+   SHADER_OPCODE_BROADCAST,
+
+   VEC4_OPCODE_MOV_BYTES,
+   VEC4_OPCODE_PACK_BYTES,
+   VEC4_OPCODE_UNPACK_UNIFORM,
+   VEC4_OPCODE_FROM_DOUBLE,
+   VEC4_OPCODE_TO_DOUBLE,
+   VEC4_OPCODE_PICK_LOW_32BIT,
+   VEC4_OPCODE_PICK_HIGH_32BIT,
+   VEC4_OPCODE_SET_LOW_32BIT,
+   VEC4_OPCODE_SET_HIGH_32BIT,
+
+   FS_OPCODE_DDX_COARSE,
+   FS_OPCODE_DDX_FINE,
+   /**
+    * Compute dFdy(), dFdyCoarse(), or dFdyFine().
+    */
+   FS_OPCODE_DDY_COARSE,
+   FS_OPCODE_DDY_FINE,
+   FS_OPCODE_CINTERP,
+   FS_OPCODE_LINTERP,
+   FS_OPCODE_PIXEL_X,
+   FS_OPCODE_PIXEL_Y,
+   FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+   FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7,
+   FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4,
+   FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
+   FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
+   FS_OPCODE_GET_BUFFER_SIZE,
+   FS_OPCODE_MOV_DISPATCH_TO_FLAGS,
+   FS_OPCODE_DISCARD_JUMP,
+   FS_OPCODE_SET_SAMPLE_ID,
+   FS_OPCODE_PACK_HALF_2x16_SPLIT,
+   FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X,
+   FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y,
+   FS_OPCODE_PLACEHOLDER_HALT,
+   FS_OPCODE_INTERPOLATE_AT_SAMPLE,
+   FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
+   FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET,
+
+   VS_OPCODE_URB_WRITE,
+   VS_OPCODE_PULL_CONSTANT_LOAD,
+   VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
+   VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
+
+   VS_OPCODE_GET_BUFFER_SIZE,
+
+   VS_OPCODE_UNPACK_FLAGS_SIMD4X2,
+
+   /**
+    * Write geometry shader output data to the URB.
+    *
+    * Unlike VS_OPCODE_URB_WRITE, this opcode doesn't do an implied move from
+    * R0 to the first MRF.  This allows the geometry shader to override the
+    * "Slot {0,1} Offset" fields in the message header.
+    */
+   GS_OPCODE_URB_WRITE,
+
+   /**
+    * Write geometry shader output data to the URB and request a new URB
+    * handle (gen6).
+    *
+    * This opcode doesn't do an implied move from R0 to the first MRF.
+    */
+   GS_OPCODE_URB_WRITE_ALLOCATE,
+
+   /**
+    * Terminate the geometry shader thread by doing an empty URB write.
+    *
+    * This opcode doesn't do an implied move from R0 to the first MRF.  This
+    * allows the geometry shader to override the "GS Number of Output Vertices
+    * for Slot {0,1}" fields in the message header.
+    */
+   GS_OPCODE_THREAD_END,
+
+   /**
+    * Set the "Slot {0,1} Offset" fields of a URB_WRITE message header.
+    *
+    * - dst is the MRF containing the message header.
+    *
+    * - src0.x indicates which portion of the URB should be written to (e.g. a
+    *   vertex number)
+    *
+    * - src1 is an immediate multiplier which will be applied to src0
+    *   (e.g. the size of a single vertex in the URB).
+    *
+    * Note: the hardware will apply this offset *in addition to* the offset in
+    * vec4_instruction::offset.
+    */
+   GS_OPCODE_SET_WRITE_OFFSET,
+
+   /**
+    * Set the "GS Number of Output Vertices for Slot {0,1}" fields of a
+    * URB_WRITE message header.
+    *
+    * - dst is the MRF containing the message header.
+    *
+    * - src0.x is the vertex count.  The upper 16 bits will be ignored.
+    */
+   GS_OPCODE_SET_VERTEX_COUNT,
+
+   /**
+    * Set DWORD 2 of dst to the value in src.
+    */
+   GS_OPCODE_SET_DWORD_2,
+
+   /**
+    * Prepare the dst register for storage in the "Channel Mask" fields of a
+    * URB_WRITE message header.
+    *
+    * DWORD 4 of dst is shifted left by 4 bits, so that later,
+    * GS_OPCODE_SET_CHANNEL_MASKS can OR DWORDs 0 and 4 together to form the
+    * final channel mask.
+    *
+    * Note: since GS_OPCODE_SET_CHANNEL_MASKS ORs DWORDs 0 and 4 together to
+    * form the final channel mask, DWORDs 0 and 4 of the dst register must not
+    * have any extraneous bits set prior to execution of this opcode (that is,
+    * they should be in the range 0x0 to 0xf).
+    */
+   GS_OPCODE_PREPARE_CHANNEL_MASKS,
+
+   /**
+    * Set the "Channel Mask" fields of a URB_WRITE message header.
+    *
+    * - dst is the MRF containing the message header.
+    *
+    * - src.x is the channel mask, as prepared by
+    *   GS_OPCODE_PREPARE_CHANNEL_MASKS.  DWORDs 0 and 4 are OR'ed together to
+    *   form the final channel mask.
+    */
+   GS_OPCODE_SET_CHANNEL_MASKS,
+
+   /**
+    * Get the "Instance ID" fields from the payload.
+    *
+    * - dst is the GRF for gl_InvocationID.
+    */
+   GS_OPCODE_GET_INSTANCE_ID,
+
+   /**
+    * Send a FF_SYNC message to allocate initial URB handles (gen6).
+    *
+    * - dst will be used as the writeback register for the FF_SYNC operation.
+    *
+    * - src0 is the number of primitives written.
+    *
+    * - src1 is the value to hold in M0.0: number of SO vertices to write
+    *   and number of SO primitives needed. Its value will be overwritten
+    *   with the SVBI values if transform feedback is enabled.
+    *
+    * Note: This opcode uses an implicit MRF register for the ff_sync message
+    * header, so the caller is expected to set inst->base_mrf and initialize
+    * that MRF register to r0. This opcode will also write to this MRF register
+    * to include the allocated URB handle so it can then be reused directly as
+    * the header in the URB write operation we are allocating the handle for.
+    */
+   GS_OPCODE_FF_SYNC,
+
+   /**
+    * Move r0.1 (which holds PrimitiveID information in gen6) to a separate
+    * register.
+    *
+    * - dst is the GRF where PrimitiveID information will be moved.
+    */
+   GS_OPCODE_SET_PRIMITIVE_ID,
+
+   /**
+    * Write transform feedback data to the SVB by sending a SVB WRITE message.
+    * Used in gen6.
+    *
+    * - dst is the MRF register containing the message header.
+    *
+    * - src0 is the register where the vertex data is going to be copied from.
+    *
+    * - src1 is the destination register when write commit occurs.
+    */
+   GS_OPCODE_SVB_WRITE,
+
+   /**
+    * Set destination index in the SVB write message payload (M0.5). Used
+    * in gen6 for transform feedback.
+    *
+    * - dst is the header to save the destination indices for SVB WRITE.
+    * - src is the register that holds the destination indices value.
+    */
+   GS_OPCODE_SVB_SET_DST_INDEX,
+
+   /**
+    * Prepare Mx.0 subregister for being used in the FF_SYNC message header.
+    * Used in gen6 for transform feedback.
+    *
+    * - dst will hold the register with the final Mx.0 value.
+    *
+    * - src0 has the number of vertices emitted in SO (NumSOVertsToWrite)
+    *
+    * - src1 has the number of needed primitives for SO (NumSOPrimsNeeded)
+    *
+    * - src2 is the value to hold in M0: number of SO vertices to write
+    *   and number of SO primitives needed.
+    */
+   GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
+
+   /**
+    * Terminate the compute shader.
+    */
+   CS_OPCODE_CS_TERMINATE,
+
+   /**
+    * GLSL barrier()
+    */
+   SHADER_OPCODE_BARRIER,
+
+   /**
+    * Calculate the high 32-bits of a 32x32 multiply.
+    */
+   SHADER_OPCODE_MULH,
+
+   /**
+    * A MOV that uses VxH indirect addressing.
+    *
+    * Source 0: A register to start from (HW_REG).
+    * Source 1: An indirect offset (in bytes, UD GRF).
+    * Source 2: The length of the region that could be accessed (in bytes,
+    *           UD immediate).
+    */
+   SHADER_OPCODE_MOV_INDIRECT,
+
+   VEC4_OPCODE_URB_READ,
+   TCS_OPCODE_GET_INSTANCE_ID,
+   TCS_OPCODE_URB_WRITE,
+   TCS_OPCODE_SET_INPUT_URB_OFFSETS,
+   TCS_OPCODE_SET_OUTPUT_URB_OFFSETS,
+   TCS_OPCODE_GET_PRIMITIVE_ID,
+   TCS_OPCODE_CREATE_BARRIER_HEADER,
+   TCS_OPCODE_SRC0_010_IS_ZERO,
+   TCS_OPCODE_RELEASE_INPUT,
+   TCS_OPCODE_THREAD_END,
+
+   TES_OPCODE_GET_PRIMITIVE_ID,
+   TES_OPCODE_CREATE_INPUT_READ_HEADER,
+   TES_OPCODE_ADD_INDIRECT_URB_OFFSET,
+};
+
+enum brw_urb_write_flags {
+   BRW_URB_WRITE_NO_FLAGS = 0,
+
+   /**
+    * Causes a new URB entry to be allocated, and its address stored in the
+    * destination register (gen < 7).
+    */
+   BRW_URB_WRITE_ALLOCATE = 0x1,
+
+   /**
+    * Causes the current URB entry to be deallocated (gen < 7).
+    */
+   BRW_URB_WRITE_UNUSED = 0x2,
+
+   /**
+    * Causes the thread to terminate.
+    */
+   BRW_URB_WRITE_EOT = 0x4,
+
+   /**
+    * Indicates that the given URB entry is complete, and may be sent further
+    * down the 3D pipeline (gen < 7).
+    */
+   BRW_URB_WRITE_COMPLETE = 0x8,
+
+   /**
+    * Indicates that an additional offset (which may be different for the two
+    * vec4 slots) is stored in the message header (gen == 7).
+    */
+   BRW_URB_WRITE_PER_SLOT_OFFSET = 0x10,
+
+   /**
+    * Indicates that the channel masks in the URB_WRITE message header should
+    * not be overridden to 0xff (gen == 7).
+    */
+   BRW_URB_WRITE_USE_CHANNEL_MASKS = 0x20,
+
+   /**
+    * Indicates that the data should be sent to the URB using the
+    * URB_WRITE_OWORD message rather than URB_WRITE_HWORD (gen == 7).  This
+    * causes offsets to be interpreted as multiples of an OWORD instead of an
+    * HWORD, and only allows one OWORD to be written.
+    */
+   BRW_URB_WRITE_OWORD = 0x40,
+
+   /**
+    * Convenient combination of flags: end the thread while simultaneously
+    * marking the given URB entry as complete.
+    */
+   BRW_URB_WRITE_EOT_COMPLETE = BRW_URB_WRITE_EOT | BRW_URB_WRITE_COMPLETE,
+
+   /**
+    * Convenient combination of flags: mark the given URB entry as complete
+    * and simultaneously allocate a new one.
+    */
+   BRW_URB_WRITE_ALLOCATE_COMPLETE =
+      BRW_URB_WRITE_ALLOCATE | BRW_URB_WRITE_COMPLETE,
+};
+
+enum fb_write_logical_srcs {
+   FB_WRITE_LOGICAL_SRC_COLOR0,      /* REQUIRED */
+   FB_WRITE_LOGICAL_SRC_COLOR1,      /* for dual source blend messages */
+   FB_WRITE_LOGICAL_SRC_SRC0_ALPHA,
+   FB_WRITE_LOGICAL_SRC_SRC_DEPTH,   /* gl_FragDepth */
+   FB_WRITE_LOGICAL_SRC_DST_DEPTH,   /* GEN4-5: passthrough from thread */
+   FB_WRITE_LOGICAL_SRC_SRC_STENCIL, /* gl_FragStencilRefARB */
+   FB_WRITE_LOGICAL_SRC_OMASK,       /* Sample Mask (gl_SampleMask) */
+   FB_WRITE_LOGICAL_SRC_COMPONENTS,  /* REQUIRED */
+   FB_WRITE_LOGICAL_NUM_SRCS
+};
+
+enum tex_logical_srcs {
+   /** Texture coordinates */
+   TEX_LOGICAL_SRC_COORDINATE,
+   /** Shadow comparator */
+   TEX_LOGICAL_SRC_SHADOW_C,
+   /** dPdx if the operation takes explicit derivatives, otherwise LOD value */
+   TEX_LOGICAL_SRC_LOD,
+   /** dPdy if the operation takes explicit derivatives */
+   TEX_LOGICAL_SRC_LOD2,
+   /** Sample index */
+   TEX_LOGICAL_SRC_SAMPLE_INDEX,
+   /** MCS data */
+   TEX_LOGICAL_SRC_MCS,
+   /** REQUIRED: Texture surface index */
+   TEX_LOGICAL_SRC_SURFACE,
+   /** Texture sampler index */
+   TEX_LOGICAL_SRC_SAMPLER,
+   /** Texel offset for gathers */
+   TEX_LOGICAL_SRC_TG4_OFFSET,
+   /** REQUIRED: Number of coordinate components (as UD immediate) */
+   TEX_LOGICAL_SRC_COORD_COMPONENTS,
+   /** REQUIRED: Number of derivative components (as UD immediate) */
+   TEX_LOGICAL_SRC_GRAD_COMPONENTS,
+
+   TEX_LOGICAL_NUM_SRCS,
+};
+
+#ifdef __cplusplus
+/**
+ * Allow brw_urb_write_flags enums to be ORed together.
+ */
+inline brw_urb_write_flags
+operator|(brw_urb_write_flags x, brw_urb_write_flags y)
+{
+   return static_cast<brw_urb_write_flags>(static_cast<int>(x) |
+                                           static_cast<int>(y));
+}
+#endif
+
+enum PACKED brw_predicate {
+   BRW_PREDICATE_NONE                =  0,
+   BRW_PREDICATE_NORMAL              =  1,
+   BRW_PREDICATE_ALIGN1_ANYV         =  2,
+   BRW_PREDICATE_ALIGN1_ALLV         =  3,
+   BRW_PREDICATE_ALIGN1_ANY2H        =  4,
+   BRW_PREDICATE_ALIGN1_ALL2H        =  5,
+   BRW_PREDICATE_ALIGN1_ANY4H        =  6,
+   BRW_PREDICATE_ALIGN1_ALL4H        =  7,
+   BRW_PREDICATE_ALIGN1_ANY8H        =  8,
+   BRW_PREDICATE_ALIGN1_ALL8H        =  9,
+   BRW_PREDICATE_ALIGN1_ANY16H       = 10,
+   BRW_PREDICATE_ALIGN1_ALL16H       = 11,
+   BRW_PREDICATE_ALIGN1_ANY32H       = 12,
+   BRW_PREDICATE_ALIGN1_ALL32H       = 13,
+   BRW_PREDICATE_ALIGN16_REPLICATE_X =  2,
+   BRW_PREDICATE_ALIGN16_REPLICATE_Y =  3,
+   BRW_PREDICATE_ALIGN16_REPLICATE_Z =  4,
+   BRW_PREDICATE_ALIGN16_REPLICATE_W =  5,
+   BRW_PREDICATE_ALIGN16_ANY4H       =  6,
+   BRW_PREDICATE_ALIGN16_ALL4H       =  7,
+};
+
+enum PACKED brw_reg_file {
+   BRW_ARCHITECTURE_REGISTER_FILE = 0,
+   BRW_GENERAL_REGISTER_FILE      = 1,
+   BRW_MESSAGE_REGISTER_FILE      = 2,
+   BRW_IMMEDIATE_VALUE            = 3,
+
+   ARF = BRW_ARCHITECTURE_REGISTER_FILE,
+   FIXED_GRF = BRW_GENERAL_REGISTER_FILE,
+   MRF = BRW_MESSAGE_REGISTER_FILE,
+   IMM = BRW_IMMEDIATE_VALUE,
+
+   /* These are not hardware values */
+   VGRF,
+   ATTR,
+   UNIFORM, /* prog_data->params[reg] */
+   BAD_FILE,
+};
+
+#define BRW_HW_REG_TYPE_UD  0
+#define BRW_HW_REG_TYPE_D   1
+#define BRW_HW_REG_TYPE_UW  2
+#define BRW_HW_REG_TYPE_W   3
+#define BRW_HW_REG_TYPE_F   7
+#define GEN8_HW_REG_TYPE_UQ 8
+#define GEN8_HW_REG_TYPE_Q  9
+
+#define BRW_HW_REG_NON_IMM_TYPE_UB  4
+#define BRW_HW_REG_NON_IMM_TYPE_B   5
+#define GEN7_HW_REG_NON_IMM_TYPE_DF 6
+#define GEN8_HW_REG_NON_IMM_TYPE_HF 10
+
+#define BRW_HW_REG_IMM_TYPE_UV  4 /* Gen6+ packed unsigned immediate vector */
+#define BRW_HW_REG_IMM_TYPE_VF  5 /* packed float immediate vector */
+#define BRW_HW_REG_IMM_TYPE_V   6 /* packed int imm. vector; uword dest only */
+#define GEN8_HW_REG_IMM_TYPE_DF 10
+#define GEN8_HW_REG_IMM_TYPE_HF 11
+
+/* SNB adds 3-src instructions (MAD and LRP) that only operate on floats, so
+ * the types were implied. IVB adds BFE and BFI2 that operate on doublewords
+ * and unsigned doublewords, so a new field is also available in the da3src
+ * struct (part of struct brw_instruction.bits1 in brw_structs.h) to select
+ * dst and shared-src types. The values are different from BRW_REGISTER_TYPE_*.
+ */
+#define BRW_3SRC_TYPE_F  0
+#define BRW_3SRC_TYPE_D  1
+#define BRW_3SRC_TYPE_UD 2
+#define BRW_3SRC_TYPE_DF 3
+
+#define BRW_ARF_NULL                  0x00
+#define BRW_ARF_ADDRESS               0x10
+#define BRW_ARF_ACCUMULATOR           0x20
+#define BRW_ARF_FLAG                  0x30
+#define BRW_ARF_MASK                  0x40
+#define BRW_ARF_MASK_STACK            0x50
+#define BRW_ARF_MASK_STACK_DEPTH      0x60
+#define BRW_ARF_STATE                 0x70
+#define BRW_ARF_CONTROL               0x80
+#define BRW_ARF_NOTIFICATION_COUNT    0x90
+#define BRW_ARF_IP                    0xA0
+#define BRW_ARF_TDR                   0xB0
+#define BRW_ARF_TIMESTAMP             0xC0
+
+#define BRW_MRF_COMPR4			(1 << 7)
+
+#define BRW_AMASK   0
+#define BRW_IMASK   1
+#define BRW_LMASK   2
+#define BRW_CMASK   3
+
+
+
+#define BRW_THREAD_NORMAL     0
+#define BRW_THREAD_ATOMIC     1
+#define BRW_THREAD_SWITCH     2
+
+enum PACKED brw_vertical_stride {
+   BRW_VERTICAL_STRIDE_0               = 0,
+   BRW_VERTICAL_STRIDE_1               = 1,
+   BRW_VERTICAL_STRIDE_2               = 2,
+   BRW_VERTICAL_STRIDE_4               = 3,
+   BRW_VERTICAL_STRIDE_8               = 4,
+   BRW_VERTICAL_STRIDE_16              = 5,
+   BRW_VERTICAL_STRIDE_32              = 6,
+   BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL = 0xF,
+};
+
+enum PACKED brw_width {
+   BRW_WIDTH_1  = 0,
+   BRW_WIDTH_2  = 1,
+   BRW_WIDTH_4  = 2,
+   BRW_WIDTH_8  = 3,
+   BRW_WIDTH_16 = 4,
+};
+
+/**
+ * Message target: Shared Function ID for where to SEND a message.
+ *
+ * These are enumerated in the ISA reference under "send - Send Message".
+ * In particular, see the following tables:
+ * - G45 PRM, Volume 4, Table 14-15 "Message Descriptor Definition"
+ * - Sandybridge PRM, Volume 4 Part 2, Table 8-16 "Extended Message Descriptor"
+ * - Ivybridge PRM, Volume 1 Part 1, section 3.2.7 "GPE Function IDs"
+ */
+enum brw_message_target {
+   BRW_SFID_NULL                     = 0,
+   BRW_SFID_MATH                     = 1, /* Only valid on Gen4-5 */
+   BRW_SFID_SAMPLER                  = 2,
+   BRW_SFID_MESSAGE_GATEWAY          = 3,
+   BRW_SFID_DATAPORT_READ            = 4,
+   BRW_SFID_DATAPORT_WRITE           = 5,
+   BRW_SFID_URB                      = 6,
+   BRW_SFID_THREAD_SPAWNER           = 7,
+   BRW_SFID_VME                      = 8,
+
+   GEN6_SFID_DATAPORT_SAMPLER_CACHE  = 4,
+   GEN6_SFID_DATAPORT_RENDER_CACHE   = 5,
+   GEN6_SFID_DATAPORT_CONSTANT_CACHE = 9,
+
+   GEN7_SFID_DATAPORT_DATA_CACHE     = 10,
+   GEN7_SFID_PIXEL_INTERPOLATOR      = 11,
+   HSW_SFID_DATAPORT_DATA_CACHE_1    = 12,
+   HSW_SFID_CRE                      = 13,
+};
+
+#define GEN7_MESSAGE_TARGET_DP_DATA_CACHE     10
+
+#define BRW_SAMPLER_RETURN_FORMAT_FLOAT32     0
+#define BRW_SAMPLER_RETURN_FORMAT_UINT32      2
+#define BRW_SAMPLER_RETURN_FORMAT_SINT32      3
+
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE              0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE             0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS        0
+#define BRW_SAMPLER_MESSAGE_SIMD8_KILLPIX             1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD        1
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD         1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS  2
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS    2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE    0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE     2
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE 0
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE 1
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE  1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO           2
+#define BRW_SAMPLER_MESSAGE_SIMD16_RESINFO            2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_LD                3
+#define BRW_SAMPLER_MESSAGE_SIMD8_LD                  3
+#define BRW_SAMPLER_MESSAGE_SIMD16_LD                 3
+
+#define GEN5_SAMPLER_MESSAGE_SAMPLE              0
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS         1
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_LOD          2
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE      3
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS       4
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE 5
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE  6
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_LD           7
+#define GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4      8
+#define GEN5_SAMPLER_MESSAGE_LOD                 9
+#define GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO      10
+#define GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO   11
+#define GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C    16
+#define GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO   17
+#define GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C 18
+#define HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE 20
+#define GEN9_SAMPLER_MESSAGE_SAMPLE_LZ           24
+#define GEN9_SAMPLER_MESSAGE_SAMPLE_C_LZ         25
+#define GEN9_SAMPLER_MESSAGE_SAMPLE_LD_LZ        26
+#define GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W     28
+#define GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS       29
+#define GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS       30
+#define GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS       31
+
+/* for GEN5 only */
+#define BRW_SAMPLER_SIMD_MODE_SIMD4X2                   0
+#define BRW_SAMPLER_SIMD_MODE_SIMD8                     1
+#define BRW_SAMPLER_SIMD_MODE_SIMD16                    2
+#define BRW_SAMPLER_SIMD_MODE_SIMD32_64                 3
+
+/* GEN9 changes SIMD mode 0 to mean SIMD8D, but lets us get the SIMD4x2
+ * behavior by setting bit 22 of dword 2 in the message header. */
+#define GEN9_SAMPLER_SIMD_MODE_SIMD8D                   0
+#define GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2        (1 << 22)
+
+#define BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW   0
+#define BRW_DATAPORT_OWORD_BLOCK_1_OWORDHIGH  1
+#define BRW_DATAPORT_OWORD_BLOCK_2_OWORDS     2
+#define BRW_DATAPORT_OWORD_BLOCK_4_OWORDS     3
+#define BRW_DATAPORT_OWORD_BLOCK_8_OWORDS     4
+#define BRW_DATAPORT_OWORD_BLOCK_DWORDS(n)              \
+   ((n) == 4 ? BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW :    \
+    (n) == 8 ? BRW_DATAPORT_OWORD_BLOCK_2_OWORDS :      \
+    (n) == 16 ? BRW_DATAPORT_OWORD_BLOCK_4_OWORDS :     \
+    (n) == 32 ? BRW_DATAPORT_OWORD_BLOCK_8_OWORDS :     \
+    (abort(), ~0))
+
+#define BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD     0
+#define BRW_DATAPORT_OWORD_DUAL_BLOCK_4OWORDS    2
+
+#define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_8DWORDS   2
+#define BRW_DATAPORT_DWORD_SCATTERED_BLOCK_16DWORDS  3
+
+/* This one stays the same across generations. */
+#define BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ          0
+/* GEN4 */
+#define BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ     1
+#define BRW_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ          2
+#define BRW_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ      3
+/* G45, GEN5 */
+#define G45_DATAPORT_READ_MESSAGE_RENDER_UNORM_READ	    1
+#define G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ     2
+#define G45_DATAPORT_READ_MESSAGE_AVC_LOOP_FILTER_READ	    3
+#define G45_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ          4
+#define G45_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ      6
+/* GEN6 */
+#define GEN6_DATAPORT_READ_MESSAGE_RENDER_UNORM_READ	    1
+#define GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ     2
+#define GEN6_DATAPORT_READ_MESSAGE_MEDIA_BLOCK_READ          4
+#define GEN6_DATAPORT_READ_MESSAGE_OWORD_UNALIGN_BLOCK_READ  5
+#define GEN6_DATAPORT_READ_MESSAGE_DWORD_SCATTERED_READ      6
+
+#define BRW_DATAPORT_READ_TARGET_DATA_CACHE      0
+#define BRW_DATAPORT_READ_TARGET_RENDER_CACHE    1
+#define BRW_DATAPORT_READ_TARGET_SAMPLER_CACHE   2
+
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE                0
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED     1
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01         2
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23         3
+#define BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01       4
+
+#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE                0
+#define BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE           1
+#define BRW_DATAPORT_WRITE_MESSAGE_MEDIA_BLOCK_WRITE                2
+#define BRW_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE            3
+#define BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE              4
+#define BRW_DATAPORT_WRITE_MESSAGE_STREAMED_VERTEX_BUFFER_WRITE     5
+#define BRW_DATAPORT_WRITE_MESSAGE_FLUSH_RENDER_CACHE               7
+
+/* GEN6 */
+#define GEN6_DATAPORT_WRITE_MESSAGE_DWORD_ATOMIC_WRITE              7
+#define GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE               8
+#define GEN6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE          9
+#define GEN6_DATAPORT_WRITE_MESSAGE_MEDIA_BLOCK_WRITE               10
+#define GEN6_DATAPORT_WRITE_MESSAGE_DWORD_SCATTERED_WRITE           11
+#define GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE             12
+#define GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE               13
+#define GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_UNORM_WRITE       14
+
+/* GEN7 */
+#define GEN7_DATAPORT_RC_MEDIA_BLOCK_READ                           4
+#define GEN7_DATAPORT_RC_TYPED_SURFACE_READ                         5
+#define GEN7_DATAPORT_RC_TYPED_ATOMIC_OP                            6
+#define GEN7_DATAPORT_RC_MEMORY_FENCE                               7
+#define GEN7_DATAPORT_RC_MEDIA_BLOCK_WRITE                          10
+#define GEN7_DATAPORT_RC_RENDER_TARGET_WRITE                        12
+#define GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE                        13
+#define GEN7_DATAPORT_DC_OWORD_BLOCK_READ                           0
+#define GEN7_DATAPORT_DC_UNALIGNED_OWORD_BLOCK_READ                 1
+#define GEN7_DATAPORT_DC_OWORD_DUAL_BLOCK_READ                      2
+#define GEN7_DATAPORT_DC_DWORD_SCATTERED_READ                       3
+#define GEN7_DATAPORT_DC_BYTE_SCATTERED_READ                        4
+#define GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ                       5
+#define GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP                          6
+#define GEN7_DATAPORT_DC_MEMORY_FENCE                               7
+#define GEN7_DATAPORT_DC_OWORD_BLOCK_WRITE                          8
+#define GEN7_DATAPORT_DC_OWORD_DUAL_BLOCK_WRITE                     10
+#define GEN7_DATAPORT_DC_DWORD_SCATTERED_WRITE                      11
+#define GEN7_DATAPORT_DC_BYTE_SCATTERED_WRITE                       12
+#define GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE                      13
+
+#define GEN7_DATAPORT_SCRATCH_READ                            ((1 << 18) | \
+                                                               (0 << 17))
+#define GEN7_DATAPORT_SCRATCH_WRITE                           ((1 << 18) | \
+                                                               (1 << 17))
+#define GEN7_DATAPORT_SCRATCH_NUM_REGS_SHIFT                        12
+
+#define GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET     0
+#define GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE            1
+#define GEN7_PIXEL_INTERPOLATOR_LOC_CENTROID          2
+#define GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET   3
+
+/* HSW */
+#define HSW_DATAPORT_DC_PORT0_OWORD_BLOCK_READ                      0
+#define HSW_DATAPORT_DC_PORT0_UNALIGNED_OWORD_BLOCK_READ            1
+#define HSW_DATAPORT_DC_PORT0_OWORD_DUAL_BLOCK_READ                 2
+#define HSW_DATAPORT_DC_PORT0_DWORD_SCATTERED_READ                  3
+#define HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_READ                   4
+#define HSW_DATAPORT_DC_PORT0_MEMORY_FENCE                          7
+#define HSW_DATAPORT_DC_PORT0_OWORD_BLOCK_WRITE                     8
+#define HSW_DATAPORT_DC_PORT0_OWORD_DUAL_BLOCK_WRITE                10
+#define HSW_DATAPORT_DC_PORT0_DWORD_SCATTERED_WRITE                 11
+#define HSW_DATAPORT_DC_PORT0_BYTE_SCATTERED_WRITE                  12
+
+#define HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ                  1
+#define HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP                     2
+#define HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2             3
+#define HSW_DATAPORT_DC_PORT1_MEDIA_BLOCK_READ                      4
+#define HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ                    5
+#define HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP                       6
+#define HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2               7
+#define HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE                 9
+#define HSW_DATAPORT_DC_PORT1_MEDIA_BLOCK_WRITE                     10
+#define HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP                     11
+#define HSW_DATAPORT_DC_PORT1_ATOMIC_COUNTER_OP_SIMD4X2             12
+#define HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE                   13
+
+/* GEN9 */
+#define GEN9_DATAPORT_RC_RENDER_TARGET_WRITE                        12
+#define GEN9_DATAPORT_RC_RENDER_TARGET_READ                         13
+
+/* Dataport special binding table indices: */
+#define BRW_BTI_STATELESS                255
+#define GEN7_BTI_SLM                     254
+/* Note that on Gen8+ BTI 255 was redefined to be IA-coherent according to the
+ * hardware spec, however because the DRM sets bit 4 of HDC_CHICKEN0 on BDW,
+ * CHV and at least some pre-production steppings of SKL due to
+ * WaForceEnableNonCoherent, HDC memory access may have been overridden by the
+ * kernel to be non-coherent (matching the behavior of the same BTI on
+ * pre-Gen8 hardware) and BTI 255 may actually be an alias for BTI 253.
+ */
+#define GEN8_BTI_STATELESS_IA_COHERENT   255
+#define GEN8_BTI_STATELESS_NON_COHERENT  253
+
+/* dataport atomic operations. */
+#define BRW_AOP_AND                   1
+#define BRW_AOP_OR                    2
+#define BRW_AOP_XOR                   3
+#define BRW_AOP_MOV                   4
+#define BRW_AOP_INC                   5
+#define BRW_AOP_DEC                   6
+#define BRW_AOP_ADD                   7
+#define BRW_AOP_SUB                   8
+#define BRW_AOP_REVSUB                9
+#define BRW_AOP_IMAX                  10
+#define BRW_AOP_IMIN                  11
+#define BRW_AOP_UMAX                  12
+#define BRW_AOP_UMIN                  13
+#define BRW_AOP_CMPWR                 14
+#define BRW_AOP_PREDEC                15
+
+#define BRW_MATH_FUNCTION_INV                              1
+#define BRW_MATH_FUNCTION_LOG                              2
+#define BRW_MATH_FUNCTION_EXP                              3
+#define BRW_MATH_FUNCTION_SQRT                             4
+#define BRW_MATH_FUNCTION_RSQ                              5
+#define BRW_MATH_FUNCTION_SIN                              6
+#define BRW_MATH_FUNCTION_COS                              7
+#define BRW_MATH_FUNCTION_SINCOS                           8 /* gen4, gen5 */
+#define BRW_MATH_FUNCTION_FDIV                             9 /* gen6+ */
+#define BRW_MATH_FUNCTION_POW                              10
+#define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER   11
+#define BRW_MATH_FUNCTION_INT_DIV_QUOTIENT                 12
+#define BRW_MATH_FUNCTION_INT_DIV_REMAINDER                13
+#define GEN8_MATH_FUNCTION_INVM                            14
+#define GEN8_MATH_FUNCTION_RSQRTM                          15
+
+#define BRW_MATH_INTEGER_UNSIGNED     0
+#define BRW_MATH_INTEGER_SIGNED       1
+
+#define BRW_MATH_PRECISION_FULL        0
+#define BRW_MATH_PRECISION_PARTIAL     1
+
+#define BRW_MATH_SATURATE_NONE         0
+#define BRW_MATH_SATURATE_SATURATE     1
+
+#define BRW_MATH_DATA_VECTOR  0
+#define BRW_MATH_DATA_SCALAR  1
+
+#define BRW_URB_OPCODE_WRITE_HWORD  0
+#define BRW_URB_OPCODE_WRITE_OWORD  1
+#define BRW_URB_OPCODE_READ_HWORD   2
+#define BRW_URB_OPCODE_READ_OWORD   3
+#define GEN7_URB_OPCODE_ATOMIC_MOV  4
+#define GEN7_URB_OPCODE_ATOMIC_INC  5
+#define GEN8_URB_OPCODE_ATOMIC_ADD  6
+#define GEN8_URB_OPCODE_SIMD8_WRITE 7
+#define GEN8_URB_OPCODE_SIMD8_READ  8
+
+#define BRW_URB_SWIZZLE_NONE          0
+#define BRW_URB_SWIZZLE_INTERLEAVE    1
+#define BRW_URB_SWIZZLE_TRANSPOSE     2
+
+#define BRW_SCRATCH_SPACE_SIZE_1K     0
+#define BRW_SCRATCH_SPACE_SIZE_2K     1
+#define BRW_SCRATCH_SPACE_SIZE_4K     2
+#define BRW_SCRATCH_SPACE_SIZE_8K     3
+#define BRW_SCRATCH_SPACE_SIZE_16K    4
+#define BRW_SCRATCH_SPACE_SIZE_32K    5
+#define BRW_SCRATCH_SPACE_SIZE_64K    6
+#define BRW_SCRATCH_SPACE_SIZE_128K   7
+#define BRW_SCRATCH_SPACE_SIZE_256K   8
+#define BRW_SCRATCH_SPACE_SIZE_512K   9
+#define BRW_SCRATCH_SPACE_SIZE_1M     10
+#define BRW_SCRATCH_SPACE_SIZE_2M     11
+
+#define BRW_MESSAGE_GATEWAY_SFID_OPEN_GATEWAY         0
+#define BRW_MESSAGE_GATEWAY_SFID_CLOSE_GATEWAY        1
+#define BRW_MESSAGE_GATEWAY_SFID_FORWARD_MSG          2
+#define BRW_MESSAGE_GATEWAY_SFID_GET_TIMESTAMP        3
+#define BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG          4
+#define BRW_MESSAGE_GATEWAY_SFID_UPDATE_GATEWAY_STATE 5
+#define BRW_MESSAGE_GATEWAY_SFID_MMIO_READ_WRITE      6
+
+
+/* Gen7 "GS URB Entry Allocation Size" is a U9-1 field, so the maximum gs_size
+ * is 2^9, or 512.  It's counted in multiples of 64 bytes.
+ *
+ * Identical for VS, DS, and HS.
+ */
+#define GEN7_MAX_GS_URB_ENTRY_SIZE_BYTES                (512*64)
+#define GEN7_MAX_DS_URB_ENTRY_SIZE_BYTES                (512*64)
+#define GEN7_MAX_HS_URB_ENTRY_SIZE_BYTES                (512*64)
+#define GEN7_MAX_VS_URB_ENTRY_SIZE_BYTES                (512*64)
+
+/* Gen6 "GS URB Entry Allocation Size" is defined as a number of 1024-bit
+ * (128 bytes) URB rows and the maximum allowed value is 5 rows.
+ */
+#define GEN6_MAX_GS_URB_ENTRY_SIZE_BYTES                (5*128)
+
+/* GS Thread Payload
+ */
+/* R0 */
+# define GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT		27
+
+#endif /* BRW_EU_DEFINES_H */
diff --git a/src/intel/compiler/brw_eu_emit.c b/src/intel/compiler/brw_eu_emit.c
new file mode 100644
index 00000000000..058742d4f6e
--- /dev/null
+++ b/src/intel/compiler/brw_eu_emit.c
@@ -0,0 +1,3675 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+
+#include "brw_eu_defines.h"
+#include "brw_eu.h"
+
+#include "util/ralloc.h"
+
+/**
+ * Prior to Sandybridge, the SEND instruction accepted non-MRF source
+ * registers, implicitly moving the operand to a message register.
+ *
+ * On Sandybridge, this is no longer the case.  This function performs the
+ * explicit move; it should be called before emitting a SEND instruction.
+ */
+void
+gen6_resolve_implied_move(struct brw_codegen *p,
+			  struct brw_reg *src,
+			  unsigned msg_reg_nr)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   if (devinfo->gen < 6)
+      return;
+
+   if (src->file == BRW_MESSAGE_REGISTER_FILE)
+      return;
+
+   if (src->file != BRW_ARCHITECTURE_REGISTER_FILE || src->nr != BRW_ARF_NULL) {
+      brw_push_insn_state(p);
+      brw_set_default_exec_size(p, BRW_EXECUTE_8);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
+	      retype(*src, BRW_REGISTER_TYPE_UD));
+      brw_pop_insn_state(p);
+   }
+   *src = brw_message_reg(msg_reg_nr);
+}
+
+static void
+gen7_convert_mrf_to_grf(struct brw_codegen *p, struct brw_reg *reg)
+{
+   /* From the Ivybridge PRM, Volume 4 Part 3, page 218 ("send"):
+    * "The send with EOT should use register space R112-R127 for <src>. This is
+    *  to enable loading of a new thread into the same slot while the message
+    *  with EOT for current thread is pending dispatch."
+    *
+    * Since we're pretending to have 16 MRFs anyway, we may as well use the
+    * registers required for messages with EOT.
+    */
+   const struct gen_device_info *devinfo = p->devinfo;
+   if (devinfo->gen >= 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
+      reg->file = BRW_GENERAL_REGISTER_FILE;
+      reg->nr += GEN7_MRF_HACK_START;
+   }
+}
+
+/**
+ * Convert a brw_reg_type enumeration value into the hardware representation.
+ *
+ * The hardware encoding may depend on whether the value is an immediate.
+ */
+unsigned
+brw_reg_type_to_hw_type(const struct gen_device_info *devinfo,
+                        enum brw_reg_type type, enum brw_reg_file file)
+{
+   if (file == BRW_IMMEDIATE_VALUE) {
+      static const int imm_hw_types[] = {
+         [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
+         [BRW_REGISTER_TYPE_D]  = BRW_HW_REG_TYPE_D,
+         [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
+         [BRW_REGISTER_TYPE_W]  = BRW_HW_REG_TYPE_W,
+         [BRW_REGISTER_TYPE_F]  = BRW_HW_REG_TYPE_F,
+         [BRW_REGISTER_TYPE_UB] = -1,
+         [BRW_REGISTER_TYPE_B]  = -1,
+         [BRW_REGISTER_TYPE_UV] = BRW_HW_REG_IMM_TYPE_UV,
+         [BRW_REGISTER_TYPE_VF] = BRW_HW_REG_IMM_TYPE_VF,
+         [BRW_REGISTER_TYPE_V]  = BRW_HW_REG_IMM_TYPE_V,
+         [BRW_REGISTER_TYPE_DF] = GEN8_HW_REG_IMM_TYPE_DF,
+         [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_IMM_TYPE_HF,
+         [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
+         [BRW_REGISTER_TYPE_Q]  = GEN8_HW_REG_TYPE_Q,
+      };
+      assert(type < ARRAY_SIZE(imm_hw_types));
+      assert(imm_hw_types[type] != -1);
+      assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_DF);
+      return imm_hw_types[type];
+   } else {
+      /* Non-immediate registers */
+      static const int hw_types[] = {
+         [BRW_REGISTER_TYPE_UD] = BRW_HW_REG_TYPE_UD,
+         [BRW_REGISTER_TYPE_D]  = BRW_HW_REG_TYPE_D,
+         [BRW_REGISTER_TYPE_UW] = BRW_HW_REG_TYPE_UW,
+         [BRW_REGISTER_TYPE_W]  = BRW_HW_REG_TYPE_W,
+         [BRW_REGISTER_TYPE_UB] = BRW_HW_REG_NON_IMM_TYPE_UB,
+         [BRW_REGISTER_TYPE_B]  = BRW_HW_REG_NON_IMM_TYPE_B,
+         [BRW_REGISTER_TYPE_F]  = BRW_HW_REG_TYPE_F,
+         [BRW_REGISTER_TYPE_UV] = -1,
+         [BRW_REGISTER_TYPE_VF] = -1,
+         [BRW_REGISTER_TYPE_V]  = -1,
+         [BRW_REGISTER_TYPE_DF] = GEN7_HW_REG_NON_IMM_TYPE_DF,
+         [BRW_REGISTER_TYPE_HF] = GEN8_HW_REG_NON_IMM_TYPE_HF,
+         [BRW_REGISTER_TYPE_UQ] = GEN8_HW_REG_TYPE_UQ,
+         [BRW_REGISTER_TYPE_Q]  = GEN8_HW_REG_TYPE_Q,
+      };
+      assert(type < ARRAY_SIZE(hw_types));
+      assert(hw_types[type] != -1);
+      assert(devinfo->gen >= 7 || type < BRW_REGISTER_TYPE_DF);
+      assert(devinfo->gen >= 8 || type < BRW_REGISTER_TYPE_Q);
+      return hw_types[type];
+   }
+}
+
+/**
+ * Return the element size given a hardware register type and file.
+ *
+ * The hardware encoding may depend on whether the value is an immediate.
+ */
+unsigned
+brw_hw_reg_type_to_size(const struct gen_device_info *devinfo,
+                        unsigned type, enum brw_reg_file file)
+{
+   if (file == BRW_IMMEDIATE_VALUE) {
+      static const unsigned imm_hw_sizes[] = {
+         [BRW_HW_REG_TYPE_UD]      = 4,
+         [BRW_HW_REG_TYPE_D]       = 4,
+         [BRW_HW_REG_TYPE_UW]      = 2,
+         [BRW_HW_REG_TYPE_W]       = 2,
+         [BRW_HW_REG_IMM_TYPE_UV]  = 2,
+         [BRW_HW_REG_IMM_TYPE_VF]  = 4,
+         [BRW_HW_REG_IMM_TYPE_V]   = 2,
+         [BRW_HW_REG_TYPE_F]       = 4,
+         [GEN8_HW_REG_TYPE_UQ]     = 8,
+         [GEN8_HW_REG_TYPE_Q]      = 8,
+         [GEN8_HW_REG_IMM_TYPE_DF] = 8,
+         [GEN8_HW_REG_IMM_TYPE_HF] = 2,
+      };
+      assert(type < ARRAY_SIZE(imm_hw_sizes));
+      assert(devinfo->gen >= 6 || type != BRW_HW_REG_IMM_TYPE_UV);
+      assert(devinfo->gen >= 8 || type <= BRW_HW_REG_TYPE_F);
+      return imm_hw_sizes[type];
+   } else {
+      /* Non-immediate registers */
+      static const unsigned hw_sizes[] = {
+         [BRW_HW_REG_TYPE_UD]          = 4,
+         [BRW_HW_REG_TYPE_D]           = 4,
+         [BRW_HW_REG_TYPE_UW]          = 2,
+         [BRW_HW_REG_TYPE_W]           = 2,
+         [BRW_HW_REG_NON_IMM_TYPE_UB]  = 1,
+         [BRW_HW_REG_NON_IMM_TYPE_B]   = 1,
+         [GEN7_HW_REG_NON_IMM_TYPE_DF] = 8,
+         [BRW_HW_REG_TYPE_F]           = 4,
+         [GEN8_HW_REG_TYPE_UQ]         = 8,
+         [GEN8_HW_REG_TYPE_Q]          = 8,
+         [GEN8_HW_REG_NON_IMM_TYPE_HF] = 2,
+      };
+      assert(type < ARRAY_SIZE(hw_sizes));
+      assert(devinfo->gen >= 7 ||
+             (type < GEN7_HW_REG_NON_IMM_TYPE_DF || type == BRW_HW_REG_TYPE_F));
+      assert(devinfo->gen >= 8 || type <= BRW_HW_REG_TYPE_F);
+      return hw_sizes[type];
+   }
+}
+
+void
+brw_set_dest(struct brw_codegen *p, brw_inst *inst, struct brw_reg dest)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+
+   if (dest.file == BRW_MESSAGE_REGISTER_FILE)
+      assert((dest.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
+   else if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE)
+      assert(dest.nr < 128);
+
+   gen7_convert_mrf_to_grf(p, &dest);
+
+   brw_inst_set_dst_reg_file(devinfo, inst, dest.file);
+   brw_inst_set_dst_reg_type(devinfo, inst,
+                             brw_reg_type_to_hw_type(devinfo, dest.type,
+                                                     dest.file));
+   brw_inst_set_dst_address_mode(devinfo, inst, dest.address_mode);
+
+   if (dest.address_mode == BRW_ADDRESS_DIRECT) {
+      brw_inst_set_dst_da_reg_nr(devinfo, inst, dest.nr);
+
+      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+         brw_inst_set_dst_da1_subreg_nr(devinfo, inst, dest.subnr);
+	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
+	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
+         brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
+      } else {
+         brw_inst_set_dst_da16_subreg_nr(devinfo, inst, dest.subnr / 16);
+         brw_inst_set_da16_writemask(devinfo, inst, dest.writemask);
+         if (dest.file == BRW_GENERAL_REGISTER_FILE ||
+             dest.file == BRW_MESSAGE_REGISTER_FILE) {
+            assert(dest.writemask != 0);
+         }
+	 /* From the Ivybridge PRM, Vol 4, Part 3, Section 5.2.4.1:
+	  *    Although Dst.HorzStride is a don't care for Align16, HW needs
+	  *    this to be programmed as "01".
+	  */
+         brw_inst_set_dst_hstride(devinfo, inst, 1);
+      }
+   } else {
+      brw_inst_set_dst_ia_subreg_nr(devinfo, inst, dest.subnr);
+
+      /* These are different sizes in align1 vs align16:
+       */
+      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+         brw_inst_set_dst_ia1_addr_imm(devinfo, inst,
+                                       dest.indirect_offset);
+	 if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
+	    dest.hstride = BRW_HORIZONTAL_STRIDE_1;
+         brw_inst_set_dst_hstride(devinfo, inst, dest.hstride);
+      } else {
+         brw_inst_set_dst_ia16_addr_imm(devinfo, inst,
+                                        dest.indirect_offset);
+	 /* even ignored in da16, still need to set as '01' */
+         brw_inst_set_dst_hstride(devinfo, inst, 1);
+      }
+   }
+
+   /* Generators should set a default exec_size of either 8 (SIMD4x2 or SIMD8)
+    * or 16 (SIMD16), as that's normally correct.  However, when dealing with
+    * small registers, we automatically reduce it to match the register size.
+    *
+    * In platforms that support fp64 we can emit instructions with a width of
+    * 4 that need two SIMD8 registers and an exec_size of 8 or 16. In these
+    * cases we need to make sure that these instructions have their exec sizes
+    * set properly when they are emitted and we can't rely on this code to fix
+    * it.
+    */
+   bool fix_exec_size;
+   if (devinfo->gen >= 6)
+      fix_exec_size = dest.width < BRW_EXECUTE_4;
+   else
+      fix_exec_size = dest.width < BRW_EXECUTE_8;
+
+   if (fix_exec_size)
+      brw_inst_set_exec_size(devinfo, inst, dest.width);
+}
+
+static void
+validate_reg(const struct gen_device_info *devinfo,
+             brw_inst *inst, struct brw_reg reg)
+{
+   const int hstride_for_reg[] = {0, 1, 2, 4};
+   const int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32};
+   const int width_for_reg[] = {1, 2, 4, 8, 16};
+   const int execsize_for_reg[] = {1, 2, 4, 8, 16, 32};
+   int width, hstride, vstride, execsize;
+
+   if (reg.file == BRW_IMMEDIATE_VALUE) {
+      /* 3.3.6: Region Parameters.  Restriction: Immediate vectors
+       * mean the destination has to be 128-bit aligned and the
+       * destination horiz stride has to be a word.
+       */
+      if (reg.type == BRW_REGISTER_TYPE_V) {
+         unsigned UNUSED elem_size = brw_element_size(devinfo, inst, dst);
+         assert(hstride_for_reg[brw_inst_dst_hstride(devinfo, inst)] *
+                elem_size == 2);
+      }
+
+      return;
+   }
+
+   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+       reg.file == BRW_ARF_NULL)
+      return;
+
+   /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
+    *
+    *    "Swizzling is not allowed when an accumulator is used as an implicit
+    *    source or an explicit source in an instruction."
+    */
+   if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+       reg.nr == BRW_ARF_ACCUMULATOR)
+      assert(reg.swizzle == BRW_SWIZZLE_XYZW);
+
+   assert(reg.hstride >= 0 && reg.hstride < ARRAY_SIZE(hstride_for_reg));
+   hstride = hstride_for_reg[reg.hstride];
+
+   if (reg.vstride == 0xf) {
+      vstride = -1;
+   } else {
+      assert(reg.vstride >= 0 && reg.vstride < ARRAY_SIZE(vstride_for_reg));
+      vstride = vstride_for_reg[reg.vstride];
+   }
+
+   assert(reg.width >= 0 && reg.width < ARRAY_SIZE(width_for_reg));
+   width = width_for_reg[reg.width];
+
+   assert(brw_inst_exec_size(devinfo, inst) >= 0 &&
+          brw_inst_exec_size(devinfo, inst) < ARRAY_SIZE(execsize_for_reg));
+   execsize = execsize_for_reg[brw_inst_exec_size(devinfo, inst)];
+
+   /* Restrictions from 3.3.10: Register Region Restrictions. */
+   /* 3. */
+   assert(execsize >= width);
+
+   /* 4. */
+   if (execsize == width && hstride != 0) {
+      assert(vstride == -1 || vstride == width * hstride);
+   }
+
+   /* 5. */
+   if (execsize == width && hstride == 0) {
+      /* no restriction on vstride. */
+   }
+
+   /* 6. */
+   if (width == 1) {
+      assert(hstride == 0);
+   }
+
+   /* 7. */
+   if (execsize == 1 && width == 1) {
+      assert(hstride == 0);
+      assert(vstride == 0);
+   }
+
+   /* 8. */
+   if (vstride == 0 && hstride == 0) {
+      assert(width == 1);
+   }
+
+   /* 10. Check destination issues. */
+}
+
+static bool
+is_compactable_immediate(unsigned imm)
+{
+   /* We get the low 12 bits as-is. */
+   imm &= ~0xfff;
+
+   /* We get one bit replicated through the top 20 bits. */
+   return imm == 0 || imm == 0xfffff000;
+}
+
+void
+brw_set_src0(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+
+   if (reg.file == BRW_MESSAGE_REGISTER_FILE)
+      assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
+   else if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
+      assert(reg.nr < 128);
+
+   gen7_convert_mrf_to_grf(p, &reg);
+
+   if (devinfo->gen >= 6 && (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND ||
+                             brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SENDC)) {
+      /* Any source modifiers or regions will be ignored, since this just
+       * identifies the MRF/GRF to start reading the message contents from.
+       * Check for some likely failures.
+       */
+      assert(!reg.negate);
+      assert(!reg.abs);
+      assert(reg.address_mode == BRW_ADDRESS_DIRECT);
+   }
+
+   validate_reg(devinfo, inst, reg);
+
+   brw_inst_set_src0_reg_file(devinfo, inst, reg.file);
+   brw_inst_set_src0_reg_type(devinfo, inst,
+                              brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
+   brw_inst_set_src0_abs(devinfo, inst, reg.abs);
+   brw_inst_set_src0_negate(devinfo, inst, reg.negate);
+   brw_inst_set_src0_address_mode(devinfo, inst, reg.address_mode);
+
+   if (reg.file == BRW_IMMEDIATE_VALUE) {
+      if (reg.type == BRW_REGISTER_TYPE_DF ||
+          brw_inst_opcode(devinfo, inst) == BRW_OPCODE_DIM)
+         brw_inst_set_imm_df(devinfo, inst, reg.df);
+      else if (reg.type == BRW_REGISTER_TYPE_UQ ||
+               reg.type == BRW_REGISTER_TYPE_Q)
+         brw_inst_set_imm_uq(devinfo, inst, reg.u64);
+      else
+         brw_inst_set_imm_ud(devinfo, inst, reg.ud);
+
+      /* The Bspec's section titled "Non-present Operands" claims that if src0
+       * is an immediate that src1's type must be the same as that of src0.
+       *
+       * The SNB+ DataTypeIndex instruction compaction tables contain mappings
+       * that do not follow this rule. E.g., from the IVB/HSW table:
+       *
+       *  DataTypeIndex   18-Bit Mapping       Mapped Meaning
+       *        3         001000001011111101   r:f | i:vf | a:ud | <1> | dir |
+       *
+       * And from the SNB table:
+       *
+       *  DataTypeIndex   18-Bit Mapping       Mapped Meaning
+       *        8         001000000111101100   a:w | i:w | a:ud | <1> | dir |
+       *
+       * Neither of these cause warnings from the simulator when used,
+       * compacted or otherwise. In fact, all compaction mappings that have an
+       * immediate in src0 use a:ud for src1.
+       *
+       * The GM45 instruction compaction tables do not contain mapped meanings
+       * so it's not clear whether it has the restriction. We'll assume it was
+       * lifted on SNB. (FINISHME: decode the GM45 tables and check.)
+       *
+       * Don't do any of this for 64-bit immediates, since the src1 fields
+       * overlap with the immediate and setting them would overwrite the
+       * immediate we set.
+       */
+      if (type_sz(reg.type) < 8) {
+         brw_inst_set_src1_reg_file(devinfo, inst,
+                                    BRW_ARCHITECTURE_REGISTER_FILE);
+         if (devinfo->gen < 6) {
+            brw_inst_set_src1_reg_type(devinfo, inst,
+                                       brw_inst_src0_reg_type(devinfo, inst));
+         } else {
+            brw_inst_set_src1_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
+         }
+      }
+
+      /* Compacted instructions only have 12-bits (plus 1 for the other 20)
+       * for immediate values. Presumably the hardware engineers realized
+       * that the only useful floating-point value that could be represented
+       * in this format is 0.0, which can also be represented as a VF-typed
+       * immediate, so they gave us the previously mentioned mapping on IVB+.
+       *
+       * Strangely, we do have a mapping for imm:f in src1, so we don't need
+       * to do this there.
+       *
+       * If we see a 0.0:F, change the type to VF so that it can be compacted.
+       */
+      if (brw_inst_imm_ud(devinfo, inst) == 0x0 &&
+          brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_F &&
+          brw_inst_dst_reg_type(devinfo, inst) != GEN7_HW_REG_NON_IMM_TYPE_DF) {
+         brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_IMM_TYPE_VF);
+      }
+
+      /* There are no mappings for dst:d | i:d, so if the immediate is suitable
+       * set the types to :UD so the instruction can be compacted.
+       */
+      if (is_compactable_immediate(brw_inst_imm_ud(devinfo, inst)) &&
+          brw_inst_cond_modifier(devinfo, inst) == BRW_CONDITIONAL_NONE &&
+          brw_inst_src0_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D &&
+          brw_inst_dst_reg_type(devinfo, inst) == BRW_HW_REG_TYPE_D) {
+         brw_inst_set_src0_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
+         brw_inst_set_dst_reg_type(devinfo, inst, BRW_HW_REG_TYPE_UD);
+      }
+   } else {
+      if (reg.address_mode == BRW_ADDRESS_DIRECT) {
+         brw_inst_set_src0_da_reg_nr(devinfo, inst, reg.nr);
+         if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+             brw_inst_set_src0_da1_subreg_nr(devinfo, inst, reg.subnr);
+	 } else {
+            brw_inst_set_src0_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
+	 }
+      } else {
+         brw_inst_set_src0_ia_subreg_nr(devinfo, inst, reg.subnr);
+
+         if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+            brw_inst_set_src0_ia1_addr_imm(devinfo, inst, reg.indirect_offset);
+	 } else {
+            brw_inst_set_src0_ia16_addr_imm(devinfo, inst, reg.indirect_offset);
+	 }
+      }
+
+      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+	 if (reg.width == BRW_WIDTH_1 &&
+             brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
+            brw_inst_set_src0_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
+            brw_inst_set_src0_width(devinfo, inst, BRW_WIDTH_1);
+            brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
+	 } else {
+            brw_inst_set_src0_hstride(devinfo, inst, reg.hstride);
+            brw_inst_set_src0_width(devinfo, inst, reg.width);
+            brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
+	 }
+      } else {
+         brw_inst_set_src0_da16_swiz_x(devinfo, inst,
+            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
+         brw_inst_set_src0_da16_swiz_y(devinfo, inst,
+            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
+         brw_inst_set_src0_da16_swiz_z(devinfo, inst,
+            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
+         brw_inst_set_src0_da16_swiz_w(devinfo, inst,
+            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
+
+	 /* This is an oddity of the fact we're using the same
+	  * descriptions for registers in align_16 as align_1:
+	  */
+	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
+            brw_inst_set_src0_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
+	 else
+            brw_inst_set_src0_vstride(devinfo, inst, reg.vstride);
+      }
+   }
+}
+
+
+void
+brw_set_src1(struct brw_codegen *p, brw_inst *inst, struct brw_reg reg)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+
+   if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
+      assert(reg.nr < 128);
+
+   /* From the IVB PRM Vol. 4, Pt. 3, Section 3.3.3.5:
+    *
+    *    "Accumulator registers may be accessed explicitly as src0
+    *    operands only."
+    */
+   assert(reg.file != BRW_ARCHITECTURE_REGISTER_FILE ||
+          reg.nr != BRW_ARF_ACCUMULATOR);
+
+   gen7_convert_mrf_to_grf(p, &reg);
+   assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
+
+   validate_reg(devinfo, inst, reg);
+
+   brw_inst_set_src1_reg_file(devinfo, inst, reg.file);
+   brw_inst_set_src1_reg_type(devinfo, inst,
+                              brw_reg_type_to_hw_type(devinfo, reg.type, reg.file));
+   brw_inst_set_src1_abs(devinfo, inst, reg.abs);
+   brw_inst_set_src1_negate(devinfo, inst, reg.negate);
+
+   /* Only src1 can be immediate in two-argument instructions.
+    */
+   assert(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE);
+
+   if (reg.file == BRW_IMMEDIATE_VALUE) {
+      /* two-argument instructions can only use 32-bit immediates */
+      assert(type_sz(reg.type) < 8);
+      brw_inst_set_imm_ud(devinfo, inst, reg.ud);
+   } else {
+      /* This is a hardware restriction, which may or may not be lifted
+       * in the future:
+       */
+      assert (reg.address_mode == BRW_ADDRESS_DIRECT);
+      /* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
+
+      brw_inst_set_src1_da_reg_nr(devinfo, inst, reg.nr);
+      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+         brw_inst_set_src1_da1_subreg_nr(devinfo, inst, reg.subnr);
+      } else {
+         brw_inst_set_src1_da16_subreg_nr(devinfo, inst, reg.subnr / 16);
+      }
+
+      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1) {
+	 if (reg.width == BRW_WIDTH_1 &&
+             brw_inst_exec_size(devinfo, inst) == BRW_EXECUTE_1) {
+            brw_inst_set_src1_hstride(devinfo, inst, BRW_HORIZONTAL_STRIDE_0);
+            brw_inst_set_src1_width(devinfo, inst, BRW_WIDTH_1);
+            brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_0);
+	 } else {
+            brw_inst_set_src1_hstride(devinfo, inst, reg.hstride);
+            brw_inst_set_src1_width(devinfo, inst, reg.width);
+            brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
+	 }
+      } else {
+         brw_inst_set_src1_da16_swiz_x(devinfo, inst,
+            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_X));
+         brw_inst_set_src1_da16_swiz_y(devinfo, inst,
+            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Y));
+         brw_inst_set_src1_da16_swiz_z(devinfo, inst,
+            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_Z));
+         brw_inst_set_src1_da16_swiz_w(devinfo, inst,
+            BRW_GET_SWZ(reg.swizzle, BRW_CHANNEL_W));
+
+	 /* This is an oddity of the fact we're using the same
+	  * descriptions for registers in align_16 as align_1:
+	  */
+	 if (reg.vstride == BRW_VERTICAL_STRIDE_8)
+            brw_inst_set_src1_vstride(devinfo, inst, BRW_VERTICAL_STRIDE_4);
+	 else
+            brw_inst_set_src1_vstride(devinfo, inst, reg.vstride);
+      }
+   }
+}
+
+/**
+ * Set the Message Descriptor and Extended Message Descriptor fields
+ * for SEND messages.
+ *
+ * \note This zeroes out the Function Control bits, so it must be called
+ *       \b before filling out any message-specific data.  Callers can
+ *       choose not to fill in irrelevant bits; they will be zero.
+ */
+void
+brw_set_message_descriptor(struct brw_codegen *p,
+			   brw_inst *inst,
+			   enum brw_message_target sfid,
+			   unsigned msg_length,
+			   unsigned response_length,
+			   bool header_present,
+			   bool end_of_thread)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+
+   brw_set_src1(p, inst, brw_imm_d(0));
+
+   /* For indirect sends, `inst` will not be the SEND/SENDC instruction
+    * itself; instead, it will be a MOV/OR into the address register.
+    *
+    * In this case, we avoid setting the extended message descriptor bits,
+    * since they go on the later SEND/SENDC instead and if set here would
+    * instead clobber the conditionalmod bits.
+    */
+   unsigned opcode = brw_inst_opcode(devinfo, inst);
+   if (opcode == BRW_OPCODE_SEND || opcode == BRW_OPCODE_SENDC) {
+      brw_inst_set_sfid(devinfo, inst, sfid);
+   }
+
+   brw_inst_set_mlen(devinfo, inst, msg_length);
+   brw_inst_set_rlen(devinfo, inst, response_length);
+   brw_inst_set_eot(devinfo, inst, end_of_thread);
+
+   if (devinfo->gen >= 5) {
+      brw_inst_set_header_present(devinfo, inst, header_present);
+   }
+}
+
+static void brw_set_math_message( struct brw_codegen *p,
+				  brw_inst *inst,
+				  unsigned function,
+				  unsigned integer_type,
+				  bool low_precision,
+				  unsigned dataType )
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   unsigned msg_length;
+   unsigned response_length;
+
+   /* Infer message length from the function */
+   switch (function) {
+   case BRW_MATH_FUNCTION_POW:
+   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
+   case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
+   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
+      msg_length = 2;
+      break;
+   default:
+      msg_length = 1;
+      break;
+   }
+
+   /* Infer response length from the function */
+   switch (function) {
+   case BRW_MATH_FUNCTION_SINCOS:
+   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
+      response_length = 2;
+      break;
+   default:
+      response_length = 1;
+      break;
+   }
+
+
+   brw_set_message_descriptor(p, inst, BRW_SFID_MATH,
+			      msg_length, response_length, false, false);
+   brw_inst_set_math_msg_function(devinfo, inst, function);
+   brw_inst_set_math_msg_signed_int(devinfo, inst, integer_type);
+   brw_inst_set_math_msg_precision(devinfo, inst, low_precision);
+   brw_inst_set_math_msg_saturate(devinfo, inst, brw_inst_saturate(devinfo, inst));
+   brw_inst_set_math_msg_data_type(devinfo, inst, dataType);
+   brw_inst_set_saturate(devinfo, inst, 0);
+}
+
+
+static void brw_set_ff_sync_message(struct brw_codegen *p,
+				    brw_inst *insn,
+				    bool allocate,
+				    unsigned response_length,
+				    bool end_of_thread)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+
+   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
+			      1, response_length, true, end_of_thread);
+   brw_inst_set_urb_opcode(devinfo, insn, 1); /* FF_SYNC */
+   brw_inst_set_urb_allocate(devinfo, insn, allocate);
+   /* The following fields are not used by FF_SYNC: */
+   brw_inst_set_urb_global_offset(devinfo, insn, 0);
+   brw_inst_set_urb_swizzle_control(devinfo, insn, 0);
+   brw_inst_set_urb_used(devinfo, insn, 0);
+   brw_inst_set_urb_complete(devinfo, insn, 0);
+}
+
+static void brw_set_urb_message( struct brw_codegen *p,
+				 brw_inst *insn,
+                                 enum brw_urb_write_flags flags,
+				 unsigned msg_length,
+				 unsigned response_length,
+				 unsigned offset,
+				 unsigned swizzle_control )
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+
+   assert(devinfo->gen < 7 || swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
+   assert(devinfo->gen < 7 || !(flags & BRW_URB_WRITE_ALLOCATE));
+   assert(devinfo->gen >= 7 || !(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
+
+   brw_set_message_descriptor(p, insn, BRW_SFID_URB,
+			      msg_length, response_length, true,
+                              flags & BRW_URB_WRITE_EOT);
+
+   if (flags & BRW_URB_WRITE_OWORD) {
+      assert(msg_length == 2); /* header + one OWORD of data */
+      brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_OWORD);
+   } else {
+      brw_inst_set_urb_opcode(devinfo, insn, BRW_URB_OPCODE_WRITE_HWORD);
+   }
+
+   brw_inst_set_urb_global_offset(devinfo, insn, offset);
+   brw_inst_set_urb_swizzle_control(devinfo, insn, swizzle_control);
+
+   if (devinfo->gen < 8) {
+      brw_inst_set_urb_complete(devinfo, insn, !!(flags & BRW_URB_WRITE_COMPLETE));
+   }
+
+   if (devinfo->gen < 7) {
+      brw_inst_set_urb_allocate(devinfo, insn, !!(flags & BRW_URB_WRITE_ALLOCATE));
+      brw_inst_set_urb_used(devinfo, insn, !(flags & BRW_URB_WRITE_UNUSED));
+   } else {
+      brw_inst_set_urb_per_slot_offset(devinfo, insn,
+         !!(flags & BRW_URB_WRITE_PER_SLOT_OFFSET));
+   }
+}
+
+void
+brw_set_dp_write_message(struct brw_codegen *p,
+			 brw_inst *insn,
+			 unsigned binding_table_index,
+			 unsigned msg_control,
+			 unsigned msg_type,
+                         unsigned target_cache,
+			 unsigned msg_length,
+			 bool header_present,
+			 unsigned last_render_target,
+			 unsigned response_length,
+			 unsigned end_of_thread,
+			 unsigned send_commit_msg)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
+                          BRW_SFID_DATAPORT_WRITE);
+
+   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
+			      header_present, end_of_thread);
+
+   brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
+   brw_inst_set_dp_write_msg_type(devinfo, insn, msg_type);
+   brw_inst_set_dp_write_msg_control(devinfo, insn, msg_control);
+   brw_inst_set_rt_last(devinfo, insn, last_render_target);
+   if (devinfo->gen < 7) {
+      brw_inst_set_dp_write_commit(devinfo, insn, send_commit_msg);
+   }
+}
+
+void
+brw_set_dp_read_message(struct brw_codegen *p,
+			brw_inst *insn,
+			unsigned binding_table_index,
+			unsigned msg_control,
+			unsigned msg_type,
+			unsigned target_cache,
+			unsigned msg_length,
+                        bool header_present,
+			unsigned response_length)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   const unsigned sfid = (devinfo->gen >= 6 ? target_cache :
+                          BRW_SFID_DATAPORT_READ);
+
+   brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
+			      header_present, false);
+
+   brw_inst_set_binding_table_index(devinfo, insn, binding_table_index);
+   brw_inst_set_dp_read_msg_type(devinfo, insn, msg_type);
+   brw_inst_set_dp_read_msg_control(devinfo, insn, msg_control);
+   if (devinfo->gen < 6)
+      brw_inst_set_dp_read_target_cache(devinfo, insn, target_cache);
+}
+
+void
+brw_set_sampler_message(struct brw_codegen *p,
+                        brw_inst *inst,
+                        unsigned binding_table_index,
+                        unsigned sampler,
+                        unsigned msg_type,
+                        unsigned response_length,
+                        unsigned msg_length,
+                        unsigned header_present,
+                        unsigned simd_mode,
+                        unsigned return_format)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+
+   brw_set_message_descriptor(p, inst, BRW_SFID_SAMPLER, msg_length,
+			      response_length, header_present, false);
+
+   brw_inst_set_binding_table_index(devinfo, inst, binding_table_index);
+   brw_inst_set_sampler(devinfo, inst, sampler);
+   brw_inst_set_sampler_msg_type(devinfo, inst, msg_type);
+   if (devinfo->gen >= 5) {
+      brw_inst_set_sampler_simd_mode(devinfo, inst, simd_mode);
+   } else if (devinfo->gen == 4 && !devinfo->is_g4x) {
+      brw_inst_set_sampler_return_format(devinfo, inst, return_format);
+   }
+}
+
+static void
+gen7_set_dp_scratch_message(struct brw_codegen *p,
+                            brw_inst *inst,
+                            bool write,
+                            bool dword,
+                            bool invalidate_after_read,
+                            unsigned num_regs,
+                            unsigned addr_offset,
+                            unsigned mlen,
+                            unsigned rlen,
+                            bool header_present)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   assert(num_regs == 1 || num_regs == 2 || num_regs == 4 ||
+          (devinfo->gen >= 8 && num_regs == 8));
+   const unsigned block_size = (devinfo->gen >= 8 ? _mesa_logbase2(num_regs) :
+                                num_regs - 1);
+
+   brw_set_message_descriptor(p, inst, GEN7_SFID_DATAPORT_DATA_CACHE,
+                              mlen, rlen, header_present, false);
+   brw_inst_set_dp_category(devinfo, inst, 1); /* Scratch Block Read/Write msgs */
+   brw_inst_set_scratch_read_write(devinfo, inst, write);
+   brw_inst_set_scratch_type(devinfo, inst, dword);
+   brw_inst_set_scratch_invalidate_after_read(devinfo, inst, invalidate_after_read);
+   brw_inst_set_scratch_block_size(devinfo, inst, block_size);
+   brw_inst_set_scratch_addr_offset(devinfo, inst, addr_offset);
+}
+
+#define next_insn brw_next_insn
+brw_inst *
+brw_next_insn(struct brw_codegen *p, unsigned opcode)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   brw_inst *insn;
+
+   if (p->nr_insn + 1 > p->store_size) {
+      p->store_size <<= 1;
+      p->store = reralloc(p->mem_ctx, p->store, brw_inst, p->store_size);
+   }
+
+   p->next_insn_offset += 16;
+   insn = &p->store[p->nr_insn++];
+   memcpy(insn, p->current, sizeof(*insn));
+
+   brw_inst_set_opcode(devinfo, insn, opcode);
+   return insn;
+}
+
+static brw_inst *
+brw_alu1(struct brw_codegen *p, unsigned opcode,
+         struct brw_reg dest, struct brw_reg src)
+{
+   brw_inst *insn = next_insn(p, opcode);
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src);
+   return insn;
+}
+
+static brw_inst *
+brw_alu2(struct brw_codegen *p, unsigned opcode,
+         struct brw_reg dest, struct brw_reg src0, struct brw_reg src1)
+{
+   /* 64-bit immediates are only supported on 1-src instructions */
+   assert(src0.file != BRW_IMMEDIATE_VALUE || type_sz(src0.type) <= 4);
+   assert(src1.file != BRW_IMMEDIATE_VALUE || type_sz(src1.type) <= 4);
+
+   brw_inst *insn = next_insn(p, opcode);
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src0);
+   brw_set_src1(p, insn, src1);
+   return insn;
+}
+
+static int
+get_3src_subreg_nr(struct brw_reg reg)
+{
+   /* Normally, SubRegNum is in bytes (0..31).  However, 3-src instructions
+    * use 32-bit units (components 0..7).  Since they only support F/D/UD
+    * types, this doesn't lose any flexibility, but uses fewer bits.
+    */
+   return reg.subnr / 4;
+}
+
+static brw_inst *
+brw_alu3(struct brw_codegen *p, unsigned opcode, struct brw_reg dest,
+         struct brw_reg src0, struct brw_reg src1, struct brw_reg src2)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   brw_inst *inst = next_insn(p, opcode);
+
+   gen7_convert_mrf_to_grf(p, &dest);
+
+   assert(brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16);
+
+   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
+	  dest.file == BRW_MESSAGE_REGISTER_FILE);
+   assert(dest.nr < 128);
+   assert(dest.address_mode == BRW_ADDRESS_DIRECT);
+   assert(dest.type == BRW_REGISTER_TYPE_F  ||
+          dest.type == BRW_REGISTER_TYPE_DF ||
+          dest.type == BRW_REGISTER_TYPE_D  ||
+          dest.type == BRW_REGISTER_TYPE_UD);
+   if (devinfo->gen == 6) {
+      brw_inst_set_3src_dst_reg_file(devinfo, inst,
+                                     dest.file == BRW_MESSAGE_REGISTER_FILE);
+   }
+   brw_inst_set_3src_dst_reg_nr(devinfo, inst, dest.nr);
+   brw_inst_set_3src_dst_subreg_nr(devinfo, inst, dest.subnr / 16);
+   brw_inst_set_3src_dst_writemask(devinfo, inst, dest.writemask);
+
+   assert(src0.file == BRW_GENERAL_REGISTER_FILE);
+   assert(src0.address_mode == BRW_ADDRESS_DIRECT);
+   assert(src0.nr < 128);
+   brw_inst_set_3src_src0_swizzle(devinfo, inst, src0.swizzle);
+   brw_inst_set_3src_src0_subreg_nr(devinfo, inst, get_3src_subreg_nr(src0));
+   brw_inst_set_3src_src0_reg_nr(devinfo, inst, src0.nr);
+   brw_inst_set_3src_src0_abs(devinfo, inst, src0.abs);
+   brw_inst_set_3src_src0_negate(devinfo, inst, src0.negate);
+   brw_inst_set_3src_src0_rep_ctrl(devinfo, inst,
+                                   src0.vstride == BRW_VERTICAL_STRIDE_0);
+
+   assert(src1.file == BRW_GENERAL_REGISTER_FILE);
+   assert(src1.address_mode == BRW_ADDRESS_DIRECT);
+   assert(src1.nr < 128);
+   brw_inst_set_3src_src1_swizzle(devinfo, inst, src1.swizzle);
+   brw_inst_set_3src_src1_subreg_nr(devinfo, inst, get_3src_subreg_nr(src1));
+   brw_inst_set_3src_src1_reg_nr(devinfo, inst, src1.nr);
+   brw_inst_set_3src_src1_abs(devinfo, inst, src1.abs);
+   brw_inst_set_3src_src1_negate(devinfo, inst, src1.negate);
+   brw_inst_set_3src_src1_rep_ctrl(devinfo, inst,
+                                   src1.vstride == BRW_VERTICAL_STRIDE_0);
+
+   assert(src2.file == BRW_GENERAL_REGISTER_FILE);
+   assert(src2.address_mode == BRW_ADDRESS_DIRECT);
+   assert(src2.nr < 128);
+   brw_inst_set_3src_src2_swizzle(devinfo, inst, src2.swizzle);
+   brw_inst_set_3src_src2_subreg_nr(devinfo, inst, get_3src_subreg_nr(src2));
+   brw_inst_set_3src_src2_reg_nr(devinfo, inst, src2.nr);
+   brw_inst_set_3src_src2_abs(devinfo, inst, src2.abs);
+   brw_inst_set_3src_src2_negate(devinfo, inst, src2.negate);
+   brw_inst_set_3src_src2_rep_ctrl(devinfo, inst,
+                                   src2.vstride == BRW_VERTICAL_STRIDE_0);
+
+   if (devinfo->gen >= 7) {
+      /* Set both the source and destination types based on dest.type,
+       * ignoring the source register types.  The MAD and LRP emitters ensure
+       * that all four types are float.  The BFE and BFI2 emitters, however,
+       * may send us mixed D and UD types and want us to ignore that and use
+       * the destination type.
+       */
+      switch (dest.type) {
+      case BRW_REGISTER_TYPE_F:
+         brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_F);
+         brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_F);
+         break;
+      case BRW_REGISTER_TYPE_DF:
+         brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_DF);
+         brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_DF);
+         break;
+      case BRW_REGISTER_TYPE_D:
+         brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_D);
+         brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_D);
+         break;
+      case BRW_REGISTER_TYPE_UD:
+         brw_inst_set_3src_src_type(devinfo, inst, BRW_3SRC_TYPE_UD);
+         brw_inst_set_3src_dst_type(devinfo, inst, BRW_3SRC_TYPE_UD);
+         break;
+      default:
+         unreachable("not reached");
+      }
+   }
+
+   return inst;
+}
+
+
+/***********************************************************************
+ * Convenience routines.
+ */
+#define ALU1(OP)					\
+brw_inst *brw_##OP(struct brw_codegen *p,		\
+	      struct brw_reg dest,			\
+	      struct brw_reg src0)   			\
+{							\
+   return brw_alu1(p, BRW_OPCODE_##OP, dest, src0);    	\
+}
+
+#define ALU2(OP)					\
+brw_inst *brw_##OP(struct brw_codegen *p,		\
+	      struct brw_reg dest,			\
+	      struct brw_reg src0,			\
+	      struct brw_reg src1)   			\
+{							\
+   return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1);	\
+}
+
+#define ALU3(OP)					\
+brw_inst *brw_##OP(struct brw_codegen *p,		\
+	      struct brw_reg dest,			\
+	      struct brw_reg src0,			\
+	      struct brw_reg src1,			\
+	      struct brw_reg src2)   			\
+{							\
+   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2);	\
+}
+
+#define ALU3F(OP)                                               \
+brw_inst *brw_##OP(struct brw_codegen *p,         \
+                                 struct brw_reg dest,           \
+                                 struct brw_reg src0,           \
+                                 struct brw_reg src1,           \
+                                 struct brw_reg src2)           \
+{                                                               \
+   assert(dest.type == BRW_REGISTER_TYPE_F ||                   \
+          dest.type == BRW_REGISTER_TYPE_DF);                   \
+   if (dest.type == BRW_REGISTER_TYPE_F) {                      \
+      assert(src0.type == BRW_REGISTER_TYPE_F);                 \
+      assert(src1.type == BRW_REGISTER_TYPE_F);                 \
+      assert(src2.type == BRW_REGISTER_TYPE_F);                 \
+   } else if (dest.type == BRW_REGISTER_TYPE_DF) {              \
+      assert(src0.type == BRW_REGISTER_TYPE_DF);                \
+      assert(src1.type == BRW_REGISTER_TYPE_DF);                \
+      assert(src2.type == BRW_REGISTER_TYPE_DF);                \
+   }                                                            \
+   return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
+}
+
+/* Rounding operations (other than RNDD) require two instructions - the first
+ * stores a rounded value (possibly the wrong way) in the dest register, but
+ * also sets a per-channel "increment bit" in the flag register.  A predicated
+ * add of 1.0 fixes dest to contain the desired result.
+ *
+ * Sandybridge and later appear to round correctly without an ADD.
+ */
+#define ROUND(OP)							      \
+void brw_##OP(struct brw_codegen *p,					      \
+	      struct brw_reg dest,					      \
+	      struct brw_reg src)					      \
+{									      \
+   const struct gen_device_info *devinfo = p->devinfo;					      \
+   brw_inst *rnd, *add;							      \
+   rnd = next_insn(p, BRW_OPCODE_##OP);					      \
+   brw_set_dest(p, rnd, dest);						      \
+   brw_set_src0(p, rnd, src);						      \
+									      \
+   if (devinfo->gen < 6) {							      \
+      /* turn on round-increments */					      \
+      brw_inst_set_cond_modifier(devinfo, rnd, BRW_CONDITIONAL_R);            \
+      add = brw_ADD(p, dest, dest, brw_imm_f(1.0f));			      \
+      brw_inst_set_pred_control(devinfo, add, BRW_PREDICATE_NORMAL);          \
+   }									      \
+}
+
+
+ALU1(MOV)
+ALU2(SEL)
+ALU1(NOT)
+ALU2(AND)
+ALU2(OR)
+ALU2(XOR)
+ALU2(SHR)
+ALU2(SHL)
+ALU1(DIM)
+ALU2(ASR)
+ALU1(FRC)
+ALU1(RNDD)
+ALU2(MAC)
+ALU2(MACH)
+ALU1(LZD)
+ALU2(DP4)
+ALU2(DPH)
+ALU2(DP3)
+ALU2(DP2)
+ALU3F(MAD)
+ALU3F(LRP)
+ALU1(BFREV)
+ALU3(BFE)
+ALU2(BFI1)
+ALU3(BFI2)
+ALU1(FBH)
+ALU1(FBL)
+ALU1(CBIT)
+ALU2(ADDC)
+ALU2(SUBB)
+
+ROUND(RNDZ)
+ROUND(RNDE)
+
+
+brw_inst *
+brw_ADD(struct brw_codegen *p, struct brw_reg dest,
+        struct brw_reg src0, struct brw_reg src1)
+{
+   /* 6.2.2: add */
+   if (src0.type == BRW_REGISTER_TYPE_F ||
+       (src0.file == BRW_IMMEDIATE_VALUE &&
+	src0.type == BRW_REGISTER_TYPE_VF)) {
+      assert(src1.type != BRW_REGISTER_TYPE_UD);
+      assert(src1.type != BRW_REGISTER_TYPE_D);
+   }
+
+   if (src1.type == BRW_REGISTER_TYPE_F ||
+       (src1.file == BRW_IMMEDIATE_VALUE &&
+	src1.type == BRW_REGISTER_TYPE_VF)) {
+      assert(src0.type != BRW_REGISTER_TYPE_UD);
+      assert(src0.type != BRW_REGISTER_TYPE_D);
+   }
+
+   return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
+}
+
+brw_inst *
+brw_AVG(struct brw_codegen *p, struct brw_reg dest,
+        struct brw_reg src0, struct brw_reg src1)
+{
+   assert(dest.type == src0.type);
+   assert(src0.type == src1.type);
+   switch (src0.type) {
+   case BRW_REGISTER_TYPE_B:
+   case BRW_REGISTER_TYPE_UB:
+   case BRW_REGISTER_TYPE_W:
+   case BRW_REGISTER_TYPE_UW:
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_UD:
+      break;
+   default:
+      unreachable("Bad type for brw_AVG");
+   }
+
+   return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
+}
+
+brw_inst *
+brw_MUL(struct brw_codegen *p, struct brw_reg dest,
+        struct brw_reg src0, struct brw_reg src1)
+{
+   /* 6.32.38: mul */
+   if (src0.type == BRW_REGISTER_TYPE_D ||
+       src0.type == BRW_REGISTER_TYPE_UD ||
+       src1.type == BRW_REGISTER_TYPE_D ||
+       src1.type == BRW_REGISTER_TYPE_UD) {
+      assert(dest.type != BRW_REGISTER_TYPE_F);
+   }
+
+   if (src0.type == BRW_REGISTER_TYPE_F ||
+       (src0.file == BRW_IMMEDIATE_VALUE &&
+	src0.type == BRW_REGISTER_TYPE_VF)) {
+      assert(src1.type != BRW_REGISTER_TYPE_UD);
+      assert(src1.type != BRW_REGISTER_TYPE_D);
+   }
+
+   if (src1.type == BRW_REGISTER_TYPE_F ||
+       (src1.file == BRW_IMMEDIATE_VALUE &&
+	src1.type == BRW_REGISTER_TYPE_VF)) {
+      assert(src0.type != BRW_REGISTER_TYPE_UD);
+      assert(src0.type != BRW_REGISTER_TYPE_D);
+   }
+
+   assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE ||
+	  src0.nr != BRW_ARF_ACCUMULATOR);
+   assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE ||
+	  src1.nr != BRW_ARF_ACCUMULATOR);
+
+   return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
+}
+
+brw_inst *
+brw_LINE(struct brw_codegen *p, struct brw_reg dest,
+         struct brw_reg src0, struct brw_reg src1)
+{
+   src0.vstride = BRW_VERTICAL_STRIDE_0;
+   src0.width = BRW_WIDTH_1;
+   src0.hstride = BRW_HORIZONTAL_STRIDE_0;
+   return brw_alu2(p, BRW_OPCODE_LINE, dest, src0, src1);
+}
+
+brw_inst *
+brw_PLN(struct brw_codegen *p, struct brw_reg dest,
+        struct brw_reg src0, struct brw_reg src1)
+{
+   src0.vstride = BRW_VERTICAL_STRIDE_0;
+   src0.width = BRW_WIDTH_1;
+   src0.hstride = BRW_HORIZONTAL_STRIDE_0;
+   src1.vstride = BRW_VERTICAL_STRIDE_8;
+   src1.width = BRW_WIDTH_8;
+   src1.hstride = BRW_HORIZONTAL_STRIDE_1;
+   return brw_alu2(p, BRW_OPCODE_PLN, dest, src0, src1);
+}
+
+brw_inst *
+brw_F32TO16(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   const bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
+   /* The F32TO16 instruction doesn't support 32-bit destination types in
+    * Align1 mode, and neither does the Gen8 implementation in terms of a
+    * converting MOV.  Gen7 does zero out the high 16 bits in Align16 mode as
+    * an undocumented feature.
+    */
+   const bool needs_zero_fill = (dst.type == BRW_REGISTER_TYPE_UD &&
+                                 (!align16 || devinfo->gen >= 8));
+   brw_inst *inst;
+
+   if (align16) {
+      assert(dst.type == BRW_REGISTER_TYPE_UD);
+   } else {
+      assert(dst.type == BRW_REGISTER_TYPE_UD ||
+             dst.type == BRW_REGISTER_TYPE_W ||
+             dst.type == BRW_REGISTER_TYPE_UW ||
+             dst.type == BRW_REGISTER_TYPE_HF);
+   }
+
+   brw_push_insn_state(p);
+
+   if (needs_zero_fill) {
+      brw_set_default_access_mode(p, BRW_ALIGN_1);
+      dst = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
+   }
+
+   if (devinfo->gen >= 8) {
+      inst = brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_HF), src);
+   } else {
+      assert(devinfo->gen == 7);
+      inst = brw_alu1(p, BRW_OPCODE_F32TO16, dst, src);
+   }
+
+   if (needs_zero_fill) {
+      brw_inst_set_no_dd_clear(devinfo, inst, true);
+      inst = brw_MOV(p, suboffset(dst, 1), brw_imm_w(0));
+      brw_inst_set_no_dd_check(devinfo, inst, true);
+   }
+
+   brw_pop_insn_state(p);
+   return inst;
+}
+
+brw_inst *
+brw_F16TO32(struct brw_codegen *p, struct brw_reg dst, struct brw_reg src)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   bool align16 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_16;
+
+   if (align16) {
+      assert(src.type == BRW_REGISTER_TYPE_UD);
+   } else {
+      /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
+       *
+       *   Because this instruction does not have a 16-bit floating-point
+       *   type, the source data type must be Word (W). The destination type
+       *   must be F (Float).
+       */
+      if (src.type == BRW_REGISTER_TYPE_UD)
+         src = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
+
+      assert(src.type == BRW_REGISTER_TYPE_W ||
+             src.type == BRW_REGISTER_TYPE_UW ||
+             src.type == BRW_REGISTER_TYPE_HF);
+   }
+
+   if (devinfo->gen >= 8) {
+      return brw_MOV(p, dst, retype(src, BRW_REGISTER_TYPE_HF));
+   } else {
+      assert(devinfo->gen == 7);
+      return brw_alu1(p, BRW_OPCODE_F16TO32, dst, src);
+   }
+}
+
+
+void brw_NOP(struct brw_codegen *p)
+{
+   brw_inst *insn = next_insn(p, BRW_OPCODE_NOP);
+   memset(insn, 0, sizeof(*insn));
+   brw_inst_set_opcode(p->devinfo, insn, BRW_OPCODE_NOP);
+}
+
+
+
+
+
+/***********************************************************************
+ * Comparisons, if/else/endif
+ */
+
+brw_inst *
+brw_JMPI(struct brw_codegen *p, struct brw_reg index,
+         unsigned predicate_control)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   struct brw_reg ip = brw_ip_reg();
+   brw_inst *inst = brw_alu2(p, BRW_OPCODE_JMPI, ip, ip, index);
+
+   brw_inst_set_exec_size(devinfo, inst, BRW_EXECUTE_2);
+   brw_inst_set_qtr_control(devinfo, inst, BRW_COMPRESSION_NONE);
+   brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
+   brw_inst_set_pred_control(devinfo, inst, predicate_control);
+
+   return inst;
+}
+
+static void
+push_if_stack(struct brw_codegen *p, brw_inst *inst)
+{
+   p->if_stack[p->if_stack_depth] = inst - p->store;
+
+   p->if_stack_depth++;
+   if (p->if_stack_array_size <= p->if_stack_depth) {
+      p->if_stack_array_size *= 2;
+      p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
+			     p->if_stack_array_size);
+   }
+}
+
+static brw_inst *
+pop_if_stack(struct brw_codegen *p)
+{
+   p->if_stack_depth--;
+   return &p->store[p->if_stack[p->if_stack_depth]];
+}
+
+static void
+push_loop_stack(struct brw_codegen *p, brw_inst *inst)
+{
+   if (p->loop_stack_array_size <= (p->loop_stack_depth + 1)) {
+      p->loop_stack_array_size *= 2;
+      p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
+			       p->loop_stack_array_size);
+      p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
+				     p->loop_stack_array_size);
+   }
+
+   p->loop_stack[p->loop_stack_depth] = inst - p->store;
+   p->loop_stack_depth++;
+   p->if_depth_in_loop[p->loop_stack_depth] = 0;
+}
+
+static brw_inst *
+get_inner_do_insn(struct brw_codegen *p)
+{
+   return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
+}
+
+/* EU takes the value from the flag register and pushes it onto some
+ * sort of a stack (presumably merging with any flag value already on
+ * the stack).  Within an if block, the flags at the top of the stack
+ * control execution on each channel of the unit, eg. on each of the
+ * 16 pixel values in our wm programs.
+ *
+ * When the matching 'else' instruction is reached (presumably by
+ * countdown of the instruction count patched in by our ELSE/ENDIF
+ * functions), the relevant flags are inverted.
+ *
+ * When the matching 'endif' instruction is reached, the flags are
+ * popped off.  If the stack is now empty, normal execution resumes.
+ */
+brw_inst *
+brw_IF(struct brw_codegen *p, unsigned execute_size)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   brw_inst *insn;
+
+   insn = next_insn(p, BRW_OPCODE_IF);
+
+   /* Override the defaults for this instruction:
+    */
+   if (devinfo->gen < 6) {
+      brw_set_dest(p, insn, brw_ip_reg());
+      brw_set_src0(p, insn, brw_ip_reg());
+      brw_set_src1(p, insn, brw_imm_d(0x0));
+   } else if (devinfo->gen == 6) {
+      brw_set_dest(p, insn, brw_imm_w(0));
+      brw_inst_set_gen6_jump_count(devinfo, insn, 0);
+      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
+      brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
+   } else if (devinfo->gen == 7) {
+      brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
+      brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
+      brw_set_src1(p, insn, brw_imm_w(0));
+      brw_inst_set_jip(devinfo, insn, 0);
+      brw_inst_set_uip(devinfo, insn, 0);
+   } else {
+      brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
+      brw_set_src0(p, insn, brw_imm_d(0));
+      brw_inst_set_jip(devinfo, insn, 0);
+      brw_inst_set_uip(devinfo, insn, 0);
+   }
+
+   brw_inst_set_exec_size(devinfo, insn, execute_size);
+   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
+   brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NORMAL);
+   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
+   if (!p->single_program_flow && devinfo->gen < 6)
+      brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
+
+   push_if_stack(p, insn);
+   p->if_depth_in_loop[p->loop_stack_depth]++;
+   return insn;
+}
+
+/* This function is only used for gen6-style IF instructions with an
+ * embedded comparison (conditional modifier).  It is not used on gen7.
+ */
+brw_inst *
+gen6_IF(struct brw_codegen *p, enum brw_conditional_mod conditional,
+	struct brw_reg src0, struct brw_reg src1)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   brw_inst *insn;
+
+   insn = next_insn(p, BRW_OPCODE_IF);
+
+   brw_set_dest(p, insn, brw_imm_w(0));
+   brw_inst_set_exec_size(devinfo, insn,
+                          brw_inst_exec_size(devinfo, p->current));
+   brw_inst_set_gen6_jump_count(devinfo, insn, 0);
+   brw_set_src0(p, insn, src0);
+   brw_set_src1(p, insn, src1);
+
+   assert(brw_inst_qtr_control(devinfo, insn) == BRW_COMPRESSION_NONE);
+   assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
+   brw_inst_set_cond_modifier(devinfo, insn, conditional);
+
+   push_if_stack(p, insn);
+   return insn;
+}
+
+/**
+ * In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
+ */
+static void
+convert_IF_ELSE_to_ADD(struct brw_codegen *p,
+                       brw_inst *if_inst, brw_inst *else_inst)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+
+   /* The next instruction (where the ENDIF would be, if it existed) */
+   brw_inst *next_inst = &p->store[p->nr_insn];
+
+   assert(p->single_program_flow);
+   assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
+   assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
+   assert(brw_inst_exec_size(devinfo, if_inst) == BRW_EXECUTE_1);
+
+   /* Convert IF to an ADD instruction that moves the instruction pointer
+    * to the first instruction of the ELSE block.  If there is no ELSE
+    * block, point to where ENDIF would be.  Reverse the predicate.
+    *
+    * There's no need to execute an ENDIF since we don't need to do any
+    * stack operations, and if we're currently executing, we just want to
+    * continue normally.
+    */
+   brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_ADD);
+   brw_inst_set_pred_inv(devinfo, if_inst, true);
+
+   if (else_inst != NULL) {
+      /* Convert ELSE to an ADD instruction that points where the ENDIF
+       * would be.
+       */
+      brw_inst_set_opcode(devinfo, else_inst, BRW_OPCODE_ADD);
+
+      brw_inst_set_imm_ud(devinfo, if_inst, (else_inst - if_inst + 1) * 16);
+      brw_inst_set_imm_ud(devinfo, else_inst, (next_inst - else_inst) * 16);
+   } else {
+      brw_inst_set_imm_ud(devinfo, if_inst, (next_inst - if_inst) * 16);
+   }
+}
+
+/**
+ * Patch IF and ELSE instructions with appropriate jump targets.
+ */
+static void
+patch_IF_ELSE(struct brw_codegen *p,
+              brw_inst *if_inst, brw_inst *else_inst, brw_inst *endif_inst)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+
+   /* We shouldn't be patching IF and ELSE instructions in single program flow
+    * mode when gen < 6, because in single program flow mode on those
+    * platforms, we convert flow control instructions to conditional ADDs that
+    * operate on IP (see brw_ENDIF).
+    *
+    * However, on Gen6, writing to IP doesn't work in single program flow mode
+    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
+    * not be updated by non-flow control instructions.").  And on later
+    * platforms, there is no significant benefit to converting control flow
+    * instructions to conditional ADDs.  So we do patch IF and ELSE
+    * instructions in single program flow mode on those platforms.
+    */
+   if (devinfo->gen < 6)
+      assert(!p->single_program_flow);
+
+   assert(if_inst != NULL && brw_inst_opcode(devinfo, if_inst) == BRW_OPCODE_IF);
+   assert(endif_inst != NULL);
+   assert(else_inst == NULL || brw_inst_opcode(devinfo, else_inst) == BRW_OPCODE_ELSE);
+
+   unsigned br = brw_jump_scale(devinfo);
+
+   assert(brw_inst_opcode(devinfo, endif_inst) == BRW_OPCODE_ENDIF);
+   brw_inst_set_exec_size(devinfo, endif_inst, brw_inst_exec_size(devinfo, if_inst));
+
+   if (else_inst == NULL) {
+      /* Patch IF -> ENDIF */
+      if (devinfo->gen < 6) {
+	 /* Turn it into an IFF, which means no mask stack operations for
+	  * all-false and jumping past the ENDIF.
+	  */
+         brw_inst_set_opcode(devinfo, if_inst, BRW_OPCODE_IFF);
+         brw_inst_set_gen4_jump_count(devinfo, if_inst,
+                                      br * (endif_inst - if_inst + 1));
+         brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
+      } else if (devinfo->gen == 6) {
+	 /* As of gen6, there is no IFF and IF must point to the ENDIF. */
+         brw_inst_set_gen6_jump_count(devinfo, if_inst, br*(endif_inst - if_inst));
+      } else {
+         brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
+         brw_inst_set_jip(devinfo, if_inst, br * (endif_inst - if_inst));
+      }
+   } else {
+      brw_inst_set_exec_size(devinfo, else_inst, brw_inst_exec_size(devinfo, if_inst));
+
+      /* Patch IF -> ELSE */
+      if (devinfo->gen < 6) {
+         brw_inst_set_gen4_jump_count(devinfo, if_inst,
+                                      br * (else_inst - if_inst));
+         brw_inst_set_gen4_pop_count(devinfo, if_inst, 0);
+      } else if (devinfo->gen == 6) {
+         brw_inst_set_gen6_jump_count(devinfo, if_inst,
+                                      br * (else_inst - if_inst + 1));
+      }
+
+      /* Patch ELSE -> ENDIF */
+      if (devinfo->gen < 6) {
+	 /* BRW_OPCODE_ELSE pre-gen6 should point just past the
+	  * matching ENDIF.
+	  */
+         brw_inst_set_gen4_jump_count(devinfo, else_inst,
+                                      br * (endif_inst - else_inst + 1));
+         brw_inst_set_gen4_pop_count(devinfo, else_inst, 1);
+      } else if (devinfo->gen == 6) {
+	 /* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
+         brw_inst_set_gen6_jump_count(devinfo, else_inst,
+                                      br * (endif_inst - else_inst));
+      } else {
+	 /* The IF instruction's JIP should point just past the ELSE */
+         brw_inst_set_jip(devinfo, if_inst, br * (else_inst - if_inst + 1));
+	 /* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
+         brw_inst_set_uip(devinfo, if_inst, br * (endif_inst - if_inst));
+         brw_inst_set_jip(devinfo, else_inst, br * (endif_inst - else_inst));
+         if (devinfo->gen >= 8) {
+            /* Since we don't set branch_ctrl, the ELSE's JIP and UIP both
+             * should point to ENDIF.
+             */
+            brw_inst_set_uip(devinfo, else_inst, br * (endif_inst - else_inst));
+         }
+      }
+   }
+}
+
+void
+brw_ELSE(struct brw_codegen *p)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   brw_inst *insn;
+
+   insn = next_insn(p, BRW_OPCODE_ELSE);
+
+   if (devinfo->gen < 6) {
+      brw_set_dest(p, insn, brw_ip_reg());
+      brw_set_src0(p, insn, brw_ip_reg());
+      brw_set_src1(p, insn, brw_imm_d(0x0));
+   } else if (devinfo->gen == 6) {
+      brw_set_dest(p, insn, brw_imm_w(0));
+      brw_inst_set_gen6_jump_count(devinfo, insn, 0);
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   } else if (devinfo->gen == 7) {
+      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(p, insn, brw_imm_w(0));
+      brw_inst_set_jip(devinfo, insn, 0);
+      brw_inst_set_uip(devinfo, insn, 0);
+   } else {
+      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src0(p, insn, brw_imm_d(0));
+      brw_inst_set_jip(devinfo, insn, 0);
+      brw_inst_set_uip(devinfo, insn, 0);
+   }
+
+   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
+   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
+   if (!p->single_program_flow && devinfo->gen < 6)
+      brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
+
+   push_if_stack(p, insn);
+}
+
+void
+brw_ENDIF(struct brw_codegen *p)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   brw_inst *insn = NULL;
+   brw_inst *else_inst = NULL;
+   brw_inst *if_inst = NULL;
+   brw_inst *tmp;
+   bool emit_endif = true;
+
+   /* In single program flow mode, we can express IF and ELSE instructions
+    * equivalently as ADD instructions that operate on IP.  On platforms prior
+    * to Gen6, flow control instructions cause an implied thread switch, so
+    * this is a significant savings.
+    *
+    * However, on Gen6, writing to IP doesn't work in single program flow mode
+    * (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
+    * not be updated by non-flow control instructions.").  And on later
+    * platforms, there is no significant benefit to converting control flow
+    * instructions to conditional ADDs.  So we only do this trick on Gen4 and
+    * Gen5.
+    */
+   if (devinfo->gen < 6 && p->single_program_flow)
+      emit_endif = false;
+
+   /*
+    * A single next_insn() may change the base address of instruction store
+    * memory(p->store), so call it first before referencing the instruction
+    * store pointer from an index
+    */
+   if (emit_endif)
+      insn = next_insn(p, BRW_OPCODE_ENDIF);
+
+   /* Pop the IF and (optional) ELSE instructions from the stack */
+   p->if_depth_in_loop[p->loop_stack_depth]--;
+   tmp = pop_if_stack(p);
+   if (brw_inst_opcode(devinfo, tmp) == BRW_OPCODE_ELSE) {
+      else_inst = tmp;
+      tmp = pop_if_stack(p);
+   }
+   if_inst = tmp;
+
+   if (!emit_endif) {
+      /* ENDIF is useless; don't bother emitting it. */
+      convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
+      return;
+   }
+
+   if (devinfo->gen < 6) {
+      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(p, insn, brw_imm_d(0x0));
+   } else if (devinfo->gen == 6) {
+      brw_set_dest(p, insn, brw_imm_w(0));
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   } else if (devinfo->gen == 7) {
+      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(p, insn, brw_imm_w(0));
+   } else {
+      brw_set_src0(p, insn, brw_imm_d(0));
+   }
+
+   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
+   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_ENABLE);
+   if (devinfo->gen < 6)
+      brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
+
+   /* Also pop item off the stack in the endif instruction: */
+   if (devinfo->gen < 6) {
+      brw_inst_set_gen4_jump_count(devinfo, insn, 0);
+      brw_inst_set_gen4_pop_count(devinfo, insn, 1);
+   } else if (devinfo->gen == 6) {
+      brw_inst_set_gen6_jump_count(devinfo, insn, 2);
+   } else {
+      brw_inst_set_jip(devinfo, insn, 2);
+   }
+   patch_IF_ELSE(p, if_inst, else_inst, insn);
+}
+
+brw_inst *
+brw_BREAK(struct brw_codegen *p)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   brw_inst *insn;
+
+   insn = next_insn(p, BRW_OPCODE_BREAK);
+   if (devinfo->gen >= 8) {
+      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src0(p, insn, brw_imm_d(0x0));
+   } else if (devinfo->gen >= 6) {
+      brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(p, insn, brw_imm_d(0x0));
+   } else {
+      brw_set_dest(p, insn, brw_ip_reg());
+      brw_set_src0(p, insn, brw_ip_reg());
+      brw_set_src1(p, insn, brw_imm_d(0x0));
+      brw_inst_set_gen4_pop_count(devinfo, insn,
+                                  p->if_depth_in_loop[p->loop_stack_depth]);
+   }
+   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
+   brw_inst_set_exec_size(devinfo, insn,
+                          brw_inst_exec_size(devinfo, p->current));
+
+   return insn;
+}
+
+brw_inst *
+brw_CONT(struct brw_codegen *p)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   brw_inst *insn;
+
+   insn = next_insn(p, BRW_OPCODE_CONTINUE);
+   brw_set_dest(p, insn, brw_ip_reg());
+   if (devinfo->gen >= 8) {
+      brw_set_src0(p, insn, brw_imm_d(0x0));
+   } else {
+      brw_set_src0(p, insn, brw_ip_reg());
+      brw_set_src1(p, insn, brw_imm_d(0x0));
+   }
+
+   if (devinfo->gen < 6) {
+      brw_inst_set_gen4_pop_count(devinfo, insn,
+                                  p->if_depth_in_loop[p->loop_stack_depth]);
+   }
+   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
+   brw_inst_set_exec_size(devinfo, insn,
+                          brw_inst_exec_size(devinfo, p->current));
+   return insn;
+}
+
+brw_inst *
+gen6_HALT(struct brw_codegen *p)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   brw_inst *insn;
+
+   insn = next_insn(p, BRW_OPCODE_HALT);
+   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   if (devinfo->gen >= 8) {
+      brw_set_src0(p, insn, brw_imm_d(0x0));
+   } else {
+      brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
+   }
+
+   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
+   brw_inst_set_exec_size(devinfo, insn,
+                          brw_inst_exec_size(devinfo, p->current));
+   return insn;
+}
+
+/* DO/WHILE loop:
+ *
+ * The DO/WHILE is just an unterminated loop -- break or continue are
+ * used for control within the loop.  We have a few ways they can be
+ * done.
+ *
+ * For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
+ * jip and no DO instruction.
+ *
+ * For non-uniform control flow pre-gen6, there's a DO instruction to
+ * push the mask, and a WHILE to jump back, and BREAK to get out and
+ * pop the mask.
+ *
+ * For gen6, there's no more mask stack, so no need for DO.  WHILE
+ * just points back to the first instruction of the loop.
+ */
+brw_inst *
+brw_DO(struct brw_codegen *p, unsigned execute_size)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+
+   if (devinfo->gen >= 6 || p->single_program_flow) {
+      push_loop_stack(p, &p->store[p->nr_insn]);
+      return &p->store[p->nr_insn];
+   } else {
+      brw_inst *insn = next_insn(p, BRW_OPCODE_DO);
+
+      push_loop_stack(p, insn);
+
+      /* Override the defaults for this instruction:
+       */
+      brw_set_dest(p, insn, brw_null_reg());
+      brw_set_src0(p, insn, brw_null_reg());
+      brw_set_src1(p, insn, brw_null_reg());
+
+      brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
+      brw_inst_set_exec_size(devinfo, insn, execute_size);
+      brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE);
+
+      return insn;
+   }
+}
+
+/**
+ * For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
+ * instruction here.
+ *
+ * For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
+ * nesting, since it can always just point to the end of the block/current loop.
+ */
+static void
+brw_patch_break_cont(struct brw_codegen *p, brw_inst *while_inst)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   brw_inst *do_inst = get_inner_do_insn(p);
+   brw_inst *inst;
+   unsigned br = brw_jump_scale(devinfo);
+
+   assert(devinfo->gen < 6);
+
+   for (inst = while_inst - 1; inst != do_inst; inst--) {
+      /* If the jump count is != 0, that means that this instruction has already
+       * been patched because it's part of a loop inside of the one we're
+       * patching.
+       */
+      if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_BREAK &&
+          brw_inst_gen4_jump_count(devinfo, inst) == 0) {
+         brw_inst_set_gen4_jump_count(devinfo, inst, br*((while_inst - inst) + 1));
+      } else if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_CONTINUE &&
+                 brw_inst_gen4_jump_count(devinfo, inst) == 0) {
+         brw_inst_set_gen4_jump_count(devinfo, inst, br * (while_inst - inst));
+      }
+   }
+}
+
+brw_inst *
+brw_WHILE(struct brw_codegen *p)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   brw_inst *insn, *do_insn;
+   unsigned br = brw_jump_scale(devinfo);
+
+   if (devinfo->gen >= 6) {
+      insn = next_insn(p, BRW_OPCODE_WHILE);
+      do_insn = get_inner_do_insn(p);
+
+      if (devinfo->gen >= 8) {
+         brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+         brw_set_src0(p, insn, brw_imm_d(0));
+         brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
+      } else if (devinfo->gen == 7) {
+         brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+         brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+         brw_set_src1(p, insn, brw_imm_w(0));
+         brw_inst_set_jip(devinfo, insn, br * (do_insn - insn));
+      } else {
+         brw_set_dest(p, insn, brw_imm_w(0));
+         brw_inst_set_gen6_jump_count(devinfo, insn, br * (do_insn - insn));
+         brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+         brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      }
+
+      brw_inst_set_exec_size(devinfo, insn,
+                             brw_inst_exec_size(devinfo, p->current));
+
+   } else {
+      if (p->single_program_flow) {
+	 insn = next_insn(p, BRW_OPCODE_ADD);
+         do_insn = get_inner_do_insn(p);
+
+	 brw_set_dest(p, insn, brw_ip_reg());
+	 brw_set_src0(p, insn, brw_ip_reg());
+	 brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
+         brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
+      } else {
+	 insn = next_insn(p, BRW_OPCODE_WHILE);
+         do_insn = get_inner_do_insn(p);
+
+         assert(brw_inst_opcode(devinfo, do_insn) == BRW_OPCODE_DO);
+
+	 brw_set_dest(p, insn, brw_ip_reg());
+	 brw_set_src0(p, insn, brw_ip_reg());
+	 brw_set_src1(p, insn, brw_imm_d(0));
+
+         brw_inst_set_exec_size(devinfo, insn, brw_inst_exec_size(devinfo, do_insn));
+         brw_inst_set_gen4_jump_count(devinfo, insn, br * (do_insn - insn + 1));
+         brw_inst_set_gen4_pop_count(devinfo, insn, 0);
+
+	 brw_patch_break_cont(p, insn);
+      }
+   }
+   brw_inst_set_qtr_control(devinfo, insn, BRW_COMPRESSION_NONE);
+
+   p->loop_stack_depth--;
+
+   return insn;
+}
+
+/* FORWARD JUMPS:
+ */
+void brw_land_fwd_jump(struct brw_codegen *p, int jmp_insn_idx)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   brw_inst *jmp_insn = &p->store[jmp_insn_idx];
+   unsigned jmpi = 1;
+
+   if (devinfo->gen >= 5)
+      jmpi = 2;
+
+   assert(brw_inst_opcode(devinfo, jmp_insn) == BRW_OPCODE_JMPI);
+   assert(brw_inst_src1_reg_file(devinfo, jmp_insn) == BRW_IMMEDIATE_VALUE);
+
+   brw_inst_set_gen4_jump_count(devinfo, jmp_insn,
+                                jmpi * (p->nr_insn - jmp_insn_idx - 1));
+}
+
+/* To integrate with the above, it makes sense that the comparison
+ * instruction should populate the flag register.  It might be simpler
+ * just to use the flag reg for most WM tasks?
+ */
+void brw_CMP(struct brw_codegen *p,
+	     struct brw_reg dest,
+	     unsigned conditional,
+	     struct brw_reg src0,
+	     struct brw_reg src1)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   brw_inst *insn = next_insn(p, BRW_OPCODE_CMP);
+
+   brw_inst_set_cond_modifier(devinfo, insn, conditional);
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src0);
+   brw_set_src1(p, insn, src1);
+
+   /* Item WaCMPInstNullDstForcesThreadSwitch in the Haswell Bspec workarounds
+    * page says:
+    *    "Any CMP instruction with a null destination must use a {switch}."
+    *
+    * It also applies to other Gen7 platforms (IVB, BYT) even though it isn't
+    * mentioned on their work-arounds pages.
+    */
+   if (devinfo->gen == 7) {
+      if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
+          dest.nr == BRW_ARF_NULL) {
+         brw_inst_set_thread_control(devinfo, insn, BRW_THREAD_SWITCH);
+      }
+   }
+}
+
+/***********************************************************************
+ * Helpers for the various SEND message types:
+ */
+
+/** Extended math function, float[8].
+ */
+void gen4_math(struct brw_codegen *p,
+	       struct brw_reg dest,
+	       unsigned function,
+	       unsigned msg_reg_nr,
+	       struct brw_reg src,
+	       unsigned precision )
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
+   unsigned data_type;
+   if (has_scalar_region(src)) {
+      data_type = BRW_MATH_DATA_SCALAR;
+   } else {
+      data_type = BRW_MATH_DATA_VECTOR;
+   }
+
+   assert(devinfo->gen < 6);
+
+   /* Example code doesn't set predicate_control for send
+    * instructions.
+    */
+   brw_inst_set_pred_control(devinfo, insn, 0);
+   brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
+
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src);
+   brw_set_math_message(p,
+                        insn,
+                        function,
+                        src.type == BRW_REGISTER_TYPE_D,
+                        precision,
+                        data_type);
+}
+
+void gen6_math(struct brw_codegen *p,
+	       struct brw_reg dest,
+	       unsigned function,
+	       struct brw_reg src0,
+	       struct brw_reg src1)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   brw_inst *insn = next_insn(p, BRW_OPCODE_MATH);
+
+   assert(devinfo->gen >= 6);
+
+   assert(dest.file == BRW_GENERAL_REGISTER_FILE ||
+          (devinfo->gen >= 7 && dest.file == BRW_MESSAGE_REGISTER_FILE));
+
+   assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
+   if (devinfo->gen == 6) {
+      assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
+      assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
+   }
+
+   if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT ||
+       function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER ||
+       function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
+      assert(src0.type != BRW_REGISTER_TYPE_F);
+      assert(src1.type != BRW_REGISTER_TYPE_F);
+      assert(src1.file == BRW_GENERAL_REGISTER_FILE ||
+             (devinfo->gen >= 8 && src1.file == BRW_IMMEDIATE_VALUE));
+   } else {
+      assert(src0.type == BRW_REGISTER_TYPE_F);
+      assert(src1.type == BRW_REGISTER_TYPE_F);
+   }
+
+   /* Source modifiers are ignored for extended math instructions on Gen6. */
+   if (devinfo->gen == 6) {
+      assert(!src0.negate);
+      assert(!src0.abs);
+      assert(!src1.negate);
+      assert(!src1.abs);
+   }
+
+   brw_inst_set_math_function(devinfo, insn, function);
+
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src0);
+   brw_set_src1(p, insn, src1);
+}
+
+/**
+ * Return the right surface index to access the thread scratch space using
+ * stateless dataport messages.
+ */
+unsigned
+brw_scratch_surface_idx(const struct brw_codegen *p)
+{
+   /* The scratch space is thread-local so IA coherency is unnecessary. */
+   if (p->devinfo->gen >= 8)
+      return GEN8_BTI_STATELESS_NON_COHERENT;
+   else
+      return BRW_BTI_STATELESS;
+}
+
+/**
+ * Write a block of OWORDs (half a GRF each) from the scratch buffer,
+ * using a constant offset per channel.
+ *
+ * The offset must be aligned to oword size (16 bytes).  Used for
+ * register spilling.
+ */
+void brw_oword_block_write_scratch(struct brw_codegen *p,
+				   struct brw_reg mrf,
+				   int num_regs,
+				   unsigned offset)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   const unsigned target_cache =
+      (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
+       devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
+       BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
+   uint32_t msg_type;
+
+   if (devinfo->gen >= 6)
+      offset /= 16;
+
+   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
+
+   const unsigned mlen = 1 + num_regs;
+
+   /* Set up the message header.  This is g0, with g0.2 filled with
+    * the offset.  We don't want to leave our offset around in g0 or
+    * it'll screw up texture samples, so set it up inside the message
+    * reg.
+    */
+   {
+      brw_push_insn_state(p);
+      brw_set_default_exec_size(p, BRW_EXECUTE_8);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+
+      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+      /* set message header global offset field (reg 0, element 2) */
+      brw_MOV(p,
+	      retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
+				  mrf.nr,
+				  2), BRW_REGISTER_TYPE_UD),
+	      brw_imm_ud(offset));
+
+      brw_pop_insn_state(p);
+   }
+
+   {
+      struct brw_reg dest;
+      brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
+      int send_commit_msg;
+      struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
+					 BRW_REGISTER_TYPE_UW);
+
+      brw_inst_set_compression(devinfo, insn, false);
+
+      if (brw_inst_exec_size(devinfo, insn) >= 16)
+	 src_header = vec16(src_header);
+
+      assert(brw_inst_pred_control(devinfo, insn) == BRW_PREDICATE_NONE);
+      if (devinfo->gen < 6)
+         brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
+
+      /* Until gen6, writes followed by reads from the same location
+       * are not guaranteed to be ordered unless write_commit is set.
+       * If set, then a no-op write is issued to the destination
+       * register to set a dependency, and a read from the destination
+       * can be used to ensure the ordering.
+       *
+       * For gen6, only writes between different threads need ordering
+       * protection.  Our use of DP writes is all about register
+       * spilling within a thread.
+       */
+      if (devinfo->gen >= 6) {
+	 dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
+	 send_commit_msg = 0;
+      } else {
+	 dest = src_header;
+	 send_commit_msg = 1;
+      }
+
+      brw_set_dest(p, insn, dest);
+      if (devinfo->gen >= 6) {
+	 brw_set_src0(p, insn, mrf);
+      } else {
+	 brw_set_src0(p, insn, brw_null_reg());
+      }
+
+      if (devinfo->gen >= 6)
+	 msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
+      else
+	 msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
+
+      brw_set_dp_write_message(p,
+			       insn,
+                               brw_scratch_surface_idx(p),
+			       BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
+			       msg_type,
+                               target_cache,
+			       mlen,
+			       true, /* header_present */
+			       0, /* not a render target */
+			       send_commit_msg, /* response_length */
+			       0, /* eot */
+			       send_commit_msg);
+   }
+}
+
+
+/**
+ * Read a block of owords (half a GRF each) from the scratch buffer
+ * using a constant index per channel.
+ *
+ * Offset must be aligned to oword size (16 bytes).  Used for register
+ * spilling.
+ */
+void
+brw_oword_block_read_scratch(struct brw_codegen *p,
+			     struct brw_reg dest,
+			     struct brw_reg mrf,
+			     int num_regs,
+			     unsigned offset)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+
+   if (devinfo->gen >= 6)
+      offset /= 16;
+
+   if (p->devinfo->gen >= 7) {
+      /* On gen 7 and above, we no longer have message registers and we can
+       * send from any register we want.  By using the destination register
+       * for the message, we guarantee that the implied message write won't
+       * accidentally overwrite anything.  This has been a problem because
+       * the MRF registers and source for the final FB write are both fixed
+       * and may overlap.
+       */
+      mrf = retype(dest, BRW_REGISTER_TYPE_UD);
+   } else {
+      mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
+   }
+   dest = retype(dest, BRW_REGISTER_TYPE_UW);
+
+   const unsigned rlen = num_regs;
+   const unsigned target_cache =
+      (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
+       devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
+       BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
+
+   {
+      brw_push_insn_state(p);
+      brw_set_default_exec_size(p, BRW_EXECUTE_8);
+      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+      brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+      /* set message header global offset field (reg 0, element 2) */
+      brw_MOV(p, get_element_ud(mrf, 2), brw_imm_ud(offset));
+
+      brw_pop_insn_state(p);
+   }
+
+   {
+      brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
+
+      assert(brw_inst_pred_control(devinfo, insn) == 0);
+      brw_inst_set_compression(devinfo, insn, false);
+
+      brw_set_dest(p, insn, dest);	/* UW? */
+      if (devinfo->gen >= 6) {
+	 brw_set_src0(p, insn, mrf);
+      } else {
+	 brw_set_src0(p, insn, brw_null_reg());
+         brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
+      }
+
+      brw_set_dp_read_message(p,
+			      insn,
+                              brw_scratch_surface_idx(p),
+			      BRW_DATAPORT_OWORD_BLOCK_DWORDS(num_regs * 8),
+			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
+			      target_cache,
+			      1, /* msg_length */
+                              true, /* header_present */
+			      rlen);
+   }
+}
+
+void
+gen7_block_read_scratch(struct brw_codegen *p,
+                        struct brw_reg dest,
+                        int num_regs,
+                        unsigned offset)
+{
+   brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
+   assert(brw_inst_pred_control(p->devinfo, insn) == BRW_PREDICATE_NONE);
+
+   brw_set_dest(p, insn, retype(dest, BRW_REGISTER_TYPE_UW));
+
+   /* The HW requires that the header is present; this is to get the g0.5
+    * scratch offset.
+    */
+   brw_set_src0(p, insn, brw_vec8_grf(0, 0));
+
+   /* According to the docs, offset is "A 12-bit HWord offset into the memory
+    * Immediate Memory buffer as specified by binding table 0xFF."  An HWORD
+    * is 32 bytes, which happens to be the size of a register.
+    */
+   offset /= REG_SIZE;
+   assert(offset < (1 << 12));
+
+   gen7_set_dp_scratch_message(p, insn,
+                               false, /* scratch read */
+                               false, /* OWords */
+                               false, /* invalidate after read */
+                               num_regs,
+                               offset,
+                               1,        /* mlen: just g0 */
+                               num_regs, /* rlen */
+                               true);    /* header present */
+}
+
+/**
+ * Read float[4] vectors from the data port constant cache.
+ * Location (in buffer) should be a multiple of 16.
+ * Used for fetching shader constants.
+ */
+void brw_oword_block_read(struct brw_codegen *p,
+			  struct brw_reg dest,
+			  struct brw_reg mrf,
+			  uint32_t offset,
+			  uint32_t bind_table_index)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   const unsigned target_cache =
+      (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_CONSTANT_CACHE :
+       BRW_DATAPORT_READ_TARGET_DATA_CACHE);
+   const unsigned exec_size = 1 << brw_inst_exec_size(devinfo, p->current);
+
+   /* On newer hardware, offset is in units of owords. */
+   if (devinfo->gen >= 6)
+      offset /= 16;
+
+   mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
+
+   brw_push_insn_state(p);
+   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+   brw_push_insn_state(p);
+   brw_set_default_exec_size(p, BRW_EXECUTE_8);
+   brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+   /* set message header global offset field (reg 0, element 2) */
+   brw_MOV(p,
+	   retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
+			       mrf.nr,
+			       2), BRW_REGISTER_TYPE_UD),
+	   brw_imm_ud(offset));
+   brw_pop_insn_state(p);
+
+   brw_inst *insn = next_insn(p, BRW_OPCODE_SEND);
+
+   /* cast dest to a uword[8] vector */
+   dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
+
+   brw_set_dest(p, insn, dest);
+   if (devinfo->gen >= 6) {
+      brw_set_src0(p, insn, mrf);
+   } else {
+      brw_set_src0(p, insn, brw_null_reg());
+      brw_inst_set_base_mrf(devinfo, insn, mrf.nr);
+   }
+
+   brw_set_dp_read_message(p, insn, bind_table_index,
+                           BRW_DATAPORT_OWORD_BLOCK_DWORDS(exec_size),
+			   BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
+			   target_cache,
+			   1, /* msg_length */
+                           true, /* header_present */
+			   DIV_ROUND_UP(exec_size, 8)); /* response_length */
+
+   brw_pop_insn_state(p);
+}
+
+
+void brw_fb_WRITE(struct brw_codegen *p,
+                  struct brw_reg payload,
+                  struct brw_reg implied_header,
+                  unsigned msg_control,
+                  unsigned binding_table_index,
+                  unsigned msg_length,
+                  unsigned response_length,
+                  bool eot,
+                  bool last_render_target,
+                  bool header_present)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   const unsigned target_cache =
+      (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
+       BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
+   brw_inst *insn;
+   unsigned msg_type;
+   struct brw_reg dest, src0;
+
+   if (brw_inst_exec_size(devinfo, p->current) >= BRW_EXECUTE_16)
+      dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
+   else
+      dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
+
+   if (devinfo->gen >= 6) {
+      insn = next_insn(p, BRW_OPCODE_SENDC);
+   } else {
+      insn = next_insn(p, BRW_OPCODE_SEND);
+   }
+   brw_inst_set_compression(devinfo, insn, false);
+
+   if (devinfo->gen >= 6) {
+      /* headerless version, just submit color payload */
+      src0 = payload;
+
+      msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
+   } else {
+      assert(payload.file == BRW_MESSAGE_REGISTER_FILE);
+      brw_inst_set_base_mrf(devinfo, insn, payload.nr);
+      src0 = implied_header;
+
+      msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
+   }
+
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src0);
+   brw_set_dp_write_message(p,
+			    insn,
+			    binding_table_index,
+			    msg_control,
+			    msg_type,
+                            target_cache,
+			    msg_length,
+			    header_present,
+			    last_render_target,
+			    response_length,
+			    eot,
+			    0 /* send_commit_msg */);
+}
+
+brw_inst *
+gen9_fb_READ(struct brw_codegen *p,
+             struct brw_reg dst,
+             struct brw_reg payload,
+             unsigned binding_table_index,
+             unsigned msg_length,
+             unsigned response_length,
+             bool per_sample)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   assert(devinfo->gen >= 9);
+   const unsigned msg_subtype =
+      brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16 ? 0 : 1;
+   brw_inst *insn = next_insn(p, BRW_OPCODE_SENDC);
+
+   brw_set_dest(p, insn, dst);
+   brw_set_src0(p, insn, payload);
+   brw_set_dp_read_message(p, insn, binding_table_index,
+                           per_sample << 5 | msg_subtype,
+                           GEN9_DATAPORT_RC_RENDER_TARGET_READ,
+                           GEN6_SFID_DATAPORT_RENDER_CACHE,
+                           msg_length, true /* header_present */,
+                           response_length);
+   brw_inst_set_rt_slot_group(devinfo, insn,
+                              brw_inst_qtr_control(devinfo, p->current) / 2);
+
+   return insn;
+}
+
+/**
+ * Texture sample instruction.
+ * Note: the msg_type plus msg_length values determine exactly what kind
+ * of sampling operation is performed.  See volume 4, page 161 of docs.
+ */
+void brw_SAMPLE(struct brw_codegen *p,
+		struct brw_reg dest,
+		unsigned msg_reg_nr,
+		struct brw_reg src0,
+		unsigned binding_table_index,
+		unsigned sampler,
+		unsigned msg_type,
+		unsigned response_length,
+		unsigned msg_length,
+		unsigned header_present,
+		unsigned simd_mode,
+		unsigned return_format)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   brw_inst *insn;
+
+   if (msg_reg_nr != -1)
+      gen6_resolve_implied_move(p, &src0, msg_reg_nr);
+
+   insn = next_insn(p, BRW_OPCODE_SEND);
+   brw_inst_set_pred_control(devinfo, insn, BRW_PREDICATE_NONE); /* XXX */
+
+   /* From the 965 PRM (volume 4, part 1, section 14.2.41):
+    *
+    *    "Instruction compression is not allowed for this instruction (that
+    *     is, send). The hardware behavior is undefined if this instruction is
+    *     set as compressed. However, compress control can be set to "SecHalf"
+    *     to affect the EMask generation."
+    *
+    * No similar wording is found in later PRMs, but there are examples
+    * utilizing send with SecHalf.  More importantly, SIMD8 sampler messages
+    * are allowed in SIMD16 mode and they could not work without SecHalf.  For
+    * these reasons, we allow BRW_COMPRESSION_2NDHALF here.
+    */
+   brw_inst_set_compression(devinfo, insn, false);
+
+   if (devinfo->gen < 6)
+      brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
+
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src0);
+   brw_set_sampler_message(p, insn,
+                           binding_table_index,
+                           sampler,
+                           msg_type,
+                           response_length,
+                           msg_length,
+                           header_present,
+                           simd_mode,
+                           return_format);
+}
+
+/* Adjust the message header's sampler state pointer to
+ * select the correct group of 16 samplers.
+ */
+void brw_adjust_sampler_state_pointer(struct brw_codegen *p,
+                                      struct brw_reg header,
+                                      struct brw_reg sampler_index)
+{
+   /* The "Sampler Index" field can only store values between 0 and 15.
+    * However, we can add an offset to the "Sampler State Pointer"
+    * field, effectively selecting a different set of 16 samplers.
+    *
+    * The "Sampler State Pointer" needs to be aligned to a 32-byte
+    * offset, and each sampler state is only 16-bytes, so we can't
+    * exclusively use the offset - we have to use both.
+    */
+
+   const struct gen_device_info *devinfo = p->devinfo;
+
+   if (sampler_index.file == BRW_IMMEDIATE_VALUE) {
+      const int sampler_state_size = 16; /* 16 bytes */
+      uint32_t sampler = sampler_index.ud;
+
+      if (sampler >= 16) {
+         assert(devinfo->is_haswell || devinfo->gen >= 8);
+         brw_ADD(p,
+                 get_element_ud(header, 3),
+                 get_element_ud(brw_vec8_grf(0, 0), 3),
+                 brw_imm_ud(16 * (sampler / 16) * sampler_state_size));
+      }
+   } else {
+      /* Non-const sampler array indexing case */
+      if (devinfo->gen < 8 && !devinfo->is_haswell) {
+         return;
+      }
+
+      struct brw_reg temp = get_element_ud(header, 3);
+
+      brw_AND(p, temp, get_element_ud(sampler_index, 0), brw_imm_ud(0x0f0));
+      brw_SHL(p, temp, temp, brw_imm_ud(4));
+      brw_ADD(p,
+              get_element_ud(header, 3),
+              get_element_ud(brw_vec8_grf(0, 0), 3),
+              temp);
+   }
+}
+
+/* All these variables are pretty confusing - we might be better off
+ * using bitmasks and macros for this, in the old style.  Or perhaps
+ * just having the caller instantiate the fields in dword3 itself.
+ */
+void brw_urb_WRITE(struct brw_codegen *p,
+		   struct brw_reg dest,
+		   unsigned msg_reg_nr,
+		   struct brw_reg src0,
+                   enum brw_urb_write_flags flags,
+		   unsigned msg_length,
+		   unsigned response_length,
+		   unsigned offset,
+		   unsigned swizzle)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   brw_inst *insn;
+
+   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
+
+   if (devinfo->gen >= 7 && !(flags & BRW_URB_WRITE_USE_CHANNEL_MASKS)) {
+      /* Enable Channel Masks in the URB_WRITE_HWORD message header */
+      brw_push_insn_state(p);
+      brw_set_default_access_mode(p, BRW_ALIGN_1);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+      brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
+		       BRW_REGISTER_TYPE_UD),
+	        retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
+		brw_imm_ud(0xff00));
+      brw_pop_insn_state(p);
+   }
+
+   insn = next_insn(p, BRW_OPCODE_SEND);
+
+   assert(msg_length < BRW_MAX_MRF(devinfo->gen));
+
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src0);
+   brw_set_src1(p, insn, brw_imm_d(0));
+
+   if (devinfo->gen < 6)
+      brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
+
+   brw_set_urb_message(p,
+		       insn,
+		       flags,
+		       msg_length,
+		       response_length,
+		       offset,
+		       swizzle);
+}
+
+struct brw_inst *
+brw_send_indirect_message(struct brw_codegen *p,
+                          unsigned sfid,
+                          struct brw_reg dst,
+                          struct brw_reg payload,
+                          struct brw_reg desc)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   struct brw_inst *send;
+   int setup;
+
+   dst = retype(dst, BRW_REGISTER_TYPE_UW);
+
+   assert(desc.type == BRW_REGISTER_TYPE_UD);
+
+   /* We hold on to the setup instruction (the SEND in the direct case, the OR
+    * in the indirect case) by its index in the instruction store.  The
+    * pointer returned by next_insn() may become invalid if emitting the SEND
+    * in the indirect case reallocs the store.
+    */
+
+   if (desc.file == BRW_IMMEDIATE_VALUE) {
+      setup = p->nr_insn;
+      send = next_insn(p, BRW_OPCODE_SEND);
+      brw_set_src1(p, send, desc);
+
+   } else {
+      struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
+
+      brw_push_insn_state(p);
+      brw_set_default_access_mode(p, BRW_ALIGN_1);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+
+      /* Load the indirect descriptor to an address register using OR so the
+       * caller can specify additional descriptor bits with the usual
+       * brw_set_*_message() helper functions.
+       */
+      setup = p->nr_insn;
+      brw_OR(p, addr, desc, brw_imm_ud(0));
+
+      brw_pop_insn_state(p);
+
+      send = next_insn(p, BRW_OPCODE_SEND);
+      brw_set_src1(p, send, addr);
+   }
+
+   if (dst.width < BRW_EXECUTE_8)
+      brw_inst_set_exec_size(devinfo, send, dst.width);
+
+   brw_set_dest(p, send, dst);
+   brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
+   brw_inst_set_sfid(devinfo, send, sfid);
+
+   return &p->store[setup];
+}
+
+static struct brw_inst *
+brw_send_indirect_surface_message(struct brw_codegen *p,
+                                  unsigned sfid,
+                                  struct brw_reg dst,
+                                  struct brw_reg payload,
+                                  struct brw_reg surface,
+                                  unsigned message_len,
+                                  unsigned response_len,
+                                  bool header_present)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   struct brw_inst *insn;
+
+   if (surface.file != BRW_IMMEDIATE_VALUE) {
+      struct brw_reg addr = retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
+
+      brw_push_insn_state(p);
+      brw_set_default_access_mode(p, BRW_ALIGN_1);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+
+      /* Mask out invalid bits from the surface index to avoid hangs e.g. when
+       * some surface array is accessed out of bounds.
+       */
+      insn = brw_AND(p, addr,
+                     suboffset(vec1(retype(surface, BRW_REGISTER_TYPE_UD)),
+                               BRW_GET_SWZ(surface.swizzle, 0)),
+                     brw_imm_ud(0xff));
+
+      brw_pop_insn_state(p);
+
+      surface = addr;
+   }
+
+   insn = brw_send_indirect_message(p, sfid, dst, payload, surface);
+   brw_inst_set_mlen(devinfo, insn, message_len);
+   brw_inst_set_rlen(devinfo, insn, response_len);
+   brw_inst_set_header_present(devinfo, insn, header_present);
+
+   return insn;
+}
+
+static bool
+while_jumps_before_offset(const struct gen_device_info *devinfo,
+                          brw_inst *insn, int while_offset, int start_offset)
+{
+   int scale = 16 / brw_jump_scale(devinfo);
+   int jip = devinfo->gen == 6 ? brw_inst_gen6_jump_count(devinfo, insn)
+                               : brw_inst_jip(devinfo, insn);
+   assert(jip < 0);
+   return while_offset + jip * scale <= start_offset;
+}
+
+
+static int
+brw_find_next_block_end(struct brw_codegen *p, int start_offset)
+{
+   int offset;
+   void *store = p->store;
+   const struct gen_device_info *devinfo = p->devinfo;
+
+   int depth = 0;
+
+   for (offset = next_offset(devinfo, store, start_offset);
+        offset < p->next_insn_offset;
+        offset = next_offset(devinfo, store, offset)) {
+      brw_inst *insn = store + offset;
+
+      switch (brw_inst_opcode(devinfo, insn)) {
+      case BRW_OPCODE_IF:
+         depth++;
+         break;
+      case BRW_OPCODE_ENDIF:
+         if (depth == 0)
+            return offset;
+         depth--;
+         break;
+      case BRW_OPCODE_WHILE:
+         /* If the while doesn't jump before our instruction, it's the end
+          * of a sibling do...while loop.  Ignore it.
+          */
+         if (!while_jumps_before_offset(devinfo, insn, offset, start_offset))
+            continue;
+         /* fallthrough */
+      case BRW_OPCODE_ELSE:
+      case BRW_OPCODE_HALT:
+         if (depth == 0)
+            return offset;
+      }
+   }
+
+   return 0;
+}
+
+/* There is no DO instruction on gen6, so to find the end of the loop
+ * we have to see if the loop is jumping back before our start
+ * instruction.
+ */
+static int
+brw_find_loop_end(struct brw_codegen *p, int start_offset)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   int offset;
+   void *store = p->store;
+
+   assert(devinfo->gen >= 6);
+
+   /* Always start after the instruction (such as a WHILE) we're trying to fix
+    * up.
+    */
+   for (offset = next_offset(devinfo, store, start_offset);
+        offset < p->next_insn_offset;
+        offset = next_offset(devinfo, store, offset)) {
+      brw_inst *insn = store + offset;
+
+      if (brw_inst_opcode(devinfo, insn) == BRW_OPCODE_WHILE) {
+	 if (while_jumps_before_offset(devinfo, insn, offset, start_offset))
+	    return offset;
+      }
+   }
+   assert(!"not reached");
+   return start_offset;
+}
+
+/* After program generation, go back and update the UIP and JIP of
+ * BREAK, CONT, and HALT instructions to their correct locations.
+ */
+void
+brw_set_uip_jip(struct brw_codegen *p, int start_offset)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   int offset;
+   int br = brw_jump_scale(devinfo);
+   int scale = 16 / br;
+   void *store = p->store;
+
+   if (devinfo->gen < 6)
+      return;
+
+   for (offset = start_offset; offset < p->next_insn_offset; offset += 16) {
+      brw_inst *insn = store + offset;
+      assert(brw_inst_cmpt_control(devinfo, insn) == 0);
+
+      int block_end_offset = brw_find_next_block_end(p, offset);
+      switch (brw_inst_opcode(devinfo, insn)) {
+      case BRW_OPCODE_BREAK:
+         assert(block_end_offset != 0);
+         brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
+	 /* Gen7 UIP points to WHILE; Gen6 points just after it */
+         brw_inst_set_uip(devinfo, insn,
+	    (brw_find_loop_end(p, offset) - offset +
+             (devinfo->gen == 6 ? 16 : 0)) / scale);
+	 break;
+      case BRW_OPCODE_CONTINUE:
+         assert(block_end_offset != 0);
+         brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
+         brw_inst_set_uip(devinfo, insn,
+            (brw_find_loop_end(p, offset) - offset) / scale);
+
+         assert(brw_inst_uip(devinfo, insn) != 0);
+         assert(brw_inst_jip(devinfo, insn) != 0);
+	 break;
+
+      case BRW_OPCODE_ENDIF: {
+         int32_t jump = (block_end_offset == 0) ?
+                        1 * br : (block_end_offset - offset) / scale;
+         if (devinfo->gen >= 7)
+            brw_inst_set_jip(devinfo, insn, jump);
+         else
+            brw_inst_set_gen6_jump_count(devinfo, insn, jump);
+	 break;
+      }
+
+      case BRW_OPCODE_HALT:
+	 /* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
+	  *
+	  *    "In case of the halt instruction not inside any conditional
+	  *     code block, the value of <JIP> and <UIP> should be the
+	  *     same. In case of the halt instruction inside conditional code
+	  *     block, the <UIP> should be the end of the program, and the
+	  *     <JIP> should be end of the most inner conditional code block."
+	  *
+	  * The uip will have already been set by whoever set up the
+	  * instruction.
+	  */
+	 if (block_end_offset == 0) {
+            brw_inst_set_jip(devinfo, insn, brw_inst_uip(devinfo, insn));
+	 } else {
+            brw_inst_set_jip(devinfo, insn, (block_end_offset - offset) / scale);
+	 }
+         assert(brw_inst_uip(devinfo, insn) != 0);
+         assert(brw_inst_jip(devinfo, insn) != 0);
+	 break;
+      }
+   }
+}
+
+void brw_ff_sync(struct brw_codegen *p,
+		   struct brw_reg dest,
+		   unsigned msg_reg_nr,
+		   struct brw_reg src0,
+		   bool allocate,
+		   unsigned response_length,
+		   bool eot)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   brw_inst *insn;
+
+   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
+
+   insn = next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src0);
+   brw_set_src1(p, insn, brw_imm_d(0));
+
+   if (devinfo->gen < 6)
+      brw_inst_set_base_mrf(devinfo, insn, msg_reg_nr);
+
+   brw_set_ff_sync_message(p,
+			   insn,
+			   allocate,
+			   response_length,
+			   eot);
+}
+
+/**
+ * Emit the SEND instruction necessary to generate stream output data on Gen6
+ * (for transform feedback).
+ *
+ * If send_commit_msg is true, this is the last piece of stream output data
+ * from this thread, so send the data as a committed write.  According to the
+ * Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
+ *
+ *   "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
+ *   writes are complete by sending the final write as a committed write."
+ */
+void
+brw_svb_write(struct brw_codegen *p,
+              struct brw_reg dest,
+              unsigned msg_reg_nr,
+              struct brw_reg src0,
+              unsigned binding_table_index,
+              bool   send_commit_msg)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   const unsigned target_cache =
+      (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
+       devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
+       BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
+   brw_inst *insn;
+
+   gen6_resolve_implied_move(p, &src0, msg_reg_nr);
+
+   insn = next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, insn, dest);
+   brw_set_src0(p, insn, src0);
+   brw_set_src1(p, insn, brw_imm_d(0));
+   brw_set_dp_write_message(p, insn,
+                            binding_table_index,
+                            0, /* msg_control: ignored */
+                            GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
+                            target_cache,
+                            1, /* msg_length */
+                            true, /* header_present */
+                            0, /* last_render_target: ignored */
+                            send_commit_msg, /* response_length */
+                            0, /* end_of_thread */
+                            send_commit_msg); /* send_commit_msg */
+}
+
+static unsigned
+brw_surface_payload_size(struct brw_codegen *p,
+                         unsigned num_channels,
+                         bool has_simd4x2,
+                         bool has_simd16)
+{
+   if (has_simd4x2 &&
+       brw_inst_access_mode(p->devinfo, p->current) == BRW_ALIGN_16)
+      return 1;
+   else if (has_simd16 &&
+            brw_inst_exec_size(p->devinfo, p->current) == BRW_EXECUTE_16)
+      return 2 * num_channels;
+   else
+      return num_channels;
+}
+
+static void
+brw_set_dp_untyped_atomic_message(struct brw_codegen *p,
+                                  brw_inst *insn,
+                                  unsigned atomic_op,
+                                  bool response_expected)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   unsigned msg_control =
+      atomic_op | /* Atomic Operation Type: BRW_AOP_* */
+      (response_expected ? 1 << 5 : 0); /* Return data expected */
+
+   if (devinfo->gen >= 8 || devinfo->is_haswell) {
+      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
+         if (brw_inst_exec_size(devinfo, p->current) != BRW_EXECUTE_16)
+            msg_control |= 1 << 4; /* SIMD8 mode */
+
+         brw_inst_set_dp_msg_type(devinfo, insn,
+                                  HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP);
+      } else {
+         brw_inst_set_dp_msg_type(devinfo, insn,
+            HSW_DATAPORT_DC_PORT1_UNTYPED_ATOMIC_OP_SIMD4X2);
+      }
+   } else {
+      brw_inst_set_dp_msg_type(devinfo, insn,
+                               GEN7_DATAPORT_DC_UNTYPED_ATOMIC_OP);
+
+      if (brw_inst_exec_size(devinfo, p->current) != BRW_EXECUTE_16)
+         msg_control |= 1 << 4; /* SIMD8 mode */
+   }
+
+   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
+}
+
+void
+brw_untyped_atomic(struct brw_codegen *p,
+                   struct brw_reg dst,
+                   struct brw_reg payload,
+                   struct brw_reg surface,
+                   unsigned atomic_op,
+                   unsigned msg_length,
+                   bool response_expected)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
+                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
+                          GEN7_SFID_DATAPORT_DATA_CACHE);
+   const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
+   /* Mask out unused components -- This is especially important in Align16
+    * mode on generations that don't have native support for SIMD4x2 atomics,
+    * because unused but enabled components will cause the dataport to perform
+    * additional atomic operations on the addresses that happen to be in the
+    * uninitialized Y, Z and W coordinates of the payload.
+    */
+   const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
+   struct brw_inst *insn = brw_send_indirect_surface_message(
+      p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
+      brw_surface_payload_size(p, response_expected,
+                               devinfo->gen >= 8 || devinfo->is_haswell, true),
+      align1);
+
+   brw_set_dp_untyped_atomic_message(
+      p, insn, atomic_op, response_expected);
+}
+
+static void
+brw_set_dp_untyped_surface_read_message(struct brw_codegen *p,
+                                        struct brw_inst *insn,
+                                        unsigned num_channels)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   /* Set mask of 32-bit channels to drop. */
+   unsigned msg_control = 0xf & (0xf << num_channels);
+
+   if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
+      if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
+         msg_control |= 1 << 4; /* SIMD16 mode */
+      else
+         msg_control |= 2 << 4; /* SIMD8 mode */
+   }
+
+   brw_inst_set_dp_msg_type(devinfo, insn,
+                            (devinfo->gen >= 8 || devinfo->is_haswell ?
+                             HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_READ :
+                             GEN7_DATAPORT_DC_UNTYPED_SURFACE_READ));
+   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
+}
+
+void
+brw_untyped_surface_read(struct brw_codegen *p,
+                         struct brw_reg dst,
+                         struct brw_reg payload,
+                         struct brw_reg surface,
+                         unsigned msg_length,
+                         unsigned num_channels)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
+                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
+                          GEN7_SFID_DATAPORT_DATA_CACHE);
+   struct brw_inst *insn = brw_send_indirect_surface_message(
+      p, sfid, dst, payload, surface, msg_length,
+      brw_surface_payload_size(p, num_channels, true, true),
+      false);
+
+   brw_set_dp_untyped_surface_read_message(
+      p, insn, num_channels);
+}
+
+static void
+brw_set_dp_untyped_surface_write_message(struct brw_codegen *p,
+                                         struct brw_inst *insn,
+                                         unsigned num_channels)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   /* Set mask of 32-bit channels to drop. */
+   unsigned msg_control = 0xf & (0xf << num_channels);
+
+   if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
+      if (brw_inst_exec_size(devinfo, p->current) == BRW_EXECUTE_16)
+         msg_control |= 1 << 4; /* SIMD16 mode */
+      else
+         msg_control |= 2 << 4; /* SIMD8 mode */
+   } else {
+      if (devinfo->gen >= 8 || devinfo->is_haswell)
+         msg_control |= 0 << 4; /* SIMD4x2 mode */
+      else
+         msg_control |= 2 << 4; /* SIMD8 mode */
+   }
+
+   brw_inst_set_dp_msg_type(devinfo, insn,
+                            devinfo->gen >= 8 || devinfo->is_haswell ?
+                             HSW_DATAPORT_DC_PORT1_UNTYPED_SURFACE_WRITE :
+                             GEN7_DATAPORT_DC_UNTYPED_SURFACE_WRITE);
+   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
+}
+
+void
+brw_untyped_surface_write(struct brw_codegen *p,
+                          struct brw_reg payload,
+                          struct brw_reg surface,
+                          unsigned msg_length,
+                          unsigned num_channels)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
+                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
+                          GEN7_SFID_DATAPORT_DATA_CACHE);
+   const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
+   /* Mask out unused components -- See comment in brw_untyped_atomic(). */
+   const unsigned mask = devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
+                          WRITEMASK_X : WRITEMASK_XYZW;
+   struct brw_inst *insn = brw_send_indirect_surface_message(
+      p, sfid, brw_writemask(brw_null_reg(), mask),
+      payload, surface, msg_length, 0, align1);
+
+   brw_set_dp_untyped_surface_write_message(
+      p, insn, num_channels);
+}
+
+static void
+brw_set_dp_typed_atomic_message(struct brw_codegen *p,
+                                struct brw_inst *insn,
+                                unsigned atomic_op,
+                                bool response_expected)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   unsigned msg_control =
+      atomic_op | /* Atomic Operation Type: BRW_AOP_* */
+      (response_expected ? 1 << 5 : 0); /* Return data expected */
+
+   if (devinfo->gen >= 8 || devinfo->is_haswell) {
+      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
+         if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
+            msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
+
+         brw_inst_set_dp_msg_type(devinfo, insn,
+                                  HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP);
+      } else {
+         brw_inst_set_dp_msg_type(devinfo, insn,
+                                  HSW_DATAPORT_DC_PORT1_TYPED_ATOMIC_OP_SIMD4X2);
+      }
+
+   } else {
+      brw_inst_set_dp_msg_type(devinfo, insn,
+                               GEN7_DATAPORT_RC_TYPED_ATOMIC_OP);
+
+      if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
+         msg_control |= 1 << 4; /* Use high 8 slots of the sample mask */
+   }
+
+   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
+}
+
+void
+brw_typed_atomic(struct brw_codegen *p,
+                 struct brw_reg dst,
+                 struct brw_reg payload,
+                 struct brw_reg surface,
+                 unsigned atomic_op,
+                 unsigned msg_length,
+                 bool response_expected) {
+   const struct gen_device_info *devinfo = p->devinfo;
+   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
+                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
+                          GEN6_SFID_DATAPORT_RENDER_CACHE);
+   const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
+   /* Mask out unused components -- See comment in brw_untyped_atomic(). */
+   const unsigned mask = align1 ? WRITEMASK_XYZW : WRITEMASK_X;
+   struct brw_inst *insn = brw_send_indirect_surface_message(
+      p, sfid, brw_writemask(dst, mask), payload, surface, msg_length,
+      brw_surface_payload_size(p, response_expected,
+                               devinfo->gen >= 8 || devinfo->is_haswell, false),
+      true);
+
+   brw_set_dp_typed_atomic_message(
+      p, insn, atomic_op, response_expected);
+}
+
+static void
+brw_set_dp_typed_surface_read_message(struct brw_codegen *p,
+                                      struct brw_inst *insn,
+                                      unsigned num_channels)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   /* Set mask of unused channels. */
+   unsigned msg_control = 0xf & (0xf << num_channels);
+
+   if (devinfo->gen >= 8 || devinfo->is_haswell) {
+      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
+         if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
+            msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
+         else
+            msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
+      }
+
+      brw_inst_set_dp_msg_type(devinfo, insn,
+                               HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_READ);
+   } else {
+      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
+         if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
+            msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
+      }
+
+      brw_inst_set_dp_msg_type(devinfo, insn,
+                               GEN7_DATAPORT_RC_TYPED_SURFACE_READ);
+   }
+
+   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
+}
+
+void
+brw_typed_surface_read(struct brw_codegen *p,
+                       struct brw_reg dst,
+                       struct brw_reg payload,
+                       struct brw_reg surface,
+                       unsigned msg_length,
+                       unsigned num_channels)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
+                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
+                          GEN6_SFID_DATAPORT_RENDER_CACHE);
+   struct brw_inst *insn = brw_send_indirect_surface_message(
+      p, sfid, dst, payload, surface, msg_length,
+      brw_surface_payload_size(p, num_channels,
+                               devinfo->gen >= 8 || devinfo->is_haswell, false),
+      true);
+
+   brw_set_dp_typed_surface_read_message(
+      p, insn, num_channels);
+}
+
+static void
+brw_set_dp_typed_surface_write_message(struct brw_codegen *p,
+                                       struct brw_inst *insn,
+                                       unsigned num_channels)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   /* Set mask of unused channels. */
+   unsigned msg_control = 0xf & (0xf << num_channels);
+
+   if (devinfo->gen >= 8 || devinfo->is_haswell) {
+      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
+         if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
+            msg_control |= 2 << 4; /* Use high 8 slots of the sample mask */
+         else
+            msg_control |= 1 << 4; /* Use low 8 slots of the sample mask */
+      }
+
+      brw_inst_set_dp_msg_type(devinfo, insn,
+                               HSW_DATAPORT_DC_PORT1_TYPED_SURFACE_WRITE);
+
+   } else {
+      if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
+         if (brw_inst_qtr_control(devinfo, p->current) % 2 == 1)
+            msg_control |= 1 << 5; /* Use high 8 slots of the sample mask */
+      }
+
+      brw_inst_set_dp_msg_type(devinfo, insn,
+                               GEN7_DATAPORT_RC_TYPED_SURFACE_WRITE);
+   }
+
+   brw_inst_set_dp_msg_control(devinfo, insn, msg_control);
+}
+
+void
+brw_typed_surface_write(struct brw_codegen *p,
+                        struct brw_reg payload,
+                        struct brw_reg surface,
+                        unsigned msg_length,
+                        unsigned num_channels)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   const unsigned sfid = (devinfo->gen >= 8 || devinfo->is_haswell ?
+                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
+                          GEN6_SFID_DATAPORT_RENDER_CACHE);
+   const bool align1 = (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1);
+   /* Mask out unused components -- See comment in brw_untyped_atomic(). */
+   const unsigned mask = (devinfo->gen == 7 && !devinfo->is_haswell && !align1 ?
+                          WRITEMASK_X : WRITEMASK_XYZW);
+   struct brw_inst *insn = brw_send_indirect_surface_message(
+      p, sfid, brw_writemask(brw_null_reg(), mask),
+      payload, surface, msg_length, 0, true);
+
+   brw_set_dp_typed_surface_write_message(
+      p, insn, num_channels);
+}
+
+static void
+brw_set_memory_fence_message(struct brw_codegen *p,
+                             struct brw_inst *insn,
+                             enum brw_message_target sfid,
+                             bool commit_enable)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+
+   brw_set_message_descriptor(p, insn, sfid,
+                              1 /* message length */,
+                              (commit_enable ? 1 : 0) /* response length */,
+                              true /* header present */,
+                              false);
+
+   switch (sfid) {
+   case GEN6_SFID_DATAPORT_RENDER_CACHE:
+      brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_RC_MEMORY_FENCE);
+      break;
+   case GEN7_SFID_DATAPORT_DATA_CACHE:
+      brw_inst_set_dp_msg_type(devinfo, insn, GEN7_DATAPORT_DC_MEMORY_FENCE);
+      break;
+   default:
+      unreachable("Not reached");
+   }
+
+   if (commit_enable)
+      brw_inst_set_dp_msg_control(devinfo, insn, 1 << 5);
+}
+
+void
+brw_memory_fence(struct brw_codegen *p,
+                 struct brw_reg dst)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   const bool commit_enable = devinfo->gen == 7 && !devinfo->is_haswell;
+   struct brw_inst *insn;
+
+   brw_push_insn_state(p);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_set_default_exec_size(p, BRW_EXECUTE_1);
+   dst = vec1(dst);
+
+   /* Set dst as destination for dependency tracking, the MEMORY_FENCE
+    * message doesn't write anything back.
+    */
+   insn = next_insn(p, BRW_OPCODE_SEND);
+   dst = retype(dst, BRW_REGISTER_TYPE_UW);
+   brw_set_dest(p, insn, dst);
+   brw_set_src0(p, insn, dst);
+   brw_set_memory_fence_message(p, insn, GEN7_SFID_DATAPORT_DATA_CACHE,
+                                commit_enable);
+
+   if (devinfo->gen == 7 && !devinfo->is_haswell) {
+      /* IVB does typed surface access through the render cache, so we need to
+       * flush it too.  Use a different register so both flushes can be
+       * pipelined by the hardware.
+       */
+      insn = next_insn(p, BRW_OPCODE_SEND);
+      brw_set_dest(p, insn, offset(dst, 1));
+      brw_set_src0(p, insn, offset(dst, 1));
+      brw_set_memory_fence_message(p, insn, GEN6_SFID_DATAPORT_RENDER_CACHE,
+                                   commit_enable);
+
+      /* Now write the response of the second message into the response of the
+       * first to trigger a pipeline stall -- This way future render and data
+       * cache messages will be properly ordered with respect to past data and
+       * render cache messages.
+       */
+      brw_MOV(p, dst, offset(dst, 1));
+   }
+
+   brw_pop_insn_state(p);
+}
+
+void
+brw_pixel_interpolator_query(struct brw_codegen *p,
+                             struct brw_reg dest,
+                             struct brw_reg mrf,
+                             bool noperspective,
+                             unsigned mode,
+                             struct brw_reg data,
+                             unsigned msg_length,
+                             unsigned response_length)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   struct brw_inst *insn;
+   const uint16_t exec_size = brw_inst_exec_size(devinfo, p->current);
+
+   /* brw_send_indirect_message will automatically use a direct send message
+    * if data is actually immediate.
+    */
+   insn = brw_send_indirect_message(p,
+                                    GEN7_SFID_PIXEL_INTERPOLATOR,
+                                    dest,
+                                    mrf,
+                                    vec1(data));
+   brw_inst_set_mlen(devinfo, insn, msg_length);
+   brw_inst_set_rlen(devinfo, insn, response_length);
+
+   brw_inst_set_pi_simd_mode(devinfo, insn, exec_size == BRW_EXECUTE_16);
+   brw_inst_set_pi_slot_group(devinfo, insn, 0); /* zero unless 32/64px dispatch */
+   brw_inst_set_pi_nopersp(devinfo, insn, noperspective);
+   brw_inst_set_pi_message_type(devinfo, insn, mode);
+}
+
+void
+brw_find_live_channel(struct brw_codegen *p, struct brw_reg dst,
+                      struct brw_reg mask)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   const unsigned exec_size = 1 << brw_inst_exec_size(devinfo, p->current);
+   const unsigned qtr_control = brw_inst_qtr_control(devinfo, p->current);
+   brw_inst *inst;
+
+   assert(devinfo->gen >= 7);
+   assert(mask.type == BRW_REGISTER_TYPE_UD);
+
+   brw_push_insn_state(p);
+
+   if (brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1) {
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+      if (devinfo->gen >= 8) {
+         /* Getting the first active channel index is easy on Gen8: Just find
+          * the first bit set in the execution mask.  The register exists on
+          * HSW already but it reads back as all ones when the current
+          * instruction has execution masking disabled, so it's kind of
+          * useless.
+          */
+         struct brw_reg exec_mask =
+            retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD);
+
+         if (mask.file != BRW_IMMEDIATE_VALUE || mask.ud != 0xffffffff) {
+            /* Unfortunately, ce0 does not take into account the thread
+             * dispatch mask, which may be a problem in cases where it's not
+             * tightly packed (i.e. it doesn't have the form '2^n - 1' for
+             * some n).  Combine ce0 with the given dispatch (or vector) mask
+             * to mask off those channels which were never dispatched by the
+             * hardware.
+             */
+            brw_SHR(p, vec1(dst), mask, brw_imm_ud(qtr_control * 8));
+            brw_AND(p, vec1(dst), exec_mask, vec1(dst));
+            exec_mask = vec1(dst);
+         }
+
+         /* Quarter control has the effect of magically shifting the value of
+          * ce0 so you'll get the first active channel relative to the
+          * specified quarter control as result.
+          */
+         inst = brw_FBL(p, vec1(dst), exec_mask);
+      } else {
+         const struct brw_reg flag = brw_flag_reg(1, 0);
+
+         brw_MOV(p, retype(flag, BRW_REGISTER_TYPE_UD), brw_imm_ud(0));
+
+         /* Run enough instructions returning zero with execution masking and
+          * a conditional modifier enabled in order to get the full execution
+          * mask in f1.0.  We could use a single 32-wide move here if it
+          * weren't because of the hardware bug that causes channel enables to
+          * be applied incorrectly to the second half of 32-wide instructions
+          * on Gen7.
+          */
+         const unsigned lower_size = MIN2(16, exec_size);
+         for (unsigned i = 0; i < exec_size / lower_size; i++) {
+            inst = brw_MOV(p, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW),
+                           brw_imm_uw(0));
+            brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
+            brw_inst_set_group(devinfo, inst, lower_size * i + 8 * qtr_control);
+            brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_Z);
+            brw_inst_set_flag_reg_nr(devinfo, inst, 1);
+            brw_inst_set_exec_size(devinfo, inst, cvt(lower_size) - 1);
+         }
+
+         /* Find the first bit set in the exec_size-wide portion of the flag
+          * register that was updated by the last sequence of MOV
+          * instructions.
+          */
+         const enum brw_reg_type type = brw_int_type(exec_size / 8, false);
+         brw_FBL(p, vec1(dst), byte_offset(retype(flag, type), qtr_control));
+      }
+   } else {
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+      if (devinfo->gen >= 8 &&
+          mask.file == BRW_IMMEDIATE_VALUE && mask.ud == 0xffffffff) {
+         /* In SIMD4x2 mode the first active channel index is just the
+          * negation of the first bit of the mask register.  Note that ce0
+          * doesn't take into account the dispatch mask, so the Gen7 path
+          * should be used instead unless you have the guarantee that the
+          * dispatch mask is tightly packed (i.e. it has the form '2^n - 1'
+          * for some n).
+          */
+         inst = brw_AND(p, brw_writemask(dst, WRITEMASK_X),
+                        negate(retype(brw_mask_reg(0), BRW_REGISTER_TYPE_UD)),
+                        brw_imm_ud(1));
+
+      } else {
+         /* Overwrite the destination without and with execution masking to
+          * find out which of the channels is active.
+          */
+         brw_push_insn_state(p);
+         brw_set_default_exec_size(p, BRW_EXECUTE_4);
+         brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
+                 brw_imm_ud(1));
+
+         inst = brw_MOV(p, brw_writemask(vec4(dst), WRITEMASK_X),
+                        brw_imm_ud(0));
+         brw_pop_insn_state(p);
+         brw_inst_set_mask_control(devinfo, inst, BRW_MASK_ENABLE);
+      }
+   }
+
+   brw_pop_insn_state(p);
+}
+
+void
+brw_broadcast(struct brw_codegen *p,
+              struct brw_reg dst,
+              struct brw_reg src,
+              struct brw_reg idx)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   const bool align1 = brw_inst_access_mode(devinfo, p->current) == BRW_ALIGN_1;
+   brw_inst *inst;
+
+   brw_push_insn_state(p);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_set_default_exec_size(p, align1 ? BRW_EXECUTE_1 : BRW_EXECUTE_4);
+
+   assert(src.file == BRW_GENERAL_REGISTER_FILE &&
+          src.address_mode == BRW_ADDRESS_DIRECT);
+
+   if ((src.vstride == 0 && (src.hstride == 0 || !align1)) ||
+       idx.file == BRW_IMMEDIATE_VALUE) {
+      /* Trivial, the source is already uniform or the index is a constant.
+       * We will typically not get here if the optimizer is doing its job, but
+       * asserting would be mean.
+       */
+      const unsigned i = idx.file == BRW_IMMEDIATE_VALUE ? idx.ud : 0;
+      brw_MOV(p, dst,
+              (align1 ? stride(suboffset(src, i), 0, 1, 0) :
+               stride(suboffset(src, 4 * i), 0, 4, 1)));
+   } else {
+      if (align1) {
+         const struct brw_reg addr =
+            retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD);
+         const unsigned offset = src.nr * REG_SIZE + src.subnr;
+         /* Limit in bytes of the signed indirect addressing immediate. */
+         const unsigned limit = 512;
+
+         brw_push_insn_state(p);
+         brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+         brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+
+         /* Take into account the component size and horizontal stride. */
+         assert(src.vstride == src.hstride + src.width);
+         brw_SHL(p, addr, vec1(idx),
+                 brw_imm_ud(_mesa_logbase2(type_sz(src.type)) +
+                            src.hstride - 1));
+
+         /* We can only address up to limit bytes using the indirect
+          * addressing immediate, account for the difference if the source
+          * register is above this limit.
+          */
+         if (offset >= limit)
+            brw_ADD(p, addr, addr, brw_imm_ud(offset - offset % limit));
+
+         brw_pop_insn_state(p);
+
+         /* Use indirect addressing to fetch the specified component. */
+         brw_MOV(p, dst,
+                 retype(brw_vec1_indirect(addr.subnr, offset % limit),
+                        src.type));
+      } else {
+         /* In SIMD4x2 mode the index can be either zero or one, replicate it
+          * to all bits of a flag register,
+          */
+         inst = brw_MOV(p,
+                        brw_null_reg(),
+                        stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 4, 4, 1));
+         brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
+         brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
+         brw_inst_set_flag_reg_nr(devinfo, inst, 1);
+
+         /* and use predicated SEL to pick the right channel. */
+         inst = brw_SEL(p, dst,
+                        stride(suboffset(src, 4), 4, 4, 1),
+                        stride(src, 4, 4, 1));
+         brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NORMAL);
+         brw_inst_set_flag_reg_nr(devinfo, inst, 1);
+      }
+   }
+
+   brw_pop_insn_state(p);
+}
+
+/**
+ * This instruction is generated as a single-channel align1 instruction by
+ * both the VS and FS stages when using INTEL_DEBUG=shader_time.
+ *
+ * We can't use the typed atomic op in the FS because that has the execution
+ * mask ANDed with the pixel mask, but we just want to write the one dword for
+ * all the pixels.
+ *
+ * We don't use the SIMD4x2 atomic ops in the VS because want to just write
+ * one u32.  So we use the same untyped atomic write message as the pixel
+ * shader.
+ *
+ * The untyped atomic operation requires a BUFFER surface type with RAW
+ * format, and is only accessible through the legacy DATA_CACHE dataport
+ * messages.
+ */
+void brw_shader_time_add(struct brw_codegen *p,
+                         struct brw_reg payload,
+                         uint32_t surf_index)
+{
+   const unsigned sfid = (p->devinfo->gen >= 8 || p->devinfo->is_haswell ?
+                          HSW_SFID_DATAPORT_DATA_CACHE_1 :
+                          GEN7_SFID_DATAPORT_DATA_CACHE);
+   assert(p->devinfo->gen >= 7);
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+
+   /* We use brw_vec1_reg and unmasked because we want to increment the given
+    * offset only once.
+    */
+   brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+                                      BRW_ARF_NULL, 0));
+   brw_set_src0(p, send, brw_vec1_reg(payload.file,
+                                      payload.nr, 0));
+   brw_set_src1(p, send, brw_imm_ud(0));
+   brw_set_message_descriptor(p, send, sfid, 2, 0, false, false);
+   brw_inst_set_binding_table_index(p->devinfo, send, surf_index);
+   brw_set_dp_untyped_atomic_message(p, send, BRW_AOP_ADD, false);
+
+   brw_pop_insn_state(p);
+}
+
+
+/**
+ * Emit the SEND message for a barrier
+ */
+void
+brw_barrier(struct brw_codegen *p, struct brw_reg src)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   struct brw_inst *inst;
+
+   assert(devinfo->gen >= 7);
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   inst = next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, inst, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
+   brw_set_src0(p, inst, src);
+   brw_set_src1(p, inst, brw_null_reg());
+
+   brw_set_message_descriptor(p, inst, BRW_SFID_MESSAGE_GATEWAY,
+                              1 /* msg_length */,
+                              0 /* response_length */,
+                              false /* header_present */,
+                              false /* end_of_thread */);
+
+   brw_inst_set_gateway_notify(devinfo, inst, 1);
+   brw_inst_set_gateway_subfuncid(devinfo, inst,
+                                  BRW_MESSAGE_GATEWAY_SFID_BARRIER_MSG);
+
+   brw_inst_set_mask_control(devinfo, inst, BRW_MASK_DISABLE);
+   brw_pop_insn_state(p);
+}
+
+
+/**
+ * Emit the wait instruction for a barrier
+ */
+void
+brw_WAIT(struct brw_codegen *p)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   struct brw_inst *insn;
+
+   struct brw_reg src = brw_notification_reg();
+
+   insn = next_insn(p, BRW_OPCODE_WAIT);
+   brw_set_dest(p, insn, src);
+   brw_set_src0(p, insn, src);
+   brw_set_src1(p, insn, brw_null_reg());
+
+   brw_inst_set_exec_size(devinfo, insn, BRW_EXECUTE_1);
+   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
+}
diff --git a/src/intel/compiler/brw_eu_util.c b/src/intel/compiler/brw_eu_util.c
new file mode 100644
index 00000000000..8c84cb45008
--- /dev/null
+++ b/src/intel/compiler/brw_eu_util.c
@@ -0,0 +1,123 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+
+#include "brw_eu_defines.h"
+#include "brw_eu.h"
+
+
+void brw_math_invert( struct brw_codegen *p,
+			     struct brw_reg dst,
+			     struct brw_reg src)
+{
+   gen4_math(p,
+	     dst,
+	     BRW_MATH_FUNCTION_INV,
+	     0,
+	     src,
+	     BRW_MATH_PRECISION_FULL);
+}
+
+
+
+void brw_copy4(struct brw_codegen *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       unsigned count)
+{
+   unsigned i;
+
+   dst = vec4(dst);
+   src = vec4(src);
+
+   for (i = 0; i < count; i++)
+   {
+      unsigned delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    byte_offset(src, delta));
+      brw_MOV(p, byte_offset(dst, delta+16), byte_offset(src, delta+16));
+   }
+}
+
+
+void brw_copy8(struct brw_codegen *p,
+	       struct brw_reg dst,
+	       struct brw_reg src,
+	       unsigned count)
+{
+   unsigned i;
+
+   dst = vec8(dst);
+   src = vec8(src);
+
+   for (i = 0; i < count; i++)
+   {
+      unsigned delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    byte_offset(src, delta));
+   }
+}
+
+
+void brw_copy_indirect_to_indirect(struct brw_codegen *p,
+				   struct brw_indirect dst_ptr,
+				   struct brw_indirect src_ptr,
+				   unsigned count)
+{
+   unsigned i;
+
+   for (i = 0; i < count; i++)
+   {
+      unsigned delta = i*32;
+      brw_MOV(p, deref_4f(dst_ptr, delta),    deref_4f(src_ptr, delta));
+      brw_MOV(p, deref_4f(dst_ptr, delta+16), deref_4f(src_ptr, delta+16));
+   }
+}
+
+
+void brw_copy_from_indirect(struct brw_codegen *p,
+			    struct brw_reg dst,
+			    struct brw_indirect ptr,
+			    unsigned count)
+{
+   unsigned i;
+
+   dst = vec4(dst);
+
+   for (i = 0; i < count; i++)
+   {
+      unsigned delta = i*32;
+      brw_MOV(p, byte_offset(dst, delta),    deref_4f(ptr, delta));
+      brw_MOV(p, byte_offset(dst, delta+16), deref_4f(ptr, delta+16));
+   }
+}
+
+
+
+
diff --git a/src/intel/compiler/brw_eu_validate.c b/src/intel/compiler/brw_eu_validate.c
new file mode 100644
index 00000000000..64615af44ac
--- /dev/null
+++ b/src/intel/compiler/brw_eu_validate.c
@@ -0,0 +1,1051 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_eu_validate.c
+ *
+ * This file implements a pass that validates shader assembly.
+ */
+
+#include "brw_eu.h"
+
+/* We're going to do lots of string concatenation, so this should help. */
+struct string {
+   char *str;
+   size_t len;
+};
+
+static void
+cat(struct string *dest, const struct string src)
+{
+   dest->str = realloc(dest->str, dest->len + src.len + 1);
+   memcpy(dest->str + dest->len, src.str, src.len);
+   dest->str[dest->len + src.len] = '\0';
+   dest->len = dest->len + src.len;
+}
+#define CAT(dest, src) cat(&dest, (struct string){src, strlen(src)})
+
+#define error(str)   "\tERROR: " str "\n"
+#define ERROR_INDENT "\t       "
+
+#define ERROR(msg) ERROR_IF(true, msg)
+#define ERROR_IF(cond, msg)          \
+   do {                              \
+      if (cond) {                    \
+         CAT(error_msg, error(msg)); \
+      }                              \
+   } while(0)
+
+#define CHECK(func, args...)                             \
+   do {                                                  \
+      struct string __msg = func(devinfo, inst, ##args); \
+      if (__msg.str) {                                   \
+         cat(&error_msg, __msg);                         \
+         free(__msg.str);                                \
+      }                                                  \
+   } while (0)
+
+static bool
+inst_is_send(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+   switch (brw_inst_opcode(devinfo, inst)) {
+   case BRW_OPCODE_SEND:
+   case BRW_OPCODE_SENDC:
+   case BRW_OPCODE_SENDS:
+   case BRW_OPCODE_SENDSC:
+      return true;
+   default:
+      return false;
+   }
+}
+
+static unsigned
+signed_type(unsigned type)
+{
+   switch (type) {
+   case BRW_HW_REG_TYPE_UD:         return BRW_HW_REG_TYPE_D;
+   case BRW_HW_REG_TYPE_UW:         return BRW_HW_REG_TYPE_W;
+   case BRW_HW_REG_NON_IMM_TYPE_UB: return BRW_HW_REG_NON_IMM_TYPE_B;
+   case GEN8_HW_REG_TYPE_UQ:        return GEN8_HW_REG_TYPE_Q;
+   default:                         return type;
+   }
+}
+
+static bool
+inst_is_raw_move(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+   unsigned dst_type = signed_type(brw_inst_dst_reg_type(devinfo, inst));
+   unsigned src_type = signed_type(brw_inst_src0_reg_type(devinfo, inst));
+
+   if (brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE &&
+       (brw_inst_src0_negate(devinfo, inst) ||
+        brw_inst_src0_abs(devinfo, inst)))
+      return false;
+
+   return brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MOV &&
+          brw_inst_saturate(devinfo, inst) == 0 &&
+          dst_type == src_type;
+}
+
+static bool
+dst_is_null(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+   return brw_inst_dst_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE &&
+          brw_inst_dst_da_reg_nr(devinfo, inst) == BRW_ARF_NULL;
+}
+
+static bool
+src0_is_null(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+   return brw_inst_src0_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE &&
+          brw_inst_src0_da_reg_nr(devinfo, inst) == BRW_ARF_NULL;
+}
+
+static bool
+src1_is_null(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+   return brw_inst_src1_reg_file(devinfo, inst) == BRW_ARCHITECTURE_REGISTER_FILE &&
+          brw_inst_src1_da_reg_nr(devinfo, inst) == BRW_ARF_NULL;
+}
+
+static bool
+src0_is_grf(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+   return brw_inst_src0_reg_file(devinfo, inst) == BRW_GENERAL_REGISTER_FILE;
+}
+
+static bool
+src0_has_scalar_region(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+   return brw_inst_src0_vstride(devinfo, inst) == BRW_VERTICAL_STRIDE_0 &&
+          brw_inst_src0_width(devinfo, inst) == BRW_WIDTH_1 &&
+          brw_inst_src0_hstride(devinfo, inst) == BRW_HORIZONTAL_STRIDE_0;
+}
+
+static bool
+src1_has_scalar_region(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+   return brw_inst_src1_vstride(devinfo, inst) == BRW_VERTICAL_STRIDE_0 &&
+          brw_inst_src1_width(devinfo, inst) == BRW_WIDTH_1 &&
+          brw_inst_src1_hstride(devinfo, inst) == BRW_HORIZONTAL_STRIDE_0;
+}
+
+static unsigned
+num_sources_from_inst(const struct gen_device_info *devinfo,
+                      const brw_inst *inst)
+{
+   const struct opcode_desc *desc =
+      brw_opcode_desc(devinfo, brw_inst_opcode(devinfo, inst));
+   unsigned math_function;
+
+   if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MATH) {
+      math_function = brw_inst_math_function(devinfo, inst);
+   } else if (devinfo->gen < 6 &&
+              brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND) {
+      if (brw_inst_sfid(devinfo, inst) == BRW_SFID_MATH) {
+         /* src1 must be a descriptor (including the information to determine
+          * that the SEND is doing an extended math operation), but src0 can
+          * actually be null since it serves as the source of the implicit GRF
+          * to MRF move.
+          *
+          * If we stop using that functionality, we'll have to revisit this.
+          */
+         return 2;
+      } else {
+         /* Send instructions are allowed to have null sources since they use
+          * the base_mrf field to specify which message register source.
+          */
+         return 0;
+      }
+   } else {
+      assert(desc->nsrc < 4);
+      return desc->nsrc;
+   }
+
+   switch (math_function) {
+   case BRW_MATH_FUNCTION_INV:
+   case BRW_MATH_FUNCTION_LOG:
+   case BRW_MATH_FUNCTION_EXP:
+   case BRW_MATH_FUNCTION_SQRT:
+   case BRW_MATH_FUNCTION_RSQ:
+   case BRW_MATH_FUNCTION_SIN:
+   case BRW_MATH_FUNCTION_COS:
+   case BRW_MATH_FUNCTION_SINCOS:
+   case GEN8_MATH_FUNCTION_INVM:
+   case GEN8_MATH_FUNCTION_RSQRTM:
+      return 1;
+   case BRW_MATH_FUNCTION_FDIV:
+   case BRW_MATH_FUNCTION_POW:
+   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
+   case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
+   case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
+      return 2;
+   default:
+      unreachable("not reached");
+   }
+}
+
+static struct string
+sources_not_null(const struct gen_device_info *devinfo,
+                 const brw_inst *inst)
+{
+   unsigned num_sources = num_sources_from_inst(devinfo, inst);
+   struct string error_msg = { .str = NULL, .len = 0 };
+
+   /* Nothing to test. 3-src instructions can only have GRF sources, and
+    * there's no bit to control the file.
+    */
+   if (num_sources == 3)
+      return (struct string){};
+
+   if (num_sources >= 1)
+      ERROR_IF(src0_is_null(devinfo, inst), "src0 is null");
+
+   if (num_sources == 2)
+      ERROR_IF(src1_is_null(devinfo, inst), "src1 is null");
+
+   return error_msg;
+}
+
+static struct string
+send_restrictions(const struct gen_device_info *devinfo,
+                  const brw_inst *inst)
+{
+   struct string error_msg = { .str = NULL, .len = 0 };
+
+   if (brw_inst_opcode(devinfo, inst) == BRW_OPCODE_SEND) {
+      ERROR_IF(brw_inst_src0_address_mode(devinfo, inst) != BRW_ADDRESS_DIRECT,
+               "send must use direct addressing");
+
+      if (devinfo->gen >= 7) {
+         ERROR_IF(!src0_is_grf(devinfo, inst), "send from non-GRF");
+         ERROR_IF(brw_inst_eot(devinfo, inst) &&
+                  brw_inst_src0_da_reg_nr(devinfo, inst) < 112,
+                  "send with EOT must use g112-g127");
+      }
+   }
+
+   return error_msg;
+}
+
+static bool
+is_unsupported_inst(const struct gen_device_info *devinfo,
+                    const brw_inst *inst)
+{
+   return brw_opcode_desc(devinfo, brw_inst_opcode(devinfo, inst)) == NULL;
+}
+
+static unsigned
+execution_type_for_type(unsigned type, bool is_immediate)
+{
+   /* The meaning of the type bits is dependent on whether the operand is an
+    * immediate, so normalize them first.
+    */
+   if (is_immediate) {
+      switch (type) {
+      case BRW_HW_REG_IMM_TYPE_UV:
+      case BRW_HW_REG_IMM_TYPE_V:
+         type = BRW_HW_REG_TYPE_W;
+         break;
+      case BRW_HW_REG_IMM_TYPE_VF:
+         type = BRW_HW_REG_TYPE_F;
+         break;
+      case GEN8_HW_REG_IMM_TYPE_DF:
+         type = GEN7_HW_REG_NON_IMM_TYPE_DF;
+         break;
+      case GEN8_HW_REG_IMM_TYPE_HF:
+         type = GEN8_HW_REG_NON_IMM_TYPE_HF;
+         break;
+      default:
+         break;
+      }
+   }
+
+   switch (type) {
+   case BRW_HW_REG_TYPE_UD:
+   case BRW_HW_REG_TYPE_D:
+      return BRW_HW_REG_TYPE_D;
+   case BRW_HW_REG_TYPE_UW:
+   case BRW_HW_REG_TYPE_W:
+   case BRW_HW_REG_NON_IMM_TYPE_UB:
+   case BRW_HW_REG_NON_IMM_TYPE_B:
+      return BRW_HW_REG_TYPE_W;
+   case GEN8_HW_REG_TYPE_UQ:
+   case GEN8_HW_REG_TYPE_Q:
+      return GEN8_HW_REG_TYPE_Q;
+   case BRW_HW_REG_TYPE_F:
+   case GEN7_HW_REG_NON_IMM_TYPE_DF:
+   case GEN8_HW_REG_NON_IMM_TYPE_HF:
+      return type;
+   default:
+      unreachable("not reached");
+   }
+}
+
+/**
+ * Returns the execution type of an instruction \p inst
+ */
+static unsigned
+execution_type(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+   unsigned num_sources = num_sources_from_inst(devinfo, inst);
+   unsigned src0_exec_type, src1_exec_type;
+   unsigned src0_type = brw_inst_src0_reg_type(devinfo, inst);
+   unsigned src1_type = brw_inst_src1_reg_type(devinfo, inst);
+
+   bool src0_is_immediate =
+      brw_inst_src0_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE;
+   bool src1_is_immediate =
+      brw_inst_src1_reg_file(devinfo, inst) == BRW_IMMEDIATE_VALUE;
+
+   /* Execution data type is independent of destination data type, except in
+    * mixed F/HF instructions on CHV and SKL+.
+    */
+   unsigned dst_exec_type = brw_inst_dst_reg_type(devinfo, inst);
+
+   src0_exec_type = execution_type_for_type(src0_type, src0_is_immediate);
+   if (num_sources == 1) {
+      if ((devinfo->gen >= 9 || devinfo->is_cherryview) &&
+          src0_exec_type == GEN8_HW_REG_NON_IMM_TYPE_HF) {
+         return dst_exec_type;
+      }
+      return src0_exec_type;
+   }
+
+   src1_exec_type = execution_type_for_type(src1_type, src1_is_immediate);
+   if (src0_exec_type == src1_exec_type)
+      return src0_exec_type;
+
+   /* Mixed operand types where one is float is float on Gen < 6
+    * (and not allowed on later platforms)
+    */
+   if (devinfo->gen < 6 &&
+       (src0_exec_type == BRW_HW_REG_TYPE_F ||
+        src1_exec_type == BRW_HW_REG_TYPE_F))
+      return BRW_HW_REG_TYPE_F;
+
+   if (src0_exec_type == GEN8_HW_REG_TYPE_Q ||
+       src1_exec_type == GEN8_HW_REG_TYPE_Q)
+      return GEN8_HW_REG_TYPE_Q;
+
+   if (src0_exec_type == BRW_HW_REG_TYPE_D ||
+       src1_exec_type == BRW_HW_REG_TYPE_D)
+      return BRW_HW_REG_TYPE_D;
+
+   if (src0_exec_type == BRW_HW_REG_TYPE_W ||
+       src1_exec_type == BRW_HW_REG_TYPE_W)
+      return BRW_HW_REG_TYPE_W;
+
+   if (src0_exec_type == GEN7_HW_REG_NON_IMM_TYPE_DF ||
+       src1_exec_type == GEN7_HW_REG_NON_IMM_TYPE_DF)
+      return GEN7_HW_REG_NON_IMM_TYPE_DF;
+
+   if (devinfo->gen >= 9 || devinfo->is_cherryview) {
+      if (dst_exec_type == BRW_HW_REG_TYPE_F ||
+          src0_exec_type == BRW_HW_REG_TYPE_F ||
+          src1_exec_type == BRW_HW_REG_TYPE_F) {
+         return BRW_HW_REG_TYPE_F;
+      } else {
+         return GEN8_HW_REG_NON_IMM_TYPE_HF;
+      }
+   }
+
+   assert(src0_exec_type == BRW_HW_REG_TYPE_F);
+   return BRW_HW_REG_TYPE_F;
+}
+
+/**
+ * Returns whether a region is packed
+ *
+ * A region is packed if its elements are adjacent in memory, with no
+ * intervening space, no overlap, and no replicated values.
+ */
+static bool
+is_packed(unsigned vstride, unsigned width, unsigned hstride)
+{
+   if (vstride == width) {
+      if (vstride == 1) {
+         return hstride == 0;
+      } else {
+         return hstride == 1;
+      }
+   }
+
+   return false;
+}
+
+/**
+ * Checks restrictions listed in "General Restrictions Based on Operand Types"
+ * in the "Register Region Restrictions" section.
+ */
+static struct string
+general_restrictions_based_on_operand_types(const struct gen_device_info *devinfo,
+                                            const brw_inst *inst)
+{
+   const struct opcode_desc *desc =
+      brw_opcode_desc(devinfo, brw_inst_opcode(devinfo, inst));
+   unsigned num_sources = num_sources_from_inst(devinfo, inst);
+   unsigned exec_size = 1 << brw_inst_exec_size(devinfo, inst);
+   struct string error_msg = { .str = NULL, .len = 0 };
+
+   if (num_sources == 3)
+      return (struct string){};
+
+   if (inst_is_send(devinfo, inst))
+      return (struct string){};
+
+   if (exec_size == 1)
+      return (struct string){};
+
+   if (desc->ndst == 0)
+      return (struct string){};
+
+   /* The PRMs say:
+    *
+    *    Where n is the largest element size in bytes for any source or
+    *    destination operand type, ExecSize * n must be <= 64.
+    *
+    * But we do not attempt to enforce it, because it is implied by other
+    * rules:
+    *
+    *    - that the destination stride must match the execution data type
+    *    - sources may not span more than two adjacent GRF registers
+    *    - destination may not span more than two adjacent GRF registers
+    *
+    * In fact, checking it would weaken testing of the other rules.
+    */
+
+   if (num_sources == 3)
+      return (struct string){};
+
+   if (exec_size == 1)
+      return (struct string){};
+
+   if (inst_is_send(devinfo, inst))
+      return (struct string){};
+
+   if (desc->ndst == 0)
+      return (struct string){};
+
+   unsigned dst_stride = 1 << (brw_inst_dst_hstride(devinfo, inst) - 1);
+   bool dst_type_is_byte =
+      brw_inst_dst_reg_type(devinfo, inst) == BRW_HW_REG_NON_IMM_TYPE_B ||
+      brw_inst_dst_reg_type(devinfo, inst) == BRW_HW_REG_NON_IMM_TYPE_UB;
+
+   if (dst_type_is_byte) {
+      if (is_packed(exec_size * dst_stride, exec_size, dst_stride)) {
+         if (!inst_is_raw_move(devinfo, inst)) {
+            ERROR("Only raw MOV supports a packed-byte destination");
+            return error_msg;
+         } else {
+            return (struct string){};
+         }
+      }
+   }
+
+   unsigned exec_type = execution_type(devinfo, inst);
+   unsigned exec_type_size =
+      brw_hw_reg_type_to_size(devinfo, exec_type, BRW_GENERAL_REGISTER_FILE);
+   unsigned dst_type_size = brw_element_size(devinfo, inst, dst);
+
+   if (exec_type_size > dst_type_size) {
+      ERROR_IF(dst_stride * dst_type_size != exec_type_size,
+               "Destination stride must be equal to the ratio of the sizes of "
+               "the execution data type to the destination type");
+
+      unsigned subreg = brw_inst_dst_da1_subreg_nr(devinfo, inst);
+
+      if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_1 &&
+          brw_inst_dst_address_mode(devinfo, inst) == BRW_ADDRESS_DIRECT) {
+         /* The i965 PRM says:
+          *
+          *    Implementation Restriction: The relaxed alignment rule for byte
+          *    destination (#10.5) is not supported.
+          */
+         if ((devinfo->gen > 4 || devinfo->is_g4x) && dst_type_is_byte) {
+            ERROR_IF(subreg % exec_type_size != 0 &&
+                     subreg % exec_type_size != 1,
+                     "Destination subreg must be aligned to the size of the "
+                     "execution data type (or to the next lowest byte for byte "
+                     "destinations)");
+         } else {
+            ERROR_IF(subreg % exec_type_size != 0,
+                     "Destination subreg must be aligned to the size of the "
+                     "execution data type");
+         }
+      }
+   }
+
+   return error_msg;
+}
+
+/**
+ * Checks restrictions listed in "General Restrictions on Regioning Parameters"
+ * in the "Register Region Restrictions" section.
+ */
+static struct string
+general_restrictions_on_region_parameters(const struct gen_device_info *devinfo,
+                                          const brw_inst *inst)
+{
+   const struct opcode_desc *desc =
+      brw_opcode_desc(devinfo, brw_inst_opcode(devinfo, inst));
+   unsigned num_sources = num_sources_from_inst(devinfo, inst);
+   unsigned exec_size = 1 << brw_inst_exec_size(devinfo, inst);
+   struct string error_msg = { .str = NULL, .len = 0 };
+
+   if (num_sources == 3)
+      return (struct string){};
+
+   if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16) {
+      if (desc->ndst != 0 && !dst_is_null(devinfo, inst))
+         ERROR_IF(brw_inst_dst_hstride(devinfo, inst) != BRW_HORIZONTAL_STRIDE_1,
+                  "Destination Horizontal Stride must be 1");
+
+      if (num_sources >= 1) {
+         if (devinfo->is_haswell || devinfo->gen >= 8) {
+            ERROR_IF(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE &&
+                     brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0 &&
+                     brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_2 &&
+                     brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4,
+                     "In Align16 mode, only VertStride of 0, 2, or 4 is allowed");
+         } else {
+            ERROR_IF(brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE &&
+                     brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0 &&
+                     brw_inst_src0_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4,
+                     "In Align16 mode, only VertStride of 0 or 4 is allowed");
+         }
+      }
+
+      if (num_sources == 2) {
+         if (devinfo->is_haswell || devinfo->gen >= 8) {
+            ERROR_IF(brw_inst_src1_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE &&
+                     brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0 &&
+                     brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_2 &&
+                     brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4,
+                     "In Align16 mode, only VertStride of 0, 2, or 4 is allowed");
+         } else {
+            ERROR_IF(brw_inst_src1_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE &&
+                     brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_0 &&
+                     brw_inst_src1_vstride(devinfo, inst) != BRW_VERTICAL_STRIDE_4,
+                     "In Align16 mode, only VertStride of 0 or 4 is allowed");
+         }
+      }
+
+      return error_msg;
+   }
+
+   for (unsigned i = 0; i < num_sources; i++) {
+      unsigned vstride, width, hstride, element_size, subreg;
+
+#define DO_SRC(n)                                                              \
+      if (brw_inst_src ## n ## _reg_file(devinfo, inst) ==                     \
+          BRW_IMMEDIATE_VALUE)                                                 \
+         continue;                                                             \
+                                                                               \
+      vstride = brw_inst_src ## n ## _vstride(devinfo, inst) ?                 \
+                (1 << (brw_inst_src ## n ## _vstride(devinfo, inst) - 1)) : 0; \
+      width = 1 << brw_inst_src ## n ## _width(devinfo, inst);                 \
+      hstride = brw_inst_src ## n ## _hstride(devinfo, inst) ?                 \
+                (1 << (brw_inst_src ## n ## _hstride(devinfo, inst) - 1)) : 0; \
+      element_size = brw_element_size(devinfo, inst, src ## n);                \
+      subreg = brw_inst_src ## n ## _da1_subreg_nr(devinfo, inst)
+
+      if (i == 0) {
+         DO_SRC(0);
+      } else if (i == 1) {
+         DO_SRC(1);
+      }
+#undef DO_SRC
+
+      /* ExecSize must be greater than or equal to Width. */
+      ERROR_IF(exec_size < width, "ExecSize must be greater than or equal "
+                                  "to Width");
+
+      /* If ExecSize = Width and HorzStride ≠ 0,
+       * VertStride must be set to Width * HorzStride.
+       */
+      if (exec_size == width && hstride != 0) {
+         ERROR_IF(vstride != width * hstride,
+                  "If ExecSize = Width and HorzStride ≠ 0, "
+                  "VertStride must be set to Width * HorzStride");
+      }
+
+      /* If Width = 1, HorzStride must be 0 regardless of the values of
+       * ExecSize and VertStride.
+       */
+      if (width == 1) {
+         ERROR_IF(hstride != 0,
+                  "If Width = 1, HorzStride must be 0 regardless "
+                  "of the values of ExecSize and VertStride");
+      }
+
+      /* If ExecSize = Width = 1, both VertStride and HorzStride must be 0. */
+      if (exec_size == 1 && width == 1) {
+         ERROR_IF(vstride != 0 || hstride != 0,
+                  "If ExecSize = Width = 1, both VertStride "
+                  "and HorzStride must be 0");
+      }
+
+      /* If VertStride = HorzStride = 0, Width must be 1 regardless of the
+       * value of ExecSize.
+       */
+      if (vstride == 0 && hstride == 0) {
+         ERROR_IF(width != 1,
+                  "If VertStride = HorzStride = 0, Width must be "
+                  "1 regardless of the value of ExecSize");
+      }
+
+      /* VertStride must be used to cross GRF register boundaries. This rule
+       * implies that elements within a 'Width' cannot cross GRF boundaries.
+       */
+      const uint64_t mask = (1 << element_size) - 1;
+      unsigned rowbase = subreg;
+
+      for (int y = 0; y < exec_size / width; y++) {
+         uint64_t access_mask = 0;
+         unsigned offset = rowbase;
+
+         for (int x = 0; x < width; x++) {
+            access_mask |= mask << offset;
+            offset += hstride * element_size;
+         }
+
+         rowbase += vstride * element_size;
+
+         if ((uint32_t)access_mask != 0 && (access_mask >> 32) != 0) {
+            ERROR("VertStride must be used to cross GRF register boundaries");
+            break;
+         }
+      }
+   }
+
+   /* Dst.HorzStride must not be 0. */
+   if (desc->ndst != 0 && !dst_is_null(devinfo, inst)) {
+      ERROR_IF(brw_inst_dst_hstride(devinfo, inst) == BRW_HORIZONTAL_STRIDE_0,
+               "Destination Horizontal Stride must not be 0");
+   }
+
+   return error_msg;
+}
+
+/**
+ * Creates an \p access_mask for an \p exec_size, \p element_size, and a region
+ *
+ * An \p access_mask is a 32-element array of uint64_t, where each uint64_t is
+ * a bitmask of bytes accessed by the region.
+ *
+ * For instance the access mask of the source gX.1<4,2,2>F in an exec_size = 4
+ * instruction would be
+ *
+ *    access_mask[0] = 0x00000000000000F0
+ *    access_mask[1] = 0x000000000000F000
+ *    access_mask[2] = 0x0000000000F00000
+ *    access_mask[3] = 0x00000000F0000000
+ *    access_mask[4-31] = 0
+ *
+ * because the first execution channel accesses bytes 7-4 and the second
+ * execution channel accesses bytes 15-12, etc.
+ */
+static void
+align1_access_mask(uint64_t access_mask[static 32],
+                   unsigned exec_size, unsigned element_size, unsigned subreg,
+                   unsigned vstride, unsigned width, unsigned hstride)
+{
+   const uint64_t mask = (1 << element_size) - 1;
+   unsigned rowbase = subreg;
+   unsigned element = 0;
+
+   for (int y = 0; y < exec_size / width; y++) {
+      unsigned offset = rowbase;
+
+      for (int x = 0; x < width; x++) {
+         access_mask[element++] = mask << offset;
+         offset += hstride * element_size;
+      }
+
+      rowbase += vstride * element_size;
+   }
+
+   assert(element == 0 || element == exec_size);
+}
+
+/**
+ * Returns the number of registers accessed according to the \p access_mask
+ */
+static int
+registers_read(const uint64_t access_mask[static 32])
+{
+   int regs_read = 0;
+
+   for (unsigned i = 0; i < 32; i++) {
+      if (access_mask[i] > 0xFFFFFFFF) {
+         return 2;
+      } else if (access_mask[i]) {
+         regs_read = 1;
+      }
+   }
+
+   return regs_read;
+}
+
+/**
+ * Checks restrictions listed in "Region Alignment Rules" in the "Register
+ * Region Restrictions" section.
+ */
+static struct string
+region_alignment_rules(const struct gen_device_info *devinfo,
+                       const brw_inst *inst)
+{
+   const struct opcode_desc *desc =
+      brw_opcode_desc(devinfo, brw_inst_opcode(devinfo, inst));
+   unsigned num_sources = num_sources_from_inst(devinfo, inst);
+   unsigned exec_size = 1 << brw_inst_exec_size(devinfo, inst);
+   uint64_t dst_access_mask[32], src0_access_mask[32], src1_access_mask[32];
+   struct string error_msg = { .str = NULL, .len = 0 };
+
+   if (num_sources == 3)
+      return (struct string){};
+
+   if (brw_inst_access_mode(devinfo, inst) == BRW_ALIGN_16)
+      return (struct string){};
+
+   if (inst_is_send(devinfo, inst))
+      return (struct string){};
+
+   memset(dst_access_mask, 0, sizeof(dst_access_mask));
+   memset(src0_access_mask, 0, sizeof(src0_access_mask));
+   memset(src1_access_mask, 0, sizeof(src1_access_mask));
+
+   for (unsigned i = 0; i < num_sources; i++) {
+      unsigned vstride, width, hstride, element_size, subreg;
+
+      /* In Direct Addressing mode, a source cannot span more than 2 adjacent
+       * GRF registers.
+       */
+
+#define DO_SRC(n)                                                              \
+      if (brw_inst_src ## n ## _address_mode(devinfo, inst) !=                 \
+          BRW_ADDRESS_DIRECT)                                                  \
+         continue;                                                             \
+                                                                               \
+      if (brw_inst_src ## n ## _reg_file(devinfo, inst) ==                     \
+          BRW_IMMEDIATE_VALUE)                                                 \
+         continue;                                                             \
+                                                                               \
+      vstride = brw_inst_src ## n ## _vstride(devinfo, inst) ?                 \
+                (1 << (brw_inst_src ## n ## _vstride(devinfo, inst) - 1)) : 0; \
+      width = 1 << brw_inst_src ## n ## _width(devinfo, inst);                 \
+      hstride = brw_inst_src ## n ## _hstride(devinfo, inst) ?                 \
+                (1 << (brw_inst_src ## n ## _hstride(devinfo, inst) - 1)) : 0; \
+      element_size = brw_element_size(devinfo, inst, src ## n);                \
+      subreg = brw_inst_src ## n ## _da1_subreg_nr(devinfo, inst);             \
+      align1_access_mask(src ## n ## _access_mask,                             \
+                         exec_size, element_size, subreg,                      \
+                         vstride, width, hstride)
+
+      if (i == 0) {
+         DO_SRC(0);
+      } else if (i == 1) {
+         DO_SRC(1);
+      }
+#undef DO_SRC
+
+      unsigned num_vstride = exec_size / width;
+      unsigned num_hstride = width;
+      unsigned vstride_elements = (num_vstride - 1) * vstride;
+      unsigned hstride_elements = (num_hstride - 1) * hstride;
+      unsigned offset = (vstride_elements + hstride_elements) * element_size +
+                        subreg;
+      ERROR_IF(offset >= 64,
+               "A source cannot span more than 2 adjacent GRF registers");
+   }
+
+   if (desc->ndst == 0 || dst_is_null(devinfo, inst))
+      return error_msg;
+
+   unsigned stride = 1 << (brw_inst_dst_hstride(devinfo, inst) - 1);
+   unsigned element_size = brw_element_size(devinfo, inst, dst);
+   unsigned subreg = brw_inst_dst_da1_subreg_nr(devinfo, inst);
+   unsigned offset = ((exec_size - 1) * stride * element_size) + subreg;
+   ERROR_IF(offset >= 64,
+            "A destination cannot span more than 2 adjacent GRF registers");
+
+   if (error_msg.str)
+      return error_msg;
+
+   align1_access_mask(dst_access_mask, exec_size, element_size, subreg,
+                      exec_size == 1 ? 0 : exec_size * stride,
+                      exec_size == 1 ? 1 : exec_size,
+                      exec_size == 1 ? 0 : stride);
+
+   unsigned dst_regs = registers_read(dst_access_mask);
+   unsigned src0_regs = registers_read(src0_access_mask);
+   unsigned src1_regs = registers_read(src1_access_mask);
+
+   /* The SNB, IVB, HSW, BDW, and CHV PRMs say:
+    *
+    *    When an instruction has a source region spanning two registers and a
+    *    destination region contained in one register, the number of elements
+    *    must be the same between two sources and one of the following must be
+    *    true:
+    *
+    *       1. The destination region is entirely contained in the lower OWord
+    *          of a register.
+    *       2. The destination region is entirely contained in the upper OWord
+    *          of a register.
+    *       3. The destination elements are evenly split between the two OWords
+    *          of a register.
+    */
+   if (devinfo->gen <= 8) {
+      if (dst_regs == 1 && (src0_regs == 2 || src1_regs == 2)) {
+         unsigned upper_oword_writes = 0, lower_oword_writes = 0;
+
+         for (unsigned i = 0; i < exec_size; i++) {
+            if (dst_access_mask[i] > 0x0000FFFF) {
+               upper_oword_writes++;
+            } else {
+               assert(dst_access_mask[i] != 0);
+               lower_oword_writes++;
+            }
+         }
+
+         ERROR_IF(lower_oword_writes != 0 &&
+                  upper_oword_writes != 0 &&
+                  upper_oword_writes != lower_oword_writes,
+                  "Writes must be to only one OWord or "
+                  "evenly split between OWords");
+      }
+   }
+
+   /* The IVB and HSW PRMs say:
+    *
+    *    When an instruction has a source region that spans two registers and
+    *    the destination spans two registers, the destination elements must be
+    *    evenly split between the two registers [...]
+    *
+    * The SNB PRM contains similar wording (but written in a much more
+    * confusing manner).
+    *
+    * The BDW PRM says:
+    *
+    *    When destination spans two registers, the source may be one or two
+    *    registers. The destination elements must be evenly split between the
+    *    two registers.
+    *
+    * The SKL PRM says:
+    *
+    *    When destination of MATH instruction spans two registers, the
+    *    destination elements must be evenly split between the two registers.
+    *
+    * It is not known whether this restriction applies to KBL other Gens after
+    * SKL.
+    */
+   if (devinfo->gen <= 8 ||
+       brw_inst_opcode(devinfo, inst) == BRW_OPCODE_MATH) {
+
+      /* Nothing explicitly states that on Gen < 8 elements must be evenly
+       * split between two destination registers in the two exceptional
+       * source-region-spans-one-register cases, but since Broadwell requires
+       * evenly split writes regardless of source region, we assume that it was
+       * an oversight and require it.
+       */
+      if (dst_regs == 2) {
+         unsigned upper_reg_writes = 0, lower_reg_writes = 0;
+
+         for (unsigned i = 0; i < exec_size; i++) {
+            if (dst_access_mask[i] > 0xFFFFFFFF) {
+               upper_reg_writes++;
+            } else {
+               assert(dst_access_mask[i] != 0);
+               lower_reg_writes++;
+            }
+         }
+
+         ERROR_IF(upper_reg_writes != lower_reg_writes,
+                  "Writes must be evenly split between the two "
+                  "destination registers");
+      }
+   }
+
+   /* The IVB and HSW PRMs say:
+    *
+    *    When an instruction has a source region that spans two registers and
+    *    the destination spans two registers, the destination elements must be
+    *    evenly split between the two registers and each destination register
+    *    must be entirely derived from one source register.
+    *
+    *    Note: In such cases, the regioning parameters must ensure that the
+    *    offset from the two source registers is the same.
+    *
+    * The SNB PRM contains similar wording (but written in a much more
+    * confusing manner).
+    *
+    * There are effectively three rules stated here:
+    *
+    *    For an instruction with a source and a destination spanning two
+    *    registers,
+    *
+    *       (1) destination elements must be evenly split between the two
+    *           registers
+    *       (2) all destination elements in a register must be derived
+    *           from one source register
+    *       (3) the offset (i.e. the starting location in each of the two
+    *           registers spanned by a region) must be the same in the two
+    *           registers spanned by a region
+    *
+    * It is impossible to violate rule (1) without violating (2) or (3), so we
+    * do not attempt to validate it.
+    */
+   if (devinfo->gen <= 7 && dst_regs == 2) {
+      for (unsigned i = 0; i < num_sources; i++) {
+#define DO_SRC(n)                                                             \
+         if (src ## n ## _regs <= 1)                                          \
+            continue;                                                         \
+                                                                              \
+         for (unsigned i = 0; i < exec_size; i++) {                           \
+            if ((dst_access_mask[i] > 0xFFFFFFFF) !=                          \
+                (src ## n ## _access_mask[i] > 0xFFFFFFFF)) {                 \
+               ERROR("Each destination register must be entirely derived "    \
+                     "from one source register");                             \
+               break;                                                         \
+            }                                                                 \
+         }                                                                    \
+                                                                              \
+         unsigned offset_0 =                                                  \
+            brw_inst_src ## n ## _da1_subreg_nr(devinfo, inst);               \
+         unsigned offset_1 = offset_0;                                        \
+                                                                              \
+         for (unsigned i = 0; i < exec_size; i++) {                           \
+            if (src ## n ## _access_mask[i] > 0xFFFFFFFF) {                   \
+               offset_1 = __builtin_ctzll(src ## n ## _access_mask[i]) - 32;  \
+               break;                                                         \
+            }                                                                 \
+         }                                                                    \
+                                                                              \
+         ERROR_IF(offset_0 != offset_1,                                       \
+                  "The offset from the two source registers "                 \
+                  "must be the same")
+
+         if (i == 0) {
+            DO_SRC(0);
+         } else if (i == 1) {
+            DO_SRC(1);
+         }
+#undef DO_SRC
+      }
+   }
+
+   /* The IVB and HSW PRMs say:
+    *
+    *    When destination spans two registers, the source MUST span two
+    *    registers. The exception to the above rule:
+    *        1. When source is scalar, the source registers are not
+    *           incremented.
+    *        2. When source is packed integer Word and destination is packed
+    *           integer DWord, the source register is not incremented by the
+    *           source sub register is incremented.
+    *
+    * The SNB PRM does not contain this rule, but the internal documentation
+    * indicates that it applies to SNB as well. We assume that the rule applies
+    * to Gen <= 5 although their PRMs do not state it.
+    *
+    * While the documentation explicitly says in exception (2) that the
+    * destination must be an integer DWord, the hardware allows at least a
+    * float destination type as well. We emit such instructions from
+    *
+    *    fs_visitor::emit_interpolation_setup_gen6
+    *    fs_visitor::emit_fragcoord_interpolation
+    *
+    * and have for years with no ill effects.
+    *
+    * Additionally the simulator source code indicates that the real condition
+    * is that the size of the destination type is 4 bytes.
+    */
+   if (devinfo->gen <= 7 && dst_regs == 2) {
+      bool dst_is_packed_dword =
+         is_packed(exec_size * stride, exec_size, stride) &&
+         brw_element_size(devinfo, inst, dst) == 4;
+
+      for (unsigned i = 0; i < num_sources; i++) {
+#define DO_SRC(n)                                                                  \
+         unsigned vstride, width, hstride;                                         \
+         vstride = brw_inst_src ## n ## _vstride(devinfo, inst) ?                  \
+                   (1 << (brw_inst_src ## n ## _vstride(devinfo, inst) - 1)) : 0;  \
+         width = 1 << brw_inst_src ## n ## _width(devinfo, inst);                  \
+         hstride = brw_inst_src ## n ## _hstride(devinfo, inst) ?                  \
+                   (1 << (brw_inst_src ## n ## _hstride(devinfo, inst) - 1)) : 0;  \
+         bool src ## n ## _is_packed_word =                                        \
+            is_packed(vstride, width, hstride) &&                                  \
+            (brw_inst_src ## n ## _reg_type(devinfo, inst) == BRW_HW_REG_TYPE_W || \
+             brw_inst_src ## n ## _reg_type(devinfo, inst) == BRW_HW_REG_TYPE_UW); \
+                                                                                   \
+         ERROR_IF(src ## n ## _regs == 1 &&                                        \
+                  !src ## n ## _has_scalar_region(devinfo, inst) &&                \
+                  !(dst_is_packed_dword && src ## n ## _is_packed_word),           \
+                  "When the destination spans two registers, the source must "     \
+                  "span two registers\n" ERROR_INDENT "(exceptions for scalar "    \
+                  "source and packed-word to packed-dword expansion)")
+
+         if (i == 0) {
+            DO_SRC(0);
+         } else if (i == 1) {
+            DO_SRC(1);
+         }
+#undef DO_SRC
+      }
+   }
+
+   return error_msg;
+}
+
+bool
+brw_validate_instructions(const struct brw_codegen *p, int start_offset,
+                          struct annotation_info *annotation)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   const void *store = p->store;
+   bool valid = true;
+
+   for (int src_offset = start_offset; src_offset < p->next_insn_offset;
+        src_offset += sizeof(brw_inst)) {
+      struct string error_msg = { .str = NULL, .len = 0 };
+      const brw_inst *inst = store + src_offset;
+
+      if (is_unsupported_inst(devinfo, inst)) {
+         ERROR("Instruction not supported on this Gen");
+      } else {
+         CHECK(sources_not_null);
+         CHECK(send_restrictions);
+         CHECK(general_restrictions_based_on_operand_types);
+         CHECK(general_restrictions_on_region_parameters);
+         CHECK(region_alignment_rules);
+      }
+
+      if (error_msg.str && annotation) {
+         annotation_insert_error(annotation, src_offset, error_msg.str);
+      }
+      valid = valid && error_msg.len == 0;
+      free(error_msg.str);
+   }
+
+   return valid;
+}
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
new file mode 100644
index 00000000000..c410efc29d6
--- /dev/null
+++ b/src/intel/compiler/brw_fs.cpp
@@ -0,0 +1,6805 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs.cpp
+ *
+ * This file drives the GLSL IR -> LIR translation, contains the
+ * optimizations on the LIR, and drives the generation of native code
+ * from the LIR.
+ */
+
+#include "main/macros.h"
+#include "brw_eu.h"
+#include "brw_fs.h"
+#include "brw_nir.h"
+#include "brw_vec4_gs_visitor.h"
+#include "brw_cfg.h"
+#include "brw_dead_control_flow.h"
+#include "common/gen_debug.h"
+#include "compiler/glsl_types.h"
+#include "compiler/nir/nir_builder.h"
+#include "program/prog_parameter.h"
+
+using namespace brw;
+
+static unsigned get_lowered_simd_width(const struct gen_device_info *devinfo,
+                                       const fs_inst *inst);
+
+void
+fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
+              const fs_reg *src, unsigned sources)
+{
+   memset(this, 0, sizeof(*this));
+
+   this->src = new fs_reg[MAX2(sources, 3)];
+   for (unsigned i = 0; i < sources; i++)
+      this->src[i] = src[i];
+
+   this->opcode = opcode;
+   this->dst = dst;
+   this->sources = sources;
+   this->exec_size = exec_size;
+   this->base_mrf = -1;
+
+   assert(dst.file != IMM && dst.file != UNIFORM);
+
+   assert(this->exec_size != 0);
+
+   this->conditional_mod = BRW_CONDITIONAL_NONE;
+
+   /* This will be the case for almost all instructions. */
+   switch (dst.file) {
+   case VGRF:
+   case ARF:
+   case FIXED_GRF:
+   case MRF:
+   case ATTR:
+      this->size_written = dst.component_size(exec_size);
+      break;
+   case BAD_FILE:
+      this->size_written = 0;
+      break;
+   case IMM:
+   case UNIFORM:
+      unreachable("Invalid destination register file");
+   }
+
+   this->writes_accumulator = false;
+}
+
+fs_inst::fs_inst()
+{
+   init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
+}
+
+fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
+{
+   init(opcode, exec_size, reg_undef, NULL, 0);
+}
+
+fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst)
+{
+   init(opcode, exec_size, dst, NULL, 0);
+}
+
+fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
+                 const fs_reg &src0)
+{
+   const fs_reg src[1] = { src0 };
+   init(opcode, exec_size, dst, src, 1);
+}
+
+fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
+                 const fs_reg &src0, const fs_reg &src1)
+{
+   const fs_reg src[2] = { src0, src1 };
+   init(opcode, exec_size, dst, src, 2);
+}
+
+fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
+                 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
+{
+   const fs_reg src[3] = { src0, src1, src2 };
+   init(opcode, exec_size, dst, src, 3);
+}
+
+fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
+                 const fs_reg src[], unsigned sources)
+{
+   init(opcode, exec_width, dst, src, sources);
+}
+
+fs_inst::fs_inst(const fs_inst &that)
+{
+   memcpy(this, &that, sizeof(that));
+
+   this->src = new fs_reg[MAX2(that.sources, 3)];
+
+   for (unsigned i = 0; i < that.sources; i++)
+      this->src[i] = that.src[i];
+}
+
+fs_inst::~fs_inst()
+{
+   delete[] this->src;
+}
+
+void
+fs_inst::resize_sources(uint8_t num_sources)
+{
+   if (this->sources != num_sources) {
+      fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
+
+      for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
+         src[i] = this->src[i];
+
+      delete[] this->src;
+      this->src = src;
+      this->sources = num_sources;
+   }
+}
+
+void
+fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
+                                       const fs_reg &dst,
+                                       const fs_reg &surf_index,
+                                       const fs_reg &varying_offset,
+                                       uint32_t const_offset)
+{
+   /* We have our constant surface use a pitch of 4 bytes, so our index can
+    * be any component of a vector, and then we load 4 contiguous
+    * components starting from that.
+    *
+    * We break down the const_offset to a portion added to the variable offset
+    * and a portion done using fs_reg::offset, which means that if you have
+    * GLSL using something like "uniform vec4 a[20]; gl_FragColor = a[i]",
+    * we'll temporarily generate 4 vec4 loads from offset i * 4, and CSE can
+    * later notice that those loads are all the same and eliminate the
+    * redundant ones.
+    */
+   fs_reg vec4_offset = vgrf(glsl_type::uint_type);
+   bld.ADD(vec4_offset, varying_offset, brw_imm_ud(const_offset & ~0xf));
+
+   /* The pull load message will load a vec4 (16 bytes). If we are loading
+    * a double this means we are only loading 2 elements worth of data.
+    * We also want to use a 32-bit data type for the dst of the load operation
+    * so other parts of the driver don't get confused about the size of the
+    * result.
+    */
+   fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+   fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
+                            vec4_result, surf_index, vec4_offset);
+   inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
+
+   if (type_sz(dst.type) == 8) {
+      shuffle_32bit_load_result_to_64bit_data(
+         bld, retype(vec4_result, dst.type), vec4_result, 2);
+   }
+
+   vec4_result.type = dst.type;
+   bld.MOV(dst, offset(vec4_result, bld,
+                       (const_offset & 0xf) / type_sz(vec4_result.type)));
+}
+
+/**
+ * A helper for MOV generation for fixing up broken hardware SEND dependency
+ * handling.
+ */
+void
+fs_visitor::DEP_RESOLVE_MOV(const fs_builder &bld, int grf)
+{
+   /* The caller always wants uncompressed to emit the minimal extra
+    * dependencies, and to avoid having to deal with aligning its regs to 2.
+    */
+   const fs_builder ubld = bld.annotate("send dependency resolve")
+                              .half(0);
+
+   ubld.MOV(ubld.null_reg_f(), fs_reg(VGRF, grf, BRW_REGISTER_TYPE_F));
+}
+
+bool
+fs_inst::equals(fs_inst *inst) const
+{
+   return (opcode == inst->opcode &&
+           dst.equals(inst->dst) &&
+           src[0].equals(inst->src[0]) &&
+           src[1].equals(inst->src[1]) &&
+           src[2].equals(inst->src[2]) &&
+           saturate == inst->saturate &&
+           predicate == inst->predicate &&
+           conditional_mod == inst->conditional_mod &&
+           mlen == inst->mlen &&
+           base_mrf == inst->base_mrf &&
+           target == inst->target &&
+           eot == inst->eot &&
+           header_size == inst->header_size &&
+           shadow_compare == inst->shadow_compare &&
+           exec_size == inst->exec_size &&
+           offset == inst->offset);
+}
+
+bool
+fs_inst::is_send_from_grf() const
+{
+   switch (opcode) {
+   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
+   case SHADER_OPCODE_SHADER_TIME_ADD:
+   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
+   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
+   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+   case SHADER_OPCODE_UNTYPED_ATOMIC:
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+   case SHADER_OPCODE_TYPED_ATOMIC:
+   case SHADER_OPCODE_TYPED_SURFACE_READ:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+   case SHADER_OPCODE_URB_WRITE_SIMD8:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
+   case SHADER_OPCODE_URB_READ_SIMD8:
+   case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
+      return true;
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+      return src[1].file == VGRF;
+   case FS_OPCODE_FB_WRITE:
+   case FS_OPCODE_FB_READ:
+      return src[0].file == VGRF;
+   default:
+      if (is_tex())
+         return src[0].file == VGRF;
+
+      return false;
+   }
+}
+
+/**
+ * Returns true if this instruction's sources and destinations cannot
+ * safely be the same register.
+ *
+ * In most cases, a register can be written over safely by the same
+ * instruction that is its last use.  For a single instruction, the
+ * sources are dereferenced before writing of the destination starts
+ * (naturally).
+ *
+ * However, there are a few cases where this can be problematic:
+ *
+ * - Virtual opcodes that translate to multiple instructions in the
+ *   code generator: if src == dst and one instruction writes the
+ *   destination before a later instruction reads the source, then
+ *   src will have been clobbered.
+ *
+ * - SIMD16 compressed instructions with certain regioning (see below).
+ *
+ * The register allocator uses this information to set up conflicts between
+ * GRF sources and the destination.
+ */
+bool
+fs_inst::has_source_and_destination_hazard() const
+{
+   switch (opcode) {
+   case FS_OPCODE_PACK_HALF_2x16_SPLIT:
+      /* Multiple partial writes to the destination */
+      return true;
+   default:
+      /* The SIMD16 compressed instruction
+       *
+       * add(16)      g4<1>F      g4<8,8,1>F   g6<8,8,1>F
+       *
+       * is actually decoded in hardware as:
+       *
+       * add(8)       g4<1>F      g4<8,8,1>F   g6<8,8,1>F
+       * add(8)       g5<1>F      g5<8,8,1>F   g7<8,8,1>F
+       *
+       * Which is safe.  However, if we have uniform accesses
+       * happening, we get into trouble:
+       *
+       * add(8)       g4<1>F      g4<0,1,0>F   g6<8,8,1>F
+       * add(8)       g5<1>F      g4<0,1,0>F   g7<8,8,1>F
+       *
+       * Now our destination for the first instruction overwrote the
+       * second instruction's src0, and we get garbage for those 8
+       * pixels.  There's a similar issue for the pre-gen6
+       * pixel_x/pixel_y, which are registers of 16-bit values and thus
+       * would get stomped by the first decode as well.
+       */
+      if (exec_size == 16) {
+         for (int i = 0; i < sources; i++) {
+            if (src[i].file == VGRF && (src[i].stride == 0 ||
+                                        src[i].type == BRW_REGISTER_TYPE_UW ||
+                                        src[i].type == BRW_REGISTER_TYPE_W ||
+                                        src[i].type == BRW_REGISTER_TYPE_UB ||
+                                        src[i].type == BRW_REGISTER_TYPE_B)) {
+               return true;
+            }
+         }
+      }
+      return false;
+   }
+}
+
+bool
+fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
+{
+   if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
+      return false;
+
+   fs_reg reg = this->src[0];
+   if (reg.file != VGRF || reg.offset != 0 || reg.stride != 1)
+      return false;
+
+   if (grf_alloc.sizes[reg.nr] * REG_SIZE != this->size_written)
+      return false;
+
+   for (int i = 0; i < this->sources; i++) {
+      reg.type = this->src[i].type;
+      if (!this->src[i].equals(reg))
+         return false;
+
+      if (i < this->header_size) {
+         reg.offset += REG_SIZE;
+      } else {
+         reg = horiz_offset(reg, this->exec_size);
+      }
+   }
+
+   return true;
+}
+
+bool
+fs_inst::can_do_source_mods(const struct gen_device_info *devinfo)
+{
+   if (devinfo->gen == 6 && is_math())
+      return false;
+
+   if (is_send_from_grf())
+      return false;
+
+   if (!backend_instruction::can_do_source_mods())
+      return false;
+
+   return true;
+}
+
+bool
+fs_inst::can_change_types() const
+{
+   return dst.type == src[0].type &&
+          !src[0].abs && !src[0].negate && !saturate &&
+          (opcode == BRW_OPCODE_MOV ||
+           (opcode == BRW_OPCODE_SEL &&
+            dst.type == src[1].type &&
+            predicate != BRW_PREDICATE_NONE &&
+            !src[1].abs && !src[1].negate));
+}
+
+bool
+fs_inst::has_side_effects() const
+{
+   return this->eot || backend_instruction::has_side_effects();
+}
+
+void
+fs_reg::init()
+{
+   memset(this, 0, sizeof(*this));
+   stride = 1;
+}
+
+/** Generic unset register constructor. */
+fs_reg::fs_reg()
+{
+   init();
+   this->file = BAD_FILE;
+}
+
+fs_reg::fs_reg(struct ::brw_reg reg) :
+   backend_reg(reg)
+{
+   this->offset = 0;
+   this->stride = 1;
+   if (this->file == IMM &&
+       (this->type != BRW_REGISTER_TYPE_V &&
+        this->type != BRW_REGISTER_TYPE_UV &&
+        this->type != BRW_REGISTER_TYPE_VF)) {
+      this->stride = 0;
+   }
+}
+
+bool
+fs_reg::equals(const fs_reg &r) const
+{
+   return (this->backend_reg::equals(r) &&
+           stride == r.stride);
+}
+
+bool
+fs_reg::is_contiguous() const
+{
+   return stride == 1;
+}
+
+unsigned
+fs_reg::component_size(unsigned width) const
+{
+   const unsigned stride = ((file != ARF && file != FIXED_GRF) ? this->stride :
+                            hstride == 0 ? 0 :
+                            1 << (hstride - 1));
+   return MAX2(width * stride, 1) * type_sz(type);
+}
+
+extern "C" int
+type_size_scalar(const struct glsl_type *type)
+{
+   unsigned int size, i;
+
+   switch (type->base_type) {
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_FLOAT:
+   case GLSL_TYPE_BOOL:
+      return type->components();
+   case GLSL_TYPE_DOUBLE:
+   case GLSL_TYPE_UINT64:
+   case GLSL_TYPE_INT64:
+      return type->components() * 2;
+   case GLSL_TYPE_ARRAY:
+      return type_size_scalar(type->fields.array) * type->length;
+   case GLSL_TYPE_STRUCT:
+      size = 0;
+      for (i = 0; i < type->length; i++) {
+	 size += type_size_scalar(type->fields.structure[i].type);
+      }
+      return size;
+   case GLSL_TYPE_SAMPLER:
+      /* Samplers take up no register space, since they're baked in at
+       * link time.
+       */
+      return 0;
+   case GLSL_TYPE_ATOMIC_UINT:
+      return 0;
+   case GLSL_TYPE_SUBROUTINE:
+      return 1;
+   case GLSL_TYPE_IMAGE:
+      return BRW_IMAGE_PARAM_SIZE;
+   case GLSL_TYPE_VOID:
+   case GLSL_TYPE_ERROR:
+   case GLSL_TYPE_INTERFACE:
+   case GLSL_TYPE_FUNCTION:
+      unreachable("not reached");
+   }
+
+   return 0;
+}
+
+/**
+ * Create a MOV to read the timestamp register.
+ *
+ * The caller is responsible for emitting the MOV.  The return value is
+ * the destination of the MOV, with extra parameters set.
+ */
+fs_reg
+fs_visitor::get_timestamp(const fs_builder &bld)
+{
+   assert(devinfo->gen >= 7);
+
+   fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+                                          BRW_ARF_TIMESTAMP,
+                                          0),
+                             BRW_REGISTER_TYPE_UD));
+
+   fs_reg dst = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+
+   /* We want to read the 3 fields we care about even if it's not enabled in
+    * the dispatch.
+    */
+   bld.group(4, 0).exec_all().MOV(dst, ts);
+
+   return dst;
+}
+
+void
+fs_visitor::emit_shader_time_begin()
+{
+   /* We want only the low 32 bits of the timestamp.  Since it's running
+    * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
+    * which is plenty of time for our purposes.  It is identical across the
+    * EUs, but since it's tracking GPU core speed it will increment at a
+    * varying rate as render P-states change.
+    */
+   shader_start_time = component(
+      get_timestamp(bld.annotate("shader time start")), 0);
+}
+
+void
+fs_visitor::emit_shader_time_end()
+{
+   /* Insert our code just before the final SEND with EOT. */
+   exec_node *end = this->instructions.get_tail();
+   assert(end && ((fs_inst *) end)->eot);
+   const fs_builder ibld = bld.annotate("shader time end")
+                              .exec_all().at(NULL, end);
+   const fs_reg timestamp = get_timestamp(ibld);
+
+   /* We only use the low 32 bits of the timestamp - see
+    * emit_shader_time_begin()).
+    *
+    * We could also check if render P-states have changed (or anything
+    * else that might disrupt timing) by setting smear to 2 and checking if
+    * that field is != 0.
+    */
+   const fs_reg shader_end_time = component(timestamp, 0);
+
+   /* Check that there weren't any timestamp reset events (assuming these
+    * were the only two timestamp reads that happened).
+    */
+   const fs_reg reset = component(timestamp, 2);
+   set_condmod(BRW_CONDITIONAL_Z,
+               ibld.AND(ibld.null_reg_ud(), reset, brw_imm_ud(1u)));
+   ibld.IF(BRW_PREDICATE_NORMAL);
+
+   fs_reg start = shader_start_time;
+   start.negate = true;
+   const fs_reg diff = component(fs_reg(VGRF, alloc.allocate(1),
+                                        BRW_REGISTER_TYPE_UD),
+                                 0);
+   const fs_builder cbld = ibld.group(1, 0);
+   cbld.group(1, 0).ADD(diff, start, shader_end_time);
+
+   /* If there were no instructions between the two timestamp gets, the diff
+    * is 2 cycles.  Remove that overhead, so I can forget about that when
+    * trying to determine the time taken for single instructions.
+    */
+   cbld.ADD(diff, diff, brw_imm_ud(-2u));
+   SHADER_TIME_ADD(cbld, 0, diff);
+   SHADER_TIME_ADD(cbld, 1, brw_imm_ud(1u));
+   ibld.emit(BRW_OPCODE_ELSE);
+   SHADER_TIME_ADD(cbld, 2, brw_imm_ud(1u));
+   ibld.emit(BRW_OPCODE_ENDIF);
+}
+
+void
+fs_visitor::SHADER_TIME_ADD(const fs_builder &bld,
+                            int shader_time_subindex,
+                            fs_reg value)
+{
+   int index = shader_time_index * 3 + shader_time_subindex;
+   struct brw_reg offset = brw_imm_d(index * BRW_SHADER_TIME_STRIDE);
+
+   fs_reg payload;
+   if (dispatch_width == 8)
+      payload = vgrf(glsl_type::uvec2_type);
+   else
+      payload = vgrf(glsl_type::uint_type);
+
+   bld.emit(SHADER_OPCODE_SHADER_TIME_ADD, fs_reg(), payload, offset, value);
+}
+
+void
+fs_visitor::vfail(const char *format, va_list va)
+{
+   char *msg;
+
+   if (failed)
+      return;
+
+   failed = true;
+
+   msg = ralloc_vasprintf(mem_ctx, format, va);
+   msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
+
+   this->fail_msg = msg;
+
+   if (debug_enabled) {
+      fprintf(stderr, "%s",  msg);
+   }
+}
+
+void
+fs_visitor::fail(const char *format, ...)
+{
+   va_list va;
+
+   va_start(va, format);
+   vfail(format, va);
+   va_end(va);
+}
+
+/**
+ * Mark this program as impossible to compile with dispatch width greater
+ * than n.
+ *
+ * During the SIMD8 compile (which happens first), we can detect and flag
+ * things that are unsupported in SIMD16+ mode, so the compiler can skip the
+ * SIMD16+ compile altogether.
+ *
+ * During a compile of dispatch width greater than n (if one happens anyway),
+ * this just calls fail().
+ */
+void
+fs_visitor::limit_dispatch_width(unsigned n, const char *msg)
+{
+   if (dispatch_width > n) {
+      fail("%s", msg);
+   } else {
+      max_dispatch_width = n;
+      compiler->shader_perf_log(log_data,
+                                "Shader dispatch width limited to SIMD%d: %s",
+                                n, msg);
+   }
+}
+
+/**
+ * Returns true if the instruction has a flag that means it won't
+ * update an entire destination register.
+ *
+ * For example, dead code elimination and live variable analysis want to know
+ * when a write to a variable screens off any preceding values that were in
+ * it.
+ */
+bool
+fs_inst::is_partial_write() const
+{
+   return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
+           (this->exec_size * type_sz(this->dst.type)) < 32 ||
+           !this->dst.is_contiguous() ||
+           this->dst.offset % REG_SIZE != 0);
+}
+
+unsigned
+fs_inst::components_read(unsigned i) const
+{
+   /* Return zero if the source is not present. */
+   if (src[i].file == BAD_FILE)
+      return 0;
+
+   switch (opcode) {
+   case FS_OPCODE_LINTERP:
+      if (i == 0)
+         return 2;
+      else
+         return 1;
+
+   case FS_OPCODE_PIXEL_X:
+   case FS_OPCODE_PIXEL_Y:
+      assert(i == 0);
+      return 2;
+
+   case FS_OPCODE_FB_WRITE_LOGICAL:
+      assert(src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
+      /* First/second FB write color. */
+      if (i < 2)
+         return src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
+      else
+         return 1;
+
+   case SHADER_OPCODE_TEX_LOGICAL:
+   case SHADER_OPCODE_TXD_LOGICAL:
+   case SHADER_OPCODE_TXF_LOGICAL:
+   case SHADER_OPCODE_TXL_LOGICAL:
+   case SHADER_OPCODE_TXS_LOGICAL:
+   case FS_OPCODE_TXB_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+   case SHADER_OPCODE_TXF_UMS_LOGICAL:
+   case SHADER_OPCODE_TXF_MCS_LOGICAL:
+   case SHADER_OPCODE_LOD_LOGICAL:
+   case SHADER_OPCODE_TG4_LOGICAL:
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+   case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
+      assert(src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM &&
+             src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
+      /* Texture coordinates. */
+      if (i == TEX_LOGICAL_SRC_COORDINATE)
+         return src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
+      /* Texture derivatives. */
+      else if ((i == TEX_LOGICAL_SRC_LOD || i == TEX_LOGICAL_SRC_LOD2) &&
+               opcode == SHADER_OPCODE_TXD_LOGICAL)
+         return src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
+      /* Texture offset. */
+      else if (i == TEX_LOGICAL_SRC_TG4_OFFSET)
+         return 2;
+      /* MCS */
+      else if (i == TEX_LOGICAL_SRC_MCS && opcode == SHADER_OPCODE_TXF_CMS_W_LOGICAL)
+         return 2;
+      else
+         return 1;
+
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+      assert(src[3].file == IMM);
+      /* Surface coordinates. */
+      if (i == 0)
+         return src[3].ud;
+      /* Surface operation source (ignored for reads). */
+      else if (i == 1)
+         return 0;
+      else
+         return 1;
+
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+      assert(src[3].file == IMM &&
+             src[4].file == IMM);
+      /* Surface coordinates. */
+      if (i == 0)
+         return src[3].ud;
+      /* Surface operation source. */
+      else if (i == 1)
+         return src[4].ud;
+      else
+         return 1;
+
+   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL: {
+      assert(src[3].file == IMM &&
+             src[4].file == IMM);
+      const unsigned op = src[4].ud;
+      /* Surface coordinates. */
+      if (i == 0)
+         return src[3].ud;
+      /* Surface operation source. */
+      else if (i == 1 && op == BRW_AOP_CMPWR)
+         return 2;
+      else if (i == 1 && (op == BRW_AOP_INC || op == BRW_AOP_DEC ||
+                          op == BRW_AOP_PREDEC))
+         return 0;
+      else
+         return 1;
+   }
+
+   default:
+      return 1;
+   }
+}
+
+unsigned
+fs_inst::size_read(int arg) const
+{
+   switch (opcode) {
+   case FS_OPCODE_FB_WRITE:
+   case FS_OPCODE_FB_READ:
+   case SHADER_OPCODE_URB_WRITE_SIMD8:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
+   case SHADER_OPCODE_URB_READ_SIMD8:
+   case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
+   case SHADER_OPCODE_UNTYPED_ATOMIC:
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+   case SHADER_OPCODE_TYPED_ATOMIC:
+   case SHADER_OPCODE_TYPED_SURFACE_READ:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+      if (arg == 0)
+         return mlen * REG_SIZE;
+      break;
+
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
+      /* The payload is actually stored in src1 */
+      if (arg == 1)
+         return mlen * REG_SIZE;
+      break;
+
+   case FS_OPCODE_LINTERP:
+      if (arg == 1)
+         return 16;
+      break;
+
+   case SHADER_OPCODE_LOAD_PAYLOAD:
+      if (arg < this->header_size)
+         return REG_SIZE;
+      break;
+
+   case CS_OPCODE_CS_TERMINATE:
+   case SHADER_OPCODE_BARRIER:
+      return REG_SIZE;
+
+   case SHADER_OPCODE_MOV_INDIRECT:
+      if (arg == 0) {
+         assert(src[2].file == IMM);
+         return src[2].ud;
+      }
+      break;
+
+   default:
+      if (is_tex() && arg == 0 && src[0].file == VGRF)
+         return mlen * REG_SIZE;
+      break;
+   }
+
+   switch (src[arg].file) {
+   case UNIFORM:
+   case IMM:
+      return components_read(arg) * type_sz(src[arg].type);
+   case BAD_FILE:
+   case ARF:
+   case FIXED_GRF:
+   case VGRF:
+   case ATTR:
+      return components_read(arg) * src[arg].component_size(exec_size);
+   case MRF:
+      unreachable("MRF registers are not allowed as sources");
+   }
+   return 0;
+}
+
+namespace {
+   /* Return the subset of flag registers that an instruction could
+    * potentially read or write based on the execution controls and flag
+    * subregister number of the instruction.
+    */
+   unsigned
+   flag_mask(const fs_inst *inst)
+   {
+      const unsigned start = inst->flag_subreg * 16 + inst->group;
+      const unsigned end = start + inst->exec_size;
+      return ((1 << DIV_ROUND_UP(end, 8)) - 1) & ~((1 << (start / 8)) - 1);
+   }
+}
+
+unsigned
+fs_inst::flags_read(const gen_device_info *devinfo) const
+{
+   /* XXX - This doesn't consider explicit uses of the flag register as source
+    *       region.
+    */
+   if (predicate == BRW_PREDICATE_ALIGN1_ANYV ||
+       predicate == BRW_PREDICATE_ALIGN1_ALLV) {
+      /* The vertical predication modes combine corresponding bits from
+       * f0.0 and f1.0 on Gen7+, and f0.0 and f0.1 on older hardware.
+       */
+      const unsigned shift = devinfo->gen >= 7 ? 4 : 2;
+      return flag_mask(this) << shift | flag_mask(this);
+   } else if (predicate) {
+      return flag_mask(this);
+   } else {
+      return 0;
+   }
+}
+
+unsigned
+fs_inst::flags_written() const
+{
+   /* XXX - This doesn't consider explicit uses of the flag register as
+    *       destination region.
+    */
+   if ((conditional_mod && (opcode != BRW_OPCODE_SEL &&
+                            opcode != BRW_OPCODE_IF &&
+                            opcode != BRW_OPCODE_WHILE)) ||
+       opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
+      return flag_mask(this);
+   } else {
+      return 0;
+   }
+}
+
+/**
+ * Returns how many MRFs an FS opcode will write over.
+ *
+ * Note that this is not the 0 or 1 implied writes in an actual gen
+ * instruction -- the FS opcodes often generate MOVs in addition.
+ */
+int
+fs_visitor::implied_mrf_writes(fs_inst *inst)
+{
+   if (inst->mlen == 0)
+      return 0;
+
+   if (inst->base_mrf == -1)
+      return 0;
+
+   switch (inst->opcode) {
+   case SHADER_OPCODE_RCP:
+   case SHADER_OPCODE_RSQ:
+   case SHADER_OPCODE_SQRT:
+   case SHADER_OPCODE_EXP2:
+   case SHADER_OPCODE_LOG2:
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_COS:
+      return 1 * dispatch_width / 8;
+   case SHADER_OPCODE_POW:
+   case SHADER_OPCODE_INT_QUOTIENT:
+   case SHADER_OPCODE_INT_REMAINDER:
+      return 2 * dispatch_width / 8;
+   case SHADER_OPCODE_TEX:
+   case FS_OPCODE_TXB:
+   case SHADER_OPCODE_TXD:
+   case SHADER_OPCODE_TXF:
+   case SHADER_OPCODE_TXF_CMS:
+   case SHADER_OPCODE_TXF_MCS:
+   case SHADER_OPCODE_TG4:
+   case SHADER_OPCODE_TG4_OFFSET:
+   case SHADER_OPCODE_TXL:
+   case SHADER_OPCODE_TXS:
+   case SHADER_OPCODE_LOD:
+   case SHADER_OPCODE_SAMPLEINFO:
+      return 1;
+   case FS_OPCODE_FB_WRITE:
+      return 2;
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+   case SHADER_OPCODE_GEN4_SCRATCH_READ:
+      return 1;
+   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
+      return inst->mlen;
+   case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
+      return inst->mlen;
+   default:
+      unreachable("not reached");
+   }
+}
+
+fs_reg
+fs_visitor::vgrf(const glsl_type *const type)
+{
+   int reg_width = dispatch_width / 8;
+   return fs_reg(VGRF, alloc.allocate(type_size_scalar(type) * reg_width),
+                 brw_type_for_base_type(type));
+}
+
+fs_reg::fs_reg(enum brw_reg_file file, int nr)
+{
+   init();
+   this->file = file;
+   this->nr = nr;
+   this->type = BRW_REGISTER_TYPE_F;
+   this->stride = (file == UNIFORM ? 0 : 1);
+}
+
+fs_reg::fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type)
+{
+   init();
+   this->file = file;
+   this->nr = nr;
+   this->type = type;
+   this->stride = (file == UNIFORM ? 0 : 1);
+}
+
+/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
+ * This brings in those uniform definitions
+ */
+void
+fs_visitor::import_uniforms(fs_visitor *v)
+{
+   this->push_constant_loc = v->push_constant_loc;
+   this->pull_constant_loc = v->pull_constant_loc;
+   this->uniforms = v->uniforms;
+}
+
+void
+fs_visitor::emit_fragcoord_interpolation(fs_reg wpos)
+{
+   assert(stage == MESA_SHADER_FRAGMENT);
+
+   /* gl_FragCoord.x */
+   bld.MOV(wpos, this->pixel_x);
+   wpos = offset(wpos, bld, 1);
+
+   /* gl_FragCoord.y */
+   bld.MOV(wpos, this->pixel_y);
+   wpos = offset(wpos, bld, 1);
+
+   /* gl_FragCoord.z */
+   if (devinfo->gen >= 6) {
+      bld.MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)));
+   } else {
+      bld.emit(FS_OPCODE_LINTERP, wpos,
+           this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL],
+           interp_reg(VARYING_SLOT_POS, 2));
+   }
+   wpos = offset(wpos, bld, 1);
+
+   /* gl_FragCoord.w: Already set up in emit_interpolation */
+   bld.MOV(wpos, this->wpos_w);
+}
+
+enum brw_barycentric_mode
+brw_barycentric_mode(enum glsl_interp_mode mode, nir_intrinsic_op op)
+{
+   /* Barycentric modes don't make sense for flat inputs. */
+   assert(mode != INTERP_MODE_FLAT);
+
+   unsigned bary;
+   switch (op) {
+   case nir_intrinsic_load_barycentric_pixel:
+   case nir_intrinsic_load_barycentric_at_offset:
+      bary = BRW_BARYCENTRIC_PERSPECTIVE_PIXEL;
+      break;
+   case nir_intrinsic_load_barycentric_centroid:
+      bary = BRW_BARYCENTRIC_PERSPECTIVE_CENTROID;
+      break;
+   case nir_intrinsic_load_barycentric_sample:
+   case nir_intrinsic_load_barycentric_at_sample:
+      bary = BRW_BARYCENTRIC_PERSPECTIVE_SAMPLE;
+      break;
+   default:
+      unreachable("invalid intrinsic");
+   }
+
+   if (mode == INTERP_MODE_NOPERSPECTIVE)
+      bary += 3;
+
+   return (enum brw_barycentric_mode) bary;
+}
+
+/**
+ * Turn one of the two CENTROID barycentric modes into PIXEL mode.
+ */
+static enum brw_barycentric_mode
+centroid_to_pixel(enum brw_barycentric_mode bary)
+{
+   assert(bary == BRW_BARYCENTRIC_PERSPECTIVE_CENTROID ||
+          bary == BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
+   return (enum brw_barycentric_mode) ((unsigned) bary - 1);
+}
+
+fs_reg *
+fs_visitor::emit_frontfacing_interpolation()
+{
+   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
+
+   if (devinfo->gen >= 6) {
+      /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
+       * a boolean result from this (~0/true or 0/false).
+       *
+       * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
+       * this task in only one instruction:
+       *    - a negation source modifier will flip the bit; and
+       *    - a W -> D type conversion will sign extend the bit into the high
+       *      word of the destination.
+       *
+       * An ASR 15 fills the low word of the destination.
+       */
+      fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
+      g0.negate = true;
+
+      bld.ASR(*reg, g0, brw_imm_d(15));
+   } else {
+      /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
+       * a boolean result from this (1/true or 0/false).
+       *
+       * Like in the above case, since the bit is the MSB of g1.6:UD we can use
+       * the negation source modifier to flip it. Unfortunately the SHR
+       * instruction only operates on UD (or D with an abs source modifier)
+       * sources without negation.
+       *
+       * Instead, use ASR (which will give ~0/true or 0/false).
+       */
+      fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
+      g1_6.negate = true;
+
+      bld.ASR(*reg, g1_6, brw_imm_d(31));
+   }
+
+   return reg;
+}
+
+void
+fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
+{
+   assert(stage == MESA_SHADER_FRAGMENT);
+   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
+   assert(dst.type == BRW_REGISTER_TYPE_F);
+
+   if (wm_prog_data->persample_dispatch) {
+      /* Convert int_sample_pos to floating point */
+      bld.MOV(dst, int_sample_pos);
+      /* Scale to the range [0, 1] */
+      bld.MUL(dst, dst, brw_imm_f(1 / 16.0f));
+   }
+   else {
+      /* From ARB_sample_shading specification:
+       * "When rendering to a non-multisample buffer, or if multisample
+       *  rasterization is disabled, gl_SamplePosition will always be
+       *  (0.5, 0.5).
+       */
+      bld.MOV(dst, brw_imm_f(0.5f));
+   }
+}
+
+fs_reg *
+fs_visitor::emit_samplepos_setup()
+{
+   assert(devinfo->gen >= 6);
+
+   const fs_builder abld = bld.annotate("compute sample position");
+   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
+   fs_reg pos = *reg;
+   fs_reg int_sample_x = vgrf(glsl_type::int_type);
+   fs_reg int_sample_y = vgrf(glsl_type::int_type);
+
+   /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
+    * mode will be enabled.
+    *
+    * From the Ivy Bridge PRM, volume 2 part 1, page 344:
+    * R31.1:0         Position Offset X/Y for Slot[3:0]
+    * R31.3:2         Position Offset X/Y for Slot[7:4]
+    * .....
+    *
+    * The X, Y sample positions come in as bytes in  thread payload. So, read
+    * the positions using vstride=16, width=8, hstride=2.
+    */
+   struct brw_reg sample_pos_reg =
+      stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
+                    BRW_REGISTER_TYPE_B), 16, 8, 2);
+
+   if (dispatch_width == 8) {
+      abld.MOV(int_sample_x, fs_reg(sample_pos_reg));
+   } else {
+      abld.half(0).MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg));
+      abld.half(1).MOV(half(int_sample_x, 1),
+                       fs_reg(suboffset(sample_pos_reg, 16)));
+   }
+   /* Compute gl_SamplePosition.x */
+   compute_sample_position(pos, int_sample_x);
+   pos = offset(pos, abld, 1);
+   if (dispatch_width == 8) {
+      abld.MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)));
+   } else {
+      abld.half(0).MOV(half(int_sample_y, 0),
+                       fs_reg(suboffset(sample_pos_reg, 1)));
+      abld.half(1).MOV(half(int_sample_y, 1),
+                       fs_reg(suboffset(sample_pos_reg, 17)));
+   }
+   /* Compute gl_SamplePosition.y */
+   compute_sample_position(pos, int_sample_y);
+   return reg;
+}
+
+fs_reg *
+fs_visitor::emit_sampleid_setup()
+{
+   assert(stage == MESA_SHADER_FRAGMENT);
+   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+   assert(devinfo->gen >= 6);
+
+   const fs_builder abld = bld.annotate("compute sample id");
+   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
+
+   if (!key->multisample_fbo) {
+      /* As per GL_ARB_sample_shading specification:
+       * "When rendering to a non-multisample buffer, or if multisample
+       *  rasterization is disabled, gl_SampleID will always be zero."
+       */
+      abld.MOV(*reg, brw_imm_d(0));
+   } else if (devinfo->gen >= 8) {
+      /* Sample ID comes in as 4-bit numbers in g1.0:
+       *
+       *    15:12 Slot 3 SampleID (only used in SIMD16)
+       *     11:8 Slot 2 SampleID (only used in SIMD16)
+       *      7:4 Slot 1 SampleID
+       *      3:0 Slot 0 SampleID
+       *
+       * Each slot corresponds to four channels, so we want to replicate each
+       * half-byte value to 4 channels in a row:
+       *
+       *    dst+0:    .7    .6    .5    .4    .3    .2    .1    .0
+       *             7:4   7:4   7:4   7:4   3:0   3:0   3:0   3:0
+       *
+       *    dst+1:    .7    .6    .5    .4    .3    .2    .1    .0  (if SIMD16)
+       *           15:12 15:12 15:12 15:12  11:8  11:8  11:8  11:8
+       *
+       * First, we read g1.0 with a <1,8,0>UB region, causing the first 8
+       * channels to read the first byte (7:0), and the second group of 8
+       * channels to read the second byte (15:8).  Then, we shift right by
+       * a vector immediate of <4, 4, 4, 4, 0, 0, 0, 0>, moving the slot 1 / 3
+       * values into place.  Finally, we AND with 0xf to keep the low nibble.
+       *
+       *    shr(16) tmp<1>W g1.0<1,8,0>B 0x44440000:V
+       *    and(16) dst<1>D tmp<8,8,1>W  0xf:W
+       *
+       * TODO: These payload bits exist on Gen7 too, but they appear to always
+       *       be zero, so this code fails to work.  We should find out why.
+       */
+      fs_reg tmp(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W);
+
+      abld.SHR(tmp, fs_reg(stride(retype(brw_vec1_grf(1, 0),
+                                         BRW_REGISTER_TYPE_B), 1, 8, 0)),
+                    brw_imm_v(0x44440000));
+      abld.AND(*reg, tmp, brw_imm_w(0xf));
+   } else {
+      const fs_reg t1 = component(fs_reg(VGRF, alloc.allocate(1),
+                                         BRW_REGISTER_TYPE_D), 0);
+      const fs_reg t2(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_W);
+
+      /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
+       * 8x multisampling, subspan 0 will represent sample N (where N
+       * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
+       * 7. We can find the value of N by looking at R0.0 bits 7:6
+       * ("Starting Sample Pair Index (SSPI)") and multiplying by two
+       * (since samples are always delivered in pairs). That is, we
+       * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
+       * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
+       * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
+       * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
+       * populating a temporary variable with the sequence (0, 1, 2, 3),
+       * and then reading from it using vstride=1, width=4, hstride=0.
+       * These computations hold good for 4x multisampling as well.
+       *
+       * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
+       * the first four slots are sample 0 of subspan 0; the next four
+       * are sample 1 of subspan 0; the third group is sample 0 of
+       * subspan 1, and finally sample 1 of subspan 1.
+       */
+
+      /* SKL+ has an extra bit for the Starting Sample Pair Index to
+       * accomodate 16x MSAA.
+       */
+      abld.exec_all().group(1, 0)
+          .AND(t1, fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
+               brw_imm_ud(0xc0));
+      abld.exec_all().group(1, 0).SHR(t1, t1, brw_imm_d(5));
+
+      /* This works for both SIMD8 and SIMD16 */
+      abld.exec_all().group(4, 0).MOV(t2, brw_imm_v(0x3210));
+
+      /* This special instruction takes care of setting vstride=1,
+       * width=4, hstride=0 of t2 during an ADD instruction.
+       */
+      abld.emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
+   }
+
+   return reg;
+}
+
+fs_reg *
+fs_visitor::emit_samplemaskin_setup()
+{
+   assert(stage == MESA_SHADER_FRAGMENT);
+   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
+   assert(devinfo->gen >= 6);
+
+   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
+
+   fs_reg coverage_mask(retype(brw_vec8_grf(payload.sample_mask_in_reg, 0),
+                               BRW_REGISTER_TYPE_D));
+
+   if (wm_prog_data->persample_dispatch) {
+      /* gl_SampleMaskIn[] comes from two sources: the input coverage mask,
+       * and a mask representing which sample is being processed by the
+       * current shader invocation.
+       *
+       * From the OES_sample_variables specification:
+       * "When per-sample shading is active due to the use of a fragment input
+       *  qualified by "sample" or due to the use of the gl_SampleID or
+       *  gl_SamplePosition variables, only the bit for the current sample is
+       *  set in gl_SampleMaskIn."
+       */
+      const fs_builder abld = bld.annotate("compute gl_SampleMaskIn");
+
+      if (nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
+         nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
+
+      fs_reg one = vgrf(glsl_type::int_type);
+      fs_reg enabled_mask = vgrf(glsl_type::int_type);
+      abld.MOV(one, brw_imm_d(1));
+      abld.SHL(enabled_mask, one, nir_system_values[SYSTEM_VALUE_SAMPLE_ID]);
+      abld.AND(*reg, enabled_mask, coverage_mask);
+   } else {
+      /* In per-pixel mode, the coverage mask is sufficient. */
+      *reg = coverage_mask;
+   }
+   return reg;
+}
+
+fs_reg
+fs_visitor::resolve_source_modifiers(const fs_reg &src)
+{
+   if (!src.abs && !src.negate)
+      return src;
+
+   fs_reg temp = bld.vgrf(src.type);
+   bld.MOV(temp, src);
+
+   return temp;
+}
+
+void
+fs_visitor::emit_discard_jump()
+{
+   assert(brw_wm_prog_data(this->prog_data)->uses_kill);
+
+   /* For performance, after a discard, jump to the end of the
+    * shader if all relevant channels have been discarded.
+    */
+   fs_inst *discard_jump = bld.emit(FS_OPCODE_DISCARD_JUMP);
+   discard_jump->flag_subreg = 1;
+
+   discard_jump->predicate = BRW_PREDICATE_ALIGN1_ANY4H;
+   discard_jump->predicate_inverse = true;
+}
+
+void
+fs_visitor::emit_gs_thread_end()
+{
+   assert(stage == MESA_SHADER_GEOMETRY);
+
+   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
+
+   if (gs_compile->control_data_header_size_bits > 0) {
+      emit_gs_control_data_bits(this->final_gs_vertex_count);
+   }
+
+   const fs_builder abld = bld.annotate("thread end");
+   fs_inst *inst;
+
+   if (gs_prog_data->static_vertex_count != -1) {
+      foreach_in_list_reverse(fs_inst, prev, &this->instructions) {
+         if (prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8 ||
+             prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
+             prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
+             prev->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT) {
+            prev->eot = true;
+
+            /* Delete now dead instructions. */
+            foreach_in_list_reverse_safe(exec_node, dead, &this->instructions) {
+               if (dead == prev)
+                  break;
+               dead->remove();
+            }
+            return;
+         } else if (prev->is_control_flow() || prev->has_side_effects()) {
+            break;
+         }
+      }
+      fs_reg hdr = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+      abld.MOV(hdr, fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD)));
+      inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, hdr);
+      inst->mlen = 1;
+   } else {
+      fs_reg payload = abld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+      fs_reg *sources = ralloc_array(mem_ctx, fs_reg, 2);
+      sources[0] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
+      sources[1] = this->final_gs_vertex_count;
+      abld.LOAD_PAYLOAD(payload, sources, 2, 2);
+      inst = abld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
+      inst->mlen = 2;
+   }
+   inst->eot = true;
+   inst->offset = 0;
+}
+
+void
+fs_visitor::assign_curb_setup()
+{
+   prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
+
+   /* Map the offsets in the UNIFORM file to fixed HW regs. */
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      for (unsigned int i = 0; i < inst->sources; i++) {
+	 if (inst->src[i].file == UNIFORM) {
+            int uniform_nr = inst->src[i].nr + inst->src[i].offset / 4;
+            int constant_nr;
+            if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
+               constant_nr = push_constant_loc[uniform_nr];
+            } else {
+               /* Section 5.11 of the OpenGL 4.1 spec says:
+                * "Out-of-bounds reads return undefined values, which include
+                *  values from other variables of the active program or zero."
+                * Just return the first push constant.
+                */
+               constant_nr = 0;
+            }
+
+	    struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
+						  constant_nr / 8,
+						  constant_nr % 8);
+            brw_reg.abs = inst->src[i].abs;
+            brw_reg.negate = inst->src[i].negate;
+
+            assert(inst->src[i].stride == 0);
+            inst->src[i] = byte_offset(
+               retype(brw_reg, inst->src[i].type),
+               inst->src[i].offset % 4);
+	 }
+      }
+   }
+
+   /* This may be updated in assign_urb_setup or assign_vs_urb_setup. */
+   this->first_non_payload_grf = payload.num_regs + prog_data->curb_read_length;
+}
+
+void
+fs_visitor::calculate_urb_setup()
+{
+   assert(stage == MESA_SHADER_FRAGMENT);
+   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+
+   memset(prog_data->urb_setup, -1,
+          sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
+
+   int urb_next = 0;
+   /* Figure out where each of the incoming setup attributes lands. */
+   if (devinfo->gen >= 6) {
+      if (_mesa_bitcount_64(nir->info->inputs_read &
+                            BRW_FS_VARYING_INPUT_MASK) <= 16) {
+         /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
+          * first 16 varying inputs, so we can put them wherever we want.
+          * Just put them in order.
+          *
+          * This is useful because it means that (a) inputs not used by the
+          * fragment shader won't take up valuable register space, and (b) we
+          * won't have to recompile the fragment shader if it gets paired with
+          * a different vertex (or geometry) shader.
+          */
+         for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
+            if (nir->info->inputs_read & BRW_FS_VARYING_INPUT_MASK &
+                BITFIELD64_BIT(i)) {
+               prog_data->urb_setup[i] = urb_next++;
+            }
+         }
+      } else {
+         bool include_vue_header =
+            nir->info->inputs_read & (VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT);
+
+         /* We have enough input varyings that the SF/SBE pipeline stage can't
+          * arbitrarily rearrange them to suit our whim; we have to put them
+          * in an order that matches the output of the previous pipeline stage
+          * (geometry or vertex shader).
+          */
+         struct brw_vue_map prev_stage_vue_map;
+         brw_compute_vue_map(devinfo, &prev_stage_vue_map,
+                             key->input_slots_valid,
+                             nir->info->separate_shader);
+         int first_slot =
+            include_vue_header ? 0 : 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
+
+         assert(prev_stage_vue_map.num_slots <= first_slot + 32);
+         for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
+              slot++) {
+            int varying = prev_stage_vue_map.slot_to_varying[slot];
+            if (varying != BRW_VARYING_SLOT_PAD &&
+                (nir->info->inputs_read & BRW_FS_VARYING_INPUT_MASK &
+                 BITFIELD64_BIT(varying))) {
+               prog_data->urb_setup[varying] = slot - first_slot;
+            }
+         }
+         urb_next = prev_stage_vue_map.num_slots - first_slot;
+      }
+   } else {
+      /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
+      for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
+         /* Point size is packed into the header, not as a general attribute */
+         if (i == VARYING_SLOT_PSIZ)
+            continue;
+
+	 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
+	    /* The back color slot is skipped when the front color is
+	     * also written to.  In addition, some slots can be
+	     * written in the vertex shader and not read in the
+	     * fragment shader.  So the register number must always be
+	     * incremented, mapped or not.
+	     */
+	    if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
+	       prog_data->urb_setup[i] = urb_next;
+            urb_next++;
+	 }
+      }
+
+      /*
+       * It's a FS only attribute, and we did interpolation for this attribute
+       * in SF thread. So, count it here, too.
+       *
+       * See compile_sf_prog() for more info.
+       */
+      if (nir->info->inputs_read & BITFIELD64_BIT(VARYING_SLOT_PNTC))
+         prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
+   }
+
+   prog_data->num_varying_inputs = urb_next;
+}
+
+void
+fs_visitor::assign_urb_setup()
+{
+   assert(stage == MESA_SHADER_FRAGMENT);
+   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+
+   int urb_start = payload.num_regs + prog_data->base.curb_read_length;
+
+   /* Offset all the urb_setup[] index by the actual position of the
+    * setup regs, now that the location of the constants has been chosen.
+    */
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      if (inst->opcode == FS_OPCODE_LINTERP) {
+	 assert(inst->src[1].file == FIXED_GRF);
+         inst->src[1].nr += urb_start;
+      }
+
+      if (inst->opcode == FS_OPCODE_CINTERP) {
+	 assert(inst->src[0].file == FIXED_GRF);
+         inst->src[0].nr += urb_start;
+      }
+   }
+
+   /* Each attribute is 4 setup channels, each of which is half a reg. */
+   this->first_non_payload_grf += prog_data->num_varying_inputs * 2;
+}
+
+void
+fs_visitor::convert_attr_sources_to_hw_regs(fs_inst *inst)
+{
+   for (int i = 0; i < inst->sources; i++) {
+      if (inst->src[i].file == ATTR) {
+         int grf = payload.num_regs +
+                   prog_data->curb_read_length +
+                   inst->src[i].nr +
+                   inst->src[i].offset / REG_SIZE;
+
+         /* As explained at brw_reg_from_fs_reg, From the Haswell PRM:
+          *
+          * VertStride must be used to cross GRF register boundaries. This
+          * rule implies that elements within a 'Width' cannot cross GRF
+          * boundaries.
+          *
+          * So, for registers that are large enough, we have to split the exec
+          * size in two and trust the compression state to sort it out.
+          */
+         unsigned total_size = inst->exec_size *
+                               inst->src[i].stride *
+                               type_sz(inst->src[i].type);
+
+         assert(total_size <= 2 * REG_SIZE);
+         const unsigned exec_size =
+            (total_size <= REG_SIZE) ? inst->exec_size : inst->exec_size / 2;
+
+         unsigned width = inst->src[i].stride == 0 ? 1 : exec_size;
+         struct brw_reg reg =
+            stride(byte_offset(retype(brw_vec8_grf(grf, 0), inst->src[i].type),
+                               inst->src[i].offset % REG_SIZE),
+                   exec_size * inst->src[i].stride,
+                   width, inst->src[i].stride);
+         reg.abs = inst->src[i].abs;
+         reg.negate = inst->src[i].negate;
+
+         inst->src[i] = reg;
+      }
+   }
+}
+
+void
+fs_visitor::assign_vs_urb_setup()
+{
+   struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data);
+
+   assert(stage == MESA_SHADER_VERTEX);
+
+   /* Each attribute is 4 regs. */
+   this->first_non_payload_grf += 4 * vs_prog_data->nr_attribute_slots;
+
+   assert(vs_prog_data->base.urb_read_length <= 15);
+
+   /* Rewrite all ATTR file references to the hw grf that they land in. */
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      convert_attr_sources_to_hw_regs(inst);
+   }
+}
+
+void
+fs_visitor::assign_tcs_single_patch_urb_setup()
+{
+   assert(stage == MESA_SHADER_TESS_CTRL);
+
+   /* Rewrite all ATTR file references to HW_REGs. */
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      convert_attr_sources_to_hw_regs(inst);
+   }
+}
+
+void
+fs_visitor::assign_tes_urb_setup()
+{
+   assert(stage == MESA_SHADER_TESS_EVAL);
+
+   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
+
+   first_non_payload_grf += 8 * vue_prog_data->urb_read_length;
+
+   /* Rewrite all ATTR file references to HW_REGs. */
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      convert_attr_sources_to_hw_regs(inst);
+   }
+}
+
+void
+fs_visitor::assign_gs_urb_setup()
+{
+   assert(stage == MESA_SHADER_GEOMETRY);
+
+   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
+
+   first_non_payload_grf +=
+      8 * vue_prog_data->urb_read_length * nir->info->gs.vertices_in;
+
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      /* Rewrite all ATTR file references to GRFs. */
+      convert_attr_sources_to_hw_regs(inst);
+   }
+}
+
+
+/**
+ * Split large virtual GRFs into separate components if we can.
+ *
+ * This is mostly duplicated with what brw_fs_vector_splitting does,
+ * but that's really conservative because it's afraid of doing
+ * splitting that doesn't result in real progress after the rest of
+ * the optimization phases, which would cause infinite looping in
+ * optimization.  We can do it once here, safely.  This also has the
+ * opportunity to split interpolated values, or maybe even uniforms,
+ * which we don't have at the IR level.
+ *
+ * We want to split, because virtual GRFs are what we register
+ * allocate and spill (due to contiguousness requirements for some
+ * instructions), and they're what we naturally generate in the
+ * codegen process, but most virtual GRFs don't actually need to be
+ * contiguous sets of GRFs.  If we split, we'll end up with reduced
+ * live intervals and better dead code elimination and coalescing.
+ */
+void
+fs_visitor::split_virtual_grfs()
+{
+   /* Compact the register file so we eliminate dead vgrfs.  This
+    * only defines split points for live registers, so if we have
+    * too large dead registers they will hit assertions later.
+    */
+   compact_virtual_grfs();
+
+   int num_vars = this->alloc.count;
+
+   /* Count the total number of registers */
+   int reg_count = 0;
+   int vgrf_to_reg[num_vars];
+   for (int i = 0; i < num_vars; i++) {
+      vgrf_to_reg[i] = reg_count;
+      reg_count += alloc.sizes[i];
+   }
+
+   /* An array of "split points".  For each register slot, this indicates
+    * if this slot can be separated from the previous slot.  Every time an
+    * instruction uses multiple elements of a register (as a source or
+    * destination), we mark the used slots as inseparable.  Then we go
+    * through and split the registers into the smallest pieces we can.
+    */
+   bool split_points[reg_count];
+   memset(split_points, 0, sizeof(split_points));
+
+   /* Mark all used registers as fully splittable */
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      if (inst->dst.file == VGRF) {
+         int reg = vgrf_to_reg[inst->dst.nr];
+         for (unsigned j = 1; j < this->alloc.sizes[inst->dst.nr]; j++)
+            split_points[reg + j] = true;
+      }
+
+      for (int i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == VGRF) {
+            int reg = vgrf_to_reg[inst->src[i].nr];
+            for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].nr]; j++)
+               split_points[reg + j] = true;
+         }
+      }
+   }
+
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      if (inst->dst.file == VGRF) {
+         int reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
+         for (unsigned j = 1; j < regs_written(inst); j++)
+            split_points[reg + j] = false;
+      }
+      for (int i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == VGRF) {
+            int reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
+            for (unsigned j = 1; j < regs_read(inst, i); j++)
+               split_points[reg + j] = false;
+         }
+      }
+   }
+
+   int new_virtual_grf[reg_count];
+   int new_reg_offset[reg_count];
+
+   int reg = 0;
+   for (int i = 0; i < num_vars; i++) {
+      /* The first one should always be 0 as a quick sanity check. */
+      assert(split_points[reg] == false);
+
+      /* j = 0 case */
+      new_reg_offset[reg] = 0;
+      reg++;
+      int offset = 1;
+
+      /* j > 0 case */
+      for (unsigned j = 1; j < alloc.sizes[i]; j++) {
+         /* If this is a split point, reset the offset to 0 and allocate a
+          * new virtual GRF for the previous offset many registers
+          */
+         if (split_points[reg]) {
+            assert(offset <= MAX_VGRF_SIZE);
+            int grf = alloc.allocate(offset);
+            for (int k = reg - offset; k < reg; k++)
+               new_virtual_grf[k] = grf;
+            offset = 0;
+         }
+         new_reg_offset[reg] = offset;
+         offset++;
+         reg++;
+      }
+
+      /* The last one gets the original register number */
+      assert(offset <= MAX_VGRF_SIZE);
+      alloc.sizes[i] = offset;
+      for (int k = reg - offset; k < reg; k++)
+         new_virtual_grf[k] = i;
+   }
+   assert(reg == reg_count);
+
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      if (inst->dst.file == VGRF) {
+         reg = vgrf_to_reg[inst->dst.nr] + inst->dst.offset / REG_SIZE;
+         inst->dst.nr = new_virtual_grf[reg];
+         inst->dst.offset = new_reg_offset[reg] * REG_SIZE +
+                            inst->dst.offset % REG_SIZE;
+         assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
+      }
+      for (int i = 0; i < inst->sources; i++) {
+	 if (inst->src[i].file == VGRF) {
+            reg = vgrf_to_reg[inst->src[i].nr] + inst->src[i].offset / REG_SIZE;
+            inst->src[i].nr = new_virtual_grf[reg];
+            inst->src[i].offset = new_reg_offset[reg] * REG_SIZE +
+                                  inst->src[i].offset % REG_SIZE;
+            assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
+         }
+      }
+   }
+   invalidate_live_intervals();
+}
+
+/**
+ * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
+ *
+ * During code generation, we create tons of temporary variables, many of
+ * which get immediately killed and are never used again.  Yet, in later
+ * optimization and analysis passes, such as compute_live_intervals, we need
+ * to loop over all the virtual GRFs.  Compacting them can save a lot of
+ * overhead.
+ */
+bool
+fs_visitor::compact_virtual_grfs()
+{
+   bool progress = false;
+   int remap_table[this->alloc.count];
+   memset(remap_table, -1, sizeof(remap_table));
+
+   /* Mark which virtual GRFs are used. */
+   foreach_block_and_inst(block, const fs_inst, inst, cfg) {
+      if (inst->dst.file == VGRF)
+         remap_table[inst->dst.nr] = 0;
+
+      for (int i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == VGRF)
+            remap_table[inst->src[i].nr] = 0;
+      }
+   }
+
+   /* Compact the GRF arrays. */
+   int new_index = 0;
+   for (unsigned i = 0; i < this->alloc.count; i++) {
+      if (remap_table[i] == -1) {
+         /* We just found an unused register.  This means that we are
+          * actually going to compact something.
+          */
+         progress = true;
+      } else {
+         remap_table[i] = new_index;
+         alloc.sizes[new_index] = alloc.sizes[i];
+         invalidate_live_intervals();
+         ++new_index;
+      }
+   }
+
+   this->alloc.count = new_index;
+
+   /* Patch all the instructions to use the newly renumbered registers */
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      if (inst->dst.file == VGRF)
+         inst->dst.nr = remap_table[inst->dst.nr];
+
+      for (int i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == VGRF)
+            inst->src[i].nr = remap_table[inst->src[i].nr];
+      }
+   }
+
+   /* Patch all the references to delta_xy, since they're used in register
+    * allocation.  If they're unused, switch them to BAD_FILE so we don't
+    * think some random VGRF is delta_xy.
+    */
+   for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
+      if (delta_xy[i].file == VGRF) {
+         if (remap_table[delta_xy[i].nr] != -1) {
+            delta_xy[i].nr = remap_table[delta_xy[i].nr];
+         } else {
+            delta_xy[i].file = BAD_FILE;
+         }
+      }
+   }
+
+   return progress;
+}
+
+static void
+set_push_pull_constant_loc(unsigned uniform, int *chunk_start,
+                           unsigned *max_chunk_bitsize,
+                           bool contiguous, unsigned bitsize,
+                           const unsigned target_bitsize,
+                           int *push_constant_loc, int *pull_constant_loc,
+                           unsigned *num_push_constants,
+                           unsigned *num_pull_constants,
+                           const unsigned max_push_components,
+                           const unsigned max_chunk_size,
+                           struct brw_stage_prog_data *stage_prog_data)
+{
+   /* This is the first live uniform in the chunk */
+   if (*chunk_start < 0)
+      *chunk_start = uniform;
+
+   /* Keep track of the maximum bit size access in contiguous uniforms */
+   *max_chunk_bitsize = MAX2(*max_chunk_bitsize, bitsize);
+
+   /* If this element does not need to be contiguous with the next, we
+    * split at this point and everything between chunk_start and u forms a
+    * single chunk.
+    */
+   if (!contiguous) {
+      /* If bitsize doesn't match the target one, skip it */
+      if (*max_chunk_bitsize != target_bitsize) {
+         /* FIXME: right now we only support 32 and 64-bit accesses */
+         assert(*max_chunk_bitsize == 4 || *max_chunk_bitsize == 8);
+         *max_chunk_bitsize = 0;
+         *chunk_start = -1;
+         return;
+      }
+
+      unsigned chunk_size = uniform - *chunk_start + 1;
+
+      /* Decide whether we should push or pull this parameter.  In the
+       * Vulkan driver, push constants are explicitly exposed via the API
+       * so we push everything.  In GL, we only push small arrays.
+       */
+      if (stage_prog_data->pull_param == NULL ||
+          (*num_push_constants + chunk_size <= max_push_components &&
+           chunk_size <= max_chunk_size)) {
+         assert(*num_push_constants + chunk_size <= max_push_components);
+         for (unsigned j = *chunk_start; j <= uniform; j++)
+            push_constant_loc[j] = (*num_push_constants)++;
+      } else {
+         for (unsigned j = *chunk_start; j <= uniform; j++)
+            pull_constant_loc[j] = (*num_pull_constants)++;
+      }
+
+      *max_chunk_bitsize = 0;
+      *chunk_start = -1;
+   }
+}
+
+/**
+ * Assign UNIFORM file registers to either push constants or pull constants.
+ *
+ * We allow a fragment shader to have more than the specified minimum
+ * maximum number of fragment shader uniform components (64).  If
+ * there are too many of these, they'd fill up all of register space.
+ * So, this will push some of them out to the pull constant buffer and
+ * update the program to load them.
+ */
+void
+fs_visitor::assign_constant_locations()
+{
+   /* Only the first compile gets to decide on locations. */
+   if (dispatch_width != min_dispatch_width)
+      return;
+
+   bool is_live[uniforms];
+   memset(is_live, 0, sizeof(is_live));
+   unsigned bitsize_access[uniforms];
+   memset(bitsize_access, 0, sizeof(bitsize_access));
+
+   /* For each uniform slot, a value of true indicates that the given slot and
+    * the next slot must remain contiguous.  This is used to keep us from
+    * splitting arrays apart.
+    */
+   bool contiguous[uniforms];
+   memset(contiguous, 0, sizeof(contiguous));
+
+   int thread_local_id_index =
+      (stage == MESA_SHADER_COMPUTE) ?
+      brw_cs_prog_data(stage_prog_data)->thread_local_id_index : -1;
+
+   /* First, we walk through the instructions and do two things:
+    *
+    *  1) Figure out which uniforms are live.
+    *
+    *  2) Mark any indirectly used ranges of registers as contiguous.
+    *
+    * Note that we don't move constant-indexed accesses to arrays.  No
+    * testing has been done of the performance impact of this choice.
+    */
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      for (int i = 0 ; i < inst->sources; i++) {
+         if (inst->src[i].file != UNIFORM)
+            continue;
+
+         int constant_nr = inst->src[i].nr + inst->src[i].offset / 4;
+
+         if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0) {
+            assert(inst->src[2].ud % 4 == 0);
+            unsigned last = constant_nr + (inst->src[2].ud / 4) - 1;
+            assert(last < uniforms);
+
+            for (unsigned j = constant_nr; j < last; j++) {
+               is_live[j] = true;
+               contiguous[j] = true;
+               bitsize_access[j] = MAX2(bitsize_access[j], type_sz(inst->src[i].type));
+            }
+            is_live[last] = true;
+            bitsize_access[last] = MAX2(bitsize_access[last], type_sz(inst->src[i].type));
+         } else {
+            if (constant_nr >= 0 && constant_nr < (int) uniforms) {
+               int regs_read = inst->components_read(i) *
+                  type_sz(inst->src[i].type) / 4;
+               for (int j = 0; j < regs_read; j++) {
+                  is_live[constant_nr + j] = true;
+                  bitsize_access[constant_nr + j] =
+                     MAX2(bitsize_access[constant_nr + j], type_sz(inst->src[i].type));
+               }
+            }
+         }
+      }
+   }
+
+   if (thread_local_id_index >= 0 && !is_live[thread_local_id_index])
+      thread_local_id_index = -1;
+
+   /* Only allow 16 registers (128 uniform components) as push constants.
+    *
+    * Just demote the end of the list.  We could probably do better
+    * here, demoting things that are rarely used in the program first.
+    *
+    * If changing this value, note the limitation about total_regs in
+    * brw_curbe.c.
+    */
+   unsigned int max_push_components = 16 * 8;
+   if (thread_local_id_index >= 0)
+      max_push_components--; /* Save a slot for the thread ID */
+
+   /* We push small arrays, but no bigger than 16 floats.  This is big enough
+    * for a vec4 but hopefully not large enough to push out other stuff.  We
+    * should probably use a better heuristic at some point.
+    */
+   const unsigned int max_chunk_size = 16;
+
+   unsigned int num_push_constants = 0;
+   unsigned int num_pull_constants = 0;
+
+   push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
+   pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
+
+   /* Default to -1 meaning no location */
+   memset(push_constant_loc, -1, uniforms * sizeof(*push_constant_loc));
+   memset(pull_constant_loc, -1, uniforms * sizeof(*pull_constant_loc));
+
+   int chunk_start = -1;
+   unsigned max_chunk_bitsize = 0;
+
+   /* First push 64-bit uniforms to ensure they are properly aligned */
+   const unsigned uniform_64_bit_size = type_sz(BRW_REGISTER_TYPE_DF);
+   for (unsigned u = 0; u < uniforms; u++) {
+      if (!is_live[u])
+         continue;
+
+      set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize,
+                                 contiguous[u], bitsize_access[u],
+                                 uniform_64_bit_size,
+                                 push_constant_loc, pull_constant_loc,
+                                 &num_push_constants, &num_pull_constants,
+                                 max_push_components, max_chunk_size,
+                                 stage_prog_data);
+
+   }
+
+   /* Then push the rest of uniforms */
+   const unsigned uniform_32_bit_size = type_sz(BRW_REGISTER_TYPE_F);
+   for (unsigned u = 0; u < uniforms; u++) {
+      if (!is_live[u])
+         continue;
+
+      /* Skip thread_local_id_index to put it in the last push register. */
+      if (thread_local_id_index == (int)u)
+         continue;
+
+      set_push_pull_constant_loc(u, &chunk_start, &max_chunk_bitsize,
+                                 contiguous[u], bitsize_access[u],
+                                 uniform_32_bit_size,
+                                 push_constant_loc, pull_constant_loc,
+                                 &num_push_constants, &num_pull_constants,
+                                 max_push_components, max_chunk_size,
+                                 stage_prog_data);
+   }
+
+   /* Add the CS local thread ID uniform at the end of the push constants */
+   if (thread_local_id_index >= 0)
+      push_constant_loc[thread_local_id_index] = num_push_constants++;
+
+   /* As the uniforms are going to be reordered, take the data from a temporary
+    * copy of the original param[].
+    */
+   gl_constant_value **param = ralloc_array(NULL, gl_constant_value*,
+                                            stage_prog_data->nr_params);
+   memcpy(param, stage_prog_data->param,
+          sizeof(gl_constant_value*) * stage_prog_data->nr_params);
+   stage_prog_data->nr_params = num_push_constants;
+   stage_prog_data->nr_pull_params = num_pull_constants;
+
+   /* Up until now, the param[] array has been indexed by reg + offset
+    * of UNIFORM registers.  Move pull constants into pull_param[] and
+    * condense param[] to only contain the uniforms we chose to push.
+    *
+    * NOTE: Because we are condensing the params[] array, we know that
+    * push_constant_loc[i] <= i and we can do it in one smooth loop without
+    * having to make a copy.
+    */
+   int new_thread_local_id_index = -1;
+   for (unsigned int i = 0; i < uniforms; i++) {
+      const gl_constant_value *value = param[i];
+
+      if (pull_constant_loc[i] != -1) {
+         stage_prog_data->pull_param[pull_constant_loc[i]] = value;
+      } else if (push_constant_loc[i] != -1) {
+         stage_prog_data->param[push_constant_loc[i]] = value;
+         if (thread_local_id_index == (int)i)
+            new_thread_local_id_index = push_constant_loc[i];
+      }
+   }
+   ralloc_free(param);
+
+   if (stage == MESA_SHADER_COMPUTE)
+      brw_cs_prog_data(stage_prog_data)->thread_local_id_index =
+         new_thread_local_id_index;
+}
+
+/**
+ * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
+ * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
+ */
+void
+fs_visitor::lower_constant_loads()
+{
+   const unsigned index = stage_prog_data->binding_table.pull_constants_start;
+
+   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+      /* Set up the annotation tracking for new generated instructions. */
+      const fs_builder ibld(this, block, inst);
+
+      for (int i = 0; i < inst->sources; i++) {
+	 if (inst->src[i].file != UNIFORM)
+	    continue;
+
+         /* We'll handle this case later */
+         if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT && i == 0)
+            continue;
+
+         unsigned location = inst->src[i].nr + inst->src[i].offset / 4;
+         if (location >= uniforms)
+            continue; /* Out of bounds access */
+
+         int pull_index = pull_constant_loc[location];
+
+         if (pull_index == -1)
+	    continue;
+
+         assert(inst->src[i].stride == 0);
+
+         const unsigned index = stage_prog_data->binding_table.pull_constants_start;
+         const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
+         const fs_builder ubld = ibld.exec_all().group(block_sz / 4, 0);
+         const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+         const unsigned base = pull_index * 4;
+
+         ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+                   dst, brw_imm_ud(index), brw_imm_ud(base & ~(block_sz - 1)));
+
+         /* Rewrite the instruction to use the temporary VGRF. */
+         inst->src[i].file = VGRF;
+         inst->src[i].nr = dst.nr;
+         inst->src[i].offset = (base & (block_sz - 1)) +
+                               inst->src[i].offset % 4;
+
+         brw_mark_surface_used(prog_data, index);
+      }
+
+      if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
+          inst->src[0].file == UNIFORM) {
+
+         unsigned location = inst->src[0].nr + inst->src[0].offset / 4;
+         if (location >= uniforms)
+            continue; /* Out of bounds access */
+
+         int pull_index = pull_constant_loc[location];
+
+         if (pull_index == -1)
+	    continue;
+
+         VARYING_PULL_CONSTANT_LOAD(ibld, inst->dst,
+                                    brw_imm_ud(index),
+                                    inst->src[1],
+                                    pull_index * 4);
+         inst->remove(block);
+
+         brw_mark_surface_used(prog_data, index);
+      }
+   }
+   invalidate_live_intervals();
+}
+
+bool
+fs_visitor::opt_algebraic()
+{
+   bool progress = false;
+
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      switch (inst->opcode) {
+      case BRW_OPCODE_MOV:
+         if (inst->src[0].file != IMM)
+            break;
+
+         if (inst->saturate) {
+            if (inst->dst.type != inst->src[0].type)
+               assert(!"unimplemented: saturate mixed types");
+
+            if (brw_saturate_immediate(inst->dst.type,
+                                       &inst->src[0].as_brw_reg())) {
+               inst->saturate = false;
+               progress = true;
+            }
+         }
+         break;
+
+      case BRW_OPCODE_MUL:
+	 if (inst->src[1].file != IMM)
+	    continue;
+
+	 /* a * 1.0 = a */
+	 if (inst->src[1].is_one()) {
+	    inst->opcode = BRW_OPCODE_MOV;
+	    inst->src[1] = reg_undef;
+	    progress = true;
+	    break;
+	 }
+
+         /* a * -1.0 = -a */
+         if (inst->src[1].is_negative_one()) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0].negate = !inst->src[0].negate;
+            inst->src[1] = reg_undef;
+            progress = true;
+            break;
+         }
+
+         /* a * 0.0 = 0.0 */
+         if (inst->src[1].is_zero()) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0] = inst->src[1];
+            inst->src[1] = reg_undef;
+            progress = true;
+            break;
+         }
+
+         if (inst->src[0].file == IMM) {
+            assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0].f *= inst->src[1].f;
+            inst->src[1] = reg_undef;
+            progress = true;
+            break;
+         }
+	 break;
+      case BRW_OPCODE_ADD:
+         if (inst->src[1].file != IMM)
+            continue;
+
+         /* a + 0.0 = a */
+         if (inst->src[1].is_zero()) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[1] = reg_undef;
+            progress = true;
+            break;
+         }
+
+         if (inst->src[0].file == IMM) {
+            assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0].f += inst->src[1].f;
+            inst->src[1] = reg_undef;
+            progress = true;
+            break;
+         }
+         break;
+      case BRW_OPCODE_OR:
+         if (inst->src[0].equals(inst->src[1])) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[1] = reg_undef;
+            progress = true;
+            break;
+         }
+         break;
+      case BRW_OPCODE_LRP:
+         if (inst->src[1].equals(inst->src[2])) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0] = inst->src[1];
+            inst->src[1] = reg_undef;
+            inst->src[2] = reg_undef;
+            progress = true;
+            break;
+         }
+         break;
+      case BRW_OPCODE_CMP:
+         if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
+             inst->src[0].abs &&
+             inst->src[0].negate &&
+             inst->src[1].is_zero()) {
+            inst->src[0].abs = false;
+            inst->src[0].negate = false;
+            inst->conditional_mod = BRW_CONDITIONAL_Z;
+            progress = true;
+            break;
+         }
+         break;
+      case BRW_OPCODE_SEL:
+         if (inst->src[0].equals(inst->src[1])) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[1] = reg_undef;
+            inst->predicate = BRW_PREDICATE_NONE;
+            inst->predicate_inverse = false;
+            progress = true;
+         } else if (inst->saturate && inst->src[1].file == IMM) {
+            switch (inst->conditional_mod) {
+            case BRW_CONDITIONAL_LE:
+            case BRW_CONDITIONAL_L:
+               switch (inst->src[1].type) {
+               case BRW_REGISTER_TYPE_F:
+                  if (inst->src[1].f >= 1.0f) {
+                     inst->opcode = BRW_OPCODE_MOV;
+                     inst->src[1] = reg_undef;
+                     inst->conditional_mod = BRW_CONDITIONAL_NONE;
+                     progress = true;
+                  }
+                  break;
+               default:
+                  break;
+               }
+               break;
+            case BRW_CONDITIONAL_GE:
+            case BRW_CONDITIONAL_G:
+               switch (inst->src[1].type) {
+               case BRW_REGISTER_TYPE_F:
+                  if (inst->src[1].f <= 0.0f) {
+                     inst->opcode = BRW_OPCODE_MOV;
+                     inst->src[1] = reg_undef;
+                     inst->conditional_mod = BRW_CONDITIONAL_NONE;
+                     progress = true;
+                  }
+                  break;
+               default:
+                  break;
+               }
+            default:
+               break;
+            }
+         }
+         break;
+      case BRW_OPCODE_MAD:
+         if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[1] = reg_undef;
+            inst->src[2] = reg_undef;
+            progress = true;
+         } else if (inst->src[0].is_zero()) {
+            inst->opcode = BRW_OPCODE_MUL;
+            inst->src[0] = inst->src[2];
+            inst->src[2] = reg_undef;
+            progress = true;
+         } else if (inst->src[1].is_one()) {
+            inst->opcode = BRW_OPCODE_ADD;
+            inst->src[1] = inst->src[2];
+            inst->src[2] = reg_undef;
+            progress = true;
+         } else if (inst->src[2].is_one()) {
+            inst->opcode = BRW_OPCODE_ADD;
+            inst->src[2] = reg_undef;
+            progress = true;
+         } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
+            inst->opcode = BRW_OPCODE_ADD;
+            inst->src[1].f *= inst->src[2].f;
+            inst->src[2] = reg_undef;
+            progress = true;
+         }
+         break;
+      case SHADER_OPCODE_BROADCAST:
+         if (is_uniform(inst->src[0])) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->sources = 1;
+            inst->force_writemask_all = true;
+            progress = true;
+         } else if (inst->src[1].file == IMM) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0] = component(inst->src[0],
+                                     inst->src[1].ud);
+            inst->sources = 1;
+            inst->force_writemask_all = true;
+            progress = true;
+         }
+         break;
+
+      default:
+	 break;
+      }
+
+      /* Swap if src[0] is immediate. */
+      if (progress && inst->is_commutative()) {
+         if (inst->src[0].file == IMM) {
+            fs_reg tmp = inst->src[1];
+            inst->src[1] = inst->src[0];
+            inst->src[0] = tmp;
+         }
+      }
+   }
+   return progress;
+}
+
+/**
+ * Optimize sample messages that have constant zero values for the trailing
+ * texture coordinates. We can just reduce the message length for these
+ * instructions instead of reserving a register for it. Trailing parameters
+ * that aren't sent default to zero anyway. This will cause the dead code
+ * eliminator to remove the MOV instruction that would otherwise be emitted to
+ * set up the zero value.
+ */
+bool
+fs_visitor::opt_zero_samples()
+{
+   /* Gen4 infers the texturing opcode based on the message length so we can't
+    * change it.
+    */
+   if (devinfo->gen < 5)
+      return false;
+
+   bool progress = false;
+
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      if (!inst->is_tex())
+         continue;
+
+      fs_inst *load_payload = (fs_inst *) inst->prev;
+
+      if (load_payload->is_head_sentinel() ||
+          load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
+         continue;
+
+      /* We don't want to remove the message header or the first parameter.
+       * Removing the first parameter is not allowed, see the Haswell PRM
+       * volume 7, page 149:
+       *
+       *     "Parameter 0 is required except for the sampleinfo message, which
+       *      has no parameter 0"
+       */
+      while (inst->mlen > inst->header_size + inst->exec_size / 8 &&
+             load_payload->src[(inst->mlen - inst->header_size) /
+                               (inst->exec_size / 8) +
+                               inst->header_size - 1].is_zero()) {
+         inst->mlen -= inst->exec_size / 8;
+         progress = true;
+      }
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
+/**
+ * Optimize sample messages which are followed by the final RT write.
+ *
+ * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
+ * results sent directly to the framebuffer, bypassing the EU.  Recognize the
+ * final texturing results copied to the framebuffer write payload and modify
+ * them to write to the framebuffer directly.
+ */
+bool
+fs_visitor::opt_sampler_eot()
+{
+   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+
+   if (stage != MESA_SHADER_FRAGMENT)
+      return false;
+
+   if (devinfo->gen < 9 && !devinfo->is_cherryview)
+      return false;
+
+   /* FINISHME: It should be possible to implement this optimization when there
+    * are multiple drawbuffers.
+    */
+   if (key->nr_color_regions != 1)
+      return false;
+
+   /* Requires emitting a bunch of saturating MOV instructions during logical
+    * send lowering to clamp the color payload, which the sampler unit isn't
+    * going to do for us.
+    */
+   if (key->clamp_fragment_color)
+      return false;
+
+   /* Look for a texturing instruction immediately before the final FB_WRITE. */
+   bblock_t *block = cfg->blocks[cfg->num_blocks - 1];
+   fs_inst *fb_write = (fs_inst *)block->end();
+   assert(fb_write->eot);
+   assert(fb_write->opcode == FS_OPCODE_FB_WRITE_LOGICAL);
+
+   /* There wasn't one; nothing to do. */
+   if (unlikely(fb_write->prev->is_head_sentinel()))
+      return false;
+
+   fs_inst *tex_inst = (fs_inst *) fb_write->prev;
+
+   /* 3D Sampler » Messages » Message Format
+    *
+    * “Response Length of zero is allowed on all SIMD8* and SIMD16* sampler
+    *  messages except sample+killpix, resinfo, sampleinfo, LOD, and gather4*”
+    */
+   if (tex_inst->opcode != SHADER_OPCODE_TEX_LOGICAL &&
+       tex_inst->opcode != SHADER_OPCODE_TXD_LOGICAL &&
+       tex_inst->opcode != SHADER_OPCODE_TXF_LOGICAL &&
+       tex_inst->opcode != SHADER_OPCODE_TXL_LOGICAL &&
+       tex_inst->opcode != FS_OPCODE_TXB_LOGICAL &&
+       tex_inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL &&
+       tex_inst->opcode != SHADER_OPCODE_TXF_CMS_W_LOGICAL &&
+       tex_inst->opcode != SHADER_OPCODE_TXF_UMS_LOGICAL)
+      return false;
+
+   /* XXX - This shouldn't be necessary. */
+   if (tex_inst->prev->is_head_sentinel())
+      return false;
+
+   /* Check that the FB write sources are fully initialized by the single
+    * texturing instruction.
+    */
+   for (unsigned i = 0; i < FB_WRITE_LOGICAL_NUM_SRCS; i++) {
+      if (i == FB_WRITE_LOGICAL_SRC_COLOR0) {
+         if (!fb_write->src[i].equals(tex_inst->dst) ||
+             fb_write->size_read(i) != tex_inst->size_written)
+         return false;
+      } else if (i != FB_WRITE_LOGICAL_SRC_COMPONENTS) {
+         if (fb_write->src[i].file != BAD_FILE)
+            return false;
+      }
+   }
+
+   assert(!tex_inst->eot); /* We can't get here twice */
+   assert((tex_inst->offset & (0xff << 24)) == 0);
+
+   const fs_builder ibld(this, block, tex_inst);
+
+   tex_inst->offset |= fb_write->target << 24;
+   tex_inst->eot = true;
+   tex_inst->dst = ibld.null_reg_ud();
+   tex_inst->size_written = 0;
+   fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
+
+   /* Marking EOT is sufficient, lower_logical_sends() will notice the EOT
+    * flag and submit a header together with the sampler message as required
+    * by the hardware.
+    */
+   invalidate_live_intervals();
+   return true;
+}
+
+bool
+fs_visitor::opt_register_renaming()
+{
+   bool progress = false;
+   int depth = 0;
+
+   int remap[alloc.count];
+   memset(remap, -1, sizeof(int) * alloc.count);
+
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
+         depth++;
+      } else if (inst->opcode == BRW_OPCODE_ENDIF ||
+                 inst->opcode == BRW_OPCODE_WHILE) {
+         depth--;
+      }
+
+      /* Rewrite instruction sources. */
+      for (int i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == VGRF &&
+             remap[inst->src[i].nr] != -1 &&
+             remap[inst->src[i].nr] != inst->src[i].nr) {
+            inst->src[i].nr = remap[inst->src[i].nr];
+            progress = true;
+         }
+      }
+
+      const int dst = inst->dst.nr;
+
+      if (depth == 0 &&
+          inst->dst.file == VGRF &&
+          alloc.sizes[inst->dst.nr] * REG_SIZE == inst->size_written &&
+          !inst->is_partial_write()) {
+         if (remap[dst] == -1) {
+            remap[dst] = dst;
+         } else {
+            remap[dst] = alloc.allocate(regs_written(inst));
+            inst->dst.nr = remap[dst];
+            progress = true;
+         }
+      } else if (inst->dst.file == VGRF &&
+                 remap[dst] != -1 &&
+                 remap[dst] != dst) {
+         inst->dst.nr = remap[dst];
+         progress = true;
+      }
+   }
+
+   if (progress) {
+      invalidate_live_intervals();
+
+      for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
+         if (delta_xy[i].file == VGRF && remap[delta_xy[i].nr] != -1) {
+            delta_xy[i].nr = remap[delta_xy[i].nr];
+         }
+      }
+   }
+
+   return progress;
+}
+
+/**
+ * Remove redundant or useless discard jumps.
+ *
+ * For example, we can eliminate jumps in the following sequence:
+ *
+ * discard-jump       (redundant with the next jump)
+ * discard-jump       (useless; jumps to the next instruction)
+ * placeholder-halt
+ */
+bool
+fs_visitor::opt_redundant_discard_jumps()
+{
+   bool progress = false;
+
+   bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
+
+   fs_inst *placeholder_halt = NULL;
+   foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
+      if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
+         placeholder_halt = inst;
+         break;
+      }
+   }
+
+   if (!placeholder_halt)
+      return false;
+
+   /* Delete any HALTs immediately before the placeholder halt. */
+   for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
+        !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
+        prev = (fs_inst *) placeholder_halt->prev) {
+      prev->remove(last_bblock);
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
+/**
+ * Compute a bitmask with GRF granularity with a bit set for each GRF starting
+ * from \p r.offset which overlaps the region starting at \p s.offset and
+ * spanning \p ds bytes.
+ */
+static inline unsigned
+mask_relative_to(const fs_reg &r, const fs_reg &s, unsigned ds)
+{
+   const int rel_offset = reg_offset(s) - reg_offset(r);
+   const int shift = rel_offset / REG_SIZE;
+   const unsigned n = DIV_ROUND_UP(rel_offset % REG_SIZE + ds, REG_SIZE);
+   assert(reg_space(r) == reg_space(s) &&
+          shift >= 0 && shift < int(8 * sizeof(unsigned)));
+   return ((1 << n) - 1) << shift;
+}
+
+bool
+fs_visitor::compute_to_mrf()
+{
+   bool progress = false;
+   int next_ip = 0;
+
+   /* No MRFs on Gen >= 7. */
+   if (devinfo->gen >= 7)
+      return false;
+
+   calculate_live_intervals();
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      int ip = next_ip;
+      next_ip++;
+
+      if (inst->opcode != BRW_OPCODE_MOV ||
+	  inst->is_partial_write() ||
+	  inst->dst.file != MRF || inst->src[0].file != VGRF ||
+	  inst->dst.type != inst->src[0].type ||
+	  inst->src[0].abs || inst->src[0].negate ||
+          !inst->src[0].is_contiguous() ||
+          inst->src[0].offset % REG_SIZE != 0)
+	 continue;
+
+      /* Can't compute-to-MRF this GRF if someone else was going to
+       * read it later.
+       */
+      if (this->virtual_grf_end[inst->src[0].nr] > ip)
+	 continue;
+
+      /* Found a move of a GRF to a MRF.  Let's see if we can go rewrite the
+       * things that computed the value of all GRFs of the source region.  The
+       * regs_left bitset keeps track of the registers we haven't yet found a
+       * generating instruction for.
+       */
+      unsigned regs_left = (1 << regs_read(inst, 0)) - 1;
+
+      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
+         if (regions_overlap(scan_inst->dst, scan_inst->size_written,
+                             inst->src[0], inst->size_read(0))) {
+	    /* Found the last thing to write our reg we want to turn
+	     * into a compute-to-MRF.
+	     */
+
+	    /* If this one instruction didn't populate all the
+	     * channels, bail.  We might be able to rewrite everything
+	     * that writes that reg, but it would require smarter
+	     * tracking.
+	     */
+	    if (scan_inst->is_partial_write())
+	       break;
+
+            /* Handling things not fully contained in the source of the copy
+             * would need us to understand coalescing out more than one MOV at
+             * a time.
+             */
+            if (!region_contained_in(scan_inst->dst, scan_inst->size_written,
+                                     inst->src[0], inst->size_read(0)))
+               break;
+
+	    /* SEND instructions can't have MRF as a destination. */
+	    if (scan_inst->mlen)
+	       break;
+
+	    if (devinfo->gen == 6) {
+	       /* gen6 math instructions must have the destination be
+		* GRF, so no compute-to-MRF for them.
+		*/
+	       if (scan_inst->is_math()) {
+		  break;
+	       }
+	    }
+
+            /* Clear the bits for any registers this instruction overwrites. */
+            regs_left &= ~mask_relative_to(
+               inst->src[0], scan_inst->dst, scan_inst->size_written);
+            if (!regs_left)
+               break;
+	 }
+
+	 /* We don't handle control flow here.  Most computation of
+	  * values that end up in MRFs are shortly before the MRF
+	  * write anyway.
+	  */
+	 if (block->start() == scan_inst)
+	    break;
+
+	 /* You can't read from an MRF, so if someone else reads our
+	  * MRF's source GRF that we wanted to rewrite, that stops us.
+	  */
+	 bool interfered = false;
+	 for (int i = 0; i < scan_inst->sources; i++) {
+            if (regions_overlap(scan_inst->src[i], scan_inst->size_read(i),
+                                inst->src[0], inst->size_read(0))) {
+	       interfered = true;
+	    }
+	 }
+	 if (interfered)
+	    break;
+
+         if (regions_overlap(scan_inst->dst, scan_inst->size_written,
+                             inst->dst, inst->size_written)) {
+	    /* If somebody else writes our MRF here, we can't
+	     * compute-to-MRF before that.
+	     */
+            break;
+         }
+
+         if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1 &&
+             regions_overlap(fs_reg(MRF, scan_inst->base_mrf), scan_inst->mlen * REG_SIZE,
+                             inst->dst, inst->size_written)) {
+	    /* Found a SEND instruction, which means that there are
+	     * live values in MRFs from base_mrf to base_mrf +
+	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
+	     * above it.
+	     */
+            break;
+         }
+      }
+
+      if (regs_left)
+         continue;
+
+      /* Found all generating instructions of our MRF's source value, so it
+       * should be safe to rewrite them to point to the MRF directly.
+       */
+      regs_left = (1 << regs_read(inst, 0)) - 1;
+
+      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
+         if (regions_overlap(scan_inst->dst, scan_inst->size_written,
+                             inst->src[0], inst->size_read(0))) {
+            /* Clear the bits for any registers this instruction overwrites. */
+            regs_left &= ~mask_relative_to(
+               inst->src[0], scan_inst->dst, scan_inst->size_written);
+
+            const unsigned rel_offset = reg_offset(scan_inst->dst) -
+                                        reg_offset(inst->src[0]);
+
+            if (inst->dst.nr & BRW_MRF_COMPR4) {
+               /* Apply the same address transformation done by the hardware
+                * for COMPR4 MRF writes.
+                */
+               assert(rel_offset < 2 * REG_SIZE);
+               scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE * 4;
+
+               /* Clear the COMPR4 bit if the generating instruction is not
+                * compressed.
+                */
+               if (scan_inst->size_written < 2 * REG_SIZE)
+                  scan_inst->dst.nr &= ~BRW_MRF_COMPR4;
+
+            } else {
+               /* Calculate the MRF number the result of this instruction is
+                * ultimately written to.
+                */
+               scan_inst->dst.nr = inst->dst.nr + rel_offset / REG_SIZE;
+            }
+
+            scan_inst->dst.file = MRF;
+            scan_inst->dst.offset = inst->dst.offset + rel_offset % REG_SIZE;
+            scan_inst->saturate |= inst->saturate;
+            if (!regs_left)
+               break;
+         }
+      }
+
+      assert(!regs_left);
+      inst->remove(block);
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
+/**
+ * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
+ * flow.  We could probably do better here with some form of divergence
+ * analysis.
+ */
+bool
+fs_visitor::eliminate_find_live_channel()
+{
+   bool progress = false;
+   unsigned depth = 0;
+
+   if (!brw_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) {
+      /* The optimization below assumes that channel zero is live on thread
+       * dispatch, which may not be the case if the fixed function dispatches
+       * threads sparsely.
+       */
+      return false;
+   }
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      switch (inst->opcode) {
+      case BRW_OPCODE_IF:
+      case BRW_OPCODE_DO:
+         depth++;
+         break;
+
+      case BRW_OPCODE_ENDIF:
+      case BRW_OPCODE_WHILE:
+         depth--;
+         break;
+
+      case FS_OPCODE_DISCARD_JUMP:
+         /* This can potentially make control flow non-uniform until the end
+          * of the program.
+          */
+         return progress;
+
+      case SHADER_OPCODE_FIND_LIVE_CHANNEL:
+         if (depth == 0) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0] = brw_imm_ud(0u);
+            inst->sources = 1;
+            inst->force_writemask_all = true;
+            progress = true;
+         }
+         break;
+
+      default:
+         break;
+      }
+   }
+
+   return progress;
+}
+
+/**
+ * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
+ * instructions to FS_OPCODE_REP_FB_WRITE.
+ */
+void
+fs_visitor::emit_repclear_shader()
+{
+   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+   int base_mrf = 0;
+   int color_mrf = base_mrf + 2;
+   fs_inst *mov;
+
+   if (uniforms > 0) {
+      mov = bld.exec_all().group(4, 0)
+               .MOV(brw_message_reg(color_mrf),
+                    fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F));
+   } else {
+      struct brw_reg reg =
+         brw_reg(BRW_GENERAL_REGISTER_FILE, 2, 3, 0, 0, BRW_REGISTER_TYPE_F,
+                 BRW_VERTICAL_STRIDE_8, BRW_WIDTH_2, BRW_HORIZONTAL_STRIDE_4,
+                 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+
+      mov = bld.exec_all().group(4, 0)
+               .MOV(vec4(brw_message_reg(color_mrf)), fs_reg(reg));
+   }
+
+   fs_inst *write;
+   if (key->nr_color_regions == 1) {
+      write = bld.emit(FS_OPCODE_REP_FB_WRITE);
+      write->saturate = key->clamp_fragment_color;
+      write->base_mrf = color_mrf;
+      write->target = 0;
+      write->header_size = 0;
+      write->mlen = 1;
+   } else {
+      assume(key->nr_color_regions > 0);
+      for (int i = 0; i < key->nr_color_regions; ++i) {
+         write = bld.emit(FS_OPCODE_REP_FB_WRITE);
+         write->saturate = key->clamp_fragment_color;
+         write->base_mrf = base_mrf;
+         write->target = i;
+         write->header_size = 2;
+         write->mlen = 3;
+      }
+   }
+   write->eot = true;
+
+   calculate_cfg();
+
+   assign_constant_locations();
+   assign_curb_setup();
+
+   /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
+   if (uniforms > 0) {
+      assert(mov->src[0].file == FIXED_GRF);
+      mov->src[0] = brw_vec4_grf(mov->src[0].nr, 0);
+   }
+}
+
+/**
+ * Walks through basic blocks, looking for repeated MRF writes and
+ * removing the later ones.
+ */
+bool
+fs_visitor::remove_duplicate_mrf_writes()
+{
+   fs_inst *last_mrf_move[BRW_MAX_MRF(devinfo->gen)];
+   bool progress = false;
+
+   /* Need to update the MRF tracking for compressed instructions. */
+   if (dispatch_width >= 16)
+      return false;
+
+   memset(last_mrf_move, 0, sizeof(last_mrf_move));
+
+   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+      if (inst->is_control_flow()) {
+	 memset(last_mrf_move, 0, sizeof(last_mrf_move));
+      }
+
+      if (inst->opcode == BRW_OPCODE_MOV &&
+	  inst->dst.file == MRF) {
+         fs_inst *prev_inst = last_mrf_move[inst->dst.nr];
+	 if (prev_inst && inst->equals(prev_inst)) {
+	    inst->remove(block);
+	    progress = true;
+	    continue;
+	 }
+      }
+
+      /* Clear out the last-write records for MRFs that were overwritten. */
+      if (inst->dst.file == MRF) {
+         last_mrf_move[inst->dst.nr] = NULL;
+      }
+
+      if (inst->mlen > 0 && inst->base_mrf != -1) {
+	 /* Found a SEND instruction, which will include two or fewer
+	  * implied MRF writes.  We could do better here.
+	  */
+	 for (int i = 0; i < implied_mrf_writes(inst); i++) {
+	    last_mrf_move[inst->base_mrf + i] = NULL;
+	 }
+      }
+
+      /* Clear out any MRF move records whose sources got overwritten. */
+      for (unsigned i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
+         if (last_mrf_move[i] &&
+             regions_overlap(inst->dst, inst->size_written,
+                             last_mrf_move[i]->src[0],
+                             last_mrf_move[i]->size_read(0))) {
+            last_mrf_move[i] = NULL;
+         }
+      }
+
+      if (inst->opcode == BRW_OPCODE_MOV &&
+	  inst->dst.file == MRF &&
+	  inst->src[0].file != ARF &&
+	  !inst->is_partial_write()) {
+         last_mrf_move[inst->dst.nr] = inst;
+      }
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
+static void
+clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
+{
+   /* Clear the flag for registers that actually got read (as expected). */
+   for (int i = 0; i < inst->sources; i++) {
+      int grf;
+      if (inst->src[i].file == VGRF || inst->src[i].file == FIXED_GRF) {
+         grf = inst->src[i].nr;
+      } else {
+         continue;
+      }
+
+      if (grf >= first_grf &&
+          grf < first_grf + grf_len) {
+         deps[grf - first_grf] = false;
+         if (inst->exec_size == 16)
+            deps[grf - first_grf + 1] = false;
+      }
+   }
+}
+
+/**
+ * Implements this workaround for the original 965:
+ *
+ *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
+ *      check for post destination dependencies on this instruction, software
+ *      must ensure that there is no destination hazard for the case of ‘write
+ *      followed by a posted write’ shown in the following example.
+ *
+ *      1. mov r3 0
+ *      2. send r3.xy <rest of send instruction>
+ *      3. mov r2 r3
+ *
+ *      Due to no post-destination dependency check on the ‘send’, the above
+ *      code sequence could have two instructions (1 and 2) in flight at the
+ *      same time that both consider ‘r3’ as the target of their final writes.
+ */
+void
+fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
+                                                        fs_inst *inst)
+{
+   int write_len = regs_written(inst);
+   int first_write_grf = inst->dst.nr;
+   bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
+   assert(write_len < (int)sizeof(needs_dep) - 1);
+
+   memset(needs_dep, false, sizeof(needs_dep));
+   memset(needs_dep, true, write_len);
+
+   clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
+
+   /* Walk backwards looking for writes to registers we're writing which
+    * aren't read since being written.  If we hit the start of the program,
+    * we assume that there are no outstanding dependencies on entry to the
+    * program.
+    */
+   foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
+      /* If we hit control flow, assume that there *are* outstanding
+       * dependencies, and force their cleanup before our instruction.
+       */
+      if (block->start() == scan_inst && block->num != 0) {
+         for (int i = 0; i < write_len; i++) {
+            if (needs_dep[i])
+               DEP_RESOLVE_MOV(fs_builder(this, block, inst),
+                               first_write_grf + i);
+         }
+         return;
+      }
+
+      /* We insert our reads as late as possible on the assumption that any
+       * instruction but a MOV that might have left us an outstanding
+       * dependency has more latency than a MOV.
+       */
+      if (scan_inst->dst.file == VGRF) {
+         for (unsigned i = 0; i < regs_written(scan_inst); i++) {
+            int reg = scan_inst->dst.nr + i;
+
+            if (reg >= first_write_grf &&
+                reg < first_write_grf + write_len &&
+                needs_dep[reg - first_write_grf]) {
+               DEP_RESOLVE_MOV(fs_builder(this, block, inst), reg);
+               needs_dep[reg - first_write_grf] = false;
+               if (scan_inst->exec_size == 16)
+                  needs_dep[reg - first_write_grf + 1] = false;
+            }
+         }
+      }
+
+      /* Clear the flag for registers that actually got read (as expected). */
+      clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
+
+      /* Continue the loop only if we haven't resolved all the dependencies */
+      int i;
+      for (i = 0; i < write_len; i++) {
+         if (needs_dep[i])
+            break;
+      }
+      if (i == write_len)
+         return;
+   }
+}
+
+/**
+ * Implements this workaround for the original 965:
+ *
+ *     "[DevBW, DevCL] Errata: A destination register from a send can not be
+ *      used as a destination register until after it has been sourced by an
+ *      instruction with a different destination register.
+ */
+void
+fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
+{
+   int write_len = regs_written(inst);
+   int first_write_grf = inst->dst.nr;
+   bool needs_dep[BRW_MAX_MRF(devinfo->gen)];
+   assert(write_len < (int)sizeof(needs_dep) - 1);
+
+   memset(needs_dep, false, sizeof(needs_dep));
+   memset(needs_dep, true, write_len);
+   /* Walk forwards looking for writes to registers we're writing which aren't
+    * read before being written.
+    */
+   foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst) {
+      /* If we hit control flow, force resolve all remaining dependencies. */
+      if (block->end() == scan_inst && block->num != cfg->num_blocks - 1) {
+         for (int i = 0; i < write_len; i++) {
+            if (needs_dep[i])
+               DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
+                               first_write_grf + i);
+         }
+         return;
+      }
+
+      /* Clear the flag for registers that actually got read (as expected). */
+      clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
+
+      /* We insert our reads as late as possible since they're reading the
+       * result of a SEND, which has massive latency.
+       */
+      if (scan_inst->dst.file == VGRF &&
+          scan_inst->dst.nr >= first_write_grf &&
+          scan_inst->dst.nr < first_write_grf + write_len &&
+          needs_dep[scan_inst->dst.nr - first_write_grf]) {
+         DEP_RESOLVE_MOV(fs_builder(this, block, scan_inst),
+                         scan_inst->dst.nr);
+         needs_dep[scan_inst->dst.nr - first_write_grf] = false;
+      }
+
+      /* Continue the loop only if we haven't resolved all the dependencies */
+      int i;
+      for (i = 0; i < write_len; i++) {
+         if (needs_dep[i])
+            break;
+      }
+      if (i == write_len)
+         return;
+   }
+}
+
+void
+fs_visitor::insert_gen4_send_dependency_workarounds()
+{
+   if (devinfo->gen != 4 || devinfo->is_g4x)
+      return;
+
+   bool progress = false;
+
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      if (inst->mlen != 0 && inst->dst.file == VGRF) {
+         insert_gen4_pre_send_dependency_workarounds(block, inst);
+         insert_gen4_post_send_dependency_workarounds(block, inst);
+         progress = true;
+      }
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+}
+
+/**
+ * Turns the generic expression-style uniform pull constant load instruction
+ * into a hardware-specific series of instructions for loading a pull
+ * constant.
+ *
+ * The expression style allows the CSE pass before this to optimize out
+ * repeated loads from the same offset, and gives the pre-register-allocation
+ * scheduling full flexibility, while the conversion to native instructions
+ * allows the post-register-allocation scheduler the best information
+ * possible.
+ *
+ * Note that execution masking for setting up pull constant loads is special:
+ * the channels that need to be written are unrelated to the current execution
+ * mask, since a later instruction will use one of the result channels as a
+ * source operand for all 8 or 16 of its channels.
+ */
+void
+fs_visitor::lower_uniform_pull_constant_loads()
+{
+   foreach_block_and_inst (block, fs_inst, inst, cfg) {
+      if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
+         continue;
+
+      if (devinfo->gen >= 7) {
+         const fs_builder ubld = fs_builder(this, block, inst).exec_all();
+         const fs_reg payload = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD);
+
+         ubld.group(8, 0).MOV(payload,
+                              retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+         ubld.group(1, 0).MOV(component(payload, 2),
+                              brw_imm_ud(inst->src[1].ud / 16));
+
+         inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
+         inst->src[1] = payload;
+         inst->header_size = 1;
+         inst->mlen = 1;
+
+         invalidate_live_intervals();
+      } else {
+         /* Before register allocation, we didn't tell the scheduler about the
+          * MRF we use.  We know it's safe to use this MRF because nothing
+          * else does except for register spill/unspill, which generates and
+          * uses its MRF within a single IR instruction.
+          */
+         inst->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
+         inst->mlen = 1;
+      }
+   }
+}
+
+bool
+fs_visitor::lower_load_payload()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+      if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
+         continue;
+
+      assert(inst->dst.file == MRF || inst->dst.file == VGRF);
+      assert(inst->saturate == false);
+      fs_reg dst = inst->dst;
+
+      /* Get rid of COMPR4.  We'll add it back in if we need it */
+      if (dst.file == MRF)
+         dst.nr = dst.nr & ~BRW_MRF_COMPR4;
+
+      const fs_builder ibld(this, block, inst);
+      const fs_builder hbld = ibld.exec_all().group(8, 0);
+
+      for (uint8_t i = 0; i < inst->header_size; i++) {
+         if (inst->src[i].file != BAD_FILE) {
+            fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
+            fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
+            hbld.MOV(mov_dst, mov_src);
+         }
+         dst = offset(dst, hbld, 1);
+      }
+
+      if (inst->dst.file == MRF && (inst->dst.nr & BRW_MRF_COMPR4) &&
+          inst->exec_size > 8) {
+         /* In this case, the payload portion of the LOAD_PAYLOAD isn't
+          * a straightforward copy.  Instead, the result of the
+          * LOAD_PAYLOAD is treated as interleaved and the first four
+          * non-header sources are unpacked as:
+          *
+          * m + 0: r0
+          * m + 1: g0
+          * m + 2: b0
+          * m + 3: a0
+          * m + 4: r1
+          * m + 5: g1
+          * m + 6: b1
+          * m + 7: a1
+          *
+          * This is used for gen <= 5 fb writes.
+          */
+         assert(inst->exec_size == 16);
+         assert(inst->header_size + 4 <= inst->sources);
+         for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
+            if (inst->src[i].file != BAD_FILE) {
+               if (devinfo->has_compr4) {
+                  fs_reg compr4_dst = retype(dst, inst->src[i].type);
+                  compr4_dst.nr |= BRW_MRF_COMPR4;
+                  ibld.MOV(compr4_dst, inst->src[i]);
+               } else {
+                  /* Platform doesn't have COMPR4.  We have to fake it */
+                  fs_reg mov_dst = retype(dst, inst->src[i].type);
+                  ibld.half(0).MOV(mov_dst, half(inst->src[i], 0));
+                  mov_dst.nr += 4;
+                  ibld.half(1).MOV(mov_dst, half(inst->src[i], 1));
+               }
+            }
+
+            dst.nr++;
+         }
+
+         /* The loop above only ever incremented us through the first set
+          * of 4 registers.  However, thanks to the magic of COMPR4, we
+          * actually wrote to the first 8 registers, so we need to take
+          * that into account now.
+          */
+         dst.nr += 4;
+
+         /* The COMPR4 code took care of the first 4 sources.  We'll let
+          * the regular path handle any remaining sources.  Yes, we are
+          * modifying the instruction but we're about to delete it so
+          * this really doesn't hurt anything.
+          */
+         inst->header_size += 4;
+      }
+
+      for (uint8_t i = inst->header_size; i < inst->sources; i++) {
+         if (inst->src[i].file != BAD_FILE)
+            ibld.MOV(retype(dst, inst->src[i].type), inst->src[i]);
+         dst = offset(dst, ibld, 1);
+      }
+
+      inst->remove(block);
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
+bool
+fs_visitor::lower_integer_multiplication()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      const fs_builder ibld(this, block, inst);
+
+      if (inst->opcode == BRW_OPCODE_MUL) {
+         if (inst->dst.is_accumulator() ||
+             (inst->dst.type != BRW_REGISTER_TYPE_D &&
+              inst->dst.type != BRW_REGISTER_TYPE_UD))
+            continue;
+
+         /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit
+          * operation directly, but CHV/BXT cannot.
+          */
+         if (devinfo->gen >= 8 &&
+             !devinfo->is_cherryview && !devinfo->is_broxton)
+            continue;
+
+         if (inst->src[1].file == IMM &&
+             inst->src[1].ud < (1 << 16)) {
+            /* The MUL instruction isn't commutative. On Gen <= 6, only the low
+             * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
+             * src1 are used.
+             *
+             * If multiplying by an immediate value that fits in 16-bits, do a
+             * single MUL instruction with that value in the proper location.
+             */
+            if (devinfo->gen < 7) {
+               fs_reg imm(VGRF, alloc.allocate(dispatch_width / 8),
+                          inst->dst.type);
+               ibld.MOV(imm, inst->src[1]);
+               ibld.MUL(inst->dst, imm, inst->src[0]);
+            } else {
+               const bool ud = (inst->src[1].type == BRW_REGISTER_TYPE_UD);
+               ibld.MUL(inst->dst, inst->src[0],
+                        ud ? brw_imm_uw(inst->src[1].ud)
+                           : brw_imm_w(inst->src[1].d));
+            }
+         } else {
+            /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
+             * do 32-bit integer multiplication in one instruction, but instead
+             * must do a sequence (which actually calculates a 64-bit result):
+             *
+             *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
+             *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
+             *    mov(8)  g2<1>D     acc0<8,8,1>D
+             *
+             * But on Gen > 6, the ability to use second accumulator register
+             * (acc1) for non-float data types was removed, preventing a simple
+             * implementation in SIMD16. A 16-channel result can be calculated by
+             * executing the three instructions twice in SIMD8, once with quarter
+             * control of 1Q for the first eight channels and again with 2Q for
+             * the second eight channels.
+             *
+             * Which accumulator register is implicitly accessed (by AccWrEnable
+             * for instance) is determined by the quarter control. Unfortunately
+             * Ivybridge (and presumably Baytrail) has a hardware bug in which an
+             * implicit accumulator access by an instruction with 2Q will access
+             * acc1 regardless of whether the data type is usable in acc1.
+             *
+             * Specifically, the 2Q mach(8) writes acc1 which does not exist for
+             * integer data types.
+             *
+             * Since we only want the low 32-bits of the result, we can do two
+             * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
+             * adjust the high result and add them (like the mach is doing):
+             *
+             *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
+             *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
+             *    shl(8)  g9<1>D     g8<8,8,1>D      16D
+             *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
+             *
+             * We avoid the shl instruction by realizing that we only want to add
+             * the low 16-bits of the "high" result to the high 16-bits of the
+             * "low" result and using proper regioning on the add:
+             *
+             *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
+             *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
+             *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
+             *
+             * Since it does not use the (single) accumulator register, we can
+             * schedule multi-component multiplications much better.
+             */
+
+            fs_reg orig_dst = inst->dst;
+            if (orig_dst.is_null() || orig_dst.file == MRF) {
+               inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
+                                  inst->dst.type);
+            }
+            fs_reg low = inst->dst;
+            fs_reg high(VGRF, alloc.allocate(dispatch_width / 8),
+                        inst->dst.type);
+
+            if (devinfo->gen >= 7) {
+               if (inst->src[1].file == IMM) {
+                  ibld.MUL(low, inst->src[0],
+                           brw_imm_uw(inst->src[1].ud & 0xffff));
+                  ibld.MUL(high, inst->src[0],
+                           brw_imm_uw(inst->src[1].ud >> 16));
+               } else {
+                  ibld.MUL(low, inst->src[0],
+                           subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 0));
+                  ibld.MUL(high, inst->src[0],
+                           subscript(inst->src[1], BRW_REGISTER_TYPE_UW, 1));
+               }
+            } else {
+               ibld.MUL(low, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 0),
+                        inst->src[1]);
+               ibld.MUL(high, subscript(inst->src[0], BRW_REGISTER_TYPE_UW, 1),
+                        inst->src[1]);
+            }
+
+            ibld.ADD(subscript(inst->dst, BRW_REGISTER_TYPE_UW, 1),
+                     subscript(low, BRW_REGISTER_TYPE_UW, 1),
+                     subscript(high, BRW_REGISTER_TYPE_UW, 0));
+
+            if (inst->conditional_mod || orig_dst.file == MRF) {
+               set_condmod(inst->conditional_mod,
+                           ibld.MOV(orig_dst, inst->dst));
+            }
+         }
+
+      } else if (inst->opcode == SHADER_OPCODE_MULH) {
+         /* Should have been lowered to 8-wide. */
+         assert(inst->exec_size <= get_lowered_simd_width(devinfo, inst));
+         const fs_reg acc = retype(brw_acc_reg(inst->exec_size),
+                                   inst->dst.type);
+         fs_inst *mul = ibld.MUL(acc, inst->src[0], inst->src[1]);
+         fs_inst *mach = ibld.MACH(inst->dst, inst->src[0], inst->src[1]);
+
+         if (devinfo->gen >= 8) {
+            /* Until Gen8, integer multiplies read 32-bits from one source,
+             * and 16-bits from the other, and relying on the MACH instruction
+             * to generate the high bits of the result.
+             *
+             * On Gen8, the multiply instruction does a full 32x32-bit
+             * multiply, but in order to do a 64-bit multiply we can simulate
+             * the previous behavior and then use a MACH instruction.
+             *
+             * FINISHME: Don't use source modifiers on src1.
+             */
+            assert(mul->src[1].type == BRW_REGISTER_TYPE_D ||
+                   mul->src[1].type == BRW_REGISTER_TYPE_UD);
+            mul->src[1].type = BRW_REGISTER_TYPE_UW;
+            mul->src[1].stride *= 2;
+
+         } else if (devinfo->gen == 7 && !devinfo->is_haswell &&
+                    inst->group > 0) {
+            /* Among other things the quarter control bits influence which
+             * accumulator register is used by the hardware for instructions
+             * that access the accumulator implicitly (e.g. MACH).  A
+             * second-half instruction would normally map to acc1, which
+             * doesn't exist on Gen7 and up (the hardware does emulate it for
+             * floating-point instructions *only* by taking advantage of the
+             * extra precision of acc0 not normally used for floating point
+             * arithmetic).
+             *
+             * HSW and up are careful enough not to try to access an
+             * accumulator register that doesn't exist, but on earlier Gen7
+             * hardware we need to make sure that the quarter control bits are
+             * zero to avoid non-deterministic behaviour and emit an extra MOV
+             * to get the result masked correctly according to the current
+             * channel enables.
+             */
+            mach->group = 0;
+            mach->force_writemask_all = true;
+            mach->dst = ibld.vgrf(inst->dst.type);
+            ibld.MOV(inst->dst, mach->dst);
+         }
+      } else {
+         continue;
+      }
+
+      inst->remove(block);
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
+bool
+fs_visitor::lower_minmax()
+{
+   assert(devinfo->gen < 6);
+
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      const fs_builder ibld(this, block, inst);
+
+      if (inst->opcode == BRW_OPCODE_SEL &&
+          inst->predicate == BRW_PREDICATE_NONE) {
+         /* FIXME: Using CMP doesn't preserve the NaN propagation semantics of
+          *        the original SEL.L/GE instruction
+          */
+         ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
+                  inst->conditional_mod);
+         inst->predicate = BRW_PREDICATE_NORMAL;
+         inst->conditional_mod = BRW_CONDITIONAL_NONE;
+
+         progress = true;
+      }
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
+static void
+setup_color_payload(const fs_builder &bld, const brw_wm_prog_key *key,
+                    fs_reg *dst, fs_reg color, unsigned components)
+{
+   if (key->clamp_fragment_color) {
+      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+      assert(color.type == BRW_REGISTER_TYPE_F);
+
+      for (unsigned i = 0; i < components; i++)
+         set_saturate(true,
+                      bld.MOV(offset(tmp, bld, i), offset(color, bld, i)));
+
+      color = tmp;
+   }
+
+   for (unsigned i = 0; i < components; i++)
+      dst[i] = offset(color, bld, i);
+}
+
+static void
+lower_fb_write_logical_send(const fs_builder &bld, fs_inst *inst,
+                            const struct brw_wm_prog_data *prog_data,
+                            const brw_wm_prog_key *key,
+                            const fs_visitor::thread_payload &payload)
+{
+   assert(inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].file == IMM);
+   const gen_device_info *devinfo = bld.shader->devinfo;
+   const fs_reg &color0 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR0];
+   const fs_reg &color1 = inst->src[FB_WRITE_LOGICAL_SRC_COLOR1];
+   const fs_reg &src0_alpha = inst->src[FB_WRITE_LOGICAL_SRC_SRC0_ALPHA];
+   const fs_reg &src_depth = inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH];
+   const fs_reg &dst_depth = inst->src[FB_WRITE_LOGICAL_SRC_DST_DEPTH];
+   const fs_reg &src_stencil = inst->src[FB_WRITE_LOGICAL_SRC_SRC_STENCIL];
+   fs_reg sample_mask = inst->src[FB_WRITE_LOGICAL_SRC_OMASK];
+   const unsigned components =
+      inst->src[FB_WRITE_LOGICAL_SRC_COMPONENTS].ud;
+
+   /* We can potentially have a message length of up to 15, so we have to set
+    * base_mrf to either 0 or 1 in order to fit in m0..m15.
+    */
+   fs_reg sources[15];
+   int header_size = 2, payload_header_size;
+   unsigned length = 0;
+
+   /* From the Sandy Bridge PRM, volume 4, page 198:
+    *
+    *     "Dispatched Pixel Enables. One bit per pixel indicating
+    *      which pixels were originally enabled when the thread was
+    *      dispatched. This field is only required for the end-of-
+    *      thread message and on all dual-source messages."
+    */
+   if (devinfo->gen >= 6 &&
+       (devinfo->is_haswell || devinfo->gen >= 8 || !prog_data->uses_kill) &&
+       color1.file == BAD_FILE &&
+       key->nr_color_regions == 1) {
+      header_size = 0;
+   }
+
+   if (header_size != 0) {
+      assert(header_size == 2);
+      /* Allocate 2 registers for a header */
+      length += 2;
+   }
+
+   if (payload.aa_dest_stencil_reg) {
+      sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1));
+      bld.group(8, 0).exec_all().annotate("FB write stencil/AA alpha")
+         .MOV(sources[length],
+              fs_reg(brw_vec8_grf(payload.aa_dest_stencil_reg, 0)));
+      length++;
+   }
+
+   if (sample_mask.file != BAD_FILE) {
+      sources[length] = fs_reg(VGRF, bld.shader->alloc.allocate(1),
+                               BRW_REGISTER_TYPE_UD);
+
+      /* Hand over gl_SampleMask.  Only the lower 16 bits of each channel are
+       * relevant.  Since it's unsigned single words one vgrf is always
+       * 16-wide, but only the lower or higher 8 channels will be used by the
+       * hardware when doing a SIMD8 write depending on whether we have
+       * selected the subspans for the first or second half respectively.
+       */
+      assert(sample_mask.file != BAD_FILE && type_sz(sample_mask.type) == 4);
+      sample_mask.type = BRW_REGISTER_TYPE_UW;
+      sample_mask.stride *= 2;
+
+      bld.exec_all().annotate("FB write oMask")
+         .MOV(horiz_offset(retype(sources[length], BRW_REGISTER_TYPE_UW),
+                           inst->group),
+              sample_mask);
+      length++;
+   }
+
+   payload_header_size = length;
+
+   if (src0_alpha.file != BAD_FILE) {
+      /* FIXME: This is being passed at the wrong location in the payload and
+       * doesn't work when gl_SampleMask and MRTs are used simultaneously.
+       * It's supposed to be immediately before oMask but there seems to be no
+       * reasonable way to pass them in the correct order because LOAD_PAYLOAD
+       * requires header sources to form a contiguous segment at the beginning
+       * of the message and src0_alpha has per-channel semantics.
+       */
+      setup_color_payload(bld, key, &sources[length], src0_alpha, 1);
+      length++;
+   } else if (key->replicate_alpha && inst->target != 0) {
+      /* Handle the case when fragment shader doesn't write to draw buffer
+       * zero. No need to call setup_color_payload() for src0_alpha because
+       * alpha value will be undefined.
+       */
+      length++;
+   }
+
+   setup_color_payload(bld, key, &sources[length], color0, components);
+   length += 4;
+
+   if (color1.file != BAD_FILE) {
+      setup_color_payload(bld, key, &sources[length], color1, components);
+      length += 4;
+   }
+
+   if (src_depth.file != BAD_FILE) {
+      sources[length] = src_depth;
+      length++;
+   }
+
+   if (dst_depth.file != BAD_FILE) {
+      sources[length] = dst_depth;
+      length++;
+   }
+
+   if (src_stencil.file != BAD_FILE) {
+      assert(devinfo->gen >= 9);
+      assert(bld.dispatch_width() != 16);
+
+      /* XXX: src_stencil is only available on gen9+. dst_depth is never
+       * available on gen9+. As such it's impossible to have both enabled at the
+       * same time and therefore length cannot overrun the array.
+       */
+      assert(length < 15);
+
+      sources[length] = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      bld.exec_all().annotate("FB write OS")
+         .MOV(retype(sources[length], BRW_REGISTER_TYPE_UB),
+              subscript(src_stencil, BRW_REGISTER_TYPE_UB, 0));
+      length++;
+   }
+
+   fs_inst *load;
+   if (devinfo->gen >= 7) {
+      /* Send from the GRF */
+      fs_reg payload = fs_reg(VGRF, -1, BRW_REGISTER_TYPE_F);
+      load = bld.LOAD_PAYLOAD(payload, sources, length, payload_header_size);
+      payload.nr = bld.shader->alloc.allocate(regs_written(load));
+      load->dst = payload;
+
+      inst->src[0] = payload;
+      inst->resize_sources(1);
+   } else {
+      /* Send from the MRF */
+      load = bld.LOAD_PAYLOAD(fs_reg(MRF, 1, BRW_REGISTER_TYPE_F),
+                              sources, length, payload_header_size);
+
+      /* On pre-SNB, we have to interlace the color values.  LOAD_PAYLOAD
+       * will do this for us if we just give it a COMPR4 destination.
+       */
+      if (devinfo->gen < 6 && bld.dispatch_width() == 16)
+         load->dst.nr |= BRW_MRF_COMPR4;
+
+      inst->resize_sources(0);
+      inst->base_mrf = 1;
+   }
+
+   inst->opcode = FS_OPCODE_FB_WRITE;
+   inst->mlen = regs_written(load);
+   inst->header_size = header_size;
+}
+
+static void
+lower_fb_read_logical_send(const fs_builder &bld, fs_inst *inst)
+{
+   const fs_builder &ubld = bld.exec_all();
+   const unsigned length = 2;
+   const fs_reg header = ubld.group(8, 0).vgrf(BRW_REGISTER_TYPE_UD, length);
+
+   ubld.group(16, 0)
+       .MOV(header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+   inst->resize_sources(1);
+   inst->src[0] = header;
+   inst->opcode = FS_OPCODE_FB_READ;
+   inst->mlen = length;
+   inst->header_size = length;
+}
+
+static void
+lower_sampler_logical_send_gen4(const fs_builder &bld, fs_inst *inst, opcode op,
+                                const fs_reg &coordinate,
+                                const fs_reg &shadow_c,
+                                const fs_reg &lod, const fs_reg &lod2,
+                                const fs_reg &surface,
+                                const fs_reg &sampler,
+                                unsigned coord_components,
+                                unsigned grad_components)
+{
+   const bool has_lod = (op == SHADER_OPCODE_TXL || op == FS_OPCODE_TXB ||
+                         op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS);
+   fs_reg msg_begin(MRF, 1, BRW_REGISTER_TYPE_F);
+   fs_reg msg_end = msg_begin;
+
+   /* g0 header. */
+   msg_end = offset(msg_end, bld.group(8, 0), 1);
+
+   for (unsigned i = 0; i < coord_components; i++)
+      bld.MOV(retype(offset(msg_end, bld, i), coordinate.type),
+              offset(coordinate, bld, i));
+
+   msg_end = offset(msg_end, bld, coord_components);
+
+   /* Messages other than SAMPLE and RESINFO in SIMD16 and TXD in SIMD8
+    * require all three components to be present and zero if they are unused.
+    */
+   if (coord_components > 0 &&
+       (has_lod || shadow_c.file != BAD_FILE ||
+        (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8))) {
+      for (unsigned i = coord_components; i < 3; i++)
+         bld.MOV(offset(msg_end, bld, i), brw_imm_f(0.0f));
+
+      msg_end = offset(msg_end, bld, 3 - coord_components);
+   }
+
+   if (op == SHADER_OPCODE_TXD) {
+      /* TXD unsupported in SIMD16 mode. */
+      assert(bld.dispatch_width() == 8);
+
+      /* the slots for u and v are always present, but r is optional */
+      if (coord_components < 2)
+         msg_end = offset(msg_end, bld, 2 - coord_components);
+
+      /*  P   = u, v, r
+       * dPdx = dudx, dvdx, drdx
+       * dPdy = dudy, dvdy, drdy
+       *
+       * 1-arg: Does not exist.
+       *
+       * 2-arg: dudx   dvdx   dudy   dvdy
+       *        dPdx.x dPdx.y dPdy.x dPdy.y
+       *        m4     m5     m6     m7
+       *
+       * 3-arg: dudx   dvdx   drdx   dudy   dvdy   drdy
+       *        dPdx.x dPdx.y dPdx.z dPdy.x dPdy.y dPdy.z
+       *        m5     m6     m7     m8     m9     m10
+       */
+      for (unsigned i = 0; i < grad_components; i++)
+         bld.MOV(offset(msg_end, bld, i), offset(lod, bld, i));
+
+      msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
+
+      for (unsigned i = 0; i < grad_components; i++)
+         bld.MOV(offset(msg_end, bld, i), offset(lod2, bld, i));
+
+      msg_end = offset(msg_end, bld, MAX2(grad_components, 2));
+   }
+
+   if (has_lod) {
+      /* Bias/LOD with shadow comparator is unsupported in SIMD16 -- *Without*
+       * shadow comparator (including RESINFO) it's unsupported in SIMD8 mode.
+       */
+      assert(shadow_c.file != BAD_FILE ? bld.dispatch_width() == 8 :
+             bld.dispatch_width() == 16);
+
+      const brw_reg_type type =
+         (op == SHADER_OPCODE_TXF || op == SHADER_OPCODE_TXS ?
+          BRW_REGISTER_TYPE_UD : BRW_REGISTER_TYPE_F);
+      bld.MOV(retype(msg_end, type), lod);
+      msg_end = offset(msg_end, bld, 1);
+   }
+
+   if (shadow_c.file != BAD_FILE) {
+      if (op == SHADER_OPCODE_TEX && bld.dispatch_width() == 8) {
+         /* There's no plain shadow compare message, so we use shadow
+          * compare with a bias of 0.0.
+          */
+         bld.MOV(msg_end, brw_imm_f(0.0f));
+         msg_end = offset(msg_end, bld, 1);
+      }
+
+      bld.MOV(msg_end, shadow_c);
+      msg_end = offset(msg_end, bld, 1);
+   }
+
+   inst->opcode = op;
+   inst->src[0] = reg_undef;
+   inst->src[1] = surface;
+   inst->src[2] = sampler;
+   inst->resize_sources(3);
+   inst->base_mrf = msg_begin.nr;
+   inst->mlen = msg_end.nr - msg_begin.nr;
+   inst->header_size = 1;
+}
+
+static void
+lower_sampler_logical_send_gen5(const fs_builder &bld, fs_inst *inst, opcode op,
+                                const fs_reg &coordinate,
+                                const fs_reg &shadow_c,
+                                const fs_reg &lod, const fs_reg &lod2,
+                                const fs_reg &sample_index,
+                                const fs_reg &surface,
+                                const fs_reg &sampler,
+                                unsigned coord_components,
+                                unsigned grad_components)
+{
+   fs_reg message(MRF, 2, BRW_REGISTER_TYPE_F);
+   fs_reg msg_coords = message;
+   unsigned header_size = 0;
+
+   if (inst->offset != 0) {
+      /* The offsets set up by the visitor are in the m1 header, so we can't
+       * go headerless.
+       */
+      header_size = 1;
+      message.nr--;
+   }
+
+   for (unsigned i = 0; i < coord_components; i++)
+      bld.MOV(retype(offset(msg_coords, bld, i), coordinate.type),
+              offset(coordinate, bld, i));
+
+   fs_reg msg_end = offset(msg_coords, bld, coord_components);
+   fs_reg msg_lod = offset(msg_coords, bld, 4);
+
+   if (shadow_c.file != BAD_FILE) {
+      fs_reg msg_shadow = msg_lod;
+      bld.MOV(msg_shadow, shadow_c);
+      msg_lod = offset(msg_shadow, bld, 1);
+      msg_end = msg_lod;
+   }
+
+   switch (op) {
+   case SHADER_OPCODE_TXL:
+   case FS_OPCODE_TXB:
+      bld.MOV(msg_lod, lod);
+      msg_end = offset(msg_lod, bld, 1);
+      break;
+   case SHADER_OPCODE_TXD:
+      /**
+       *  P   =  u,    v,    r
+       * dPdx = dudx, dvdx, drdx
+       * dPdy = dudy, dvdy, drdy
+       *
+       * Load up these values:
+       * - dudx   dudy   dvdx   dvdy   drdx   drdy
+       * - dPdx.x dPdy.x dPdx.y dPdy.y dPdx.z dPdy.z
+       */
+      msg_end = msg_lod;
+      for (unsigned i = 0; i < grad_components; i++) {
+         bld.MOV(msg_end, offset(lod, bld, i));
+         msg_end = offset(msg_end, bld, 1);
+
+         bld.MOV(msg_end, offset(lod2, bld, i));
+         msg_end = offset(msg_end, bld, 1);
+      }
+      break;
+   case SHADER_OPCODE_TXS:
+      msg_lod = retype(msg_end, BRW_REGISTER_TYPE_UD);
+      bld.MOV(msg_lod, lod);
+      msg_end = offset(msg_lod, bld, 1);
+      break;
+   case SHADER_OPCODE_TXF:
+      msg_lod = offset(msg_coords, bld, 3);
+      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), lod);
+      msg_end = offset(msg_lod, bld, 1);
+      break;
+   case SHADER_OPCODE_TXF_CMS:
+      msg_lod = offset(msg_coords, bld, 3);
+      /* lod */
+      bld.MOV(retype(msg_lod, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
+      /* sample index */
+      bld.MOV(retype(offset(msg_lod, bld, 1), BRW_REGISTER_TYPE_UD), sample_index);
+      msg_end = offset(msg_lod, bld, 2);
+      break;
+   default:
+      break;
+   }
+
+   inst->opcode = op;
+   inst->src[0] = reg_undef;
+   inst->src[1] = surface;
+   inst->src[2] = sampler;
+   inst->resize_sources(3);
+   inst->base_mrf = message.nr;
+   inst->mlen = msg_end.nr - message.nr;
+   inst->header_size = header_size;
+
+   /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
+   assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
+}
+
+static bool
+is_high_sampler(const struct gen_device_info *devinfo, const fs_reg &sampler)
+{
+   if (devinfo->gen < 8 && !devinfo->is_haswell)
+      return false;
+
+   return sampler.file != IMM || sampler.ud >= 16;
+}
+
+static void
+lower_sampler_logical_send_gen7(const fs_builder &bld, fs_inst *inst, opcode op,
+                                const fs_reg &coordinate,
+                                const fs_reg &shadow_c,
+                                fs_reg lod, const fs_reg &lod2,
+                                const fs_reg &sample_index,
+                                const fs_reg &mcs,
+                                const fs_reg &surface,
+                                const fs_reg &sampler,
+                                const fs_reg &tg4_offset,
+                                unsigned coord_components,
+                                unsigned grad_components)
+{
+   const gen_device_info *devinfo = bld.shader->devinfo;
+   unsigned reg_width = bld.dispatch_width() / 8;
+   unsigned header_size = 0, length = 0;
+   fs_reg sources[MAX_SAMPLER_MESSAGE_SIZE];
+   for (unsigned i = 0; i < ARRAY_SIZE(sources); i++)
+      sources[i] = bld.vgrf(BRW_REGISTER_TYPE_F);
+
+   if (op == SHADER_OPCODE_TG4 || op == SHADER_OPCODE_TG4_OFFSET ||
+       inst->offset != 0 || inst->eot ||
+       op == SHADER_OPCODE_SAMPLEINFO ||
+       is_high_sampler(devinfo, sampler)) {
+      /* For general texture offsets (no txf workaround), we need a header to
+       * put them in.  Note that we're only reserving space for it in the
+       * message payload as it will be initialized implicitly by the
+       * generator.
+       *
+       * TG4 needs to place its channel select in the header, for interaction
+       * with ARB_texture_swizzle.  The sampler index is only 4-bits, so for
+       * larger sampler numbers we need to offset the Sampler State Pointer in
+       * the header.
+       */
+      header_size = 1;
+      sources[0] = fs_reg();
+      length++;
+
+      /* If we're requesting fewer than four channels worth of response,
+       * and we have an explicit header, we need to set up the sampler
+       * writemask.  It's reversed from normal: 1 means "don't write".
+       */
+      if (!inst->eot && regs_written(inst) != 4 * reg_width) {
+         assert(regs_written(inst) % reg_width == 0);
+         unsigned mask = ~((1 << (regs_written(inst) / reg_width)) - 1) & 0xf;
+         inst->offset |= mask << 12;
+      }
+   }
+
+   if (shadow_c.file != BAD_FILE) {
+      bld.MOV(sources[length], shadow_c);
+      length++;
+   }
+
+   bool coordinate_done = false;
+
+   /* Set up the LOD info */
+   switch (op) {
+   case FS_OPCODE_TXB:
+   case SHADER_OPCODE_TXL:
+      if (devinfo->gen >= 9 && op == SHADER_OPCODE_TXL && lod.is_zero()) {
+         op = SHADER_OPCODE_TXL_LZ;
+         break;
+      }
+      bld.MOV(sources[length], lod);
+      length++;
+      break;
+   case SHADER_OPCODE_TXD:
+      /* TXD should have been lowered in SIMD16 mode. */
+      assert(bld.dispatch_width() == 8);
+
+      /* Load dPdx and the coordinate together:
+       * [hdr], [ref], x, dPdx.x, dPdy.x, y, dPdx.y, dPdy.y, z, dPdx.z, dPdy.z
+       */
+      for (unsigned i = 0; i < coord_components; i++) {
+         bld.MOV(sources[length++], offset(coordinate, bld, i));
+
+         /* For cube map array, the coordinate is (u,v,r,ai) but there are
+          * only derivatives for (u, v, r).
+          */
+         if (i < grad_components) {
+            bld.MOV(sources[length++], offset(lod, bld, i));
+            bld.MOV(sources[length++], offset(lod2, bld, i));
+         }
+      }
+
+      coordinate_done = true;
+      break;
+   case SHADER_OPCODE_TXS:
+      bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), lod);
+      length++;
+      break;
+   case SHADER_OPCODE_TXF:
+      /* Unfortunately, the parameters for LD are intermixed: u, lod, v, r.
+       * On Gen9 they are u, v, lod, r
+       */
+      bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D), coordinate);
+
+      if (devinfo->gen >= 9) {
+         if (coord_components >= 2) {
+            bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D),
+                    offset(coordinate, bld, 1));
+         } else {
+            sources[length] = brw_imm_d(0);
+         }
+         length++;
+      }
+
+      if (devinfo->gen >= 9 && lod.is_zero()) {
+         op = SHADER_OPCODE_TXF_LZ;
+      } else {
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_D), lod);
+         length++;
+      }
+
+      for (unsigned i = devinfo->gen >= 9 ? 2 : 1; i < coord_components; i++)
+         bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
+                 offset(coordinate, bld, i));
+
+      coordinate_done = true;
+      break;
+
+   case SHADER_OPCODE_TXF_CMS:
+   case SHADER_OPCODE_TXF_CMS_W:
+   case SHADER_OPCODE_TXF_UMS:
+   case SHADER_OPCODE_TXF_MCS:
+      if (op == SHADER_OPCODE_TXF_UMS ||
+          op == SHADER_OPCODE_TXF_CMS ||
+          op == SHADER_OPCODE_TXF_CMS_W) {
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), sample_index);
+         length++;
+      }
+
+      if (op == SHADER_OPCODE_TXF_CMS || op == SHADER_OPCODE_TXF_CMS_W) {
+         /* Data from the multisample control surface. */
+         bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD), mcs);
+         length++;
+
+         /* On Gen9+ we'll use ld2dms_w instead which has two registers for
+          * the MCS data.
+          */
+         if (op == SHADER_OPCODE_TXF_CMS_W) {
+            bld.MOV(retype(sources[length], BRW_REGISTER_TYPE_UD),
+                    mcs.file == IMM ?
+                    mcs :
+                    offset(mcs, bld, 1));
+            length++;
+         }
+      }
+
+      /* There is no offsetting for this message; just copy in the integer
+       * texture coordinates.
+       */
+      for (unsigned i = 0; i < coord_components; i++)
+         bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
+                 offset(coordinate, bld, i));
+
+      coordinate_done = true;
+      break;
+   case SHADER_OPCODE_TG4_OFFSET:
+      /* More crazy intermixing */
+      for (unsigned i = 0; i < 2; i++) /* u, v */
+         bld.MOV(sources[length++], offset(coordinate, bld, i));
+
+      for (unsigned i = 0; i < 2; i++) /* offu, offv */
+         bld.MOV(retype(sources[length++], BRW_REGISTER_TYPE_D),
+                 offset(tg4_offset, bld, i));
+
+      if (coord_components == 3) /* r if present */
+         bld.MOV(sources[length++], offset(coordinate, bld, 2));
+
+      coordinate_done = true;
+      break;
+   default:
+      break;
+   }
+
+   /* Set up the coordinate (except for cases where it was done above) */
+   if (!coordinate_done) {
+      for (unsigned i = 0; i < coord_components; i++)
+         bld.MOV(sources[length++], offset(coordinate, bld, i));
+   }
+
+   int mlen;
+   if (reg_width == 2)
+      mlen = length * reg_width - header_size;
+   else
+      mlen = length * reg_width;
+
+   const fs_reg src_payload = fs_reg(VGRF, bld.shader->alloc.allocate(mlen),
+                                     BRW_REGISTER_TYPE_F);
+   bld.LOAD_PAYLOAD(src_payload, sources, length, header_size);
+
+   /* Generate the SEND. */
+   inst->opcode = op;
+   inst->src[0] = src_payload;
+   inst->src[1] = surface;
+   inst->src[2] = sampler;
+   inst->resize_sources(3);
+   inst->mlen = mlen;
+   inst->header_size = header_size;
+
+   /* Message length > MAX_SAMPLER_MESSAGE_SIZE disallowed by hardware. */
+   assert(inst->mlen <= MAX_SAMPLER_MESSAGE_SIZE);
+}
+
+static void
+lower_sampler_logical_send(const fs_builder &bld, fs_inst *inst, opcode op)
+{
+   const gen_device_info *devinfo = bld.shader->devinfo;
+   const fs_reg &coordinate = inst->src[TEX_LOGICAL_SRC_COORDINATE];
+   const fs_reg &shadow_c = inst->src[TEX_LOGICAL_SRC_SHADOW_C];
+   const fs_reg &lod = inst->src[TEX_LOGICAL_SRC_LOD];
+   const fs_reg &lod2 = inst->src[TEX_LOGICAL_SRC_LOD2];
+   const fs_reg &sample_index = inst->src[TEX_LOGICAL_SRC_SAMPLE_INDEX];
+   const fs_reg &mcs = inst->src[TEX_LOGICAL_SRC_MCS];
+   const fs_reg &surface = inst->src[TEX_LOGICAL_SRC_SURFACE];
+   const fs_reg &sampler = inst->src[TEX_LOGICAL_SRC_SAMPLER];
+   const fs_reg &tg4_offset = inst->src[TEX_LOGICAL_SRC_TG4_OFFSET];
+   assert(inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].file == IMM);
+   const unsigned coord_components = inst->src[TEX_LOGICAL_SRC_COORD_COMPONENTS].ud;
+   assert(inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].file == IMM);
+   const unsigned grad_components = inst->src[TEX_LOGICAL_SRC_GRAD_COMPONENTS].ud;
+
+   if (devinfo->gen >= 7) {
+      lower_sampler_logical_send_gen7(bld, inst, op, coordinate,
+                                      shadow_c, lod, lod2, sample_index,
+                                      mcs, surface, sampler, tg4_offset,
+                                      coord_components, grad_components);
+   } else if (devinfo->gen >= 5) {
+      lower_sampler_logical_send_gen5(bld, inst, op, coordinate,
+                                      shadow_c, lod, lod2, sample_index,
+                                      surface, sampler,
+                                      coord_components, grad_components);
+   } else {
+      lower_sampler_logical_send_gen4(bld, inst, op, coordinate,
+                                      shadow_c, lod, lod2,
+                                      surface, sampler,
+                                      coord_components, grad_components);
+   }
+}
+
+/**
+ * Initialize the header present in some typed and untyped surface
+ * messages.
+ */
+static fs_reg
+emit_surface_header(const fs_builder &bld, const fs_reg &sample_mask)
+{
+   fs_builder ubld = bld.exec_all().group(8, 0);
+   const fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+   ubld.MOV(dst, brw_imm_d(0));
+   ubld.MOV(component(dst, 7), sample_mask);
+   return dst;
+}
+
+static void
+lower_surface_logical_send(const fs_builder &bld, fs_inst *inst, opcode op,
+                           const fs_reg &sample_mask)
+{
+   /* Get the logical send arguments. */
+   const fs_reg &addr = inst->src[0];
+   const fs_reg &src = inst->src[1];
+   const fs_reg &surface = inst->src[2];
+   const UNUSED fs_reg &dims = inst->src[3];
+   const fs_reg &arg = inst->src[4];
+
+   /* Calculate the total number of components of the payload. */
+   const unsigned addr_sz = inst->components_read(0);
+   const unsigned src_sz = inst->components_read(1);
+   const unsigned header_sz = (sample_mask.file == BAD_FILE ? 0 : 1);
+   const unsigned sz = header_sz + addr_sz + src_sz;
+
+   /* Allocate space for the payload. */
+   fs_reg *const components = new fs_reg[sz];
+   const fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
+   unsigned n = 0;
+
+   /* Construct the payload. */
+   if (header_sz)
+      components[n++] = emit_surface_header(bld, sample_mask);
+
+   for (unsigned i = 0; i < addr_sz; i++)
+      components[n++] = offset(addr, bld, i);
+
+   for (unsigned i = 0; i < src_sz; i++)
+      components[n++] = offset(src, bld, i);
+
+   bld.LOAD_PAYLOAD(payload, components, sz, header_sz);
+
+   /* Update the original instruction. */
+   inst->opcode = op;
+   inst->mlen = header_sz + (addr_sz + src_sz) * inst->exec_size / 8;
+   inst->header_size = header_sz;
+
+   inst->src[0] = payload;
+   inst->src[1] = surface;
+   inst->src[2] = arg;
+   inst->resize_sources(3);
+
+   delete[] components;
+}
+
+static void
+lower_varying_pull_constant_logical_send(const fs_builder &bld, fs_inst *inst)
+{
+   const gen_device_info *devinfo = bld.shader->devinfo;
+
+   if (devinfo->gen >= 7) {
+      /* We are switching the instruction from an ALU-like instruction to a
+       * send-from-grf instruction.  Since sends can't handle strides or
+       * source modifiers, we have to make a copy of the offset source.
+       */
+      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      bld.MOV(tmp, inst->src[1]);
+      inst->src[1] = tmp;
+
+      inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
+
+   } else {
+      const fs_reg payload(MRF, FIRST_PULL_LOAD_MRF(devinfo->gen),
+                           BRW_REGISTER_TYPE_UD);
+
+      bld.MOV(byte_offset(payload, REG_SIZE), inst->src[1]);
+
+      inst->opcode = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4;
+      inst->resize_sources(1);
+      inst->base_mrf = payload.nr;
+      inst->header_size = 1;
+      inst->mlen = 1 + inst->exec_size / 8;
+   }
+}
+
+static void
+lower_math_logical_send(const fs_builder &bld, fs_inst *inst)
+{
+   assert(bld.shader->devinfo->gen < 6);
+
+   inst->base_mrf = 2;
+   inst->mlen = inst->sources * inst->exec_size / 8;
+
+   if (inst->sources > 1) {
+      /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
+       * "Message Payload":
+       *
+       * "Operand0[7].  For the INT DIV functions, this operand is the
+       *  denominator."
+       *  ...
+       * "Operand1[7].  For the INT DIV functions, this operand is the
+       *  numerator."
+       */
+      const bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
+      const fs_reg src0 = is_int_div ? inst->src[1] : inst->src[0];
+      const fs_reg src1 = is_int_div ? inst->src[0] : inst->src[1];
+
+      inst->resize_sources(1);
+      inst->src[0] = src0;
+
+      assert(inst->exec_size == 8);
+      bld.MOV(fs_reg(MRF, inst->base_mrf + 1, src1.type), src1);
+   }
+}
+
+bool
+fs_visitor::lower_logical_sends()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      const fs_builder ibld(this, block, inst);
+
+      switch (inst->opcode) {
+      case FS_OPCODE_FB_WRITE_LOGICAL:
+         assert(stage == MESA_SHADER_FRAGMENT);
+         lower_fb_write_logical_send(ibld, inst,
+                                     brw_wm_prog_data(prog_data),
+                                     (const brw_wm_prog_key *)key,
+                                     payload);
+         break;
+
+      case FS_OPCODE_FB_READ_LOGICAL:
+         lower_fb_read_logical_send(ibld, inst);
+         break;
+
+      case SHADER_OPCODE_TEX_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TEX);
+         break;
+
+      case SHADER_OPCODE_TXD_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXD);
+         break;
+
+      case SHADER_OPCODE_TXF_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF);
+         break;
+
+      case SHADER_OPCODE_TXL_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXL);
+         break;
+
+      case SHADER_OPCODE_TXS_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXS);
+         break;
+
+      case FS_OPCODE_TXB_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, FS_OPCODE_TXB);
+         break;
+
+      case SHADER_OPCODE_TXF_CMS_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS);
+         break;
+
+      case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_CMS_W);
+         break;
+
+      case SHADER_OPCODE_TXF_UMS_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_UMS);
+         break;
+
+      case SHADER_OPCODE_TXF_MCS_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TXF_MCS);
+         break;
+
+      case SHADER_OPCODE_LOD_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_LOD);
+         break;
+
+      case SHADER_OPCODE_TG4_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4);
+         break;
+
+      case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_TG4_OFFSET);
+         break;
+
+      case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
+         lower_sampler_logical_send(ibld, inst, SHADER_OPCODE_SAMPLEINFO);
+         break;
+
+      case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_UNTYPED_SURFACE_READ,
+                                    fs_reg());
+         break;
+
+      case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_UNTYPED_SURFACE_WRITE,
+                                    ibld.sample_mask_reg());
+         break;
+
+      case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_UNTYPED_ATOMIC,
+                                    ibld.sample_mask_reg());
+         break;
+
+      case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_TYPED_SURFACE_READ,
+                                    brw_imm_d(0xffff));
+         break;
+
+      case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_TYPED_SURFACE_WRITE,
+                                    ibld.sample_mask_reg());
+         break;
+
+      case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+         lower_surface_logical_send(ibld, inst,
+                                    SHADER_OPCODE_TYPED_ATOMIC,
+                                    ibld.sample_mask_reg());
+         break;
+
+      case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
+         lower_varying_pull_constant_logical_send(ibld, inst);
+         break;
+
+      case SHADER_OPCODE_RCP:
+      case SHADER_OPCODE_RSQ:
+      case SHADER_OPCODE_SQRT:
+      case SHADER_OPCODE_EXP2:
+      case SHADER_OPCODE_LOG2:
+      case SHADER_OPCODE_SIN:
+      case SHADER_OPCODE_COS:
+      case SHADER_OPCODE_POW:
+      case SHADER_OPCODE_INT_QUOTIENT:
+      case SHADER_OPCODE_INT_REMAINDER:
+         /* The math opcodes are overloaded for the send-like and
+          * expression-like instructions which seems kind of icky.  Gen6+ has
+          * a native (but rather quirky) MATH instruction so we don't need to
+          * do anything here.  On Gen4-5 we'll have to lower the Gen6-like
+          * logical instructions (which we can easily recognize because they
+          * have mlen = 0) into send-like virtual instructions.
+          */
+         if (devinfo->gen < 6 && inst->mlen == 0) {
+            lower_math_logical_send(ibld, inst);
+            break;
+
+         } else {
+            continue;
+         }
+
+      default:
+         continue;
+      }
+
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
+/**
+ * Get the closest allowed SIMD width for instruction \p inst accounting for
+ * some common regioning and execution control restrictions that apply to FPU
+ * instructions.  These restrictions don't necessarily have any relevance to
+ * instructions not executed by the FPU pipeline like extended math, control
+ * flow or send message instructions.
+ *
+ * For virtual opcodes it's really up to the instruction -- In some cases
+ * (e.g. where a virtual instruction unrolls into a simple sequence of FPU
+ * instructions) it may simplify virtual instruction lowering if we can
+ * enforce FPU-like regioning restrictions already on the virtual instruction,
+ * in other cases (e.g. virtual send-like instructions) this may be
+ * excessively restrictive.
+ */
+static unsigned
+get_fpu_lowered_simd_width(const struct gen_device_info *devinfo,
+                           const fs_inst *inst)
+{
+   /* Maximum execution size representable in the instruction controls. */
+   unsigned max_width = MIN2(32, inst->exec_size);
+
+   /* According to the PRMs:
+    *  "A. In Direct Addressing mode, a source cannot span more than 2
+    *      adjacent GRF registers.
+    *   B. A destination cannot span more than 2 adjacent GRF registers."
+    *
+    * Look for the source or destination with the largest register region
+    * which is the one that is going to limit the overall execution size of
+    * the instruction due to this rule.
+    */
+   unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
+
+   for (unsigned i = 0; i < inst->sources; i++)
+      reg_count = MAX2(reg_count, DIV_ROUND_UP(inst->size_read(i), REG_SIZE));
+
+   /* Calculate the maximum execution size of the instruction based on the
+    * factor by which it goes over the hardware limit of 2 GRFs.
+    */
+   if (reg_count > 2)
+      max_width = MIN2(max_width, inst->exec_size / DIV_ROUND_UP(reg_count, 2));
+
+   /* According to the IVB PRMs:
+    *  "When destination spans two registers, the source MUST span two
+    *   registers. The exception to the above rule:
+    *
+    *    - When source is scalar, the source registers are not incremented.
+    *    - When source is packed integer Word and destination is packed
+    *      integer DWord, the source register is not incremented but the
+    *      source sub register is incremented."
+    *
+    * The hardware specs from Gen4 to Gen7.5 mention similar regioning
+    * restrictions.  The code below intentionally doesn't check whether the
+    * destination type is integer because empirically the hardware doesn't
+    * seem to care what the actual type is as long as it's dword-aligned.
+    */
+   if (devinfo->gen < 8) {
+      for (unsigned i = 0; i < inst->sources; i++) {
+         if (inst->size_written > REG_SIZE &&
+             inst->size_read(i) != 0 && inst->size_read(i) <= REG_SIZE &&
+             !is_uniform(inst->src[i]) &&
+             !(type_sz(inst->dst.type) == 4 && inst->dst.stride == 1 &&
+               type_sz(inst->src[i].type) == 2 && inst->src[i].stride == 1)) {
+            const unsigned reg_count = DIV_ROUND_UP(inst->size_written, REG_SIZE);
+            max_width = MIN2(max_width, inst->exec_size / reg_count);
+         }
+      }
+   }
+
+   /* From the IVB PRMs:
+    *  "When an instruction is SIMD32, the low 16 bits of the execution mask
+    *   are applied for both halves of the SIMD32 instruction. If different
+    *   execution mask channels are required, split the instruction into two
+    *   SIMD16 instructions."
+    *
+    * There is similar text in the HSW PRMs.  Gen4-6 don't even implement
+    * 32-wide control flow support in hardware and will behave similarly.
+    */
+   if (devinfo->gen < 8 && !inst->force_writemask_all)
+      max_width = MIN2(max_width, 16);
+
+   /* From the IVB PRMs (applies to HSW too):
+    *  "Instructions with condition modifiers must not use SIMD32."
+    *
+    * From the BDW PRMs (applies to later hardware too):
+    *  "Ternary instruction with condition modifiers must not use SIMD32."
+    */
+   if (inst->conditional_mod && (devinfo->gen < 8 || inst->is_3src(devinfo)))
+      max_width = MIN2(max_width, 16);
+
+   /* From the IVB PRMs (applies to other devices that don't have the
+    * gen_device_info::supports_simd16_3src flag set):
+    *  "In Align16 access mode, SIMD16 is not allowed for DW operations and
+    *   SIMD8 is not allowed for DF operations."
+    */
+   if (inst->is_3src(devinfo) && !devinfo->supports_simd16_3src)
+      max_width = MIN2(max_width, inst->exec_size / reg_count);
+
+   /* Pre-Gen8 EUs are hardwired to use the QtrCtrl+1 (where QtrCtrl is
+    * the 8-bit quarter of the execution mask signals specified in the
+    * instruction control fields) for the second compressed half of any
+    * single-precision instruction (for double-precision instructions
+    * it's hardwired to use NibCtrl+1, at least on HSW), which means that
+    * the EU will apply the wrong execution controls for the second
+    * sequential GRF write if the number of channels per GRF is not exactly
+    * eight in single-precision mode (or four in double-float mode).
+    *
+    * In this situation we calculate the maximum size of the split
+    * instructions so they only ever write to a single register.
+    */
+   if (devinfo->gen < 8 && inst->size_written > REG_SIZE &&
+       !inst->force_writemask_all) {
+      const unsigned channels_per_grf = inst->exec_size /
+         DIV_ROUND_UP(inst->size_written, REG_SIZE);
+      unsigned exec_type_size = 0;
+      for (int i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file != BAD_FILE)
+            exec_type_size = MAX2(exec_type_size, type_sz(inst->src[i].type));
+      }
+      assert(exec_type_size);
+
+      /* The hardware shifts exactly 8 channels per compressed half of the
+       * instruction in single-precision mode and exactly 4 in double-precision.
+       */
+      if (channels_per_grf != (exec_type_size == 8 ? 4 : 8))
+         max_width = MIN2(max_width, channels_per_grf);
+   }
+
+   /* Only power-of-two execution sizes are representable in the instruction
+    * control fields.
+    */
+   return 1 << _mesa_logbase2(max_width);
+}
+
+/**
+ * Get the maximum allowed SIMD width for instruction \p inst accounting for
+ * various payload size restrictions that apply to sampler message
+ * instructions.
+ *
+ * This is only intended to provide a maximum theoretical bound for the
+ * execution size of the message based on the number of argument components
+ * alone, which in most cases will determine whether the SIMD8 or SIMD16
+ * variant of the message can be used, though some messages may have
+ * additional restrictions not accounted for here (e.g. pre-ILK hardware uses
+ * the message length to determine the exact SIMD width and argument count,
+ * which makes a number of sampler message combinations impossible to
+ * represent).
+ */
+static unsigned
+get_sampler_lowered_simd_width(const struct gen_device_info *devinfo,
+                               const fs_inst *inst)
+{
+   /* Calculate the number of coordinate components that have to be present
+    * assuming that additional arguments follow the texel coordinates in the
+    * message payload.  On IVB+ there is no need for padding, on ILK-SNB we
+    * need to pad to four or three components depending on the message,
+    * pre-ILK we need to pad to at most three components.
+    */
+   const unsigned req_coord_components =
+      (devinfo->gen >= 7 ||
+       !inst->components_read(TEX_LOGICAL_SRC_COORDINATE)) ? 0 :
+      (devinfo->gen >= 5 && inst->opcode != SHADER_OPCODE_TXF_LOGICAL &&
+                            inst->opcode != SHADER_OPCODE_TXF_CMS_LOGICAL) ? 4 :
+      3;
+
+   /* On Gen9+ the LOD argument is for free if we're able to use the LZ
+    * variant of the TXL or TXF message.
+    */
+   const bool implicit_lod = devinfo->gen >= 9 &&
+                             (inst->opcode == SHADER_OPCODE_TXL ||
+                              inst->opcode == SHADER_OPCODE_TXF) &&
+                             inst->src[TEX_LOGICAL_SRC_LOD].is_zero();
+
+   /* Calculate the total number of argument components that need to be passed
+    * to the sampler unit.
+    */
+   const unsigned num_payload_components =
+      MAX2(inst->components_read(TEX_LOGICAL_SRC_COORDINATE),
+           req_coord_components) +
+      inst->components_read(TEX_LOGICAL_SRC_SHADOW_C) +
+      (implicit_lod ? 0 : inst->components_read(TEX_LOGICAL_SRC_LOD)) +
+      inst->components_read(TEX_LOGICAL_SRC_LOD2) +
+      inst->components_read(TEX_LOGICAL_SRC_SAMPLE_INDEX) +
+      (inst->opcode == SHADER_OPCODE_TG4_OFFSET_LOGICAL ?
+       inst->components_read(TEX_LOGICAL_SRC_TG4_OFFSET) : 0) +
+      inst->components_read(TEX_LOGICAL_SRC_MCS);
+
+   /* SIMD16 messages with more than five arguments exceed the maximum message
+    * size supported by the sampler, regardless of whether a header is
+    * provided or not.
+    */
+   return MIN2(inst->exec_size,
+               num_payload_components > MAX_SAMPLER_MESSAGE_SIZE / 2 ? 8 : 16);
+}
+
+/**
+ * Get the closest native SIMD width supported by the hardware for instruction
+ * \p inst.  The instruction will be left untouched by
+ * fs_visitor::lower_simd_width() if the returned value is equal to the
+ * original execution size.
+ */
+static unsigned
+get_lowered_simd_width(const struct gen_device_info *devinfo,
+                       const fs_inst *inst)
+{
+   switch (inst->opcode) {
+   case BRW_OPCODE_MOV:
+   case BRW_OPCODE_SEL:
+   case BRW_OPCODE_NOT:
+   case BRW_OPCODE_AND:
+   case BRW_OPCODE_OR:
+   case BRW_OPCODE_XOR:
+   case BRW_OPCODE_SHR:
+   case BRW_OPCODE_SHL:
+   case BRW_OPCODE_ASR:
+   case BRW_OPCODE_CMPN:
+   case BRW_OPCODE_CSEL:
+   case BRW_OPCODE_F32TO16:
+   case BRW_OPCODE_F16TO32:
+   case BRW_OPCODE_BFREV:
+   case BRW_OPCODE_BFE:
+   case BRW_OPCODE_ADD:
+   case BRW_OPCODE_MUL:
+   case BRW_OPCODE_AVG:
+   case BRW_OPCODE_FRC:
+   case BRW_OPCODE_RNDU:
+   case BRW_OPCODE_RNDD:
+   case BRW_OPCODE_RNDE:
+   case BRW_OPCODE_RNDZ:
+   case BRW_OPCODE_LZD:
+   case BRW_OPCODE_FBH:
+   case BRW_OPCODE_FBL:
+   case BRW_OPCODE_CBIT:
+   case BRW_OPCODE_SAD2:
+   case BRW_OPCODE_MAD:
+   case BRW_OPCODE_LRP:
+   case FS_OPCODE_PACK:
+      return get_fpu_lowered_simd_width(devinfo, inst);
+
+   case BRW_OPCODE_CMP: {
+      /* The Ivybridge/BayTrail WaCMPInstFlagDepClearedEarly workaround says that
+       * when the destination is a GRF the dependency-clear bit on the flag
+       * register is cleared early.
+       *
+       * Suggested workarounds are to disable coissuing CMP instructions
+       * or to split CMP(16) instructions into two CMP(8) instructions.
+       *
+       * We choose to split into CMP(8) instructions since disabling
+       * coissuing would affect CMP instructions not otherwise affected by
+       * the errata.
+       */
+      const unsigned max_width = (devinfo->gen == 7 && !devinfo->is_haswell &&
+                                  !inst->dst.is_null() ? 8 : ~0);
+      return MIN2(max_width, get_fpu_lowered_simd_width(devinfo, inst));
+   }
+   case BRW_OPCODE_BFI1:
+   case BRW_OPCODE_BFI2:
+      /* The Haswell WaForceSIMD8ForBFIInstruction workaround says that we
+       * should
+       *  "Force BFI instructions to be executed always in SIMD8."
+       */
+      return MIN2(devinfo->is_haswell ? 8 : ~0u,
+                  get_fpu_lowered_simd_width(devinfo, inst));
+
+   case BRW_OPCODE_IF:
+      assert(inst->src[0].file == BAD_FILE || inst->exec_size <= 16);
+      return inst->exec_size;
+
+   case SHADER_OPCODE_RCP:
+   case SHADER_OPCODE_RSQ:
+   case SHADER_OPCODE_SQRT:
+   case SHADER_OPCODE_EXP2:
+   case SHADER_OPCODE_LOG2:
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_COS:
+      /* Unary extended math instructions are limited to SIMD8 on Gen4 and
+       * Gen6.
+       */
+      return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) :
+              devinfo->gen == 5 || devinfo->is_g4x ? MIN2(16, inst->exec_size) :
+              MIN2(8, inst->exec_size));
+
+   case SHADER_OPCODE_POW:
+      /* SIMD16 is only allowed on Gen7+. */
+      return (devinfo->gen >= 7 ? MIN2(16, inst->exec_size) :
+              MIN2(8, inst->exec_size));
+
+   case SHADER_OPCODE_INT_QUOTIENT:
+   case SHADER_OPCODE_INT_REMAINDER:
+      /* Integer division is limited to SIMD8 on all generations. */
+      return MIN2(8, inst->exec_size);
+
+   case FS_OPCODE_LINTERP:
+   case FS_OPCODE_GET_BUFFER_SIZE:
+   case FS_OPCODE_DDX_COARSE:
+   case FS_OPCODE_DDX_FINE:
+   case FS_OPCODE_DDY_COARSE:
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
+   case FS_OPCODE_PACK_HALF_2x16_SPLIT:
+   case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
+   case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
+   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
+   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
+   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+      return MIN2(16, inst->exec_size);
+
+   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
+      /* Pre-ILK hardware doesn't have a SIMD8 variant of the texel fetch
+       * message used to implement varying pull constant loads, so expand it
+       * to SIMD16.  An alternative with longer message payload length but
+       * shorter return payload would be to use the SIMD8 sampler message that
+       * takes (header, u, v, r) as parameters instead of (header, u).
+       */
+      return (devinfo->gen == 4 ? 16 : MIN2(16, inst->exec_size));
+
+   case FS_OPCODE_DDY_FINE:
+      /* The implementation of this virtual opcode may require emitting
+       * compressed Align16 instructions, which are severely limited on some
+       * generations.
+       *
+       * From the Ivy Bridge PRM, volume 4 part 3, section 3.3.9 (Register
+       * Region Restrictions):
+       *
+       *  "In Align16 access mode, SIMD16 is not allowed for DW operations
+       *   and SIMD8 is not allowed for DF operations."
+       *
+       * In this context, "DW operations" means "operations acting on 32-bit
+       * values", so it includes operations on floats.
+       *
+       * Gen4 has a similar restriction.  From the i965 PRM, section 11.5.3
+       * (Instruction Compression -> Rules and Restrictions):
+       *
+       *  "A compressed instruction must be in Align1 access mode. Align16
+       *   mode instructions cannot be compressed."
+       *
+       * Similar text exists in the g45 PRM.
+       *
+       * Empirically, compressed align16 instructions using odd register
+       * numbers don't appear to work on Sandybridge either.
+       */
+      return (devinfo->gen == 4 || devinfo->gen == 6 ||
+              (devinfo->gen == 7 && !devinfo->is_haswell) ?
+              MIN2(8, inst->exec_size) : MIN2(16, inst->exec_size));
+
+   case SHADER_OPCODE_MULH:
+      /* MULH is lowered to the MUL/MACH sequence using the accumulator, which
+       * is 8-wide on Gen7+.
+       */
+      return (devinfo->gen >= 7 ? 8 :
+              get_fpu_lowered_simd_width(devinfo, inst));
+
+   case FS_OPCODE_FB_WRITE_LOGICAL:
+      /* Gen6 doesn't support SIMD16 depth writes but we cannot handle them
+       * here.
+       */
+      assert(devinfo->gen != 6 ||
+             inst->src[FB_WRITE_LOGICAL_SRC_SRC_DEPTH].file == BAD_FILE ||
+             inst->exec_size == 8);
+      /* Dual-source FB writes are unsupported in SIMD16 mode. */
+      return (inst->src[FB_WRITE_LOGICAL_SRC_COLOR1].file != BAD_FILE ?
+              8 : MIN2(16, inst->exec_size));
+
+   case FS_OPCODE_FB_READ_LOGICAL:
+      return MIN2(16, inst->exec_size);
+
+   case SHADER_OPCODE_TEX_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_LOGICAL:
+   case SHADER_OPCODE_TXF_UMS_LOGICAL:
+   case SHADER_OPCODE_TXF_MCS_LOGICAL:
+   case SHADER_OPCODE_LOD_LOGICAL:
+   case SHADER_OPCODE_TG4_LOGICAL:
+   case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+      return get_sampler_lowered_simd_width(devinfo, inst);
+
+   case SHADER_OPCODE_TXD_LOGICAL:
+      /* TXD is unsupported in SIMD16 mode. */
+      return 8;
+
+   case SHADER_OPCODE_TXL_LOGICAL:
+   case FS_OPCODE_TXB_LOGICAL:
+      /* Only one execution size is representable pre-ILK depending on whether
+       * the shadow reference argument is present.
+       */
+      if (devinfo->gen == 4)
+         return inst->src[TEX_LOGICAL_SRC_SHADOW_C].file == BAD_FILE ? 16 : 8;
+      else
+         return get_sampler_lowered_simd_width(devinfo, inst);
+
+   case SHADER_OPCODE_TXF_LOGICAL:
+   case SHADER_OPCODE_TXS_LOGICAL:
+      /* Gen4 doesn't have SIMD8 variants for the RESINFO and LD-with-LOD
+       * messages.  Use SIMD16 instead.
+       */
+      if (devinfo->gen == 4)
+         return 16;
+      else
+         return get_sampler_lowered_simd_width(devinfo, inst);
+
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+      return 8;
+
+   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+      return MIN2(16, inst->exec_size);
+
+   case SHADER_OPCODE_URB_READ_SIMD8:
+   case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
+   case SHADER_OPCODE_URB_WRITE_SIMD8:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
+      return MIN2(8, inst->exec_size);
+
+   case SHADER_OPCODE_MOV_INDIRECT:
+      /* Prior to Broadwell, we only have 8 address subregisters */
+      return MIN3(devinfo->gen >= 8 ? 16 : 8,
+                  2 * REG_SIZE / (inst->dst.stride * type_sz(inst->dst.type)),
+                  inst->exec_size);
+
+   case SHADER_OPCODE_LOAD_PAYLOAD: {
+      const unsigned reg_count =
+         DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
+
+      if (reg_count > 2) {
+         /* Only LOAD_PAYLOAD instructions with per-channel destination region
+          * can be easily lowered (which excludes headers and heterogeneous
+          * types).
+          */
+         assert(!inst->header_size);
+         for (unsigned i = 0; i < inst->sources; i++)
+            assert(type_sz(inst->dst.type) == type_sz(inst->src[i].type) ||
+                   inst->src[i].file == BAD_FILE);
+
+         return inst->exec_size / DIV_ROUND_UP(reg_count, 2);
+      } else {
+         return inst->exec_size;
+      }
+   }
+   default:
+      return inst->exec_size;
+   }
+}
+
+/**
+ * Return true if splitting out the group of channels of instruction \p inst
+ * given by lbld.group() requires allocating a temporary for the i-th source
+ * of the lowered instruction.
+ */
+static inline bool
+needs_src_copy(const fs_builder &lbld, const fs_inst *inst, unsigned i)
+{
+   return !(is_periodic(inst->src[i], lbld.dispatch_width()) ||
+            (inst->components_read(i) == 1 &&
+             lbld.dispatch_width() <= inst->exec_size));
+}
+
+/**
+ * Extract the data that would be consumed by the channel group given by
+ * lbld.group() from the i-th source region of instruction \p inst and return
+ * it as result in packed form.  If any copy instructions are required they
+ * will be emitted before the given \p inst in \p block.
+ */
+static fs_reg
+emit_unzip(const fs_builder &lbld, bblock_t *block, fs_inst *inst,
+           unsigned i)
+{
+   /* Specified channel group from the source region. */
+   const fs_reg src = horiz_offset(inst->src[i], lbld.group());
+
+   if (needs_src_copy(lbld, inst, i)) {
+      /* Builder of the right width to perform the copy avoiding uninitialized
+       * data if the lowered execution size is greater than the original
+       * execution size of the instruction.
+       */
+      const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(),
+                                              inst->exec_size), 0);
+      const fs_reg tmp = lbld.vgrf(inst->src[i].type, inst->components_read(i));
+
+      for (unsigned k = 0; k < inst->components_read(i); ++k)
+         cbld.at(block, inst)
+             .MOV(offset(tmp, lbld, k), offset(src, inst->exec_size, k));
+
+      return tmp;
+
+   } else if (is_periodic(inst->src[i], lbld.dispatch_width())) {
+      /* The source is invariant for all dispatch_width-wide groups of the
+       * original region.
+       */
+      return inst->src[i];
+
+   } else {
+      /* We can just point the lowered instruction at the right channel group
+       * from the original region.
+       */
+      return src;
+   }
+}
+
+/**
+ * Return true if splitting out the group of channels of instruction \p inst
+ * given by lbld.group() requires allocating a temporary for the destination
+ * of the lowered instruction and copying the data back to the original
+ * destination region.
+ */
+static inline bool
+needs_dst_copy(const fs_builder &lbld, const fs_inst *inst)
+{
+   /* If the instruction writes more than one component we'll have to shuffle
+    * the results of multiple lowered instructions in order to make sure that
+    * they end up arranged correctly in the original destination region.
+    */
+   if (inst->size_written > inst->dst.component_size(inst->exec_size))
+      return true;
+
+   /* If the lowered execution size is larger than the original the result of
+    * the instruction won't fit in the original destination, so we'll have to
+    * allocate a temporary in any case.
+    */
+   if (lbld.dispatch_width() > inst->exec_size)
+      return true;
+
+   for (unsigned i = 0; i < inst->sources; i++) {
+      /* If we already made a copy of the source for other reasons there won't
+       * be any overlap with the destination.
+       */
+      if (needs_src_copy(lbld, inst, i))
+         continue;
+
+      /* In order to keep the logic simple we emit a copy whenever the
+       * destination region doesn't exactly match an overlapping source, which
+       * may point at the source and destination not being aligned group by
+       * group which could cause one of the lowered instructions to overwrite
+       * the data read from the same source by other lowered instructions.
+       */
+      if (regions_overlap(inst->dst, inst->size_written,
+                          inst->src[i], inst->size_read(i)) &&
+          !inst->dst.equals(inst->src[i]))
+        return true;
+   }
+
+   return false;
+}
+
+/**
+ * Insert data from a packed temporary into the channel group given by
+ * lbld.group() of the destination region of instruction \p inst and return
+ * the temporary as result.  If any copy instructions are required they will
+ * be emitted around the given \p inst in \p block.
+ */
+static fs_reg
+emit_zip(const fs_builder &lbld, bblock_t *block, fs_inst *inst)
+{
+   /* Builder of the right width to perform the copy avoiding uninitialized
+    * data if the lowered execution size is greater than the original
+    * execution size of the instruction.
+    */
+   const fs_builder cbld = lbld.group(MIN2(lbld.dispatch_width(),
+                                           inst->exec_size), 0);
+
+   /* Specified channel group from the destination region. */
+   const fs_reg dst = horiz_offset(inst->dst, lbld.group());
+   const unsigned dst_size = inst->size_written /
+      inst->dst.component_size(inst->exec_size);
+
+   if (needs_dst_copy(lbld, inst)) {
+      const fs_reg tmp = lbld.vgrf(inst->dst.type, dst_size);
+
+      if (inst->predicate) {
+         /* Handle predication by copying the original contents of
+          * the destination into the temporary before emitting the
+          * lowered instruction.
+          */
+         for (unsigned k = 0; k < dst_size; ++k)
+            cbld.at(block, inst)
+                .MOV(offset(tmp, lbld, k), offset(dst, inst->exec_size, k));
+      }
+
+      for (unsigned k = 0; k < dst_size; ++k)
+         cbld.at(block, inst->next)
+             .MOV(offset(dst, inst->exec_size, k), offset(tmp, lbld, k));
+
+      return tmp;
+
+   } else {
+      /* No need to allocate a temporary for the lowered instruction, just
+       * take the right group of channels from the original region.
+       */
+      return dst;
+   }
+}
+
+bool
+fs_visitor::lower_simd_width()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      const unsigned lower_width = get_lowered_simd_width(devinfo, inst);
+
+      if (lower_width != inst->exec_size) {
+         /* Builder matching the original instruction.  We may also need to
+          * emit an instruction of width larger than the original, set the
+          * execution size of the builder to the highest of both for now so
+          * we're sure that both cases can be handled.
+          */
+         const unsigned max_width = MAX2(inst->exec_size, lower_width);
+         const fs_builder ibld = bld.at(block, inst)
+                                    .exec_all(inst->force_writemask_all)
+                                    .group(max_width, inst->group / max_width);
+
+         /* Split the copies in chunks of the execution width of either the
+          * original or the lowered instruction, whichever is lower.
+          */
+         const unsigned n = DIV_ROUND_UP(inst->exec_size, lower_width);
+         const unsigned dst_size = inst->size_written /
+            inst->dst.component_size(inst->exec_size);
+
+         assert(!inst->writes_accumulator && !inst->mlen);
+
+         for (unsigned i = 0; i < n; i++) {
+            /* Emit a copy of the original instruction with the lowered width.
+             * If the EOT flag was set throw it away except for the last
+             * instruction to avoid killing the thread prematurely.
+             */
+            fs_inst split_inst = *inst;
+            split_inst.exec_size = lower_width;
+            split_inst.eot = inst->eot && i == n - 1;
+
+            /* Select the correct channel enables for the i-th group, then
+             * transform the sources and destination and emit the lowered
+             * instruction.
+             */
+            const fs_builder lbld = ibld.group(lower_width, i);
+
+            for (unsigned j = 0; j < inst->sources; j++)
+               split_inst.src[j] = emit_unzip(lbld, block, inst, j);
+
+            split_inst.dst = emit_zip(lbld, block, inst);
+            split_inst.size_written =
+               split_inst.dst.component_size(lower_width) * dst_size;
+
+            lbld.emit(split_inst);
+         }
+
+         inst->remove(block);
+         progress = true;
+      }
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
+void
+fs_visitor::dump_instructions()
+{
+   dump_instructions(NULL);
+}
+
+void
+fs_visitor::dump_instructions(const char *name)
+{
+   FILE *file = stderr;
+   if (name && geteuid() != 0) {
+      file = fopen(name, "w");
+      if (!file)
+         file = stderr;
+   }
+
+   if (cfg) {
+      calculate_register_pressure();
+      int ip = 0, max_pressure = 0;
+      foreach_block_and_inst(block, backend_instruction, inst, cfg) {
+         max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
+         fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
+         dump_instruction(inst, file);
+         ip++;
+      }
+      fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
+   } else {
+      int ip = 0;
+      foreach_in_list(backend_instruction, inst, &instructions) {
+         fprintf(file, "%4d: ", ip++);
+         dump_instruction(inst, file);
+      }
+   }
+
+   if (file != stderr) {
+      fclose(file);
+   }
+}
+
+void
+fs_visitor::dump_instruction(backend_instruction *be_inst)
+{
+   dump_instruction(be_inst, stderr);
+}
+
+void
+fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
+{
+   fs_inst *inst = (fs_inst *)be_inst;
+
+   if (inst->predicate) {
+      fprintf(file, "(%cf0.%d) ",
+             inst->predicate_inverse ? '-' : '+',
+             inst->flag_subreg);
+   }
+
+   fprintf(file, "%s", brw_instruction_name(devinfo, inst->opcode));
+   if (inst->saturate)
+      fprintf(file, ".sat");
+   if (inst->conditional_mod) {
+      fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
+      if (!inst->predicate &&
+          (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
+                              inst->opcode != BRW_OPCODE_IF &&
+                              inst->opcode != BRW_OPCODE_WHILE))) {
+         fprintf(file, ".f0.%d", inst->flag_subreg);
+      }
+   }
+   fprintf(file, "(%d) ", inst->exec_size);
+
+   if (inst->mlen) {
+      fprintf(file, "(mlen: %d) ", inst->mlen);
+   }
+
+   if (inst->eot) {
+      fprintf(file, "(EOT) ");
+   }
+
+   switch (inst->dst.file) {
+   case VGRF:
+      fprintf(file, "vgrf%d", inst->dst.nr);
+      break;
+   case FIXED_GRF:
+      fprintf(file, "g%d", inst->dst.nr);
+      break;
+   case MRF:
+      fprintf(file, "m%d", inst->dst.nr);
+      break;
+   case BAD_FILE:
+      fprintf(file, "(null)");
+      break;
+   case UNIFORM:
+      fprintf(file, "***u%d***", inst->dst.nr);
+      break;
+   case ATTR:
+      fprintf(file, "***attr%d***", inst->dst.nr);
+      break;
+   case ARF:
+      switch (inst->dst.nr) {
+      case BRW_ARF_NULL:
+         fprintf(file, "null");
+         break;
+      case BRW_ARF_ADDRESS:
+         fprintf(file, "a0.%d", inst->dst.subnr);
+         break;
+      case BRW_ARF_ACCUMULATOR:
+         fprintf(file, "acc%d", inst->dst.subnr);
+         break;
+      case BRW_ARF_FLAG:
+         fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
+         break;
+      default:
+         fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
+         break;
+      }
+      break;
+   case IMM:
+      unreachable("not reached");
+   }
+
+   if (inst->dst.offset ||
+       (inst->dst.file == VGRF &&
+        alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
+      const unsigned reg_size = (inst->dst.file == UNIFORM ? 4 : REG_SIZE);
+      fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
+              inst->dst.offset % reg_size);
+   }
+
+   if (inst->dst.stride != 1)
+      fprintf(file, "<%u>", inst->dst.stride);
+   fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
+
+   for (int i = 0; i < inst->sources; i++) {
+      if (inst->src[i].negate)
+         fprintf(file, "-");
+      if (inst->src[i].abs)
+         fprintf(file, "|");
+      switch (inst->src[i].file) {
+      case VGRF:
+         fprintf(file, "vgrf%d", inst->src[i].nr);
+         break;
+      case FIXED_GRF:
+         fprintf(file, "g%d", inst->src[i].nr);
+         break;
+      case MRF:
+         fprintf(file, "***m%d***", inst->src[i].nr);
+         break;
+      case ATTR:
+         fprintf(file, "attr%d", inst->src[i].nr);
+         break;
+      case UNIFORM:
+         fprintf(file, "u%d", inst->src[i].nr);
+         break;
+      case BAD_FILE:
+         fprintf(file, "(null)");
+         break;
+      case IMM:
+         switch (inst->src[i].type) {
+         case BRW_REGISTER_TYPE_F:
+            fprintf(file, "%-gf", inst->src[i].f);
+            break;
+         case BRW_REGISTER_TYPE_DF:
+            fprintf(file, "%fdf", inst->src[i].df);
+            break;
+         case BRW_REGISTER_TYPE_W:
+         case BRW_REGISTER_TYPE_D:
+            fprintf(file, "%dd", inst->src[i].d);
+            break;
+         case BRW_REGISTER_TYPE_UW:
+         case BRW_REGISTER_TYPE_UD:
+            fprintf(file, "%uu", inst->src[i].ud);
+            break;
+         case BRW_REGISTER_TYPE_VF:
+            fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
+                    brw_vf_to_float((inst->src[i].ud >>  0) & 0xff),
+                    brw_vf_to_float((inst->src[i].ud >>  8) & 0xff),
+                    brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
+                    brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
+            break;
+         default:
+            fprintf(file, "???");
+            break;
+         }
+         break;
+      case ARF:
+         switch (inst->src[i].nr) {
+         case BRW_ARF_NULL:
+            fprintf(file, "null");
+            break;
+         case BRW_ARF_ADDRESS:
+            fprintf(file, "a0.%d", inst->src[i].subnr);
+            break;
+         case BRW_ARF_ACCUMULATOR:
+            fprintf(file, "acc%d", inst->src[i].subnr);
+            break;
+         case BRW_ARF_FLAG:
+            fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
+            break;
+         default:
+            fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
+            break;
+         }
+         break;
+      }
+
+      if (inst->src[i].offset ||
+          (inst->src[i].file == VGRF &&
+           alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
+         const unsigned reg_size = (inst->src[i].file == UNIFORM ? 4 : REG_SIZE);
+         fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
+                 inst->src[i].offset % reg_size);
+      }
+
+      if (inst->src[i].abs)
+         fprintf(file, "|");
+
+      if (inst->src[i].file != IMM) {
+         unsigned stride;
+         if (inst->src[i].file == ARF || inst->src[i].file == FIXED_GRF) {
+            unsigned hstride = inst->src[i].hstride;
+            stride = (hstride == 0 ? 0 : (1 << (hstride - 1)));
+         } else {
+            stride = inst->src[i].stride;
+         }
+         if (stride != 1)
+            fprintf(file, "<%u>", stride);
+
+         fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
+      }
+
+      if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
+         fprintf(file, ", ");
+   }
+
+   fprintf(file, " ");
+
+   if (inst->force_writemask_all)
+      fprintf(file, "NoMask ");
+
+   if (inst->exec_size != dispatch_width)
+      fprintf(file, "group%d ", inst->group);
+
+   fprintf(file, "\n");
+}
+
+/**
+ * Possibly returns an instruction that set up @param reg.
+ *
+ * Sometimes we want to take the result of some expression/variable
+ * dereference tree and rewrite the instruction generating the result
+ * of the tree.  When processing the tree, we know that the
+ * instructions generated are all writing temporaries that are dead
+ * outside of this tree.  So, if we have some instructions that write
+ * a temporary, we're free to point that temp write somewhere else.
+ *
+ * Note that this doesn't guarantee that the instruction generated
+ * only reg -- it might be the size=4 destination of a texture instruction.
+ */
+fs_inst *
+fs_visitor::get_instruction_generating_reg(fs_inst *start,
+					   fs_inst *end,
+					   const fs_reg &reg)
+{
+   if (end == start ||
+       end->is_partial_write() ||
+       !reg.equals(end->dst)) {
+      return NULL;
+   } else {
+      return end;
+   }
+}
+
+void
+fs_visitor::setup_fs_payload_gen6()
+{
+   assert(stage == MESA_SHADER_FRAGMENT);
+   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+
+   assert(devinfo->gen >= 6);
+
+   /* R0-1: masks, pixel X/Y coordinates. */
+   payload.num_regs = 2;
+   /* R2: only for 32-pixel dispatch.*/
+
+   /* R3-26: barycentric interpolation coordinates.  These appear in the
+    * same order that they appear in the brw_barycentric_mode
+    * enum.  Each set of coordinates occupies 2 registers if dispatch width
+    * == 8 and 4 registers if dispatch width == 16.  Coordinates only
+    * appear if they were enabled using the "Barycentric Interpolation
+    * Mode" bits in WM_STATE.
+    */
+   for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
+      if (prog_data->barycentric_interp_modes & (1 << i)) {
+         payload.barycentric_coord_reg[i] = payload.num_regs;
+         payload.num_regs += 2;
+         if (dispatch_width == 16) {
+            payload.num_regs += 2;
+         }
+      }
+   }
+
+   /* R27: interpolated depth if uses source depth */
+   prog_data->uses_src_depth =
+      (nir->info->inputs_read & (1 << VARYING_SLOT_POS)) != 0;
+   if (prog_data->uses_src_depth) {
+      payload.source_depth_reg = payload.num_regs;
+      payload.num_regs++;
+      if (dispatch_width == 16) {
+         /* R28: interpolated depth if not SIMD8. */
+         payload.num_regs++;
+      }
+   }
+
+   /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
+   prog_data->uses_src_w =
+      (nir->info->inputs_read & (1 << VARYING_SLOT_POS)) != 0;
+   if (prog_data->uses_src_w) {
+      payload.source_w_reg = payload.num_regs;
+      payload.num_regs++;
+      if (dispatch_width == 16) {
+         /* R30: interpolated W if not SIMD8. */
+         payload.num_regs++;
+      }
+   }
+
+   /* R31: MSAA position offsets. */
+   if (prog_data->persample_dispatch &&
+       (nir->info->system_values_read & SYSTEM_BIT_SAMPLE_POS)) {
+      /* From the Ivy Bridge PRM documentation for 3DSTATE_PS:
+       *
+       *    "MSDISPMODE_PERSAMPLE is required in order to select
+       *    POSOFFSET_SAMPLE"
+       *
+       * So we can only really get sample positions if we are doing real
+       * per-sample dispatch.  If we need gl_SamplePosition and we don't have
+       * persample dispatch, we hard-code it to 0.5.
+       */
+      prog_data->uses_pos_offset = true;
+      payload.sample_pos_reg = payload.num_regs;
+      payload.num_regs++;
+   }
+
+   /* R32: MSAA input coverage mask */
+   prog_data->uses_sample_mask =
+      (nir->info->system_values_read & SYSTEM_BIT_SAMPLE_MASK_IN) != 0;
+   if (prog_data->uses_sample_mask) {
+      assert(devinfo->gen >= 7);
+      payload.sample_mask_in_reg = payload.num_regs;
+      payload.num_regs++;
+      if (dispatch_width == 16) {
+         /* R33: input coverage mask if not SIMD8. */
+         payload.num_regs++;
+      }
+   }
+
+   /* R34-: bary for 32-pixel. */
+   /* R58-59: interp W for 32-pixel. */
+
+   if (nir->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
+      source_depth_to_render_target = true;
+   }
+}
+
+void
+fs_visitor::setup_vs_payload()
+{
+   /* R0: thread header, R1: urb handles */
+   payload.num_regs = 2;
+}
+
+void
+fs_visitor::setup_gs_payload()
+{
+   assert(stage == MESA_SHADER_GEOMETRY);
+
+   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
+   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
+
+   /* R0: thread header, R1: output URB handles */
+   payload.num_regs = 2;
+
+   if (gs_prog_data->include_primitive_id) {
+      /* R2: Primitive ID 0..7 */
+      payload.num_regs++;
+   }
+
+   /* Use a maximum of 24 registers for push-model inputs. */
+   const unsigned max_push_components = 24;
+
+   /* If pushing our inputs would take too many registers, reduce the URB read
+    * length (which is in HWords, or 8 registers), and resort to pulling.
+    *
+    * Note that the GS reads <URB Read Length> HWords for every vertex - so we
+    * have to multiply by VerticesIn to obtain the total storage requirement.
+    */
+   if (8 * vue_prog_data->urb_read_length * nir->info->gs.vertices_in >
+       max_push_components || gs_prog_data->invocations > 1) {
+      gs_prog_data->base.include_vue_handles = true;
+
+      /* R3..RN: ICP Handles for each incoming vertex (when using pull model) */
+      payload.num_regs += nir->info->gs.vertices_in;
+
+      vue_prog_data->urb_read_length =
+         ROUND_DOWN_TO(max_push_components / nir->info->gs.vertices_in, 8) / 8;
+   }
+}
+
+void
+fs_visitor::setup_cs_payload()
+{
+   assert(devinfo->gen >= 7);
+   payload.num_regs = 1;
+}
+
+void
+fs_visitor::calculate_register_pressure()
+{
+   invalidate_live_intervals();
+   calculate_live_intervals();
+
+   unsigned num_instructions = 0;
+   foreach_block(block, cfg)
+      num_instructions += block->instructions.length();
+
+   regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
+
+   for (unsigned reg = 0; reg < alloc.count; reg++) {
+      for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
+         regs_live_at_ip[ip] += alloc.sizes[reg];
+   }
+}
+
+/**
+ * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
+ *
+ * The needs_unlit_centroid_workaround ends up producing one of these per
+ * channel of centroid input, so it's good to clean them up.
+ *
+ * An assumption here is that nothing ever modifies the dispatched pixels
+ * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
+ * dictates that anyway.
+ */
+bool
+fs_visitor::opt_drop_redundant_mov_to_flags()
+{
+   bool flag_mov_found[2] = {false};
+   bool progress = false;
+
+   /* Instructions removed by this pass can only be added if this were true */
+   if (!devinfo->needs_unlit_centroid_workaround)
+      return false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      if (inst->is_control_flow()) {
+         memset(flag_mov_found, 0, sizeof(flag_mov_found));
+      } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
+         if (!flag_mov_found[inst->flag_subreg]) {
+            flag_mov_found[inst->flag_subreg] = true;
+         } else {
+            inst->remove(block);
+            progress = true;
+         }
+      } else if (inst->flags_written()) {
+         flag_mov_found[inst->flag_subreg] = false;
+      }
+   }
+
+   return progress;
+}
+
+void
+fs_visitor::optimize()
+{
+   /* Start by validating the shader we currently have. */
+   validate();
+
+   /* bld is the common builder object pointing at the end of the program we
+    * used to translate it into i965 IR.  For the optimization and lowering
+    * passes coming next, any code added after the end of the program without
+    * having explicitly called fs_builder::at() clearly points at a mistake.
+    * Ideally optimization passes wouldn't be part of the visitor so they
+    * wouldn't have access to bld at all, but they do, so just in case some
+    * pass forgets to ask for a location explicitly set it to NULL here to
+    * make it trip.  The dispatch width is initialized to a bogus value to
+    * make sure that optimizations set the execution controls explicitly to
+    * match the code they are manipulating instead of relying on the defaults.
+    */
+   bld = fs_builder(this, 64);
+
+   assign_constant_locations();
+   lower_constant_loads();
+
+   validate();
+
+   split_virtual_grfs();
+   validate();
+
+#define OPT(pass, args...) ({                                           \
+      pass_num++;                                                       \
+      bool this_progress = pass(args);                                  \
+                                                                        \
+      if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
+         char filename[64];                                             \
+         snprintf(filename, 64, "%s%d-%s-%02d-%02d-" #pass,              \
+                  stage_abbrev, dispatch_width, nir->info->name, iteration, pass_num); \
+                                                                        \
+         backend_shader::dump_instructions(filename);                   \
+      }                                                                 \
+                                                                        \
+      validate();                                                       \
+                                                                        \
+      progress = progress || this_progress;                             \
+      this_progress;                                                    \
+   })
+
+   if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
+      char filename[64];
+      snprintf(filename, 64, "%s%d-%s-00-00-start",
+               stage_abbrev, dispatch_width, nir->info->name);
+
+      backend_shader::dump_instructions(filename);
+   }
+
+   bool progress = false;
+   int iteration = 0;
+   int pass_num = 0;
+
+   OPT(opt_drop_redundant_mov_to_flags);
+
+   do {
+      progress = false;
+      pass_num = 0;
+      iteration++;
+
+      OPT(remove_duplicate_mrf_writes);
+
+      OPT(opt_algebraic);
+      OPT(opt_cse);
+      OPT(opt_copy_propagation);
+      OPT(opt_predicated_break, this);
+      OPT(opt_cmod_propagation);
+      OPT(dead_code_eliminate);
+      OPT(opt_peephole_sel);
+      OPT(dead_control_flow_eliminate, this);
+      OPT(opt_register_renaming);
+      OPT(opt_saturate_propagation);
+      OPT(register_coalesce);
+      OPT(compute_to_mrf);
+      OPT(eliminate_find_live_channel);
+
+      OPT(compact_virtual_grfs);
+   } while (progress);
+
+   progress = false;
+   pass_num = 0;
+
+   if (OPT(lower_pack)) {
+      OPT(register_coalesce);
+      OPT(dead_code_eliminate);
+   }
+
+   if (OPT(lower_d2x)) {
+      OPT(opt_copy_propagation);
+      OPT(dead_code_eliminate);
+   }
+
+   OPT(lower_simd_width);
+
+   /* After SIMD lowering just in case we had to unroll the EOT send. */
+   OPT(opt_sampler_eot);
+
+   OPT(lower_logical_sends);
+
+   if (progress) {
+      OPT(opt_copy_propagation);
+      /* Only run after logical send lowering because it's easier to implement
+       * in terms of physical sends.
+       */
+      if (OPT(opt_zero_samples))
+         OPT(opt_copy_propagation);
+      /* Run after logical send lowering to give it a chance to CSE the
+       * LOAD_PAYLOAD instructions created to construct the payloads of
+       * e.g. texturing messages in cases where it wasn't possible to CSE the
+       * whole logical instruction.
+       */
+      OPT(opt_cse);
+      OPT(register_coalesce);
+      OPT(compute_to_mrf);
+      OPT(dead_code_eliminate);
+      OPT(remove_duplicate_mrf_writes);
+      OPT(opt_peephole_sel);
+   }
+
+   OPT(opt_redundant_discard_jumps);
+
+   if (OPT(lower_load_payload)) {
+      split_virtual_grfs();
+      OPT(register_coalesce);
+      OPT(compute_to_mrf);
+      OPT(dead_code_eliminate);
+   }
+
+   OPT(opt_combine_constants);
+   OPT(lower_integer_multiplication);
+
+   if (devinfo->gen <= 5 && OPT(lower_minmax)) {
+      OPT(opt_cmod_propagation);
+      OPT(opt_cse);
+      OPT(opt_copy_propagation);
+      OPT(dead_code_eliminate);
+   }
+
+   lower_uniform_pull_constant_loads();
+
+   validate();
+}
+
+/**
+ * Three source instruction must have a GRF/MRF destination register.
+ * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
+ */
+void
+fs_visitor::fixup_3src_null_dest()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
+      if (inst->is_3src(devinfo) && inst->dst.is_null()) {
+         inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
+                            inst->dst.type);
+         progress = true;
+      }
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+}
+
+void
+fs_visitor::allocate_registers(bool allow_spilling)
+{
+   bool allocated_without_spills;
+
+   static const enum instruction_scheduler_mode pre_modes[] = {
+      SCHEDULE_PRE,
+      SCHEDULE_PRE_NON_LIFO,
+      SCHEDULE_PRE_LIFO,
+   };
+
+   bool spill_all = allow_spilling && (INTEL_DEBUG & DEBUG_SPILL_FS);
+
+   /* Try each scheduling heuristic to see if it can successfully register
+    * allocate without spilling.  They should be ordered by decreasing
+    * performance but increasing likelihood of allocating.
+    */
+   for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
+      schedule_instructions(pre_modes[i]);
+
+      if (0) {
+         assign_regs_trivial();
+         allocated_without_spills = true;
+      } else {
+         allocated_without_spills = assign_regs(false, spill_all);
+      }
+      if (allocated_without_spills)
+         break;
+   }
+
+   if (!allocated_without_spills) {
+      if (!allow_spilling)
+         fail("Failure to register allocate and spilling is not allowed.");
+
+      /* We assume that any spilling is worse than just dropping back to
+       * SIMD8.  There's probably actually some intermediate point where
+       * SIMD16 with a couple of spills is still better.
+       */
+      if (dispatch_width > min_dispatch_width) {
+         fail("Failure to register allocate.  Reduce number of "
+              "live scalar values to avoid this.");
+      } else {
+         compiler->shader_perf_log(log_data,
+                                   "%s shader triggered register spilling.  "
+                                   "Try reducing the number of live scalar "
+                                   "values to improve performance.\n",
+                                   stage_name);
+      }
+
+      /* Since we're out of heuristics, just go spill registers until we
+       * get an allocation.
+       */
+      while (!assign_regs(true, spill_all)) {
+         if (failed)
+            break;
+      }
+   }
+
+   /* This must come after all optimization and register allocation, since
+    * it inserts dead code that happens to have side effects, and it does
+    * so based on the actual physical registers in use.
+    */
+   insert_gen4_send_dependency_workarounds();
+
+   if (failed)
+      return;
+
+   schedule_instructions(SCHEDULE_POST);
+
+   if (last_scratch > 0) {
+      MAYBE_UNUSED unsigned max_scratch_size = 2 * 1024 * 1024;
+
+      prog_data->total_scratch = brw_get_scratch_size(last_scratch);
+
+      if (stage == MESA_SHADER_COMPUTE) {
+         if (devinfo->is_haswell) {
+            /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
+             * field documentation, Haswell supports a minimum of 2kB of
+             * scratch space for compute shaders, unlike every other stage
+             * and platform.
+             */
+            prog_data->total_scratch = MAX2(prog_data->total_scratch, 2048);
+         } else if (devinfo->gen <= 7) {
+            /* According to the MEDIA_VFE_STATE's "Per Thread Scratch Space"
+             * field documentation, platforms prior to Haswell measure scratch
+             * size linearly with a range of [1kB, 12kB] and 1kB granularity.
+             */
+            prog_data->total_scratch = ALIGN(last_scratch, 1024);
+            max_scratch_size = 12 * 1024;
+         }
+      }
+
+      /* We currently only support up to 2MB of scratch space.  If we
+       * need to support more eventually, the documentation suggests
+       * that we could allocate a larger buffer, and partition it out
+       * ourselves.  We'd just have to undo the hardware's address
+       * calculation by subtracting (FFTID * Per Thread Scratch Space)
+       * and then add FFTID * (Larger Per Thread Scratch Space).
+       *
+       * See 3D-Media-GPGPU Engine > Media GPGPU Pipeline >
+       * Thread Group Tracking > Local Memory/Scratch Space.
+       */
+      assert(prog_data->total_scratch < max_scratch_size);
+   }
+}
+
+bool
+fs_visitor::run_vs(gl_clip_plane *clip_planes)
+{
+   assert(stage == MESA_SHADER_VERTEX);
+
+   setup_vs_payload();
+
+   if (shader_time_index >= 0)
+      emit_shader_time_begin();
+
+   emit_nir_code();
+
+   if (failed)
+      return false;
+
+   compute_clip_distance(clip_planes);
+
+   emit_urb_writes();
+
+   if (shader_time_index >= 0)
+      emit_shader_time_end();
+
+   calculate_cfg();
+
+   optimize();
+
+   assign_curb_setup();
+   assign_vs_urb_setup();
+
+   fixup_3src_null_dest();
+   allocate_registers(true);
+
+   return !failed;
+}
+
+bool
+fs_visitor::run_tcs_single_patch()
+{
+   assert(stage == MESA_SHADER_TESS_CTRL);
+
+   struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
+
+   /* r1-r4 contain the ICP handles. */
+   payload.num_regs = 5;
+
+   if (shader_time_index >= 0)
+      emit_shader_time_begin();
+
+   /* Initialize gl_InvocationID */
+   fs_reg channels_uw = bld.vgrf(BRW_REGISTER_TYPE_UW);
+   fs_reg channels_ud = bld.vgrf(BRW_REGISTER_TYPE_UD);
+   bld.MOV(channels_uw, fs_reg(brw_imm_uv(0x76543210)));
+   bld.MOV(channels_ud, channels_uw);
+
+   if (tcs_prog_data->instances == 1) {
+      invocation_id = channels_ud;
+   } else {
+      invocation_id = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+      /* Get instance number from g0.2 bits 23:17, and multiply it by 8. */
+      fs_reg t = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      fs_reg instance_times_8 = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      bld.AND(t, fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD)),
+              brw_imm_ud(INTEL_MASK(23, 17)));
+      bld.SHR(instance_times_8, t, brw_imm_ud(17 - 3));
+
+      bld.ADD(invocation_id, instance_times_8, channels_ud);
+   }
+
+   /* Fix the disptach mask */
+   if (nir->info->tess.tcs_vertices_out % 8) {
+      bld.CMP(bld.null_reg_ud(), invocation_id,
+              brw_imm_ud(nir->info->tess.tcs_vertices_out), BRW_CONDITIONAL_L);
+      bld.IF(BRW_PREDICATE_NORMAL);
+   }
+
+   emit_nir_code();
+
+   if (nir->info->tess.tcs_vertices_out % 8) {
+      bld.emit(BRW_OPCODE_ENDIF);
+   }
+
+   /* Emit EOT write; set TR DS Cache bit */
+   fs_reg srcs[3] = {
+      fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
+      fs_reg(brw_imm_ud(WRITEMASK_X << 16)),
+      fs_reg(brw_imm_ud(0)),
+   };
+   fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
+   bld.LOAD_PAYLOAD(payload, srcs, 3, 2);
+
+   fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8_MASKED,
+                            bld.null_reg_ud(), payload);
+   inst->mlen = 3;
+   inst->eot = true;
+
+   if (shader_time_index >= 0)
+      emit_shader_time_end();
+
+   if (failed)
+      return false;
+
+   calculate_cfg();
+
+   optimize();
+
+   assign_curb_setup();
+   assign_tcs_single_patch_urb_setup();
+
+   fixup_3src_null_dest();
+   allocate_registers(true);
+
+   return !failed;
+}
+
+bool
+fs_visitor::run_tes()
+{
+   assert(stage == MESA_SHADER_TESS_EVAL);
+
+   /* R0: thread header, R1-3: gl_TessCoord.xyz, R4: URB handles */
+   payload.num_regs = 5;
+
+   if (shader_time_index >= 0)
+      emit_shader_time_begin();
+
+   emit_nir_code();
+
+   if (failed)
+      return false;
+
+   emit_urb_writes();
+
+   if (shader_time_index >= 0)
+      emit_shader_time_end();
+
+   calculate_cfg();
+
+   optimize();
+
+   assign_curb_setup();
+   assign_tes_urb_setup();
+
+   fixup_3src_null_dest();
+   allocate_registers(true);
+
+   return !failed;
+}
+
+bool
+fs_visitor::run_gs()
+{
+   assert(stage == MESA_SHADER_GEOMETRY);
+
+   setup_gs_payload();
+
+   this->final_gs_vertex_count = vgrf(glsl_type::uint_type);
+
+   if (gs_compile->control_data_header_size_bits > 0) {
+      /* Create a VGRF to store accumulated control data bits. */
+      this->control_data_bits = vgrf(glsl_type::uint_type);
+
+      /* If we're outputting more than 32 control data bits, then EmitVertex()
+       * will set control_data_bits to 0 after emitting the first vertex.
+       * Otherwise, we need to initialize it to 0 here.
+       */
+      if (gs_compile->control_data_header_size_bits <= 32) {
+         const fs_builder abld = bld.annotate("initialize control data bits");
+         abld.MOV(this->control_data_bits, brw_imm_ud(0u));
+      }
+   }
+
+   if (shader_time_index >= 0)
+      emit_shader_time_begin();
+
+   emit_nir_code();
+
+   emit_gs_thread_end();
+
+   if (shader_time_index >= 0)
+      emit_shader_time_end();
+
+   if (failed)
+      return false;
+
+   calculate_cfg();
+
+   optimize();
+
+   assign_curb_setup();
+   assign_gs_urb_setup();
+
+   fixup_3src_null_dest();
+   allocate_registers(true);
+
+   return !failed;
+}
+
+bool
+fs_visitor::run_fs(bool allow_spilling, bool do_rep_send)
+{
+   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
+   brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
+
+   assert(stage == MESA_SHADER_FRAGMENT);
+
+   if (devinfo->gen >= 6)
+      setup_fs_payload_gen6();
+   else
+      setup_fs_payload_gen4();
+
+   if (0) {
+      emit_dummy_fs();
+   } else if (do_rep_send) {
+      assert(dispatch_width == 16);
+      emit_repclear_shader();
+   } else {
+      if (shader_time_index >= 0)
+         emit_shader_time_begin();
+
+      calculate_urb_setup();
+      if (nir->info->inputs_read > 0 ||
+          (nir->info->outputs_read > 0 && !wm_key->coherent_fb_fetch)) {
+         if (devinfo->gen < 6)
+            emit_interpolation_setup_gen4();
+         else
+            emit_interpolation_setup_gen6();
+      }
+
+      /* We handle discards by keeping track of the still-live pixels in f0.1.
+       * Initialize it with the dispatched pixels.
+       */
+      if (wm_prog_data->uses_kill) {
+         fs_inst *discard_init = bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
+         discard_init->flag_subreg = 1;
+      }
+
+      /* Generate FS IR for main().  (the visitor only descends into
+       * functions called "main").
+       */
+      emit_nir_code();
+
+      if (failed)
+	 return false;
+
+      if (wm_prog_data->uses_kill)
+         bld.emit(FS_OPCODE_PLACEHOLDER_HALT);
+
+      if (wm_key->alpha_test_func)
+         emit_alpha_test();
+
+      emit_fb_writes();
+
+      if (shader_time_index >= 0)
+         emit_shader_time_end();
+
+      calculate_cfg();
+
+      optimize();
+
+      assign_curb_setup();
+      assign_urb_setup();
+
+      fixup_3src_null_dest();
+      allocate_registers(allow_spilling);
+
+      if (failed)
+         return false;
+   }
+
+   return !failed;
+}
+
+bool
+fs_visitor::run_cs()
+{
+   assert(stage == MESA_SHADER_COMPUTE);
+
+   setup_cs_payload();
+
+   if (shader_time_index >= 0)
+      emit_shader_time_begin();
+
+   if (devinfo->is_haswell && prog_data->total_shared > 0) {
+      /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
+      const fs_builder abld = bld.exec_all().group(1, 0);
+      abld.MOV(retype(brw_sr0_reg(1), BRW_REGISTER_TYPE_UW),
+               suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1));
+   }
+
+   emit_nir_code();
+
+   if (failed)
+      return false;
+
+   emit_cs_terminate();
+
+   if (shader_time_index >= 0)
+      emit_shader_time_end();
+
+   calculate_cfg();
+
+   optimize();
+
+   assign_curb_setup();
+
+   fixup_3src_null_dest();
+   allocate_registers(true);
+
+   if (failed)
+      return false;
+
+   return !failed;
+}
+
+/**
+ * Return a bitfield where bit n is set if barycentric interpolation mode n
+ * (see enum brw_barycentric_mode) is needed by the fragment shader.
+ *
+ * We examine the load_barycentric intrinsics rather than looking at input
+ * variables so that we catch interpolateAtCentroid() messages too, which
+ * also need the BRW_BARYCENTRIC_[NON]PERSPECTIVE_CENTROID mode set up.
+ */
+static unsigned
+brw_compute_barycentric_interp_modes(const struct gen_device_info *devinfo,
+                                     const nir_shader *shader)
+{
+   unsigned barycentric_interp_modes = 0;
+
+   nir_foreach_function(f, shader) {
+      if (!f->impl)
+         continue;
+
+      nir_foreach_block(block, f->impl) {
+         nir_foreach_instr(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
+               continue;
+
+            /* Ignore WPOS; it doesn't require interpolation. */
+            if (nir_intrinsic_base(intrin) == VARYING_SLOT_POS)
+               continue;
+
+            intrin = nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
+            enum glsl_interp_mode interp = (enum glsl_interp_mode)
+               nir_intrinsic_interp_mode(intrin);
+            nir_intrinsic_op bary_op = intrin->intrinsic;
+            enum brw_barycentric_mode bary =
+               brw_barycentric_mode(interp, bary_op);
+
+            barycentric_interp_modes |= 1 << bary;
+
+            if (devinfo->needs_unlit_centroid_workaround &&
+                bary_op == nir_intrinsic_load_barycentric_centroid)
+               barycentric_interp_modes |= 1 << centroid_to_pixel(bary);
+         }
+      }
+   }
+
+   return barycentric_interp_modes;
+}
+
+static void
+brw_compute_flat_inputs(struct brw_wm_prog_data *prog_data,
+                        const nir_shader *shader)
+{
+   prog_data->flat_inputs = 0;
+
+   nir_foreach_variable(var, &shader->inputs) {
+      int input_index = prog_data->urb_setup[var->data.location];
+
+      if (input_index < 0)
+	 continue;
+
+      /* flat shading */
+      if (var->data.interpolation == INTERP_MODE_FLAT)
+         prog_data->flat_inputs |= (1 << input_index);
+   }
+}
+
+static uint8_t
+computed_depth_mode(const nir_shader *shader)
+{
+   if (shader->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
+      switch (shader->info->fs.depth_layout) {
+      case FRAG_DEPTH_LAYOUT_NONE:
+      case FRAG_DEPTH_LAYOUT_ANY:
+         return BRW_PSCDEPTH_ON;
+      case FRAG_DEPTH_LAYOUT_GREATER:
+         return BRW_PSCDEPTH_ON_GE;
+      case FRAG_DEPTH_LAYOUT_LESS:
+         return BRW_PSCDEPTH_ON_LE;
+      case FRAG_DEPTH_LAYOUT_UNCHANGED:
+         return BRW_PSCDEPTH_OFF;
+      }
+   }
+   return BRW_PSCDEPTH_OFF;
+}
+
+/**
+ * Move load_interpolated_input with simple (payload-based) barycentric modes
+ * to the top of the program so we don't emit multiple PLNs for the same input.
+ *
+ * This works around CSE not being able to handle non-dominating cases
+ * such as:
+ *
+ *    if (...) {
+ *       interpolate input
+ *    } else {
+ *       interpolate the same exact input
+ *    }
+ *
+ * This should be replaced by global value numbering someday.
+ */
+void
+move_interpolation_to_top(nir_shader *nir)
+{
+   nir_foreach_function(f, nir) {
+      if (!f->impl)
+         continue;
+
+      nir_block *top = nir_start_block(f->impl);
+      exec_node *cursor_node = NULL;
+
+      nir_foreach_block(block, f->impl) {
+         if (block == top)
+            continue;
+
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            if (intrin->intrinsic != nir_intrinsic_load_interpolated_input)
+               continue;
+            nir_intrinsic_instr *bary_intrinsic =
+               nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr);
+            nir_intrinsic_op op = bary_intrinsic->intrinsic;
+
+            /* Leave interpolateAtSample/Offset() where they are. */
+            if (op == nir_intrinsic_load_barycentric_at_sample ||
+                op == nir_intrinsic_load_barycentric_at_offset)
+               continue;
+
+            nir_instr *move[3] = {
+               &bary_intrinsic->instr,
+               intrin->src[1].ssa->parent_instr,
+               instr
+            };
+
+            for (unsigned i = 0; i < ARRAY_SIZE(move); i++) {
+               if (move[i]->block != top) {
+                  move[i]->block = top;
+                  exec_node_remove(&move[i]->node);
+                  if (cursor_node) {
+                     exec_node_insert_after(cursor_node, &move[i]->node);
+                  } else {
+                     exec_list_push_head(&top->instr_list, &move[i]->node);
+                  }
+                  cursor_node = &move[i]->node;
+               }
+            }
+         }
+      }
+      nir_metadata_preserve(f->impl, (nir_metadata)
+                            ((unsigned) nir_metadata_block_index |
+                             (unsigned) nir_metadata_dominance));
+   }
+}
+
+/**
+ * Demote per-sample barycentric intrinsics to centroid.
+ *
+ * Useful when rendering to a non-multisampled buffer.
+ */
+static void
+demote_sample_qualifiers(nir_shader *nir)
+{
+   nir_foreach_function(f, nir) {
+      if (!f->impl)
+         continue;
+
+      nir_builder b;
+      nir_builder_init(&b, f->impl);
+
+      nir_foreach_block(block, f->impl) {
+         nir_foreach_instr_safe(instr, block) {
+            if (instr->type != nir_instr_type_intrinsic)
+               continue;
+
+            nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+            if (intrin->intrinsic != nir_intrinsic_load_barycentric_sample &&
+                intrin->intrinsic != nir_intrinsic_load_barycentric_at_sample)
+               continue;
+
+            b.cursor = nir_before_instr(instr);
+            nir_ssa_def *centroid =
+               nir_load_barycentric(&b, nir_intrinsic_load_barycentric_centroid,
+                                    nir_intrinsic_interp_mode(intrin));
+            nir_ssa_def_rewrite_uses(&intrin->dest.ssa,
+                                     nir_src_for_ssa(centroid));
+            nir_instr_remove(instr);
+         }
+      }
+
+      nir_metadata_preserve(f->impl, (nir_metadata)
+                            ((unsigned) nir_metadata_block_index |
+                             (unsigned) nir_metadata_dominance));
+   }
+}
+
+/**
+ * Pre-gen6, the register file of the EUs was shared between threads,
+ * and each thread used some subset allocated on a 16-register block
+ * granularity.  The unit states wanted these block counts.
+ */
+static inline int
+brw_register_blocks(int reg_count)
+{
+   return ALIGN(reg_count, 16) / 16 - 1;
+}
+
+const unsigned *
+brw_compile_fs(const struct brw_compiler *compiler, void *log_data,
+               void *mem_ctx,
+               const struct brw_wm_prog_key *key,
+               struct brw_wm_prog_data *prog_data,
+               const nir_shader *src_shader,
+               struct gl_program *prog,
+               int shader_time_index8, int shader_time_index16,
+               bool allow_spilling,
+               bool use_rep_send, struct brw_vue_map *vue_map,
+               unsigned *final_assembly_size,
+               char **error_str)
+{
+   const struct gen_device_info *devinfo = compiler->devinfo;
+
+   nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
+   shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
+   brw_nir_lower_fs_inputs(shader, devinfo, key);
+   brw_nir_lower_fs_outputs(shader);
+
+   if (devinfo->gen < 6) {
+      brw_setup_vue_interpolation(vue_map, shader, prog_data, devinfo);
+   }
+
+   if (!key->multisample_fbo)
+      NIR_PASS_V(shader, demote_sample_qualifiers);
+   NIR_PASS_V(shader, move_interpolation_to_top);
+   shader = brw_postprocess_nir(shader, compiler, true);
+
+   /* key->alpha_test_func means simulating alpha testing via discards,
+    * so the shader definitely kills pixels.
+    */
+   prog_data->uses_kill = shader->info->fs.uses_discard ||
+      key->alpha_test_func;
+   prog_data->uses_omask = key->multisample_fbo &&
+      shader->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK);
+   prog_data->computed_depth_mode = computed_depth_mode(shader);
+   prog_data->computed_stencil =
+      shader->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL);
+
+   prog_data->persample_dispatch =
+      key->multisample_fbo &&
+      (key->persample_interp ||
+       (shader->info->system_values_read & (SYSTEM_BIT_SAMPLE_ID |
+                                            SYSTEM_BIT_SAMPLE_POS)) ||
+       shader->info->fs.uses_sample_qualifier ||
+       shader->info->outputs_read);
+
+   prog_data->early_fragment_tests = shader->info->fs.early_fragment_tests;
+   prog_data->post_depth_coverage = shader->info->fs.post_depth_coverage;
+   prog_data->inner_coverage = shader->info->fs.inner_coverage;
+
+   prog_data->barycentric_interp_modes =
+      brw_compute_barycentric_interp_modes(compiler->devinfo, shader);
+
+   cfg_t *simd8_cfg = NULL, *simd16_cfg = NULL;
+   uint8_t simd8_grf_start = 0, simd16_grf_start = 0;
+   unsigned simd8_grf_used = 0, simd16_grf_used = 0;
+
+   fs_visitor v8(compiler, log_data, mem_ctx, key,
+                 &prog_data->base, prog, shader, 8,
+                 shader_time_index8);
+   if (!v8.run_fs(allow_spilling, false /* do_rep_send */)) {
+      if (error_str)
+         *error_str = ralloc_strdup(mem_ctx, v8.fail_msg);
+
+      return NULL;
+   } else if (likely(!(INTEL_DEBUG & DEBUG_NO8))) {
+      simd8_cfg = v8.cfg;
+      simd8_grf_start = v8.payload.num_regs;
+      simd8_grf_used = v8.grf_used;
+   }
+
+   if (v8.max_dispatch_width >= 16 &&
+       likely(!(INTEL_DEBUG & DEBUG_NO16) || use_rep_send)) {
+      /* Try a SIMD16 compile */
+      fs_visitor v16(compiler, log_data, mem_ctx, key,
+                     &prog_data->base, prog, shader, 16,
+                     shader_time_index16);
+      v16.import_uniforms(&v8);
+      if (!v16.run_fs(allow_spilling, use_rep_send)) {
+         compiler->shader_perf_log(log_data,
+                                   "SIMD16 shader failed to compile: %s",
+                                   v16.fail_msg);
+      } else {
+         simd16_cfg = v16.cfg;
+         simd16_grf_start = v16.payload.num_regs;
+         simd16_grf_used = v16.grf_used;
+      }
+   }
+
+   /* When the caller requests a repclear shader, they want SIMD16-only */
+   if (use_rep_send)
+      simd8_cfg = NULL;
+
+   /* Prior to Iron Lake, the PS had a single shader offset with a jump table
+    * at the top to select the shader.  We've never implemented that.
+    * Instead, we just give them exactly one shader and we pick the widest one
+    * available.
+    */
+   if (compiler->devinfo->gen < 5 && simd16_cfg)
+      simd8_cfg = NULL;
+
+   if (prog_data->persample_dispatch) {
+      /* Starting with SandyBridge (where we first get MSAA), the different
+       * pixel dispatch combinations are grouped into classifications A
+       * through F (SNB PRM Vol. 2 Part 1 Section 7.7.1).  On all hardware
+       * generations, the only configurations supporting persample dispatch
+       * are are this in which only one dispatch width is enabled.
+       *
+       * If computed depth is enabled, SNB only allows SIMD8 while IVB+
+       * allow SIMD8 or SIMD16 so we choose SIMD16 if available.
+       */
+      if (compiler->devinfo->gen == 6 &&
+          prog_data->computed_depth_mode != BRW_PSCDEPTH_OFF) {
+         simd16_cfg = NULL;
+      } else if (simd16_cfg) {
+         simd8_cfg = NULL;
+      }
+   }
+
+   /* We have to compute the flat inputs after the visitor is finished running
+    * because it relies on prog_data->urb_setup which is computed in
+    * fs_visitor::calculate_urb_setup().
+    */
+   brw_compute_flat_inputs(prog_data, shader);
+
+   fs_generator g(compiler, log_data, mem_ctx, (void *) key, &prog_data->base,
+                  v8.promoted_constants, v8.runtime_check_aads_emit,
+                  MESA_SHADER_FRAGMENT);
+
+   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
+      g.enable_debug(ralloc_asprintf(mem_ctx, "%s fragment shader %s",
+                                     shader->info->label ?
+                                        shader->info->label : "unnamed",
+                                     shader->info->name));
+   }
+
+   if (simd8_cfg) {
+      prog_data->dispatch_8 = true;
+      g.generate_code(simd8_cfg, 8);
+      prog_data->base.dispatch_grf_start_reg = simd8_grf_start;
+      prog_data->reg_blocks_0 = brw_register_blocks(simd8_grf_used);
+
+      if (simd16_cfg) {
+         prog_data->dispatch_16 = true;
+         prog_data->prog_offset_2 = g.generate_code(simd16_cfg, 16);
+         prog_data->dispatch_grf_start_reg_2 = simd16_grf_start;
+         prog_data->reg_blocks_2 = brw_register_blocks(simd16_grf_used);
+      }
+   } else if (simd16_cfg) {
+      prog_data->dispatch_16 = true;
+      g.generate_code(simd16_cfg, 16);
+      prog_data->base.dispatch_grf_start_reg = simd16_grf_start;
+      prog_data->reg_blocks_0 = brw_register_blocks(simd16_grf_used);
+   }
+
+   return g.get_assembly(final_assembly_size);
+}
+
+fs_reg *
+fs_visitor::emit_cs_work_group_id_setup()
+{
+   assert(stage == MESA_SHADER_COMPUTE);
+
+   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::uvec3_type));
+
+   struct brw_reg r0_1(retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
+   struct brw_reg r0_6(retype(brw_vec1_grf(0, 6), BRW_REGISTER_TYPE_UD));
+   struct brw_reg r0_7(retype(brw_vec1_grf(0, 7), BRW_REGISTER_TYPE_UD));
+
+   bld.MOV(*reg, r0_1);
+   bld.MOV(offset(*reg, bld, 1), r0_6);
+   bld.MOV(offset(*reg, bld, 2), r0_7);
+
+   return reg;
+}
+
+static void
+fill_push_const_block_info(struct brw_push_const_block *block, unsigned dwords)
+{
+   block->dwords = dwords;
+   block->regs = DIV_ROUND_UP(dwords, 8);
+   block->size = block->regs * 32;
+}
+
+static void
+cs_fill_push_const_info(const struct gen_device_info *devinfo,
+                        struct brw_cs_prog_data *cs_prog_data)
+{
+   const struct brw_stage_prog_data *prog_data = &cs_prog_data->base;
+   bool fill_thread_id =
+      cs_prog_data->thread_local_id_index >= 0 &&
+      cs_prog_data->thread_local_id_index < (int)prog_data->nr_params;
+   bool cross_thread_supported = devinfo->gen > 7 || devinfo->is_haswell;
+
+   /* The thread ID should be stored in the last param dword */
+   assert(prog_data->nr_params > 0 || !fill_thread_id);
+   assert(!fill_thread_id ||
+          cs_prog_data->thread_local_id_index ==
+             (int)prog_data->nr_params - 1);
+
+   unsigned cross_thread_dwords, per_thread_dwords;
+   if (!cross_thread_supported) {
+      cross_thread_dwords = 0u;
+      per_thread_dwords = prog_data->nr_params;
+   } else if (fill_thread_id) {
+      /* Fill all but the last register with cross-thread payload */
+      cross_thread_dwords = 8 * (cs_prog_data->thread_local_id_index / 8);
+      per_thread_dwords = prog_data->nr_params - cross_thread_dwords;
+      assert(per_thread_dwords > 0 && per_thread_dwords <= 8);
+   } else {
+      /* Fill all data using cross-thread payload */
+      cross_thread_dwords = prog_data->nr_params;
+      per_thread_dwords = 0u;
+   }
+
+   fill_push_const_block_info(&cs_prog_data->push.cross_thread, cross_thread_dwords);
+   fill_push_const_block_info(&cs_prog_data->push.per_thread, per_thread_dwords);
+
+   unsigned total_dwords =
+      (cs_prog_data->push.per_thread.size * cs_prog_data->threads +
+       cs_prog_data->push.cross_thread.size) / 4;
+   fill_push_const_block_info(&cs_prog_data->push.total, total_dwords);
+
+   assert(cs_prog_data->push.cross_thread.dwords % 8 == 0 ||
+          cs_prog_data->push.per_thread.size == 0);
+   assert(cs_prog_data->push.cross_thread.dwords +
+          cs_prog_data->push.per_thread.dwords ==
+             prog_data->nr_params);
+}
+
+static void
+cs_set_simd_size(struct brw_cs_prog_data *cs_prog_data, unsigned size)
+{
+   cs_prog_data->simd_size = size;
+   unsigned group_size = cs_prog_data->local_size[0] *
+      cs_prog_data->local_size[1] * cs_prog_data->local_size[2];
+   cs_prog_data->threads = (group_size + size - 1) / size;
+}
+
+const unsigned *
+brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
+               void *mem_ctx,
+               const struct brw_cs_prog_key *key,
+               struct brw_cs_prog_data *prog_data,
+               const nir_shader *src_shader,
+               int shader_time_index,
+               unsigned *final_assembly_size,
+               char **error_str)
+{
+   nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
+   shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, true);
+   brw_nir_lower_cs_shared(shader);
+   prog_data->base.total_shared += shader->num_shared;
+
+   /* Now that we cloned the nir_shader, we can update num_uniforms based on
+    * the thread_local_id_index.
+    */
+   assert(prog_data->thread_local_id_index >= 0);
+   shader->num_uniforms =
+      MAX2(shader->num_uniforms,
+           (unsigned)4 * (prog_data->thread_local_id_index + 1));
+
+   brw_nir_lower_intrinsics(shader, &prog_data->base);
+   shader = brw_postprocess_nir(shader, compiler, true);
+
+   prog_data->local_size[0] = shader->info->cs.local_size[0];
+   prog_data->local_size[1] = shader->info->cs.local_size[1];
+   prog_data->local_size[2] = shader->info->cs.local_size[2];
+   unsigned local_workgroup_size =
+      shader->info->cs.local_size[0] * shader->info->cs.local_size[1] *
+      shader->info->cs.local_size[2];
+
+   unsigned max_cs_threads = compiler->devinfo->max_cs_threads;
+   unsigned simd_required = DIV_ROUND_UP(local_workgroup_size, max_cs_threads);
+
+   cfg_t *cfg = NULL;
+   const char *fail_msg = NULL;
+
+   /* Now the main event: Visit the shader IR and generate our CS IR for it.
+    */
+   fs_visitor v8(compiler, log_data, mem_ctx, key, &prog_data->base,
+                 NULL, /* Never used in core profile */
+                 shader, 8, shader_time_index);
+   if (simd_required <= 8) {
+      if (!v8.run_cs()) {
+         fail_msg = v8.fail_msg;
+      } else {
+         cfg = v8.cfg;
+         cs_set_simd_size(prog_data, 8);
+         cs_fill_push_const_info(compiler->devinfo, prog_data);
+         prog_data->base.dispatch_grf_start_reg = v8.payload.num_regs;
+      }
+   }
+
+   fs_visitor v16(compiler, log_data, mem_ctx, key, &prog_data->base,
+                 NULL, /* Never used in core profile */
+                 shader, 16, shader_time_index);
+   if (likely(!(INTEL_DEBUG & DEBUG_NO16)) &&
+       !fail_msg && v8.max_dispatch_width >= 16 &&
+       simd_required <= 16) {
+      /* Try a SIMD16 compile */
+      if (simd_required <= 8)
+         v16.import_uniforms(&v8);
+      if (!v16.run_cs()) {
+         compiler->shader_perf_log(log_data,
+                                   "SIMD16 shader failed to compile: %s",
+                                   v16.fail_msg);
+         if (!cfg) {
+            fail_msg =
+               "Couldn't generate SIMD16 program and not "
+               "enough threads for SIMD8";
+         }
+      } else {
+         cfg = v16.cfg;
+         cs_set_simd_size(prog_data, 16);
+         cs_fill_push_const_info(compiler->devinfo, prog_data);
+         prog_data->dispatch_grf_start_reg_16 = v16.payload.num_regs;
+      }
+   }
+
+   fs_visitor v32(compiler, log_data, mem_ctx, key, &prog_data->base,
+                 NULL, /* Never used in core profile */
+                 shader, 32, shader_time_index);
+   if (!fail_msg && v8.max_dispatch_width >= 32 &&
+       (simd_required > 16 || (INTEL_DEBUG & DEBUG_DO32))) {
+      /* Try a SIMD32 compile */
+      if (simd_required <= 8)
+         v32.import_uniforms(&v8);
+      else if (simd_required <= 16)
+         v32.import_uniforms(&v16);
+
+      if (!v32.run_cs()) {
+         compiler->shader_perf_log(log_data,
+                                   "SIMD32 shader failed to compile: %s",
+                                   v16.fail_msg);
+         if (!cfg) {
+            fail_msg =
+               "Couldn't generate SIMD32 program and not "
+               "enough threads for SIMD16";
+         }
+      } else {
+         cfg = v32.cfg;
+         cs_set_simd_size(prog_data, 32);
+         cs_fill_push_const_info(compiler->devinfo, prog_data);
+      }
+   }
+
+   if (unlikely(cfg == NULL)) {
+      assert(fail_msg);
+      if (error_str)
+         *error_str = ralloc_strdup(mem_ctx, fail_msg);
+
+      return NULL;
+   }
+
+   fs_generator g(compiler, log_data, mem_ctx, (void*) key, &prog_data->base,
+                  v8.promoted_constants, v8.runtime_check_aads_emit,
+                  MESA_SHADER_COMPUTE);
+   if (INTEL_DEBUG & DEBUG_CS) {
+      char *name = ralloc_asprintf(mem_ctx, "%s compute shader %s",
+                                   shader->info->label ? shader->info->label :
+                                                        "unnamed",
+                                   shader->info->name);
+      g.enable_debug(name);
+   }
+
+   g.generate_code(cfg, prog_data->simd_size);
+
+   return g.get_assembly(final_assembly_size);
+}
+
+/**
+ * Test the dispatch mask packing assumptions of
+ * brw_stage_has_packed_dispatch().  Call this from e.g. the top of
+ * fs_visitor::emit_nir_code() to cause a GPU hang if any shader invocation is
+ * executed with an unexpected dispatch mask.
+ */
+static UNUSED void
+brw_fs_test_dispatch_packing(const fs_builder &bld)
+{
+   const gl_shader_stage stage = bld.shader->stage;
+
+   if (brw_stage_has_packed_dispatch(bld.shader->devinfo, stage,
+                                     bld.shader->stage_prog_data)) {
+      const fs_builder ubld = bld.exec_all().group(1, 0);
+      const fs_reg tmp = component(bld.vgrf(BRW_REGISTER_TYPE_UD), 0);
+      const fs_reg mask = (stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() :
+                           brw_dmask_reg());
+
+      ubld.ADD(tmp, mask, brw_imm_ud(1));
+      ubld.AND(tmp, mask, tmp);
+
+      /* This will loop forever if the dispatch mask doesn't have the expected
+       * form '2^n-1', in which case tmp will be non-zero.
+       */
+      bld.emit(BRW_OPCODE_DO);
+      bld.CMP(bld.null_reg_ud(), tmp, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
+      set_predicate(BRW_PREDICATE_NORMAL, bld.emit(BRW_OPCODE_WHILE));
+   }
+}
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
new file mode 100644
index 00000000000..00861ce5dad
--- /dev/null
+++ b/src/intel/compiler/brw_fs.h
@@ -0,0 +1,500 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#pragma once
+
+#include "brw_shader.h"
+#include "brw_ir_fs.h"
+#include "brw_fs_builder.h"
+#include "compiler/nir/nir.h"
+
+struct bblock_t;
+namespace {
+   struct acp_entry;
+}
+
+namespace brw {
+   class fs_live_variables;
+}
+
+struct brw_gs_compile;
+
+static inline fs_reg
+offset(const fs_reg &reg, const brw::fs_builder &bld, unsigned delta)
+{
+   return offset(reg, bld.dispatch_width(), delta);
+}
+
+/**
+ * The fragment shader front-end.
+ *
+ * Translates either GLSL IR or Mesa IR (for ARB_fragment_program) into FS IR.
+ */
+class fs_visitor : public backend_shader
+{
+public:
+   fs_visitor(const struct brw_compiler *compiler, void *log_data,
+              void *mem_ctx,
+              const void *key,
+              struct brw_stage_prog_data *prog_data,
+              struct gl_program *prog,
+              const nir_shader *shader,
+              unsigned dispatch_width,
+              int shader_time_index,
+              const struct brw_vue_map *input_vue_map = NULL);
+   fs_visitor(const struct brw_compiler *compiler, void *log_data,
+              void *mem_ctx,
+              struct brw_gs_compile *gs_compile,
+              struct brw_gs_prog_data *prog_data,
+              const nir_shader *shader,
+              int shader_time_index);
+   void init();
+   ~fs_visitor();
+
+   fs_reg vgrf(const glsl_type *const type);
+   void import_uniforms(fs_visitor *v);
+   void setup_uniform_clipplane_values(gl_clip_plane *clip_planes);
+   void compute_clip_distance(gl_clip_plane *clip_planes);
+
+   fs_inst *get_instruction_generating_reg(fs_inst *start,
+					   fs_inst *end,
+					   const fs_reg &reg);
+
+   void VARYING_PULL_CONSTANT_LOAD(const brw::fs_builder &bld,
+                                   const fs_reg &dst,
+                                   const fs_reg &surf_index,
+                                   const fs_reg &varying_offset,
+                                   uint32_t const_offset);
+   void DEP_RESOLVE_MOV(const brw::fs_builder &bld, int grf);
+
+   bool run_fs(bool allow_spilling, bool do_rep_send);
+   bool run_vs(gl_clip_plane *clip_planes);
+   bool run_tcs_single_patch();
+   bool run_tes();
+   bool run_gs();
+   bool run_cs();
+   void optimize();
+   void allocate_registers(bool allow_spilling);
+   void setup_fs_payload_gen4();
+   void setup_fs_payload_gen6();
+   void setup_vs_payload();
+   void setup_gs_payload();
+   void setup_cs_payload();
+   void fixup_3src_null_dest();
+   void assign_curb_setup();
+   void calculate_urb_setup();
+   void assign_urb_setup();
+   void convert_attr_sources_to_hw_regs(fs_inst *inst);
+   void assign_vs_urb_setup();
+   void assign_tcs_single_patch_urb_setup();
+   void assign_tes_urb_setup();
+   void assign_gs_urb_setup();
+   bool assign_regs(bool allow_spilling, bool spill_all);
+   void assign_regs_trivial();
+   void calculate_payload_ranges(int payload_node_count,
+                                 int *payload_last_use_ip);
+   void setup_payload_interference(struct ra_graph *g, int payload_reg_count,
+                                   int first_payload_node);
+   int choose_spill_reg(struct ra_graph *g);
+   void spill_reg(int spill_reg);
+   void split_virtual_grfs();
+   bool compact_virtual_grfs();
+   void assign_constant_locations();
+   void lower_constant_loads();
+   void invalidate_live_intervals();
+   void calculate_live_intervals();
+   void calculate_register_pressure();
+   void validate();
+   bool opt_algebraic();
+   bool opt_redundant_discard_jumps();
+   bool opt_cse();
+   bool opt_cse_local(bblock_t *block);
+   bool opt_copy_propagation();
+   bool try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry);
+   bool try_constant_propagate(fs_inst *inst, acp_entry *entry);
+   bool opt_copy_propagation_local(void *mem_ctx, bblock_t *block,
+                                   exec_list *acp);
+   bool opt_drop_redundant_mov_to_flags();
+   bool opt_register_renaming();
+   bool register_coalesce();
+   bool compute_to_mrf();
+   bool eliminate_find_live_channel();
+   bool dead_code_eliminate();
+   bool remove_duplicate_mrf_writes();
+
+   bool opt_sampler_eot();
+   bool virtual_grf_interferes(int a, int b);
+   void schedule_instructions(instruction_scheduler_mode mode);
+   void insert_gen4_send_dependency_workarounds();
+   void insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
+                                                    fs_inst *inst);
+   void insert_gen4_post_send_dependency_workarounds(bblock_t *block,
+                                                     fs_inst *inst);
+   void vfail(const char *msg, va_list args);
+   void fail(const char *msg, ...);
+   void limit_dispatch_width(unsigned n, const char *msg);
+   void lower_uniform_pull_constant_loads();
+   bool lower_load_payload();
+   bool lower_pack();
+   bool lower_d2x();
+   bool lower_logical_sends();
+   bool lower_integer_multiplication();
+   bool lower_minmax();
+   bool lower_simd_width();
+   bool opt_combine_constants();
+
+   void emit_dummy_fs();
+   void emit_repclear_shader();
+   void emit_fragcoord_interpolation(fs_reg wpos);
+   fs_reg *emit_frontfacing_interpolation();
+   fs_reg *emit_samplepos_setup();
+   fs_reg *emit_sampleid_setup();
+   fs_reg *emit_samplemaskin_setup();
+   fs_reg *emit_vs_system_value(int location);
+   void emit_interpolation_setup_gen4();
+   void emit_interpolation_setup_gen6();
+   void compute_sample_position(fs_reg dst, fs_reg int_sample_pos);
+   fs_reg emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
+                         const fs_reg &sampler);
+   void emit_gen6_gather_wa(uint8_t wa, fs_reg dst);
+   fs_reg resolve_source_modifiers(const fs_reg &src);
+   void emit_discard_jump();
+   bool opt_peephole_sel();
+   bool opt_peephole_predicated_break();
+   bool opt_saturate_propagation();
+   bool opt_cmod_propagation();
+   bool opt_zero_samples();
+
+   void emit_nir_code();
+   void nir_setup_outputs();
+   void nir_setup_uniforms();
+   void nir_emit_system_values();
+   void nir_emit_impl(nir_function_impl *impl);
+   void nir_emit_cf_list(exec_list *list);
+   void nir_emit_if(nir_if *if_stmt);
+   void nir_emit_loop(nir_loop *loop);
+   void nir_emit_block(nir_block *block);
+   void nir_emit_instr(nir_instr *instr);
+   void nir_emit_alu(const brw::fs_builder &bld, nir_alu_instr *instr);
+   void nir_emit_load_const(const brw::fs_builder &bld,
+                            nir_load_const_instr *instr);
+   void nir_emit_vs_intrinsic(const brw::fs_builder &bld,
+                              nir_intrinsic_instr *instr);
+   void nir_emit_tcs_intrinsic(const brw::fs_builder &bld,
+                               nir_intrinsic_instr *instr);
+   void nir_emit_gs_intrinsic(const brw::fs_builder &bld,
+                              nir_intrinsic_instr *instr);
+   void nir_emit_fs_intrinsic(const brw::fs_builder &bld,
+                              nir_intrinsic_instr *instr);
+   void nir_emit_cs_intrinsic(const brw::fs_builder &bld,
+                              nir_intrinsic_instr *instr);
+   void nir_emit_intrinsic(const brw::fs_builder &bld,
+                           nir_intrinsic_instr *instr);
+   void nir_emit_tes_intrinsic(const brw::fs_builder &bld,
+                               nir_intrinsic_instr *instr);
+   void nir_emit_ssbo_atomic(const brw::fs_builder &bld,
+                             int op, nir_intrinsic_instr *instr);
+   void nir_emit_shared_atomic(const brw::fs_builder &bld,
+                               int op, nir_intrinsic_instr *instr);
+   void nir_emit_texture(const brw::fs_builder &bld,
+                         nir_tex_instr *instr);
+   void nir_emit_jump(const brw::fs_builder &bld,
+                      nir_jump_instr *instr);
+   fs_reg get_nir_src(const nir_src &src);
+   fs_reg get_nir_src_imm(const nir_src &src);
+   fs_reg get_nir_dest(const nir_dest &dest);
+   fs_reg get_nir_image_deref(const nir_deref_var *deref);
+   fs_reg get_indirect_offset(nir_intrinsic_instr *instr);
+   void emit_percomp(const brw::fs_builder &bld, const fs_inst &inst,
+                     unsigned wr_mask);
+
+   bool optimize_extract_to_float(nir_alu_instr *instr,
+                                  const fs_reg &result);
+   bool optimize_frontfacing_ternary(nir_alu_instr *instr,
+                                     const fs_reg &result);
+
+   void emit_alpha_test();
+   fs_inst *emit_single_fb_write(const brw::fs_builder &bld,
+                                 fs_reg color1, fs_reg color2,
+                                 fs_reg src0_alpha, unsigned components);
+   void emit_fb_writes();
+   fs_inst *emit_non_coherent_fb_read(const brw::fs_builder &bld,
+                                      const fs_reg &dst, unsigned target);
+   void emit_urb_writes(const fs_reg &gs_vertex_count = fs_reg());
+   void set_gs_stream_control_data_bits(const fs_reg &vertex_count,
+                                        unsigned stream_id);
+   void emit_gs_control_data_bits(const fs_reg &vertex_count);
+   void emit_gs_end_primitive(const nir_src &vertex_count_nir_src);
+   void emit_gs_vertex(const nir_src &vertex_count_nir_src,
+                       unsigned stream_id);
+   void emit_gs_thread_end();
+   void emit_gs_input_load(const fs_reg &dst, const nir_src &vertex_src,
+                           unsigned base_offset, const nir_src &offset_src,
+                           unsigned num_components, unsigned first_component);
+   void emit_cs_terminate();
+   fs_reg *emit_cs_work_group_id_setup();
+
+   void emit_barrier();
+
+   void emit_shader_time_begin();
+   void emit_shader_time_end();
+   void SHADER_TIME_ADD(const brw::fs_builder &bld,
+                        int shader_time_subindex,
+                        fs_reg value);
+
+   fs_reg get_timestamp(const brw::fs_builder &bld);
+
+   struct brw_reg interp_reg(int location, int channel);
+
+   int implied_mrf_writes(fs_inst *inst);
+
+   virtual void dump_instructions();
+   virtual void dump_instructions(const char *name);
+   void dump_instruction(backend_instruction *inst);
+   void dump_instruction(backend_instruction *inst, FILE *file);
+
+   const void *const key;
+   const struct brw_sampler_prog_key_data *key_tex;
+
+   struct brw_gs_compile *gs_compile;
+
+   struct brw_stage_prog_data *prog_data;
+   struct gl_program *prog;
+
+   const struct brw_vue_map *input_vue_map;
+
+   int *virtual_grf_start;
+   int *virtual_grf_end;
+   brw::fs_live_variables *live_intervals;
+
+   int *regs_live_at_ip;
+
+   /** Number of uniform variable components visited. */
+   unsigned uniforms;
+
+   /** Byte-offset for the next available spot in the scratch space buffer. */
+   unsigned last_scratch;
+
+   /**
+    * Array mapping UNIFORM register numbers to the pull parameter index,
+    * or -1 if this uniform register isn't being uploaded as a pull constant.
+    */
+   int *pull_constant_loc;
+
+   /**
+    * Array mapping UNIFORM register numbers to the push parameter index,
+    * or -1 if this uniform register isn't being uploaded as a push constant.
+    */
+   int *push_constant_loc;
+
+   fs_reg frag_depth;
+   fs_reg frag_stencil;
+   fs_reg sample_mask;
+   fs_reg outputs[VARYING_SLOT_MAX];
+   fs_reg dual_src_output;
+   int first_non_payload_grf;
+   /** Either BRW_MAX_GRF or GEN7_MRF_HACK_START */
+   unsigned max_grf;
+
+   fs_reg *nir_locals;
+   fs_reg *nir_ssa_values;
+   fs_reg *nir_system_values;
+
+   bool failed;
+   char *fail_msg;
+
+   /** Register numbers for thread payload fields. */
+   struct thread_payload {
+      uint8_t source_depth_reg;
+      uint8_t source_w_reg;
+      uint8_t aa_dest_stencil_reg;
+      uint8_t dest_depth_reg;
+      uint8_t sample_pos_reg;
+      uint8_t sample_mask_in_reg;
+      uint8_t barycentric_coord_reg[BRW_BARYCENTRIC_MODE_COUNT];
+      uint8_t local_invocation_id_reg;
+
+      /** The number of thread payload registers the hardware will supply. */
+      uint8_t num_regs;
+   } payload;
+
+   bool source_depth_to_render_target;
+   bool runtime_check_aads_emit;
+
+   fs_reg pixel_x;
+   fs_reg pixel_y;
+   fs_reg wpos_w;
+   fs_reg pixel_w;
+   fs_reg delta_xy[BRW_BARYCENTRIC_MODE_COUNT];
+   fs_reg shader_start_time;
+   fs_reg userplane[MAX_CLIP_PLANES];
+   fs_reg final_gs_vertex_count;
+   fs_reg control_data_bits;
+   fs_reg invocation_id;
+
+   unsigned grf_used;
+   bool spilled_any_registers;
+
+   const unsigned dispatch_width; /**< 8, 16 or 32 */
+   unsigned min_dispatch_width;
+   unsigned max_dispatch_width;
+
+   int shader_time_index;
+
+   unsigned promoted_constants;
+   brw::fs_builder bld;
+};
+
+/**
+ * The fragment shader code generator.
+ *
+ * Translates FS IR to actual i965 assembly code.
+ */
+class fs_generator
+{
+public:
+   fs_generator(const struct brw_compiler *compiler, void *log_data,
+                void *mem_ctx,
+                const void *key,
+                struct brw_stage_prog_data *prog_data,
+                unsigned promoted_constants,
+                bool runtime_check_aads_emit,
+                gl_shader_stage stage);
+   ~fs_generator();
+
+   void enable_debug(const char *shader_name);
+   int generate_code(const cfg_t *cfg, int dispatch_width);
+   const unsigned *get_assembly(unsigned int *assembly_size);
+
+private:
+   void fire_fb_write(fs_inst *inst,
+                      struct brw_reg payload,
+                      struct brw_reg implied_header,
+                      GLuint nr);
+   void generate_fb_write(fs_inst *inst, struct brw_reg payload);
+   void generate_fb_read(fs_inst *inst, struct brw_reg dst,
+                         struct brw_reg payload);
+   void generate_urb_read(fs_inst *inst, struct brw_reg dst, struct brw_reg payload);
+   void generate_urb_write(fs_inst *inst, struct brw_reg payload);
+   void generate_cs_terminate(fs_inst *inst, struct brw_reg payload);
+   void generate_barrier(fs_inst *inst, struct brw_reg src);
+   void generate_linterp(fs_inst *inst, struct brw_reg dst,
+			 struct brw_reg *src);
+   void generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
+                     struct brw_reg surface_index,
+                     struct brw_reg sampler_index);
+   void generate_get_buffer_size(fs_inst *inst, struct brw_reg dst,
+                                 struct brw_reg src,
+                                 struct brw_reg surf_index);
+   void generate_ddx(enum opcode op, struct brw_reg dst, struct brw_reg src);
+   void generate_ddy(enum opcode op, struct brw_reg dst, struct brw_reg src);
+   void generate_scratch_write(fs_inst *inst, struct brw_reg src);
+   void generate_scratch_read(fs_inst *inst, struct brw_reg dst);
+   void generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst);
+   void generate_uniform_pull_constant_load(fs_inst *inst, struct brw_reg dst,
+                                            struct brw_reg index,
+                                            struct brw_reg offset);
+   void generate_uniform_pull_constant_load_gen7(fs_inst *inst,
+                                                 struct brw_reg dst,
+                                                 struct brw_reg surf_index,
+                                                 struct brw_reg payload);
+   void generate_varying_pull_constant_load_gen4(fs_inst *inst,
+                                                 struct brw_reg dst,
+                                                 struct brw_reg index);
+   void generate_varying_pull_constant_load_gen7(fs_inst *inst,
+                                                 struct brw_reg dst,
+                                                 struct brw_reg index,
+                                                 struct brw_reg offset);
+   void generate_mov_dispatch_to_flags(fs_inst *inst);
+
+   void generate_pixel_interpolator_query(fs_inst *inst,
+                                          struct brw_reg dst,
+                                          struct brw_reg src,
+                                          struct brw_reg msg_data,
+                                          unsigned msg_type);
+
+   void generate_set_sample_id(fs_inst *inst,
+                               struct brw_reg dst,
+                               struct brw_reg src0,
+                               struct brw_reg src1);
+
+   void generate_discard_jump(fs_inst *inst);
+
+   void generate_pack_half_2x16_split(fs_inst *inst,
+                                      struct brw_reg dst,
+                                      struct brw_reg x,
+                                      struct brw_reg y);
+   void generate_unpack_half_2x16_split(fs_inst *inst,
+                                        struct brw_reg dst,
+                                        struct brw_reg src);
+
+   void generate_shader_time_add(fs_inst *inst,
+                                 struct brw_reg payload,
+                                 struct brw_reg offset,
+                                 struct brw_reg value);
+
+   void generate_mov_indirect(fs_inst *inst,
+                              struct brw_reg dst,
+                              struct brw_reg reg,
+                              struct brw_reg indirect_byte_offset);
+
+   bool patch_discard_jumps_to_fb_writes();
+
+   const struct brw_compiler *compiler;
+   void *log_data; /* Passed to compiler->*_log functions */
+
+   const struct gen_device_info *devinfo;
+
+   struct brw_codegen *p;
+   const void * const key;
+   struct brw_stage_prog_data * const prog_data;
+
+   unsigned dispatch_width; /**< 8, 16 or 32 */
+
+   exec_list discard_halt_patches;
+   unsigned promoted_constants;
+   bool runtime_check_aads_emit;
+   bool debug_flag;
+   const char *shader_name;
+   gl_shader_stage stage;
+   void *mem_ctx;
+};
+
+void shuffle_32bit_load_result_to_64bit_data(const brw::fs_builder &bld,
+                                             const fs_reg &dst,
+                                             const fs_reg &src,
+                                             uint32_t components);
+
+void shuffle_64bit_data_for_32bit_write(const brw::fs_builder &bld,
+                                        const fs_reg &dst,
+                                        const fs_reg &src,
+                                        uint32_t components);
+fs_reg setup_imm_df(const brw::fs_builder &bld,
+                    double v);
+
+enum brw_barycentric_mode brw_barycentric_mode(enum glsl_interp_mode mode,
+                                               nir_intrinsic_op op);
diff --git a/src/intel/compiler/brw_fs_builder.h b/src/intel/compiler/brw_fs_builder.h
new file mode 100644
index 00000000000..87394bc17b3
--- /dev/null
+++ b/src/intel/compiler/brw_fs_builder.h
@@ -0,0 +1,662 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2010-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_FS_BUILDER_H
+#define BRW_FS_BUILDER_H
+
+#include "brw_ir_fs.h"
+#include "brw_shader.h"
+
+namespace brw {
+   /**
+    * Toolbox to assemble an FS IR program out of individual instructions.
+    *
+    * This object is meant to have an interface consistent with
+    * brw::vec4_builder.  They cannot be fully interchangeable because
+    * brw::fs_builder generates scalar code while brw::vec4_builder generates
+    * vector code.
+    */
+   class fs_builder {
+   public:
+      /** Type used in this IR to represent a source of an instruction. */
+      typedef fs_reg src_reg;
+
+      /** Type used in this IR to represent the destination of an instruction. */
+      typedef fs_reg dst_reg;
+
+      /** Type used in this IR to represent an instruction. */
+      typedef fs_inst instruction;
+
+      /**
+       * Construct an fs_builder that inserts instructions into \p shader.
+       * \p dispatch_width gives the native execution width of the program.
+       */
+      fs_builder(backend_shader *shader,
+                 unsigned dispatch_width) :
+         shader(shader), block(NULL), cursor(NULL),
+         _dispatch_width(dispatch_width),
+         _group(0),
+         force_writemask_all(false),
+         annotation()
+      {
+      }
+
+      /**
+       * Construct an fs_builder that inserts instructions into \p shader
+       * before instruction \p inst in basic block \p block.  The default
+       * execution controls and debug annotation are initialized from the
+       * instruction passed as argument.
+       */
+      fs_builder(backend_shader *shader, bblock_t *block, fs_inst *inst) :
+         shader(shader), block(block), cursor(inst),
+         _dispatch_width(inst->exec_size),
+         _group(inst->group),
+         force_writemask_all(inst->force_writemask_all)
+      {
+         annotation.str = inst->annotation;
+         annotation.ir = inst->ir;
+      }
+
+      /**
+       * Construct an fs_builder that inserts instructions before \p cursor in
+       * basic block \p block, inheriting other code generation parameters
+       * from this.
+       */
+      fs_builder
+      at(bblock_t *block, exec_node *cursor) const
+      {
+         fs_builder bld = *this;
+         bld.block = block;
+         bld.cursor = cursor;
+         return bld;
+      }
+
+      /**
+       * Construct an fs_builder appending instructions at the end of the
+       * instruction list of the shader, inheriting other code generation
+       * parameters from this.
+       */
+      fs_builder
+      at_end() const
+      {
+         return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
+      }
+
+      /**
+       * Construct a builder specifying the default SIMD width and group of
+       * channel enable signals, inheriting other code generation parameters
+       * from this.
+       *
+       * \p n gives the default SIMD width, \p i gives the slot group used for
+       * predication and control flow masking in multiples of \p n channels.
+       */
+      fs_builder
+      group(unsigned n, unsigned i) const
+      {
+         assert(force_writemask_all ||
+                (n <= dispatch_width() && i < dispatch_width() / n));
+         fs_builder bld = *this;
+         bld._dispatch_width = n;
+         bld._group += i * n;
+         return bld;
+      }
+
+      /**
+       * Alias for group() with width equal to eight.
+       */
+      fs_builder
+      half(unsigned i) const
+      {
+         return group(8, i);
+      }
+
+      /**
+       * Construct a builder with per-channel control flow execution masking
+       * disabled if \p b is true.  If control flow execution masking is
+       * already disabled this has no effect.
+       */
+      fs_builder
+      exec_all(bool b = true) const
+      {
+         fs_builder bld = *this;
+         if (b)
+            bld.force_writemask_all = true;
+         return bld;
+      }
+
+      /**
+       * Construct a builder with the given debug annotation info.
+       */
+      fs_builder
+      annotate(const char *str, const void *ir = NULL) const
+      {
+         fs_builder bld = *this;
+         bld.annotation.str = str;
+         bld.annotation.ir = ir;
+         return bld;
+      }
+
+      /**
+       * Get the SIMD width in use.
+       */
+      unsigned
+      dispatch_width() const
+      {
+         return _dispatch_width;
+      }
+
+      /**
+       * Get the channel group in use.
+       */
+      unsigned
+      group() const
+      {
+         return _group;
+      }
+
+      /**
+       * Allocate a virtual register of natural vector size (one for this IR)
+       * and SIMD width.  \p n gives the amount of space to allocate in
+       * dispatch_width units (which is just enough space for one logical
+       * component in this IR).
+       */
+      dst_reg
+      vgrf(enum brw_reg_type type, unsigned n = 1) const
+      {
+         assert(dispatch_width() <= 32);
+
+         if (n > 0)
+            return dst_reg(VGRF, shader->alloc.allocate(
+                              DIV_ROUND_UP(n * type_sz(type) * dispatch_width(),
+                                           REG_SIZE)),
+                           type);
+         else
+            return retype(null_reg_ud(), type);
+      }
+
+      /**
+       * Create a null register of floating type.
+       */
+      dst_reg
+      null_reg_f() const
+      {
+         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_F));
+      }
+
+      dst_reg
+      null_reg_df() const
+      {
+         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
+      }
+
+      /**
+       * Create a null register of signed integer type.
+       */
+      dst_reg
+      null_reg_d() const
+      {
+         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+      }
+
+      /**
+       * Create a null register of unsigned integer type.
+       */
+      dst_reg
+      null_reg_ud() const
+      {
+         return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
+      }
+
+      /**
+       * Get the mask of SIMD channels enabled by dispatch and not yet
+       * disabled by discard.
+       */
+      src_reg
+      sample_mask_reg() const
+      {
+         assert(shader->stage != MESA_SHADER_FRAGMENT ||
+                group() + dispatch_width() <= 16);
+         if (shader->stage != MESA_SHADER_FRAGMENT) {
+            return brw_imm_d(0xffffffff);
+         } else if (brw_wm_prog_data(shader->stage_prog_data)->uses_kill) {
+            return brw_flag_reg(0, 1);
+         } else {
+            return retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UD);
+         }
+      }
+
+      /**
+       * Insert an instruction into the program.
+       */
+      instruction *
+      emit(const instruction &inst) const
+      {
+         return emit(new(shader->mem_ctx) instruction(inst));
+      }
+
+      /**
+       * Create and insert a nullary control instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode) const
+      {
+         return emit(instruction(opcode, dispatch_width()));
+      }
+
+      /**
+       * Create and insert a nullary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst) const
+      {
+         return emit(instruction(opcode, dispatch_width(), dst));
+      }
+
+      /**
+       * Create and insert a unary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
+      {
+         switch (opcode) {
+         case SHADER_OPCODE_RCP:
+         case SHADER_OPCODE_RSQ:
+         case SHADER_OPCODE_SQRT:
+         case SHADER_OPCODE_EXP2:
+         case SHADER_OPCODE_LOG2:
+         case SHADER_OPCODE_SIN:
+         case SHADER_OPCODE_COS:
+            return emit(instruction(opcode, dispatch_width(), dst,
+                                    fix_math_operand(src0)));
+
+         default:
+            return emit(instruction(opcode, dispatch_width(), dst, src0));
+         }
+      }
+
+      /**
+       * Create and insert a binary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+           const src_reg &src1) const
+      {
+         switch (opcode) {
+         case SHADER_OPCODE_POW:
+         case SHADER_OPCODE_INT_QUOTIENT:
+         case SHADER_OPCODE_INT_REMAINDER:
+            return emit(instruction(opcode, dispatch_width(), dst,
+                                    fix_math_operand(src0),
+                                    fix_math_operand(src1)));
+
+         default:
+            return emit(instruction(opcode, dispatch_width(), dst, src0, src1));
+
+         }
+      }
+
+      /**
+       * Create and insert a ternary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+           const src_reg &src1, const src_reg &src2) const
+      {
+         switch (opcode) {
+         case BRW_OPCODE_BFE:
+         case BRW_OPCODE_BFI2:
+         case BRW_OPCODE_MAD:
+         case BRW_OPCODE_LRP:
+            return emit(instruction(opcode, dispatch_width(), dst,
+                                    fix_3src_operand(src0),
+                                    fix_3src_operand(src1),
+                                    fix_3src_operand(src2)));
+
+         default:
+            return emit(instruction(opcode, dispatch_width(), dst,
+                                    src0, src1, src2));
+         }
+      }
+
+      /**
+       * Create and insert an instruction with a variable number of sources
+       * into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg srcs[],
+           unsigned n) const
+      {
+         return emit(instruction(opcode, dispatch_width(), dst, srcs, n));
+      }
+
+      /**
+       * Insert a preallocated instruction into the program.
+       */
+      instruction *
+      emit(instruction *inst) const
+      {
+         assert(inst->exec_size <= 32);
+         assert(inst->exec_size == dispatch_width() ||
+                force_writemask_all);
+
+         inst->group = _group;
+         inst->force_writemask_all = force_writemask_all;
+         inst->annotation = annotation.str;
+         inst->ir = annotation.ir;
+
+         if (block)
+            static_cast<instruction *>(cursor)->insert_before(block, inst);
+         else
+            cursor->insert_before(inst);
+
+         return inst;
+      }
+
+      /**
+       * Select \p src0 if the comparison of both sources with the given
+       * conditional mod evaluates to true, otherwise select \p src1.
+       *
+       * Generally useful to get the minimum or maximum of two values.
+       */
+      instruction *
+      emit_minmax(const dst_reg &dst, const src_reg &src0,
+                  const src_reg &src1, brw_conditional_mod mod) const
+      {
+         assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
+
+         return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
+                                     fix_unsigned_negate(src1)));
+      }
+
+      /**
+       * Copy any live channel from \p src to the first channel of the result.
+       */
+      src_reg
+      emit_uniformize(const src_reg &src) const
+      {
+         /* FIXME: We use a vector chan_index and dst to allow constant and
+          * copy propagration to move result all the way into the consuming
+          * instruction (typically a surface index or sampler index for a
+          * send). This uses 1 or 3 extra hw registers in 16 or 32 wide
+          * dispatch. Once we teach const/copy propagation about scalars we
+          * should go back to scalar destinations here.
+          */
+         const fs_builder ubld = exec_all();
+         const dst_reg chan_index = vgrf(BRW_REGISTER_TYPE_UD);
+         const dst_reg dst = vgrf(src.type);
+
+         ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
+         ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, component(chan_index, 0));
+
+         return src_reg(component(dst, 0));
+      }
+
+      /**
+       * Assorted arithmetic ops.
+       * @{
+       */
+#define ALU1(op)                                        \
+      instruction *                                     \
+      op(const dst_reg &dst, const src_reg &src0) const \
+      {                                                 \
+         return emit(BRW_OPCODE_##op, dst, src0);       \
+      }
+
+#define ALU2(op)                                                        \
+      instruction *                                                     \
+      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
+      {                                                                 \
+         return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
+      }
+
+#define ALU2_ACC(op)                                                    \
+      instruction *                                                     \
+      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
+      {                                                                 \
+         instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
+         inst->writes_accumulator = true;                               \
+         return inst;                                                   \
+      }
+
+#define ALU3(op)                                                        \
+      instruction *                                                     \
+      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
+         const src_reg &src2) const                                     \
+      {                                                                 \
+         return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
+      }
+
+      ALU2(ADD)
+      ALU2_ACC(ADDC)
+      ALU2(AND)
+      ALU2(ASR)
+      ALU2(AVG)
+      ALU3(BFE)
+      ALU2(BFI1)
+      ALU3(BFI2)
+      ALU1(BFREV)
+      ALU1(CBIT)
+      ALU2(CMPN)
+      ALU3(CSEL)
+      ALU1(DIM)
+      ALU2(DP2)
+      ALU2(DP3)
+      ALU2(DP4)
+      ALU2(DPH)
+      ALU1(F16TO32)
+      ALU1(F32TO16)
+      ALU1(FBH)
+      ALU1(FBL)
+      ALU1(FRC)
+      ALU2(LINE)
+      ALU1(LZD)
+      ALU2(MAC)
+      ALU2_ACC(MACH)
+      ALU3(MAD)
+      ALU1(MOV)
+      ALU2(MUL)
+      ALU1(NOT)
+      ALU2(OR)
+      ALU2(PLN)
+      ALU1(RNDD)
+      ALU1(RNDE)
+      ALU1(RNDU)
+      ALU1(RNDZ)
+      ALU2(SAD2)
+      ALU2_ACC(SADA2)
+      ALU2(SEL)
+      ALU2(SHL)
+      ALU2(SHR)
+      ALU2_ACC(SUBB)
+      ALU2(XOR)
+
+#undef ALU3
+#undef ALU2_ACC
+#undef ALU2
+#undef ALU1
+      /** @} */
+
+      /**
+       * CMP: Sets the low bit of the destination channels with the result
+       * of the comparison, while the upper bits are undefined, and updates
+       * the flag register with the packed 16 bits of the result.
+       */
+      instruction *
+      CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
+          brw_conditional_mod condition) const
+      {
+         /* Take the instruction:
+          *
+          * CMP null<d> src0<f> src1<f>
+          *
+          * Original gen4 does type conversion to the destination type
+          * before comparison, producing garbage results for floating
+          * point comparisons.
+          *
+          * The destination type doesn't matter on newer generations,
+          * so we set the type to match src0 so we can compact the
+          * instruction.
+          */
+         return set_condmod(condition,
+                            emit(BRW_OPCODE_CMP, retype(dst, src0.type),
+                                 fix_unsigned_negate(src0),
+                                 fix_unsigned_negate(src1)));
+      }
+
+      /**
+       * Gen4 predicated IF.
+       */
+      instruction *
+      IF(brw_predicate predicate) const
+      {
+         return set_predicate(predicate, emit(BRW_OPCODE_IF));
+      }
+
+      /**
+       * Emit a linear interpolation instruction.
+       */
+      instruction *
+      LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
+          const src_reg &a) const
+      {
+         if (shader->devinfo->gen >= 6) {
+            /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
+             * we need to reorder the operands.
+             */
+            return emit(BRW_OPCODE_LRP, dst, a, y, x);
+
+         } else {
+            /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
+            const dst_reg y_times_a = vgrf(dst.type);
+            const dst_reg one_minus_a = vgrf(dst.type);
+            const dst_reg x_times_one_minus_a = vgrf(dst.type);
+
+            MUL(y_times_a, y, a);
+            ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
+            MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
+            return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
+         }
+      }
+
+      /**
+       * Collect a number of registers in a contiguous range of registers.
+       */
+      instruction *
+      LOAD_PAYLOAD(const dst_reg &dst, const src_reg *src,
+                   unsigned sources, unsigned header_size) const
+      {
+         instruction *inst = emit(SHADER_OPCODE_LOAD_PAYLOAD, dst, src, sources);
+         inst->header_size = header_size;
+         inst->size_written = header_size * REG_SIZE;
+         for (unsigned i = header_size; i < sources; i++) {
+            inst->size_written +=
+               ALIGN(dispatch_width() * type_sz(src[i].type) * dst.stride,
+                     REG_SIZE);
+         }
+
+         return inst;
+      }
+
+      backend_shader *shader;
+
+   private:
+      /**
+       * Workaround for negation of UD registers.  See comment in
+       * fs_generator::generate_code() for more details.
+       */
+      src_reg
+      fix_unsigned_negate(const src_reg &src) const
+      {
+         if (src.type == BRW_REGISTER_TYPE_UD &&
+             src.negate) {
+            dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
+            MOV(temp, src);
+            return src_reg(temp);
+         } else {
+            return src;
+         }
+      }
+
+      /**
+       * Workaround for source register modes not supported by the ternary
+       * instruction encoding.
+       */
+      src_reg
+      fix_3src_operand(const src_reg &src) const
+      {
+         if (src.file == VGRF || src.file == UNIFORM || src.stride > 1) {
+            return src;
+         } else {
+            dst_reg expanded = vgrf(src.type);
+            MOV(expanded, src);
+            return expanded;
+         }
+      }
+
+      /**
+       * Workaround for source register modes not supported by the math
+       * instruction.
+       */
+      src_reg
+      fix_math_operand(const src_reg &src) const
+      {
+         /* Can't do hstride == 0 args on gen6 math, so expand it out. We
+          * might be able to do better by doing execsize = 1 math and then
+          * expanding that result out, but we would need to be careful with
+          * masking.
+          *
+          * Gen6 hardware ignores source modifiers (negate and abs) on math
+          * instructions, so we also move to a temp to set those up.
+          *
+          * Gen7 relaxes most of the above restrictions, but still can't use IMM
+          * operands to math
+          */
+         if ((shader->devinfo->gen == 6 &&
+              (src.file == IMM || src.file == UNIFORM ||
+               src.abs || src.negate)) ||
+             (shader->devinfo->gen == 7 && src.file == IMM)) {
+            const dst_reg tmp = vgrf(src.type);
+            MOV(tmp, src);
+            return tmp;
+         } else {
+            return src;
+         }
+      }
+
+      bblock_t *block;
+      exec_node *cursor;
+
+      unsigned _dispatch_width;
+      unsigned _group;
+      bool force_writemask_all;
+
+      /** Debug annotation info. */
+      struct {
+         const char *str;
+         const void *ir;
+      } annotation;
+   };
+}
+
+#endif
diff --git a/src/intel/compiler/brw_fs_cmod_propagation.cpp b/src/intel/compiler/brw_fs_cmod_propagation.cpp
new file mode 100644
index 00000000000..2d50c92e9e3
--- /dev/null
+++ b/src/intel/compiler/brw_fs_cmod_propagation.cpp
@@ -0,0 +1,183 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "brw_eu.h"
+
+/** @file brw_fs_cmod_propagation.cpp
+ *
+ * Implements a pass that propagates the conditional modifier from a CMP x 0.0
+ * instruction into the instruction that generated x. For instance, in this
+ * sequence
+ *
+ *    add(8)          g70<1>F    g69<8,8,1>F    4096F
+ *    cmp.ge.f0(8)    null       g70<8,8,1>F    0F
+ *
+ * we can do the comparison as part of the ADD instruction directly:
+ *
+ *    add.ge.f0(8)    g70<1>F    g69<8,8,1>F    4096F
+ *
+ * If there had been a use of the flag register and another CMP using g70
+ *
+ *    add.ge.f0(8)    g70<1>F    g69<8,8,1>F    4096F
+ *    (+f0) sel(8)    g71<F>     g72<8,8,1>F    g73<8,8,1>F
+ *    cmp.ge.f0(8)    null       g70<8,8,1>F    0F
+ *
+ * we can recognize that the CMP is generating the flag value that already
+ * exists and therefore remove the instruction.
+ */
+
+static bool
+opt_cmod_propagation_local(const gen_device_info *devinfo, bblock_t *block)
+{
+   bool progress = false;
+   int ip = block->end_ip + 1;
+
+   foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
+      ip--;
+
+      if ((inst->opcode != BRW_OPCODE_AND &&
+           inst->opcode != BRW_OPCODE_CMP &&
+           inst->opcode != BRW_OPCODE_MOV) ||
+          inst->predicate != BRW_PREDICATE_NONE ||
+          !inst->dst.is_null() ||
+          inst->src[0].file != VGRF ||
+          inst->src[0].abs)
+         continue;
+
+      /* Only an AND.NZ can be propagated.  Many AND.Z instructions are
+       * generated (for ir_unop_not in fs_visitor::emit_bool_to_cond_code).
+       * Propagating those would require inverting the condition on the CMP.
+       * This changes both the flag value and the register destination of the
+       * CMP.  That result may be used elsewhere, so we can't change its value
+       * on a whim.
+       */
+      if (inst->opcode == BRW_OPCODE_AND &&
+          !(inst->src[1].is_one() &&
+            inst->conditional_mod == BRW_CONDITIONAL_NZ &&
+            !inst->src[0].negate))
+         continue;
+
+      if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero())
+         continue;
+
+      if (inst->opcode == BRW_OPCODE_MOV &&
+          inst->conditional_mod != BRW_CONDITIONAL_NZ)
+         continue;
+
+      bool read_flag = false;
+      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
+         if (regions_overlap(scan_inst->dst, scan_inst->size_written,
+                             inst->src[0], inst->size_read(0))) {
+            if (scan_inst->is_partial_write() ||
+                scan_inst->dst.offset != inst->src[0].offset ||
+                scan_inst->exec_size != inst->exec_size)
+               break;
+
+            /* CMP's result is the same regardless of dest type. */
+            if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
+                scan_inst->opcode == BRW_OPCODE_CMP &&
+                (inst->dst.type == BRW_REGISTER_TYPE_D ||
+                 inst->dst.type == BRW_REGISTER_TYPE_UD)) {
+               inst->remove(block);
+               progress = true;
+               break;
+            }
+
+            /* If the AND wasn't handled by the previous case, it isn't safe
+             * to remove it.
+             */
+            if (inst->opcode == BRW_OPCODE_AND)
+               break;
+
+            /* Comparisons operate differently for ints and floats */
+            if (scan_inst->dst.type != inst->dst.type &&
+                (scan_inst->dst.type == BRW_REGISTER_TYPE_F ||
+                 inst->dst.type == BRW_REGISTER_TYPE_F))
+               break;
+
+            /* If the instruction generating inst's source also wrote the
+             * flag, and inst is doing a simple .nz comparison, then inst
+             * is redundant - the appropriate value is already in the flag
+             * register.  Delete inst.
+             */
+            if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
+                !inst->src[0].negate &&
+                scan_inst->flags_written()) {
+               inst->remove(block);
+               progress = true;
+               break;
+            }
+
+            /* The conditional mod of the CMP/CMPN instructions behaves
+             * specially because the flag output is not calculated from the
+             * result of the instruction, but the other way around, which
+             * means that even if the condmod to propagate and the condmod
+             * from the CMP instruction are the same they will in general give
+             * different results because they are evaluated based on different
+             * inputs.
+             */
+            if (scan_inst->opcode == BRW_OPCODE_CMP ||
+                scan_inst->opcode == BRW_OPCODE_CMPN)
+               break;
+
+            /* Otherwise, try propagating the conditional. */
+            enum brw_conditional_mod cond =
+               inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod)
+                                   : inst->conditional_mod;
+
+            if (scan_inst->can_do_cmod() &&
+                ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
+                 scan_inst->conditional_mod == cond)) {
+               scan_inst->conditional_mod = cond;
+               inst->remove(block);
+               progress = true;
+            }
+            break;
+         }
+
+         if (scan_inst->flags_written())
+            break;
+
+         read_flag = read_flag || scan_inst->flags_read(devinfo);
+      }
+   }
+
+   return progress;
+}
+
+bool
+fs_visitor::opt_cmod_propagation()
+{
+   bool progress = false;
+
+   foreach_block_reverse(block, cfg) {
+      progress = opt_cmod_propagation_local(devinfo, block) || progress;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
diff --git a/src/intel/compiler/brw_fs_combine_constants.cpp b/src/intel/compiler/brw_fs_combine_constants.cpp
new file mode 100644
index 00000000000..e0c95d379b8
--- /dev/null
+++ b/src/intel/compiler/brw_fs_combine_constants.cpp
@@ -0,0 +1,329 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_combine_constants.cpp
+ *
+ * This file contains the opt_combine_constants() pass that runs after the
+ * regular optimization loop. It passes over the instruction list and
+ * selectively promotes immediate values to registers by emitting a mov(1)
+ * instruction.
+ *
+ * This is useful on Gen 7 particularly, because a few instructions can be
+ * coissued (i.e., issued in the same cycle as another thread on the same EU
+ * issues an instruction) under some circumstances, one of which is that they
+ * cannot use immediate values.
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+
+using namespace brw;
+
+static const bool debug = false;
+
+/* Returns whether an instruction could co-issue if its immediate source were
+ * replaced with a GRF source.
+ */
+static bool
+could_coissue(const struct gen_device_info *devinfo, const fs_inst *inst)
+{
+   if (devinfo->gen != 7)
+      return false;
+
+   switch (inst->opcode) {
+   case BRW_OPCODE_MOV:
+   case BRW_OPCODE_CMP:
+   case BRW_OPCODE_ADD:
+   case BRW_OPCODE_MUL:
+      return true;
+   default:
+      return false;
+   }
+}
+
+/**
+ * Returns true for instructions that don't support immediate sources.
+ */
+static bool
+must_promote_imm(const struct gen_device_info *devinfo, const fs_inst *inst)
+{
+   switch (inst->opcode) {
+   case SHADER_OPCODE_POW:
+      return devinfo->gen < 8;
+   case BRW_OPCODE_MAD:
+   case BRW_OPCODE_LRP:
+      return true;
+   default:
+      return false;
+   }
+}
+
+/** A box for putting fs_regs in a linked list. */
+struct reg_link {
+   DECLARE_RALLOC_CXX_OPERATORS(reg_link)
+
+   reg_link(fs_reg *reg) : reg(reg) {}
+
+   struct exec_node link;
+   fs_reg *reg;
+};
+
+static struct exec_node *
+link(void *mem_ctx, fs_reg *reg)
+{
+   reg_link *l = new(mem_ctx) reg_link(reg);
+   return &l->link;
+}
+
+/**
+ * Information about an immediate value.
+ */
+struct imm {
+   /** The common ancestor of all blocks using this immediate value. */
+   bblock_t *block;
+
+   /**
+    * The instruction generating the immediate value, if all uses are contained
+    * within a single basic block. Otherwise, NULL.
+    */
+   fs_inst *inst;
+
+   /**
+    * A list of fs_regs that refer to this immediate.  If we promote it, we'll
+    * have to patch these up to refer to the new GRF.
+    */
+   exec_list *uses;
+
+   /** The immediate value.  We currently only handle floats. */
+   float val;
+
+   /**
+    * The GRF register and subregister number where we've decided to store the
+    * constant value.
+    */
+   uint8_t subreg_offset;
+   uint16_t nr;
+
+   /** The number of coissuable instructions using this immediate. */
+   uint16_t uses_by_coissue;
+
+   /**
+    * Whether this constant is used by an instruction that can't handle an
+    * immediate source (and already has to be promoted to a GRF).
+    */
+   bool must_promote;
+
+   uint16_t first_use_ip;
+   uint16_t last_use_ip;
+};
+
+/** The working set of information about immediates. */
+struct table {
+   struct imm *imm;
+   int size;
+   int len;
+};
+
+static struct imm *
+find_imm(struct table *table, float val)
+{
+   for (int i = 0; i < table->len; i++) {
+      if (table->imm[i].val == val) {
+         return &table->imm[i];
+      }
+   }
+   return NULL;
+}
+
+static struct imm *
+new_imm(struct table *table, void *mem_ctx)
+{
+   if (table->len == table->size) {
+      table->size *= 2;
+      table->imm = reralloc(mem_ctx, table->imm, struct imm, table->size);
+   }
+   return &table->imm[table->len++];
+}
+
+/**
+ * Comparator used for sorting an array of imm structures.
+ *
+ * We sort by basic block number, then last use IP, then first use IP (least
+ * to greatest). This sorting causes immediates live in the same area to be
+ * allocated to the same register in the hopes that all values will be dead
+ * about the same time and the register can be reused.
+ */
+static int
+compare(const void *_a, const void *_b)
+{
+   const struct imm *a = (const struct imm *)_a,
+                    *b = (const struct imm *)_b;
+
+   int block_diff = a->block->num - b->block->num;
+   if (block_diff)
+      return block_diff;
+
+   int end_diff = a->last_use_ip - b->last_use_ip;
+   if (end_diff)
+      return end_diff;
+
+   return a->first_use_ip - b->first_use_ip;
+}
+
+bool
+fs_visitor::opt_combine_constants()
+{
+   void *const_ctx = ralloc_context(NULL);
+
+   struct table table;
+   table.size = 8;
+   table.len = 0;
+   table.imm = ralloc_array(const_ctx, struct imm, table.size);
+
+   cfg->calculate_idom();
+   unsigned ip = -1;
+
+   /* Make a pass through all instructions and count the number of times each
+    * constant is used by coissueable instructions or instructions that cannot
+    * take immediate arguments.
+    */
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      ip++;
+
+      if (!could_coissue(devinfo, inst) && !must_promote_imm(devinfo, inst))
+         continue;
+
+      for (int i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file != IMM ||
+             inst->src[i].type != BRW_REGISTER_TYPE_F)
+            continue;
+
+         float val = !inst->can_do_source_mods(devinfo) ? inst->src[i].f :
+                     fabs(inst->src[i].f);
+         struct imm *imm = find_imm(&table, val);
+
+         if (imm) {
+            bblock_t *intersection = cfg_t::intersect(block, imm->block);
+            if (intersection != imm->block)
+               imm->inst = NULL;
+            imm->block = intersection;
+            imm->uses->push_tail(link(const_ctx, &inst->src[i]));
+            imm->uses_by_coissue += could_coissue(devinfo, inst);
+            imm->must_promote = imm->must_promote || must_promote_imm(devinfo, inst);
+            imm->last_use_ip = ip;
+         } else {
+            imm = new_imm(&table, const_ctx);
+            imm->block = block;
+            imm->inst = inst;
+            imm->uses = new(const_ctx) exec_list();
+            imm->uses->push_tail(link(const_ctx, &inst->src[i]));
+            imm->val = val;
+            imm->uses_by_coissue = could_coissue(devinfo, inst);
+            imm->must_promote = must_promote_imm(devinfo, inst);
+            imm->first_use_ip = ip;
+            imm->last_use_ip = ip;
+         }
+      }
+   }
+
+   /* Remove constants from the table that don't have enough uses to make them
+    * profitable to store in a register.
+    */
+   for (int i = 0; i < table.len;) {
+      struct imm *imm = &table.imm[i];
+
+      if (!imm->must_promote && imm->uses_by_coissue < 4) {
+         table.imm[i] = table.imm[table.len - 1];
+         table.len--;
+         continue;
+      }
+      i++;
+   }
+   if (table.len == 0) {
+      ralloc_free(const_ctx);
+      return false;
+   }
+   if (cfg->num_blocks != 1)
+      qsort(table.imm, table.len, sizeof(struct imm), compare);
+
+   /* Insert MOVs to load the constant values into GRFs. */
+   fs_reg reg(VGRF, alloc.allocate(1));
+   reg.stride = 0;
+   for (int i = 0; i < table.len; i++) {
+      struct imm *imm = &table.imm[i];
+      /* Insert it either before the instruction that generated the immediate
+       * or after the last non-control flow instruction of the common ancestor.
+       */
+      exec_node *n = (imm->inst ? imm->inst :
+                      imm->block->last_non_control_flow_inst()->next);
+      const fs_builder ibld = bld.at(imm->block, n).exec_all().group(1, 0);
+
+      ibld.MOV(reg, brw_imm_f(imm->val));
+      imm->nr = reg.nr;
+      imm->subreg_offset = reg.offset;
+
+      reg.offset += sizeof(float);
+      if (reg.offset == 8 * sizeof(float)) {
+         reg.nr = alloc.allocate(1);
+         reg.offset = 0;
+      }
+   }
+   promoted_constants = table.len;
+
+   /* Rewrite the immediate sources to refer to the new GRFs. */
+   for (int i = 0; i < table.len; i++) {
+      foreach_list_typed(reg_link, link, link, table.imm[i].uses) {
+         fs_reg *reg = link->reg;
+         reg->file = VGRF;
+         reg->nr = table.imm[i].nr;
+         reg->offset = table.imm[i].subreg_offset;
+         reg->stride = 0;
+         reg->negate = signbit(reg->f) != signbit(table.imm[i].val);
+         assert((isnan(reg->f) && isnan(table.imm[i].val)) ||
+                fabsf(reg->f) == fabs(table.imm[i].val));
+      }
+   }
+
+   if (debug) {
+      for (int i = 0; i < table.len; i++) {
+         struct imm *imm = &table.imm[i];
+
+         printf("%.3fF - block %3d, reg %3d sub %2d, Uses: (%2d, %2d), "
+                "IP: %4d to %4d, length %4d\n",
+                imm->val,
+                imm->block->num,
+                imm->nr,
+                imm->subreg_offset,
+                imm->must_promote,
+                imm->uses_by_coissue,
+                imm->first_use_ip,
+                imm->last_use_ip,
+                imm->last_use_ip - imm->first_use_ip);
+      }
+   }
+
+   ralloc_free(const_ctx);
+   invalidate_live_intervals();
+
+   return true;
+}
diff --git a/src/intel/compiler/brw_fs_copy_propagation.cpp b/src/intel/compiler/brw_fs_copy_propagation.cpp
new file mode 100644
index 00000000000..cb117396089
--- /dev/null
+++ b/src/intel/compiler/brw_fs_copy_propagation.cpp
@@ -0,0 +1,869 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_copy_propagation.cpp
+ *
+ * Support for global copy propagation in two passes: A local pass that does
+ * intra-block copy (and constant) propagation, and a global pass that uses
+ * dataflow analysis on the copies available at the end of each block to re-do
+ * local copy propagation with more copies available.
+ *
+ * See Muchnick's Advanced Compiler Design and Implementation, section
+ * 12.5 (p356).
+ */
+
+#define ACP_HASH_SIZE 16
+
+#include "util/bitset.h"
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "brw_eu.h"
+
+namespace { /* avoid conflict with opt_copy_propagation_elements */
+struct acp_entry : public exec_node {
+   fs_reg dst;
+   fs_reg src;
+   uint8_t size_written;
+   uint8_t size_read;
+   enum opcode opcode;
+   bool saturate;
+};
+
+struct block_data {
+   /**
+    * Which entries in the fs_copy_prop_dataflow acp table are live at the
+    * start of this block.  This is the useful output of the analysis, since
+    * it lets us plug those into the local copy propagation on the second
+    * pass.
+    */
+   BITSET_WORD *livein;
+
+   /**
+    * Which entries in the fs_copy_prop_dataflow acp table are live at the end
+    * of this block.  This is done in initial setup from the per-block acps
+    * returned by the first local copy prop pass.
+    */
+   BITSET_WORD *liveout;
+
+   /**
+    * Which entries in the fs_copy_prop_dataflow acp table are generated by
+    * instructions in this block which reach the end of the block without
+    * being killed.
+    */
+   BITSET_WORD *copy;
+
+   /**
+    * Which entries in the fs_copy_prop_dataflow acp table are killed over the
+    * course of this block.
+    */
+   BITSET_WORD *kill;
+};
+
+class fs_copy_prop_dataflow
+{
+public:
+   fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg,
+                         exec_list *out_acp[ACP_HASH_SIZE]);
+
+   void setup_initial_values();
+   void run();
+
+   void dump_block_data() const UNUSED;
+
+   void *mem_ctx;
+   cfg_t *cfg;
+
+   acp_entry **acp;
+   int num_acp;
+   int bitset_words;
+
+  struct block_data *bd;
+};
+} /* anonymous namespace */
+
+fs_copy_prop_dataflow::fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg,
+                                             exec_list *out_acp[ACP_HASH_SIZE])
+   : mem_ctx(mem_ctx), cfg(cfg)
+{
+   bd = rzalloc_array(mem_ctx, struct block_data, cfg->num_blocks);
+
+   num_acp = 0;
+   foreach_block (block, cfg) {
+      for (int i = 0; i < ACP_HASH_SIZE; i++) {
+         num_acp += out_acp[block->num][i].length();
+      }
+   }
+
+   acp = rzalloc_array(mem_ctx, struct acp_entry *, num_acp);
+
+   bitset_words = BITSET_WORDS(num_acp);
+
+   int next_acp = 0;
+   foreach_block (block, cfg) {
+      bd[block->num].livein = rzalloc_array(bd, BITSET_WORD, bitset_words);
+      bd[block->num].liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
+      bd[block->num].copy = rzalloc_array(bd, BITSET_WORD, bitset_words);
+      bd[block->num].kill = rzalloc_array(bd, BITSET_WORD, bitset_words);
+
+      for (int i = 0; i < ACP_HASH_SIZE; i++) {
+         foreach_in_list(acp_entry, entry, &out_acp[block->num][i]) {
+            acp[next_acp] = entry;
+
+            /* opt_copy_propagation_local populates out_acp with copies created
+             * in a block which are still live at the end of the block.  This
+             * is exactly what we want in the COPY set.
+             */
+            BITSET_SET(bd[block->num].copy, next_acp);
+
+            next_acp++;
+         }
+      }
+   }
+
+   assert(next_acp == num_acp);
+
+   setup_initial_values();
+   run();
+}
+
+/**
+ * Set up initial values for each of the data flow sets, prior to running
+ * the fixed-point algorithm.
+ */
+void
+fs_copy_prop_dataflow::setup_initial_values()
+{
+   /* Initialize the COPY and KILL sets. */
+   foreach_block (block, cfg) {
+      foreach_inst_in_block(fs_inst, inst, block) {
+         if (inst->dst.file != VGRF)
+            continue;
+
+         /* Mark ACP entries which are killed by this instruction. */
+         for (int i = 0; i < num_acp; i++) {
+            if (regions_overlap(inst->dst, inst->size_written,
+                                acp[i]->dst, acp[i]->size_written) ||
+                regions_overlap(inst->dst, inst->size_written,
+                                acp[i]->src, acp[i]->size_read)) {
+               BITSET_SET(bd[block->num].kill, i);
+            }
+         }
+      }
+   }
+
+   /* Populate the initial values for the livein and liveout sets.  For the
+    * block at the start of the program, livein = 0 and liveout = copy.
+    * For the others, set liveout to 0 (the empty set) and livein to ~0
+    * (the universal set).
+    */
+   foreach_block (block, cfg) {
+      if (block->parents.is_empty()) {
+         for (int i = 0; i < bitset_words; i++) {
+            bd[block->num].livein[i] = 0u;
+            bd[block->num].liveout[i] = bd[block->num].copy[i];
+         }
+      } else {
+         for (int i = 0; i < bitset_words; i++) {
+            bd[block->num].liveout[i] = 0u;
+            bd[block->num].livein[i] = ~0u;
+         }
+      }
+   }
+}
+
+/**
+ * Walk the set of instructions in the block, marking which entries in the acp
+ * are killed by the block.
+ */
+void
+fs_copy_prop_dataflow::run()
+{
+   bool progress;
+
+   do {
+      progress = false;
+
+      /* Update liveout for all blocks. */
+      foreach_block (block, cfg) {
+         if (block->parents.is_empty())
+            continue;
+
+         for (int i = 0; i < bitset_words; i++) {
+            const BITSET_WORD old_liveout = bd[block->num].liveout[i];
+
+            bd[block->num].liveout[i] =
+               bd[block->num].copy[i] | (bd[block->num].livein[i] &
+                                         ~bd[block->num].kill[i]);
+
+            if (old_liveout != bd[block->num].liveout[i])
+               progress = true;
+         }
+      }
+
+      /* Update livein for all blocks.  If a copy is live out of all parent
+       * blocks, it's live coming in to this block.
+       */
+      foreach_block (block, cfg) {
+         if (block->parents.is_empty())
+            continue;
+
+         for (int i = 0; i < bitset_words; i++) {
+            const BITSET_WORD old_livein = bd[block->num].livein[i];
+
+            bd[block->num].livein[i] = ~0u;
+            foreach_list_typed(bblock_link, parent_link, link, &block->parents) {
+               bblock_t *parent = parent_link->block;
+               bd[block->num].livein[i] &= bd[parent->num].liveout[i];
+            }
+
+            if (old_livein != bd[block->num].livein[i])
+               progress = true;
+         }
+      }
+   } while (progress);
+}
+
+void
+fs_copy_prop_dataflow::dump_block_data() const
+{
+   foreach_block (block, cfg) {
+      fprintf(stderr, "Block %d [%d, %d] (parents ", block->num,
+             block->start_ip, block->end_ip);
+      foreach_list_typed(bblock_link, link, link, &block->parents) {
+         bblock_t *parent = link->block;
+         fprintf(stderr, "%d ", parent->num);
+      }
+      fprintf(stderr, "):\n");
+      fprintf(stderr, "       livein = 0x");
+      for (int i = 0; i < bitset_words; i++)
+         fprintf(stderr, "%08x", bd[block->num].livein[i]);
+      fprintf(stderr, ", liveout = 0x");
+      for (int i = 0; i < bitset_words; i++)
+         fprintf(stderr, "%08x", bd[block->num].liveout[i]);
+      fprintf(stderr, ",\n       copy   = 0x");
+      for (int i = 0; i < bitset_words; i++)
+         fprintf(stderr, "%08x", bd[block->num].copy[i]);
+      fprintf(stderr, ", kill    = 0x");
+      for (int i = 0; i < bitset_words; i++)
+         fprintf(stderr, "%08x", bd[block->num].kill[i]);
+      fprintf(stderr, "\n");
+   }
+}
+
+static bool
+is_logic_op(enum opcode opcode)
+{
+   return (opcode == BRW_OPCODE_AND ||
+           opcode == BRW_OPCODE_OR  ||
+           opcode == BRW_OPCODE_XOR ||
+           opcode == BRW_OPCODE_NOT);
+}
+
+static bool
+can_take_stride(fs_inst *inst, unsigned arg, unsigned stride,
+                const gen_device_info *devinfo)
+{
+   if (stride > 4)
+      return false;
+
+   /* 3-source instructions can only be Align16, which restricts what strides
+    * they can take. They can only take a stride of 1 (the usual case), or 0
+    * with a special "repctrl" bit. But the repctrl bit doesn't work for
+    * 64-bit datatypes, so if the source type is 64-bit then only a stride of
+    * 1 is allowed. From the Broadwell PRM, Volume 7 "3D Media GPGPU", page
+    * 944:
+    *
+    *    This is applicable to 32b datatypes and 16b datatype. 64b datatypes
+    *    cannot use the replicate control.
+    */
+   if (inst->is_3src(devinfo)) {
+      if (type_sz(inst->src[arg].type) > 4)
+         return stride == 1;
+      else
+         return stride == 1 || stride == 0;
+   }
+
+   /* From the Broadwell PRM, Volume 2a "Command Reference - Instructions",
+    * page 391 ("Extended Math Function"):
+    *
+    *     The following restrictions apply for align1 mode: Scalar source is
+    *     supported. Source and destination horizontal stride must be the
+    *     same.
+    *
+    * From the Haswell PRM Volume 2b "Command Reference - Instructions", page
+    * 134 ("Extended Math Function"):
+    *
+    *    Scalar source is supported. Source and destination horizontal stride
+    *    must be 1.
+    *
+    * and similar language exists for IVB and SNB. Pre-SNB, math instructions
+    * are sends, so the sources are moved to MRF's and there are no
+    * restrictions.
+    */
+   if (inst->is_math()) {
+      if (devinfo->gen == 6 || devinfo->gen == 7) {
+         assert(inst->dst.stride == 1);
+         return stride == 1 || stride == 0;
+      } else if (devinfo->gen >= 8) {
+         return stride == inst->dst.stride || stride == 0;
+      }
+   }
+
+   return true;
+}
+
+bool
+fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
+{
+   if (inst->src[arg].file != VGRF)
+      return false;
+
+   if (entry->src.file == IMM)
+      return false;
+   assert(entry->src.file == VGRF || entry->src.file == UNIFORM ||
+          entry->src.file == ATTR);
+
+   if (entry->opcode == SHADER_OPCODE_LOAD_PAYLOAD &&
+       inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD)
+      return false;
+
+   assert(entry->dst.file == VGRF);
+   if (inst->src[arg].nr != entry->dst.nr)
+      return false;
+
+   /* Bail if inst is reading a range that isn't contained in the range
+    * that entry is writing.
+    */
+   if (!region_contained_in(inst->src[arg], inst->size_read(arg),
+                            entry->dst, entry->size_written))
+      return false;
+
+   /* we can't generally copy-propagate UD negations because we
+    * can end up accessing the resulting values as signed integers
+    * instead. See also resolve_ud_negate() and comment in
+    * fs_generator::generate_code.
+    */
+   if (entry->src.type == BRW_REGISTER_TYPE_UD &&
+       entry->src.negate)
+      return false;
+
+   bool has_source_modifiers = entry->src.abs || entry->src.negate;
+
+   if ((has_source_modifiers || entry->src.file == UNIFORM ||
+        !entry->src.is_contiguous()) &&
+       !inst->can_do_source_mods(devinfo))
+      return false;
+
+   if (has_source_modifiers &&
+       inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_WRITE)
+      return false;
+
+   /* Bail if the result of composing both strides would exceed the
+    * hardware limit.
+    */
+   if (!can_take_stride(inst, arg, entry->src.stride * inst->src[arg].stride,
+                        devinfo))
+      return false;
+
+   /* Bail if the instruction type is larger than the execution type of the
+    * copy, what implies that each channel is reading multiple channels of the
+    * destination of the copy, and simply replacing the sources would give a
+    * program with different semantics.
+    */
+   if (type_sz(entry->dst.type) < type_sz(inst->src[arg].type))
+      return false;
+
+   /* Bail if the result of composing both strides cannot be expressed
+    * as another stride. This avoids, for example, trying to transform
+    * this:
+    *
+    *     MOV (8) rX<1>UD rY<0;1,0>UD
+    *     FOO (8) ...     rX<8;8,1>UW
+    *
+    * into this:
+    *
+    *     FOO (8) ...     rY<0;1,0>UW
+    *
+    * Which would have different semantics.
+    */
+   if (entry->src.stride != 1 &&
+       (inst->src[arg].stride *
+        type_sz(inst->src[arg].type)) % type_sz(entry->src.type) != 0)
+      return false;
+
+   /* Since semantics of source modifiers are type-dependent we need to
+    * ensure that the meaning of the instruction remains the same if we
+    * change the type. If the sizes of the types are different the new
+    * instruction will read a different amount of data than the original
+    * and the semantics will always be different.
+    */
+   if (has_source_modifiers &&
+       entry->dst.type != inst->src[arg].type &&
+       (!inst->can_change_types() ||
+        type_sz(entry->dst.type) != type_sz(inst->src[arg].type)))
+      return false;
+
+   if (devinfo->gen >= 8 && (entry->src.negate || entry->src.abs) &&
+       is_logic_op(inst->opcode)) {
+      return false;
+   }
+
+   if (entry->saturate) {
+      switch(inst->opcode) {
+      case BRW_OPCODE_SEL:
+         if ((inst->conditional_mod != BRW_CONDITIONAL_GE &&
+              inst->conditional_mod != BRW_CONDITIONAL_L) ||
+             inst->src[1].file != IMM ||
+             inst->src[1].f < 0.0 ||
+             inst->src[1].f > 1.0) {
+            return false;
+         }
+         break;
+      default:
+         return false;
+      }
+   }
+
+   inst->src[arg].file = entry->src.file;
+   inst->src[arg].nr = entry->src.nr;
+   inst->src[arg].stride *= entry->src.stride;
+   inst->saturate = inst->saturate || entry->saturate;
+
+   /* Compute the offset of inst->src[arg] relative to entry->dst */
+   const unsigned rel_offset = inst->src[arg].offset - entry->dst.offset;
+
+   /* Compute the first component of the copy that the instruction is
+    * reading, and the base byte offset within that component.
+    */
+   assert(entry->dst.offset % REG_SIZE == 0 && entry->dst.stride == 1);
+   const unsigned component = rel_offset / type_sz(entry->dst.type);
+   const unsigned suboffset = rel_offset % type_sz(entry->dst.type);
+
+   /* Calculate the byte offset at the origin of the copy of the given
+    * component and suboffset.
+    */
+   inst->src[arg].offset = suboffset +
+      component * entry->src.stride * type_sz(entry->src.type) +
+      entry->src.offset;
+
+   if (has_source_modifiers) {
+      if (entry->dst.type != inst->src[arg].type) {
+         /* We are propagating source modifiers from a MOV with a different
+          * type.  If we got here, then we can just change the source and
+          * destination types of the instruction and keep going.
+          */
+         assert(inst->can_change_types());
+         for (int i = 0; i < inst->sources; i++) {
+            inst->src[i].type = entry->dst.type;
+         }
+         inst->dst.type = entry->dst.type;
+      }
+
+      if (!inst->src[arg].abs) {
+         inst->src[arg].abs = entry->src.abs;
+         inst->src[arg].negate ^= entry->src.negate;
+      }
+   }
+
+   return true;
+}
+
+
+bool
+fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
+{
+   bool progress = false;
+
+   if (entry->src.file != IMM)
+      return false;
+   if (type_sz(entry->src.type) > 4)
+      return false;
+   if (entry->saturate)
+      return false;
+
+   for (int i = inst->sources - 1; i >= 0; i--) {
+      if (inst->src[i].file != VGRF)
+         continue;
+
+      assert(entry->dst.file == VGRF);
+      if (inst->src[i].nr != entry->dst.nr)
+         continue;
+
+      /* Bail if inst is reading a range that isn't contained in the range
+       * that entry is writing.
+       */
+      if (!region_contained_in(inst->src[i], inst->size_read(i),
+                               entry->dst, entry->size_written))
+         continue;
+
+      /* If the type sizes don't match each channel of the instruction is
+       * either extracting a portion of the constant (which could be handled
+       * with some effort but the code below doesn't) or reading multiple
+       * channels of the source at once.
+       */
+      if (type_sz(inst->src[i].type) != type_sz(entry->dst.type))
+         continue;
+
+      fs_reg val = entry->src;
+      val.type = inst->src[i].type;
+
+      if (inst->src[i].abs) {
+         if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) ||
+             !brw_abs_immediate(val.type, &val.as_brw_reg())) {
+            continue;
+         }
+      }
+
+      if (inst->src[i].negate) {
+         if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) ||
+             !brw_negate_immediate(val.type, &val.as_brw_reg())) {
+            continue;
+         }
+      }
+
+      switch (inst->opcode) {
+      case BRW_OPCODE_MOV:
+      case SHADER_OPCODE_LOAD_PAYLOAD:
+      case FS_OPCODE_PACK:
+         inst->src[i] = val;
+         progress = true;
+         break;
+
+      case SHADER_OPCODE_INT_QUOTIENT:
+      case SHADER_OPCODE_INT_REMAINDER:
+         /* FINISHME: Promote non-float constants and remove this. */
+         if (devinfo->gen < 8)
+            break;
+         /* fallthrough */
+      case SHADER_OPCODE_POW:
+         /* Allow constant propagation into src1 (except on Gen 6 which
+          * doesn't support scalar source math), and let constant combining
+          * promote the constant on Gen < 8.
+          */
+         if (devinfo->gen == 6)
+            break;
+         /* fallthrough */
+      case BRW_OPCODE_BFI1:
+      case BRW_OPCODE_ASR:
+      case BRW_OPCODE_SHL:
+      case BRW_OPCODE_SHR:
+      case BRW_OPCODE_SUBB:
+         if (i == 1) {
+            inst->src[i] = val;
+            progress = true;
+         }
+         break;
+
+      case BRW_OPCODE_MACH:
+      case BRW_OPCODE_MUL:
+      case SHADER_OPCODE_MULH:
+      case BRW_OPCODE_ADD:
+      case BRW_OPCODE_OR:
+      case BRW_OPCODE_AND:
+      case BRW_OPCODE_XOR:
+      case BRW_OPCODE_ADDC:
+         if (i == 1) {
+            inst->src[i] = val;
+            progress = true;
+         } else if (i == 0 && inst->src[1].file != IMM) {
+            /* Fit this constant in by commuting the operands.
+             * Exception: we can't do this for 32-bit integer MUL/MACH
+             * because it's asymmetric.
+             *
+             * The BSpec says for Broadwell that
+             *
+             *    "When multiplying DW x DW, the dst cannot be accumulator."
+             *
+             * Integer MUL with a non-accumulator destination will be lowered
+             * by lower_integer_multiplication(), so don't restrict it.
+             */
+            if (((inst->opcode == BRW_OPCODE_MUL &&
+                  inst->dst.is_accumulator()) ||
+                 inst->opcode == BRW_OPCODE_MACH) &&
+                (inst->src[1].type == BRW_REGISTER_TYPE_D ||
+                 inst->src[1].type == BRW_REGISTER_TYPE_UD))
+               break;
+            inst->src[0] = inst->src[1];
+            inst->src[1] = val;
+            progress = true;
+         }
+         break;
+
+      case BRW_OPCODE_CMP:
+      case BRW_OPCODE_IF:
+         if (i == 1) {
+            inst->src[i] = val;
+            progress = true;
+         } else if (i == 0 && inst->src[1].file != IMM) {
+            enum brw_conditional_mod new_cmod;
+
+            new_cmod = brw_swap_cmod(inst->conditional_mod);
+            if (new_cmod != BRW_CONDITIONAL_NONE) {
+               /* Fit this constant in by swapping the operands and
+                * flipping the test
+                */
+               inst->src[0] = inst->src[1];
+               inst->src[1] = val;
+               inst->conditional_mod = new_cmod;
+               progress = true;
+            }
+         }
+         break;
+
+      case BRW_OPCODE_SEL:
+         if (i == 1) {
+            inst->src[i] = val;
+            progress = true;
+         } else if (i == 0 && inst->src[1].file != IMM) {
+            inst->src[0] = inst->src[1];
+            inst->src[1] = val;
+
+            /* If this was predicated, flipping operands means
+             * we also need to flip the predicate.
+             */
+            if (inst->conditional_mod == BRW_CONDITIONAL_NONE) {
+               inst->predicate_inverse =
+                  !inst->predicate_inverse;
+            }
+            progress = true;
+         }
+         break;
+
+      case SHADER_OPCODE_UNTYPED_ATOMIC:
+      case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+      case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+      case SHADER_OPCODE_TYPED_ATOMIC:
+      case SHADER_OPCODE_TYPED_SURFACE_READ:
+      case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+         /* We only propagate into the surface argument of the
+          * instruction. Everything else goes through LOAD_PAYLOAD.
+          */
+         if (i == 1) {
+            inst->src[i] = val;
+            progress = true;
+         }
+         break;
+
+      case FS_OPCODE_FB_WRITE_LOGICAL:
+         /* The stencil and omask sources of FS_OPCODE_FB_WRITE_LOGICAL are
+          * bit-cast using a strided region so they cannot be immediates.
+          */
+         if (i != FB_WRITE_LOGICAL_SRC_SRC_STENCIL &&
+             i != FB_WRITE_LOGICAL_SRC_OMASK) {
+            inst->src[i] = val;
+            progress = true;
+         }
+         break;
+
+      case SHADER_OPCODE_TEX_LOGICAL:
+      case SHADER_OPCODE_TXD_LOGICAL:
+      case SHADER_OPCODE_TXF_LOGICAL:
+      case SHADER_OPCODE_TXL_LOGICAL:
+      case SHADER_OPCODE_TXS_LOGICAL:
+      case FS_OPCODE_TXB_LOGICAL:
+      case SHADER_OPCODE_TXF_CMS_LOGICAL:
+      case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+      case SHADER_OPCODE_TXF_UMS_LOGICAL:
+      case SHADER_OPCODE_TXF_MCS_LOGICAL:
+      case SHADER_OPCODE_LOD_LOGICAL:
+      case SHADER_OPCODE_TG4_LOGICAL:
+      case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+      case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+      case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+      case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+      case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+      case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+      case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+         inst->src[i] = val;
+         progress = true;
+         break;
+
+      case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+      case SHADER_OPCODE_BROADCAST:
+         inst->src[i] = val;
+         progress = true;
+         break;
+
+      case BRW_OPCODE_MAD:
+      case BRW_OPCODE_LRP:
+         inst->src[i] = val;
+         progress = true;
+         break;
+
+      default:
+         break;
+      }
+   }
+
+   return progress;
+}
+
+static bool
+can_propagate_from(fs_inst *inst)
+{
+   return (inst->opcode == BRW_OPCODE_MOV &&
+           inst->dst.file == VGRF &&
+           ((inst->src[0].file == VGRF &&
+             !regions_overlap(inst->dst, inst->size_written,
+                              inst->src[0], inst->size_read(0))) ||
+            inst->src[0].file == ATTR ||
+            inst->src[0].file == UNIFORM ||
+            inst->src[0].file == IMM) &&
+           inst->src[0].type == inst->dst.type &&
+           !inst->is_partial_write());
+}
+
+/* Walks a basic block and does copy propagation on it using the acp
+ * list.
+ */
+bool
+fs_visitor::opt_copy_propagation_local(void *copy_prop_ctx, bblock_t *block,
+                                       exec_list *acp)
+{
+   bool progress = false;
+
+   foreach_inst_in_block(fs_inst, inst, block) {
+      /* Try propagating into this instruction. */
+      for (int i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file != VGRF)
+            continue;
+
+         foreach_in_list(acp_entry, entry, &acp[inst->src[i].nr % ACP_HASH_SIZE]) {
+            if (try_constant_propagate(inst, entry))
+               progress = true;
+            else if (try_copy_propagate(inst, i, entry))
+               progress = true;
+         }
+      }
+
+      /* kill the destination from the ACP */
+      if (inst->dst.file == VGRF) {
+         foreach_in_list_safe(acp_entry, entry, &acp[inst->dst.nr % ACP_HASH_SIZE]) {
+            if (regions_overlap(entry->dst, entry->size_written,
+                                inst->dst, inst->size_written))
+               entry->remove();
+         }
+
+         /* Oops, we only have the chaining hash based on the destination, not
+          * the source, so walk across the entire table.
+          */
+         for (int i = 0; i < ACP_HASH_SIZE; i++) {
+            foreach_in_list_safe(acp_entry, entry, &acp[i]) {
+               /* Make sure we kill the entry if this instruction overwrites
+                * _any_ of the registers that it reads
+                */
+               if (regions_overlap(entry->src, entry->size_read,
+                                   inst->dst, inst->size_written))
+                  entry->remove();
+            }
+	 }
+      }
+
+      /* If this instruction's source could potentially be folded into the
+       * operand of another instruction, add it to the ACP.
+       */
+      if (can_propagate_from(inst)) {
+         acp_entry *entry = ralloc(copy_prop_ctx, acp_entry);
+         entry->dst = inst->dst;
+         entry->src = inst->src[0];
+         entry->size_written = inst->size_written;
+         entry->size_read = inst->size_read(0);
+         entry->opcode = inst->opcode;
+         entry->saturate = inst->saturate;
+         acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry);
+      } else if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD &&
+                 inst->dst.file == VGRF) {
+         int offset = 0;
+         for (int i = 0; i < inst->sources; i++) {
+            int effective_width = i < inst->header_size ? 8 : inst->exec_size;
+            assert(effective_width * type_sz(inst->src[i].type) % REG_SIZE == 0);
+            const unsigned size_written = effective_width *
+                                          type_sz(inst->src[i].type);
+            if (inst->src[i].file == VGRF) {
+               acp_entry *entry = rzalloc(copy_prop_ctx, acp_entry);
+               entry->dst = byte_offset(inst->dst, offset);
+               entry->src = inst->src[i];
+               entry->size_written = size_written;
+               entry->size_read = inst->size_read(i);
+               entry->opcode = inst->opcode;
+               if (!entry->dst.equals(inst->src[i])) {
+                  acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry);
+               } else {
+                  ralloc_free(entry);
+               }
+            }
+            offset += size_written;
+         }
+      }
+   }
+
+   return progress;
+}
+
+bool
+fs_visitor::opt_copy_propagation()
+{
+   bool progress = false;
+   void *copy_prop_ctx = ralloc_context(NULL);
+   exec_list *out_acp[cfg->num_blocks];
+
+   for (int i = 0; i < cfg->num_blocks; i++)
+      out_acp[i] = new exec_list [ACP_HASH_SIZE];
+
+   /* First, walk through each block doing local copy propagation and getting
+    * the set of copies available at the end of the block.
+    */
+   foreach_block (block, cfg) {
+      progress = opt_copy_propagation_local(copy_prop_ctx, block,
+                                            out_acp[block->num]) || progress;
+   }
+
+   /* Do dataflow analysis for those available copies. */
+   fs_copy_prop_dataflow dataflow(copy_prop_ctx, cfg, out_acp);
+
+   /* Next, re-run local copy propagation, this time with the set of copies
+    * provided by the dataflow analysis available at the start of a block.
+    */
+   foreach_block (block, cfg) {
+      exec_list in_acp[ACP_HASH_SIZE];
+
+      for (int i = 0; i < dataflow.num_acp; i++) {
+         if (BITSET_TEST(dataflow.bd[block->num].livein, i)) {
+            struct acp_entry *entry = dataflow.acp[i];
+            in_acp[entry->dst.nr % ACP_HASH_SIZE].push_tail(entry);
+         }
+      }
+
+      progress = opt_copy_propagation_local(copy_prop_ctx, block, in_acp) ||
+                 progress;
+   }
+
+   for (int i = 0; i < cfg->num_blocks; i++)
+      delete [] out_acp[i];
+   ralloc_free(copy_prop_ctx);
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
diff --git a/src/intel/compiler/brw_fs_cse.cpp b/src/intel/compiler/brw_fs_cse.cpp
new file mode 100644
index 00000000000..48220efd730
--- /dev/null
+++ b/src/intel/compiler/brw_fs_cse.cpp
@@ -0,0 +1,380 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+
+/** @file brw_fs_cse.cpp
+ *
+ * Support for local common subexpression elimination.
+ *
+ * See Muchnick's Advanced Compiler Design and Implementation, section
+ * 13.1 (p378).
+ */
+
+using namespace brw;
+
+namespace {
+struct aeb_entry : public exec_node {
+   /** The instruction that generates the expression value. */
+   fs_inst *generator;
+
+   /** The temporary where the value is stored. */
+   fs_reg tmp;
+};
+}
+
+static bool
+is_expression(const fs_visitor *v, const fs_inst *const inst)
+{
+   switch (inst->opcode) {
+   case BRW_OPCODE_MOV:
+   case BRW_OPCODE_SEL:
+   case BRW_OPCODE_NOT:
+   case BRW_OPCODE_AND:
+   case BRW_OPCODE_OR:
+   case BRW_OPCODE_XOR:
+   case BRW_OPCODE_SHR:
+   case BRW_OPCODE_SHL:
+   case BRW_OPCODE_ASR:
+   case BRW_OPCODE_CMP:
+   case BRW_OPCODE_CMPN:
+   case BRW_OPCODE_ADD:
+   case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
+   case BRW_OPCODE_FRC:
+   case BRW_OPCODE_RNDU:
+   case BRW_OPCODE_RNDD:
+   case BRW_OPCODE_RNDE:
+   case BRW_OPCODE_RNDZ:
+   case BRW_OPCODE_LINE:
+   case BRW_OPCODE_PLN:
+   case BRW_OPCODE_MAD:
+   case BRW_OPCODE_LRP:
+   case FS_OPCODE_FB_READ_LOGICAL:
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
+   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
+   case FS_OPCODE_CINTERP:
+   case FS_OPCODE_LINTERP:
+   case SHADER_OPCODE_FIND_LIVE_CHANNEL:
+   case SHADER_OPCODE_BROADCAST:
+   case SHADER_OPCODE_MOV_INDIRECT:
+   case SHADER_OPCODE_TEX_LOGICAL:
+   case SHADER_OPCODE_TXD_LOGICAL:
+   case SHADER_OPCODE_TXF_LOGICAL:
+   case SHADER_OPCODE_TXL_LOGICAL:
+   case SHADER_OPCODE_TXS_LOGICAL:
+   case FS_OPCODE_TXB_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_LOGICAL:
+   case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+   case SHADER_OPCODE_TXF_UMS_LOGICAL:
+   case SHADER_OPCODE_TXF_MCS_LOGICAL:
+   case SHADER_OPCODE_LOD_LOGICAL:
+   case SHADER_OPCODE_TG4_LOGICAL:
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+   case FS_OPCODE_PACK:
+      return true;
+   case SHADER_OPCODE_RCP:
+   case SHADER_OPCODE_RSQ:
+   case SHADER_OPCODE_SQRT:
+   case SHADER_OPCODE_EXP2:
+   case SHADER_OPCODE_LOG2:
+   case SHADER_OPCODE_POW:
+   case SHADER_OPCODE_INT_QUOTIENT:
+   case SHADER_OPCODE_INT_REMAINDER:
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_COS:
+      return inst->mlen < 2;
+   case SHADER_OPCODE_LOAD_PAYLOAD:
+      return !inst->is_copy_payload(v->alloc);
+   default:
+      return inst->is_send_from_grf() && !inst->has_side_effects() &&
+         !inst->is_volatile();
+   }
+}
+
+static bool
+operands_match(const fs_inst *a, const fs_inst *b, bool *negate)
+{
+   fs_reg *xs = a->src;
+   fs_reg *ys = b->src;
+
+   if (a->opcode == BRW_OPCODE_MAD) {
+      return xs[0].equals(ys[0]) &&
+             ((xs[1].equals(ys[1]) && xs[2].equals(ys[2])) ||
+              (xs[2].equals(ys[1]) && xs[1].equals(ys[2])));
+   } else if (a->opcode == BRW_OPCODE_MUL && a->dst.type == BRW_REGISTER_TYPE_F) {
+      bool xs0_negate = xs[0].negate;
+      bool xs1_negate = xs[1].file == IMM ? xs[1].f < 0.0f
+                                          : xs[1].negate;
+      bool ys0_negate = ys[0].negate;
+      bool ys1_negate = ys[1].file == IMM ? ys[1].f < 0.0f
+                                          : ys[1].negate;
+      float xs1_imm = xs[1].f;
+      float ys1_imm = ys[1].f;
+
+      xs[0].negate = false;
+      xs[1].negate = false;
+      ys[0].negate = false;
+      ys[1].negate = false;
+      xs[1].f = fabsf(xs[1].f);
+      ys[1].f = fabsf(ys[1].f);
+
+      bool ret = (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) ||
+                 (xs[1].equals(ys[0]) && xs[0].equals(ys[1]));
+
+      xs[0].negate = xs0_negate;
+      xs[1].negate = xs[1].file == IMM ? false : xs1_negate;
+      ys[0].negate = ys0_negate;
+      ys[1].negate = ys[1].file == IMM ? false : ys1_negate;
+      xs[1].f = xs1_imm;
+      ys[1].f = ys1_imm;
+
+      *negate = (xs0_negate != xs1_negate) != (ys0_negate != ys1_negate);
+      if (*negate && (a->saturate || b->saturate))
+         return false;
+      return ret;
+   } else if (!a->is_commutative()) {
+      bool match = true;
+      for (int i = 0; i < a->sources; i++) {
+         if (!xs[i].equals(ys[i])) {
+            match = false;
+            break;
+         }
+      }
+      return match;
+   } else {
+      return (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) ||
+             (xs[1].equals(ys[0]) && xs[0].equals(ys[1]));
+   }
+}
+
+static bool
+instructions_match(fs_inst *a, fs_inst *b, bool *negate)
+{
+   return a->opcode == b->opcode &&
+          a->force_writemask_all == b->force_writemask_all &&
+          a->exec_size == b->exec_size &&
+          a->group == b->group &&
+          a->saturate == b->saturate &&
+          a->predicate == b->predicate &&
+          a->predicate_inverse == b->predicate_inverse &&
+          a->conditional_mod == b->conditional_mod &&
+          a->flag_subreg == b->flag_subreg &&
+          a->dst.type == b->dst.type &&
+          a->offset == b->offset &&
+          a->mlen == b->mlen &&
+          a->size_written == b->size_written &&
+          a->base_mrf == b->base_mrf &&
+          a->eot == b->eot &&
+          a->header_size == b->header_size &&
+          a->shadow_compare == b->shadow_compare &&
+          a->pi_noperspective == b->pi_noperspective &&
+          a->target == b->target &&
+          a->sources == b->sources &&
+          operands_match(a, b, negate);
+}
+
+static void
+create_copy_instr(const fs_builder &bld, fs_inst *inst, fs_reg src, bool negate)
+{
+   unsigned written = regs_written(inst);
+   unsigned dst_width =
+      DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE);
+   fs_inst *copy;
+
+   if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD ||
+       written != dst_width) {
+      fs_reg *payload;
+      int sources, header_size;
+      if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
+         sources = inst->sources;
+         header_size = inst->header_size;
+      } else {
+         assert(written % dst_width == 0);
+         sources = written / dst_width;
+         header_size = 0;
+      }
+
+      assert(src.file == VGRF);
+      payload = ralloc_array(bld.shader->mem_ctx, fs_reg, sources);
+      for (int i = 0; i < header_size; i++) {
+         payload[i] = src;
+         src.offset += REG_SIZE;
+      }
+      for (int i = header_size; i < sources; i++) {
+         payload[i] = src;
+         src = offset(src, bld, 1);
+      }
+      copy = bld.LOAD_PAYLOAD(inst->dst, payload, sources, header_size);
+   } else {
+      copy = bld.MOV(inst->dst, src);
+      copy->group = inst->group;
+      copy->force_writemask_all = inst->force_writemask_all;
+      copy->src[0].negate = negate;
+   }
+   assert(regs_written(copy) == written);
+}
+
+bool
+fs_visitor::opt_cse_local(bblock_t *block)
+{
+   bool progress = false;
+   exec_list aeb;
+
+   void *cse_ctx = ralloc_context(NULL);
+
+   int ip = block->start_ip;
+   foreach_inst_in_block(fs_inst, inst, block) {
+      /* Skip some cases. */
+      if (is_expression(this, inst) && !inst->is_partial_write() &&
+          ((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) ||
+           inst->dst.is_null()))
+      {
+         bool found = false;
+         bool negate = false;
+
+         foreach_in_list_use_after(aeb_entry, entry, &aeb) {
+            /* Match current instruction's expression against those in AEB. */
+            if (!(entry->generator->dst.is_null() && !inst->dst.is_null()) &&
+                instructions_match(inst, entry->generator, &negate)) {
+               found = true;
+               progress = true;
+               break;
+            }
+         }
+
+         if (!found) {
+            if (inst->opcode != BRW_OPCODE_MOV ||
+                (inst->opcode == BRW_OPCODE_MOV &&
+                 inst->src[0].file == IMM &&
+                 inst->src[0].type == BRW_REGISTER_TYPE_VF)) {
+               /* Our first sighting of this expression.  Create an entry. */
+               aeb_entry *entry = ralloc(cse_ctx, aeb_entry);
+               entry->tmp = reg_undef;
+               entry->generator = inst;
+               aeb.push_tail(entry);
+            }
+         } else {
+            /* This is at least our second sighting of this expression.
+             * If we don't have a temporary already, make one.
+             */
+            bool no_existing_temp = entry->tmp.file == BAD_FILE;
+            if (no_existing_temp && !entry->generator->dst.is_null()) {
+               const fs_builder ibld = fs_builder(this, block, entry->generator)
+                                       .at(block, entry->generator->next);
+               int written = regs_written(entry->generator);
+
+               entry->tmp = fs_reg(VGRF, alloc.allocate(written),
+                                   entry->generator->dst.type);
+
+               create_copy_instr(ibld, entry->generator, entry->tmp, false);
+
+               entry->generator->dst = entry->tmp;
+            }
+
+            /* dest <- temp */
+            if (!inst->dst.is_null()) {
+               assert(inst->size_written == entry->generator->size_written);
+               assert(inst->dst.type == entry->tmp.type);
+               const fs_builder ibld(this, block, inst);
+
+               create_copy_instr(ibld, inst, entry->tmp, negate);
+            }
+
+            /* Set our iterator so that next time through the loop inst->next
+             * will get the instruction in the basic block after the one we've
+             * removed.
+             */
+            fs_inst *prev = (fs_inst *)inst->prev;
+
+            inst->remove(block);
+            inst = prev;
+         }
+      }
+
+      foreach_in_list_safe(aeb_entry, entry, &aeb) {
+         /* Kill all AEB entries that write a different value to or read from
+          * the flag register if we just wrote it.
+          */
+         if (inst->flags_written()) {
+            bool negate; /* dummy */
+            if (entry->generator->flags_read(devinfo) ||
+                (entry->generator->flags_written() &&
+                 !instructions_match(inst, entry->generator, &negate))) {
+               entry->remove();
+               ralloc_free(entry);
+               continue;
+            }
+         }
+
+         for (int i = 0; i < entry->generator->sources; i++) {
+            fs_reg *src_reg = &entry->generator->src[i];
+
+            /* Kill all AEB entries that use the destination we just
+             * overwrote.
+             */
+            if (regions_overlap(inst->dst, inst->size_written,
+                                entry->generator->src[i],
+                                entry->generator->size_read(i))) {
+               entry->remove();
+               ralloc_free(entry);
+               break;
+            }
+
+            /* Kill any AEB entries using registers that don't get reused any
+             * more -- a sure sign they'll fail operands_match().
+             */
+            if (src_reg->file == VGRF && virtual_grf_end[src_reg->nr] < ip) {
+               entry->remove();
+               ralloc_free(entry);
+               break;
+            }
+         }
+      }
+
+      ip++;
+   }
+
+   ralloc_free(cse_ctx);
+
+   return progress;
+}
+
+bool
+fs_visitor::opt_cse()
+{
+   bool progress = false;
+
+   calculate_live_intervals();
+
+   foreach_block (block, cfg) {
+      progress = opt_cse_local(block) || progress;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
diff --git a/src/intel/compiler/brw_fs_dead_code_eliminate.cpp b/src/intel/compiler/brw_fs_dead_code_eliminate.cpp
new file mode 100644
index 00000000000..7adb4278919
--- /dev/null
+++ b/src/intel/compiler/brw_fs_dead_code_eliminate.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_fs_live_variables.h"
+#include "brw_cfg.h"
+
+/** @file brw_fs_dead_code_eliminate.cpp
+ *
+ * Dataflow-aware dead code elimination.
+ *
+ * Walks the instruction list from the bottom, removing instructions that
+ * have results that both aren't used in later blocks and haven't been read
+ * yet in the tail end of this block.
+ */
+
+/**
+ * Is it safe to eliminate the instruction?
+ */
+static bool
+can_eliminate(const fs_inst *inst, BITSET_WORD *flag_live)
+{
+    return !inst->is_control_flow() &&
+           !inst->has_side_effects() &&
+           !(flag_live[0] & inst->flags_written()) &&
+           !inst->writes_accumulator;
+}
+
+/**
+ * Is it safe to omit the write, making the destination ARF null?
+ */
+static bool
+can_omit_write(const fs_inst *inst)
+{
+   switch (inst->opcode) {
+   case SHADER_OPCODE_UNTYPED_ATOMIC:
+   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_TYPED_ATOMIC:
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+      return true;
+   default:
+      /* We can eliminate the destination write for ordinary instructions,
+       * but not most SENDs.
+       */
+      if (inst->opcode < 128 && inst->mlen == 0)
+         return true;
+
+      /* It might not be safe for other virtual opcodes. */
+      return false;
+   }
+}
+
+bool
+fs_visitor::dead_code_eliminate()
+{
+   bool progress = false;
+
+   calculate_live_intervals();
+
+   int num_vars = live_intervals->num_vars;
+   BITSET_WORD *live = rzalloc_array(NULL, BITSET_WORD, BITSET_WORDS(num_vars));
+   BITSET_WORD *flag_live = rzalloc_array(NULL, BITSET_WORD, 1);
+
+   foreach_block_reverse_safe(block, cfg) {
+      memcpy(live, live_intervals->block_data[block->num].liveout,
+             sizeof(BITSET_WORD) * BITSET_WORDS(num_vars));
+      memcpy(flag_live, live_intervals->block_data[block->num].flag_liveout,
+             sizeof(BITSET_WORD));
+
+      foreach_inst_in_block_reverse_safe(fs_inst, inst, block) {
+         if (inst->dst.file == VGRF) {
+            const unsigned var = live_intervals->var_from_reg(inst->dst);
+            bool result_live = false;
+
+            for (unsigned i = 0; i < regs_written(inst); i++)
+               result_live |= BITSET_TEST(live, var + i);
+
+            if (!result_live &&
+                (can_omit_write(inst) || can_eliminate(inst, flag_live))) {
+               inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
+               progress = true;
+            }
+         }
+
+         if (inst->dst.is_null() && can_eliminate(inst, flag_live)) {
+            inst->opcode = BRW_OPCODE_NOP;
+            progress = true;
+         }
+
+         if (inst->dst.file == VGRF) {
+            if (!inst->is_partial_write()) {
+               int var = live_intervals->var_from_reg(inst->dst);
+               for (unsigned i = 0; i < regs_written(inst); i++) {
+                  BITSET_CLEAR(live, var + i);
+               }
+            }
+         }
+
+         if (!inst->predicate && inst->exec_size >= 8)
+            flag_live[0] &= ~inst->flags_written();
+
+         if (inst->opcode == BRW_OPCODE_NOP) {
+            inst->remove(block);
+            continue;
+         }
+
+         for (int i = 0; i < inst->sources; i++) {
+            if (inst->src[i].file == VGRF) {
+               int var = live_intervals->var_from_reg(inst->src[i]);
+
+               for (unsigned j = 0; j < regs_read(inst, i); j++) {
+                  BITSET_SET(live, var + j);
+               }
+            }
+         }
+
+         flag_live[0] |= inst->flags_read(devinfo);
+      }
+   }
+
+   ralloc_free(live);
+   ralloc_free(flag_live);
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
new file mode 100644
index 00000000000..aeed6a11977
--- /dev/null
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -0,0 +1,2126 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_generator.cpp
+ *
+ * This file supports generating code from the FS LIR to the actual
+ * native instructions.
+ */
+
+#include "brw_eu.h"
+#include "brw_fs.h"
+#include "brw_cfg.h"
+
+static enum brw_reg_file
+brw_file_from_reg(fs_reg *reg)
+{
+   switch (reg->file) {
+   case ARF:
+      return BRW_ARCHITECTURE_REGISTER_FILE;
+   case FIXED_GRF:
+   case VGRF:
+      return BRW_GENERAL_REGISTER_FILE;
+   case MRF:
+      return BRW_MESSAGE_REGISTER_FILE;
+   case IMM:
+      return BRW_IMMEDIATE_VALUE;
+   case BAD_FILE:
+   case ATTR:
+   case UNIFORM:
+      unreachable("not reached");
+   }
+   return BRW_ARCHITECTURE_REGISTER_FILE;
+}
+
+static struct brw_reg
+brw_reg_from_fs_reg(fs_inst *inst, fs_reg *reg, unsigned gen, bool compressed)
+{
+   struct brw_reg brw_reg;
+
+   switch (reg->file) {
+   case MRF:
+      assert((reg->nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(gen));
+      /* Fallthrough */
+   case VGRF:
+      if (reg->stride == 0) {
+         brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->nr, 0);
+      } else {
+         /* From the Haswell PRM:
+          *
+          *  "VertStride must be used to cross GRF register boundaries. This
+          *   rule implies that elements within a 'Width' cannot cross GRF
+          *   boundaries."
+          *
+          * The maximum width value that could satisfy this restriction is:
+          */
+         const unsigned reg_width = REG_SIZE / (reg->stride * type_sz(reg->type));
+
+         /* Because the hardware can only split source regions at a whole
+          * multiple of width during decompression (i.e. vertically), clamp
+          * the value obtained above to the physical execution size of a
+          * single decompressed chunk of the instruction:
+          */
+         const unsigned phys_width = compressed ? inst->exec_size / 2 :
+                                     inst->exec_size;
+
+         /* XXX - The equation above is strictly speaking not correct on
+          *       hardware that supports unbalanced GRF writes -- On Gen9+
+          *       each decompressed chunk of the instruction may have a
+          *       different execution size when the number of components
+          *       written to each destination GRF is not the same.
+          */
+         const unsigned width = MIN2(reg_width, phys_width);
+         brw_reg = brw_vecn_reg(width, brw_file_from_reg(reg), reg->nr, 0);
+         brw_reg = stride(brw_reg, width * reg->stride, width, reg->stride);
+      }
+
+      brw_reg = retype(brw_reg, reg->type);
+      brw_reg = byte_offset(brw_reg, reg->offset);
+      brw_reg.abs = reg->abs;
+      brw_reg.negate = reg->negate;
+      break;
+   case ARF:
+   case FIXED_GRF:
+   case IMM:
+      assert(reg->offset == 0);
+      brw_reg = reg->as_brw_reg();
+      break;
+   case BAD_FILE:
+      /* Probably unused. */
+      brw_reg = brw_null_reg();
+      break;
+   case ATTR:
+   case UNIFORM:
+      unreachable("not reached");
+   }
+
+   return brw_reg;
+}
+
+fs_generator::fs_generator(const struct brw_compiler *compiler, void *log_data,
+                           void *mem_ctx,
+                           const void *key,
+                           struct brw_stage_prog_data *prog_data,
+                           unsigned promoted_constants,
+                           bool runtime_check_aads_emit,
+                           gl_shader_stage stage)
+
+   : compiler(compiler), log_data(log_data),
+     devinfo(compiler->devinfo), key(key),
+     prog_data(prog_data),
+     promoted_constants(promoted_constants),
+     runtime_check_aads_emit(runtime_check_aads_emit), debug_flag(false),
+     stage(stage), mem_ctx(mem_ctx)
+{
+   p = rzalloc(mem_ctx, struct brw_codegen);
+   brw_init_codegen(devinfo, p, mem_ctx);
+}
+
+fs_generator::~fs_generator()
+{
+}
+
+class ip_record : public exec_node {
+public:
+   DECLARE_RALLOC_CXX_OPERATORS(ip_record)
+
+   ip_record(int ip)
+   {
+      this->ip = ip;
+   }
+
+   int ip;
+};
+
+bool
+fs_generator::patch_discard_jumps_to_fb_writes()
+{
+   if (devinfo->gen < 6 || this->discard_halt_patches.is_empty())
+      return false;
+
+   int scale = brw_jump_scale(p->devinfo);
+
+   /* There is a somewhat strange undocumented requirement of using
+    * HALT, according to the simulator.  If some channel has HALTed to
+    * a particular UIP, then by the end of the program, every channel
+    * must have HALTed to that UIP.  Furthermore, the tracking is a
+    * stack, so you can't do the final halt of a UIP after starting
+    * halting to a new UIP.
+    *
+    * Symptoms of not emitting this instruction on actual hardware
+    * included GPU hangs and sparkly rendering on the piglit discard
+    * tests.
+    */
+   brw_inst *last_halt = gen6_HALT(p);
+   brw_inst_set_uip(p->devinfo, last_halt, 1 * scale);
+   brw_inst_set_jip(p->devinfo, last_halt, 1 * scale);
+
+   int ip = p->nr_insn;
+
+   foreach_in_list(ip_record, patch_ip, &discard_halt_patches) {
+      brw_inst *patch = &p->store[patch_ip->ip];
+
+      assert(brw_inst_opcode(p->devinfo, patch) == BRW_OPCODE_HALT);
+      /* HALT takes a half-instruction distance from the pre-incremented IP. */
+      brw_inst_set_uip(p->devinfo, patch, (ip - patch_ip->ip) * scale);
+   }
+
+   this->discard_halt_patches.make_empty();
+   return true;
+}
+
+void
+fs_generator::fire_fb_write(fs_inst *inst,
+                            struct brw_reg payload,
+                            struct brw_reg implied_header,
+                            GLuint nr)
+{
+   uint32_t msg_control;
+
+   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+
+   if (devinfo->gen < 6) {
+      brw_push_insn_state(p);
+      brw_set_default_exec_size(p, BRW_EXECUTE_8);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_MOV(p, offset(payload, 1), brw_vec8_grf(1, 0));
+      brw_pop_insn_state(p);
+   }
+
+   if (inst->opcode == FS_OPCODE_REP_FB_WRITE)
+      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE_REPLICATED;
+   else if (prog_data->dual_src_blend) {
+      if (!inst->group)
+         msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
+      else
+         msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN23;
+   } else if (inst->exec_size == 16)
+      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
+   else
+      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
+
+   uint32_t surf_index =
+      prog_data->binding_table.render_target_start + inst->target;
+
+   bool last_render_target = inst->eot ||
+                             (prog_data->dual_src_blend && dispatch_width == 16);
+
+
+   brw_fb_WRITE(p,
+                payload,
+                implied_header,
+                msg_control,
+                surf_index,
+                nr,
+                0,
+                inst->eot,
+                last_render_target,
+                inst->header_size != 0);
+
+   brw_mark_surface_used(&prog_data->base, surf_index);
+}
+
+void
+fs_generator::generate_fb_write(fs_inst *inst, struct brw_reg payload)
+{
+   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+   const brw_wm_prog_key * const key = (brw_wm_prog_key * const) this->key;
+   struct brw_reg implied_header;
+
+   if (devinfo->gen < 8 && !devinfo->is_haswell) {
+      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+   }
+
+   if (inst->base_mrf >= 0)
+      payload = brw_message_reg(inst->base_mrf);
+
+   /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
+    * move, here's g1.
+    */
+   if (inst->header_size != 0) {
+      brw_push_insn_state(p);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_set_default_flag_reg(p, 0, 0);
+
+      /* On HSW, the GPU will use the predicate on SENDC, unless the header is
+       * present.
+       */
+      if (prog_data->uses_kill) {
+         struct brw_reg pixel_mask;
+
+         if (devinfo->gen >= 6)
+            pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
+         else
+            pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
+
+         brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
+      }
+
+      if (devinfo->gen >= 6) {
+         brw_push_insn_state(p);
+         brw_set_default_exec_size(p, BRW_EXECUTE_16);
+	 brw_set_default_compression_control(p, BRW_COMPRESSION_COMPRESSED);
+	 brw_MOV(p,
+		 retype(payload, BRW_REGISTER_TYPE_UD),
+		 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+         brw_pop_insn_state(p);
+
+         if (inst->target > 0 && key->replicate_alpha) {
+            /* Set "Source0 Alpha Present to RenderTarget" bit in message
+             * header.
+             */
+            brw_OR(p,
+		   vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
+		   vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
+		   brw_imm_ud(0x1 << 11));
+         }
+
+	 if (inst->target > 0) {
+	    /* Set the render target index for choosing BLEND_STATE. */
+	    brw_MOV(p, retype(vec1(suboffset(payload, 2)),
+                              BRW_REGISTER_TYPE_UD),
+		    brw_imm_ud(inst->target));
+	 }
+
+         /* Set computes stencil to render target */
+         if (prog_data->computed_stencil) {
+            brw_OR(p,
+                   vec1(retype(payload, BRW_REGISTER_TYPE_UD)),
+                   vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
+                   brw_imm_ud(0x1 << 14));
+         }
+
+	 implied_header = brw_null_reg();
+      } else {
+	 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
+      }
+
+      brw_pop_insn_state(p);
+   } else {
+      implied_header = brw_null_reg();
+   }
+
+   if (!runtime_check_aads_emit) {
+      fire_fb_write(inst, payload, implied_header, inst->mlen);
+   } else {
+      /* This can only happen in gen < 6 */
+      assert(devinfo->gen < 6);
+
+      struct brw_reg v1_null_ud = vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
+
+      /* Check runtime bit to detect if we have to send AA data or not */
+      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_AND(p,
+              v1_null_ud,
+              retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD),
+              brw_imm_ud(1<<26));
+      brw_inst_set_cond_modifier(p->devinfo, brw_last_inst, BRW_CONDITIONAL_NZ);
+
+      int jmp = brw_JMPI(p, brw_imm_ud(0), BRW_PREDICATE_NORMAL) - p->store;
+      brw_inst_set_exec_size(p->devinfo, brw_last_inst, BRW_EXECUTE_1);
+      {
+         /* Don't send AA data */
+         fire_fb_write(inst, offset(payload, 1), implied_header, inst->mlen-1);
+      }
+      brw_land_fwd_jump(p, jmp);
+      fire_fb_write(inst, payload, implied_header, inst->mlen);
+   }
+}
+
+void
+fs_generator::generate_fb_read(fs_inst *inst, struct brw_reg dst,
+                               struct brw_reg payload)
+{
+   assert(inst->size_written % REG_SIZE == 0);
+   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+   const unsigned surf_index =
+      prog_data->binding_table.render_target_start + inst->target;
+
+   gen9_fb_READ(p, dst, payload, surf_index,
+                inst->header_size, inst->size_written / REG_SIZE,
+                prog_data->persample_dispatch);
+
+   brw_mark_surface_used(&prog_data->base, surf_index);
+}
+
+void
+fs_generator::generate_mov_indirect(fs_inst *inst,
+                                    struct brw_reg dst,
+                                    struct brw_reg reg,
+                                    struct brw_reg indirect_byte_offset)
+{
+   assert(indirect_byte_offset.type == BRW_REGISTER_TYPE_UD);
+   assert(indirect_byte_offset.file == BRW_GENERAL_REGISTER_FILE);
+
+   unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr;
+
+   if (indirect_byte_offset.file == BRW_IMMEDIATE_VALUE) {
+      imm_byte_offset += indirect_byte_offset.ud;
+
+      reg.nr = imm_byte_offset / REG_SIZE;
+      reg.subnr = imm_byte_offset % REG_SIZE;
+      brw_MOV(p, dst, reg);
+   } else {
+      /* Prior to Broadwell, there are only 8 address registers. */
+      assert(inst->exec_size == 8 || devinfo->gen >= 8);
+
+      /* We use VxH indirect addressing, clobbering a0.0 through a0.7. */
+      struct brw_reg addr = vec8(brw_address_reg(0));
+
+      /* The destination stride of an instruction (in bytes) must be greater
+       * than or equal to the size of the rest of the instruction.  Since the
+       * address register is of type UW, we can't use a D-type instruction.
+       * In order to get around this, re retype to UW and use a stride.
+       */
+      indirect_byte_offset =
+         retype(spread(indirect_byte_offset, 2), BRW_REGISTER_TYPE_UW);
+
+      /* There are a number of reasons why we don't use the base offset here.
+       * One reason is that the field is only 9 bits which means we can only
+       * use it to access the first 16 GRFs.  Also, from the Haswell PRM
+       * section "Register Region Restrictions":
+       *
+       *    "The lower bits of the AddressImmediate must not overflow to
+       *    change the register address.  The lower 5 bits of Address
+       *    Immediate when added to lower 5 bits of address register gives
+       *    the sub-register offset. The upper bits of Address Immediate
+       *    when added to upper bits of address register gives the register
+       *    address. Any overflow from sub-register offset is dropped."
+       *
+       * Since the indirect may cause us to cross a register boundary, this
+       * makes the base offset almost useless.  We could try and do something
+       * clever where we use a actual base offset if base_offset % 32 == 0 but
+       * that would mean we were generating different code depending on the
+       * base offset.  Instead, for the sake of consistency, we'll just do the
+       * add ourselves.  This restriction is only listed in the Haswell PRM
+       * but empirical testing indicates that it applies on all older
+       * generations and is lifted on Broadwell.
+       *
+       * In the end, while base_offset is nice to look at in the generated
+       * code, using it saves us 0 instructions and would require quite a bit
+       * of case-by-case work.  It's just not worth it.
+       */
+      brw_ADD(p, addr, indirect_byte_offset, brw_imm_uw(imm_byte_offset));
+      struct brw_reg ind_src = brw_VxH_indirect(0, 0);
+
+      brw_inst *mov = brw_MOV(p, dst, retype(ind_src, reg.type));
+
+      if (devinfo->gen == 6 && dst.file == BRW_MESSAGE_REGISTER_FILE &&
+          !inst->get_next()->is_tail_sentinel() &&
+          ((fs_inst *)inst->get_next())->mlen > 0) {
+         /* From the Sandybridge PRM:
+          *
+          *    "[Errata: DevSNB(SNB)] If MRF register is updated by any
+          *    instruction that “indexed/indirect” source AND is followed by a
+          *    send, the instruction requires a “Switch”. This is to avoid
+          *    race condition where send may dispatch before MRF is updated."
+          */
+         brw_inst_set_thread_control(devinfo, mov, BRW_THREAD_SWITCH);
+      }
+   }
+}
+
+void
+fs_generator::generate_urb_read(fs_inst *inst,
+                                struct brw_reg dst,
+                                struct brw_reg header)
+{
+   assert(inst->size_written % REG_SIZE == 0);
+   assert(header.file == BRW_GENERAL_REGISTER_FILE);
+   assert(header.type == BRW_REGISTER_TYPE_UD);
+
+   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
+   brw_set_src0(p, send, header);
+   brw_set_src1(p, send, brw_imm_ud(0u));
+
+   brw_inst_set_sfid(p->devinfo, send, BRW_SFID_URB);
+   brw_inst_set_urb_opcode(p->devinfo, send, GEN8_URB_OPCODE_SIMD8_READ);
+
+   if (inst->opcode == SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT)
+      brw_inst_set_urb_per_slot_offset(p->devinfo, send, true);
+
+   brw_inst_set_mlen(p->devinfo, send, inst->mlen);
+   brw_inst_set_rlen(p->devinfo, send, inst->size_written / REG_SIZE);
+   brw_inst_set_header_present(p->devinfo, send, true);
+   brw_inst_set_urb_global_offset(p->devinfo, send, inst->offset);
+}
+
+void
+fs_generator::generate_urb_write(fs_inst *inst, struct brw_reg payload)
+{
+   brw_inst *insn;
+
+   insn = brw_next_insn(p, BRW_OPCODE_SEND);
+
+   brw_set_dest(p, insn, brw_null_reg());
+   brw_set_src0(p, insn, payload);
+   brw_set_src1(p, insn, brw_imm_d(0));
+
+   brw_inst_set_sfid(p->devinfo, insn, BRW_SFID_URB);
+   brw_inst_set_urb_opcode(p->devinfo, insn, GEN8_URB_OPCODE_SIMD8_WRITE);
+
+   if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT ||
+       inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
+      brw_inst_set_urb_per_slot_offset(p->devinfo, insn, true);
+
+   if (inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED ||
+       inst->opcode == SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT)
+      brw_inst_set_urb_channel_mask_present(p->devinfo, insn, true);
+
+   brw_inst_set_mlen(p->devinfo, insn, inst->mlen);
+   brw_inst_set_rlen(p->devinfo, insn, 0);
+   brw_inst_set_eot(p->devinfo, insn, inst->eot);
+   brw_inst_set_header_present(p->devinfo, insn, true);
+   brw_inst_set_urb_global_offset(p->devinfo, insn, inst->offset);
+}
+
+void
+fs_generator::generate_cs_terminate(fs_inst *inst, struct brw_reg payload)
+{
+   struct brw_inst *insn;
+
+   insn = brw_next_insn(p, BRW_OPCODE_SEND);
+
+   brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_UW));
+   brw_set_src0(p, insn, retype(payload, BRW_REGISTER_TYPE_UW));
+   brw_set_src1(p, insn, brw_imm_d(0));
+
+   /* Terminate a compute shader by sending a message to the thread spawner.
+    */
+   brw_inst_set_sfid(devinfo, insn, BRW_SFID_THREAD_SPAWNER);
+   brw_inst_set_mlen(devinfo, insn, 1);
+   brw_inst_set_rlen(devinfo, insn, 0);
+   brw_inst_set_eot(devinfo, insn, inst->eot);
+   brw_inst_set_header_present(devinfo, insn, false);
+
+   brw_inst_set_ts_opcode(devinfo, insn, 0); /* Dereference resource */
+   brw_inst_set_ts_request_type(devinfo, insn, 0); /* Root thread */
+
+   /* Note that even though the thread has a URB resource associated with it,
+    * we set the "do not dereference URB" bit, because the URB resource is
+    * managed by the fixed-function unit, so it will free it automatically.
+    */
+   brw_inst_set_ts_resource_select(devinfo, insn, 1); /* Do not dereference URB */
+
+   brw_inst_set_mask_control(devinfo, insn, BRW_MASK_DISABLE);
+}
+
+void
+fs_generator::generate_barrier(fs_inst *inst, struct brw_reg src)
+{
+   brw_barrier(p, src);
+   brw_WAIT(p);
+}
+
+void
+fs_generator::generate_linterp(fs_inst *inst,
+			     struct brw_reg dst, struct brw_reg *src)
+{
+   /* PLN reads:
+    *                      /   in SIMD16   \
+    *    -----------------------------------
+    *   | src1+0 | src1+1 | src1+2 | src1+3 |
+    *   |-----------------------------------|
+    *   |(x0, x1)|(y0, y1)|(x2, x3)|(y2, y3)|
+    *    -----------------------------------
+    *
+    * but for the LINE/MAC pair, the LINE reads Xs and the MAC reads Ys:
+    *
+    *    -----------------------------------
+    *   | src1+0 | src1+1 | src1+2 | src1+3 |
+    *   |-----------------------------------|
+    *   |(x0, x1)|(y0, y1)|        |        | in SIMD8
+    *   |-----------------------------------|
+    *   |(x0, x1)|(x2, x3)|(y0, y1)|(y2, y3)| in SIMD16
+    *    -----------------------------------
+    *
+    * See also: emit_interpolation_setup_gen4().
+    */
+   struct brw_reg delta_x = src[0];
+   struct brw_reg delta_y = offset(src[0], inst->exec_size / 8);
+   struct brw_reg interp = src[1];
+
+   if (devinfo->has_pln &&
+       (devinfo->gen >= 7 || (delta_x.nr & 1) == 0)) {
+      brw_PLN(p, dst, interp, delta_x);
+   } else {
+      brw_LINE(p, brw_null_reg(), interp, delta_x);
+      brw_MAC(p, dst, suboffset(interp, 1), delta_y);
+   }
+}
+
+void
+fs_generator::generate_get_buffer_size(fs_inst *inst,
+                                       struct brw_reg dst,
+                                       struct brw_reg src,
+                                       struct brw_reg surf_index)
+{
+   assert(devinfo->gen >= 7);
+   assert(surf_index.file == BRW_IMMEDIATE_VALUE);
+
+   uint32_t simd_mode;
+   int rlen = 4;
+
+   switch (inst->exec_size) {
+   case 8:
+      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
+      break;
+   case 16:
+      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+      break;
+   default:
+      unreachable("Invalid width for texture instruction");
+   }
+
+   if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
+      rlen = 8;
+      dst = vec16(dst);
+   }
+
+   brw_SAMPLE(p,
+              retype(dst, BRW_REGISTER_TYPE_UW),
+              inst->base_mrf,
+              src,
+              surf_index.ud,
+              0,
+              GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
+              rlen, /* response length */
+              inst->mlen,
+              inst->header_size > 0,
+              simd_mode,
+              BRW_SAMPLER_RETURN_FORMAT_SINT32);
+
+   brw_mark_surface_used(prog_data, surf_index.ud);
+}
+
+void
+fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
+                           struct brw_reg surface_index,
+                           struct brw_reg sampler_index)
+{
+   assert(inst->size_written % REG_SIZE == 0);
+   int msg_type = -1;
+   uint32_t simd_mode;
+   uint32_t return_format;
+   bool is_combined_send = inst->eot;
+
+   switch (dst.type) {
+   case BRW_REGISTER_TYPE_D:
+      return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
+      break;
+   case BRW_REGISTER_TYPE_UD:
+      return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
+      break;
+   default:
+      return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
+      break;
+   }
+
+   /* Stomp the resinfo output type to UINT32.  On gens 4-5, the output type
+    * is set as part of the message descriptor.  On gen4, the PRM seems to
+    * allow UINT32 and FLOAT32 (i965 PRM, Vol. 4 Section 4.8.1.1), but on
+    * later gens UINT32 is required.  Once you hit Sandy Bridge, the bit is
+    * gone from the message descriptor entirely and you just get UINT32 all
+    * the time regasrdless.  Since we can really only do non-UINT32 on gen4,
+    * just stomp it to UINT32 all the time.
+    */
+   if (inst->opcode == SHADER_OPCODE_TXS)
+      return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
+
+   switch (inst->exec_size) {
+   case 8:
+      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
+      break;
+   case 16:
+      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+      break;
+   default:
+      unreachable("Invalid width for texture instruction");
+   }
+
+   if (devinfo->gen >= 5) {
+      switch (inst->opcode) {
+      case SHADER_OPCODE_TEX:
+	 if (inst->shadow_compare) {
+	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
+	 } else {
+	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
+	 }
+	 break;
+      case FS_OPCODE_TXB:
+	 if (inst->shadow_compare) {
+	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
+	 } else {
+	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
+	 }
+	 break;
+      case SHADER_OPCODE_TXL:
+	 if (inst->shadow_compare) {
+	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
+	 } else {
+	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
+	 }
+	 break;
+      case SHADER_OPCODE_TXL_LZ:
+         assert(devinfo->gen >= 9);
+	 if (inst->shadow_compare) {
+            msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_C_LZ;
+         } else {
+            msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LZ;
+         }
+         break;
+      case SHADER_OPCODE_TXS:
+	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
+	 break;
+      case SHADER_OPCODE_TXD:
+         if (inst->shadow_compare) {
+            /* Gen7.5+.  Otherwise, lowered in NIR */
+            assert(devinfo->gen >= 8 || devinfo->is_haswell);
+            msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
+         } else {
+            msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
+         }
+	 break;
+      case SHADER_OPCODE_TXF:
+	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
+	 break;
+      case SHADER_OPCODE_TXF_LZ:
+         assert(devinfo->gen >= 9);
+         msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD_LZ;
+         break;
+      case SHADER_OPCODE_TXF_CMS_W:
+         assert(devinfo->gen >= 9);
+         msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
+         break;
+      case SHADER_OPCODE_TXF_CMS:
+         if (devinfo->gen >= 7)
+            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
+         else
+            msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
+         break;
+      case SHADER_OPCODE_TXF_UMS:
+         assert(devinfo->gen >= 7);
+         msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DSS;
+         break;
+      case SHADER_OPCODE_TXF_MCS:
+         assert(devinfo->gen >= 7);
+         msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
+         break;
+      case SHADER_OPCODE_LOD:
+         msg_type = GEN5_SAMPLER_MESSAGE_LOD;
+         break;
+      case SHADER_OPCODE_TG4:
+         if (inst->shadow_compare) {
+            assert(devinfo->gen >= 7);
+            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
+         } else {
+            assert(devinfo->gen >= 6);
+            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
+         }
+         break;
+      case SHADER_OPCODE_TG4_OFFSET:
+         assert(devinfo->gen >= 7);
+         if (inst->shadow_compare) {
+            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
+         } else {
+            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
+         }
+         break;
+      case SHADER_OPCODE_SAMPLEINFO:
+         msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
+         break;
+      default:
+	 unreachable("not reached");
+      }
+   } else {
+      switch (inst->opcode) {
+      case SHADER_OPCODE_TEX:
+	 /* Note that G45 and older determines shadow compare and dispatch width
+	  * from message length for most messages.
+	  */
+         if (inst->exec_size == 8) {
+            msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
+            if (inst->shadow_compare) {
+               assert(inst->mlen == 6);
+            } else {
+               assert(inst->mlen <= 4);
+            }
+         } else {
+            if (inst->shadow_compare) {
+               msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
+               assert(inst->mlen == 9);
+            } else {
+               msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
+               assert(inst->mlen <= 7 && inst->mlen % 2 == 1);
+            }
+         }
+	 break;
+      case FS_OPCODE_TXB:
+	 if (inst->shadow_compare) {
+            assert(inst->exec_size == 8);
+	    assert(inst->mlen == 6);
+	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
+	 } else {
+	    assert(inst->mlen == 9);
+	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
+	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+	 }
+	 break;
+      case SHADER_OPCODE_TXL:
+	 if (inst->shadow_compare) {
+            assert(inst->exec_size == 8);
+	    assert(inst->mlen == 6);
+	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
+	 } else {
+	    assert(inst->mlen == 9);
+	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
+	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+	 }
+	 break;
+      case SHADER_OPCODE_TXD:
+	 /* There is no sample_d_c message; comparisons are done manually */
+         assert(inst->exec_size == 8);
+	 assert(inst->mlen == 7 || inst->mlen == 10);
+	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
+	 break;
+      case SHADER_OPCODE_TXF:
+         assert(inst->mlen <= 9 && inst->mlen % 2 == 1);
+	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
+	 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+	 break;
+      case SHADER_OPCODE_TXS:
+	 assert(inst->mlen == 3);
+	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
+	 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+	 break;
+      default:
+	 unreachable("not reached");
+      }
+   }
+   assert(msg_type != -1);
+
+   if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
+      dst = vec16(dst);
+   }
+
+   assert(devinfo->gen < 7 || inst->header_size == 0 ||
+          src.file == BRW_GENERAL_REGISTER_FILE);
+
+   assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
+
+   /* Load the message header if present.  If there's a texture offset,
+    * we need to set it up explicitly and load the offset bitfield.
+    * Otherwise, we can use an implied move from g0 to the first message reg.
+    */
+   if (inst->header_size != 0) {
+      if (devinfo->gen < 6 && !inst->offset) {
+         /* Set up an implied move from g0 to the MRF. */
+         src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
+      } else {
+         struct brw_reg header_reg;
+
+         if (devinfo->gen >= 7) {
+            header_reg = src;
+         } else {
+            assert(inst->base_mrf != -1);
+            header_reg = brw_message_reg(inst->base_mrf);
+         }
+
+         brw_push_insn_state(p);
+         brw_set_default_exec_size(p, BRW_EXECUTE_8);
+         brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+         brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+         /* Explicitly set up the message header by copying g0 to the MRF. */
+         brw_MOV(p, header_reg, brw_vec8_grf(0, 0));
+
+         if (inst->offset) {
+            /* Set the offset bits in DWord 2. */
+            brw_MOV(p, get_element_ud(header_reg, 2),
+                       brw_imm_ud(inst->offset));
+         } else if (stage != MESA_SHADER_VERTEX &&
+                    stage != MESA_SHADER_FRAGMENT) {
+            /* The vertex and fragment stages have g0.2 set to 0, so
+             * header0.2 is 0 when g0 is copied. Other stages may not, so we
+             * must set it to 0 to avoid setting undesirable bits in the
+             * message.
+             */
+            brw_MOV(p, get_element_ud(header_reg, 2), brw_imm_ud(0));
+         }
+
+         brw_adjust_sampler_state_pointer(p, header_reg, sampler_index);
+         brw_pop_insn_state(p);
+      }
+   }
+
+   uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
+         inst->opcode == SHADER_OPCODE_TG4_OFFSET)
+         ? prog_data->binding_table.gather_texture_start
+         : prog_data->binding_table.texture_start;
+
+   if (surface_index.file == BRW_IMMEDIATE_VALUE &&
+       sampler_index.file == BRW_IMMEDIATE_VALUE) {
+      uint32_t surface = surface_index.ud;
+      uint32_t sampler = sampler_index.ud;
+
+      brw_SAMPLE(p,
+                 retype(dst, BRW_REGISTER_TYPE_UW),
+                 inst->base_mrf,
+                 src,
+                 surface + base_binding_table_index,
+                 sampler % 16,
+                 msg_type,
+                 inst->size_written / REG_SIZE,
+                 inst->mlen,
+                 inst->header_size != 0,
+                 simd_mode,
+                 return_format);
+
+      brw_mark_surface_used(prog_data, surface + base_binding_table_index);
+   } else {
+      /* Non-const sampler index */
+
+      struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
+      struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD));
+      struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
+
+      brw_push_insn_state(p);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+      if (brw_regs_equal(&surface_reg, &sampler_reg)) {
+         brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
+      } else {
+         if (sampler_reg.file == BRW_IMMEDIATE_VALUE) {
+            brw_OR(p, addr, surface_reg, brw_imm_ud(sampler_reg.ud << 8));
+         } else {
+            brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
+            brw_OR(p, addr, addr, surface_reg);
+         }
+      }
+      if (base_binding_table_index)
+         brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
+      brw_AND(p, addr, addr, brw_imm_ud(0xfff));
+
+      brw_pop_insn_state(p);
+
+      /* dst = send(offset, a0.0 | <descriptor>) */
+      brw_inst *insn = brw_send_indirect_message(
+         p, BRW_SFID_SAMPLER, dst, src, addr);
+      brw_set_sampler_message(p, insn,
+                              0 /* surface */,
+                              0 /* sampler */,
+                              msg_type,
+                              inst->size_written / REG_SIZE,
+                              inst->mlen /* mlen */,
+                              inst->header_size != 0 /* header */,
+                              simd_mode,
+                              return_format);
+
+      /* visitor knows more than we do about the surface limit required,
+       * so has already done marking.
+       */
+   }
+
+   if (is_combined_send) {
+      brw_inst_set_eot(p->devinfo, brw_last_inst, true);
+      brw_inst_set_opcode(p->devinfo, brw_last_inst, BRW_OPCODE_SENDC);
+   }
+}
+
+
+/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
+ * looking like:
+ *
+ * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
+ *
+ * Ideally, we want to produce:
+ *
+ *           DDX                     DDY
+ * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
+ *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
+ *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
+ *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
+ *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
+ *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
+ *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
+ *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
+ *
+ * and add another set of two more subspans if in 16-pixel dispatch mode.
+ *
+ * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
+ * for each pair, and vertstride = 2 jumps us 2 elements after processing a
+ * pair.  But the ideal approximation may impose a huge performance cost on
+ * sample_d.  On at least Haswell, sample_d instruction does some
+ * optimizations if the same LOD is used for all pixels in the subspan.
+ *
+ * For DDY, we need to use ALIGN16 mode since it's capable of doing the
+ * appropriate swizzling.
+ */
+void
+fs_generator::generate_ddx(enum opcode opcode,
+                           struct brw_reg dst, struct brw_reg src)
+{
+   unsigned vstride, width;
+
+   if (opcode == FS_OPCODE_DDX_FINE) {
+      /* produce accurate derivatives */
+      vstride = BRW_VERTICAL_STRIDE_2;
+      width = BRW_WIDTH_2;
+   } else {
+      /* replicate the derivative at the top-left pixel to other pixels */
+      vstride = BRW_VERTICAL_STRIDE_4;
+      width = BRW_WIDTH_4;
+   }
+
+   struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
+                                 src.negate, src.abs,
+				 BRW_REGISTER_TYPE_F,
+				 vstride,
+				 width,
+				 BRW_HORIZONTAL_STRIDE_0,
+				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+   struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
+                                 src.negate, src.abs,
+				 BRW_REGISTER_TYPE_F,
+				 vstride,
+				 width,
+				 BRW_HORIZONTAL_STRIDE_0,
+				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+   brw_ADD(p, dst, src0, negate(src1));
+}
+
+/* The negate_value boolean is used to negate the derivative computation for
+ * FBOs, since they place the origin at the upper left instead of the lower
+ * left.
+ */
+void
+fs_generator::generate_ddy(enum opcode opcode,
+                           struct brw_reg dst, struct brw_reg src)
+{
+   if (opcode == FS_OPCODE_DDY_FINE) {
+      /* produce accurate derivatives */
+      struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
+                                    src.negate, src.abs,
+                                    BRW_REGISTER_TYPE_F,
+                                    BRW_VERTICAL_STRIDE_4,
+                                    BRW_WIDTH_4,
+                                    BRW_HORIZONTAL_STRIDE_1,
+                                    BRW_SWIZZLE_XYXY, WRITEMASK_XYZW);
+      struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
+                                    src.negate, src.abs,
+                                    BRW_REGISTER_TYPE_F,
+                                    BRW_VERTICAL_STRIDE_4,
+                                    BRW_WIDTH_4,
+                                    BRW_HORIZONTAL_STRIDE_1,
+                                    BRW_SWIZZLE_ZWZW, WRITEMASK_XYZW);
+      brw_push_insn_state(p);
+      brw_set_default_access_mode(p, BRW_ALIGN_16);
+      brw_ADD(p, dst, negate(src0), src1);
+      brw_pop_insn_state(p);
+   } else {
+      /* replicate the derivative at the top-left pixel to other pixels */
+      struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
+                                    src.negate, src.abs,
+                                    BRW_REGISTER_TYPE_F,
+                                    BRW_VERTICAL_STRIDE_4,
+                                    BRW_WIDTH_4,
+                                    BRW_HORIZONTAL_STRIDE_0,
+                                    BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+      struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
+                                    src.negate, src.abs,
+                                    BRW_REGISTER_TYPE_F,
+                                    BRW_VERTICAL_STRIDE_4,
+                                    BRW_WIDTH_4,
+                                    BRW_HORIZONTAL_STRIDE_0,
+                                    BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
+      brw_ADD(p, dst, negate(src0), src1);
+   }
+}
+
+void
+fs_generator::generate_discard_jump(fs_inst *inst)
+{
+   assert(devinfo->gen >= 6);
+
+   /* This HALT will be patched up at FB write time to point UIP at the end of
+    * the program, and at brw_uip_jip() JIP will be set to the end of the
+    * current block (or the program).
+    */
+   this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
+   gen6_HALT(p);
+}
+
+void
+fs_generator::generate_scratch_write(fs_inst *inst, struct brw_reg src)
+{
+   /* The 32-wide messages only respect the first 16-wide half of the channel
+    * enable signals which are replicated identically for the second group of
+    * 16 channels, so we cannot use them unless the write is marked
+    * force_writemask_all.
+    */
+   const unsigned lower_size = inst->force_writemask_all ? inst->exec_size :
+                               MIN2(16, inst->exec_size);
+   const unsigned block_size = 4 * lower_size / REG_SIZE;
+   assert(inst->mlen != 0);
+
+   brw_push_insn_state(p);
+   brw_set_default_exec_size(p, cvt(lower_size) - 1);
+   brw_set_default_compression(p, lower_size > 8);
+
+   for (unsigned i = 0; i < inst->exec_size / lower_size; i++) {
+      brw_set_default_group(p, inst->group + lower_size * i);
+
+      brw_MOV(p, brw_uvec_mrf(lower_size, inst->base_mrf + 1, 0),
+              retype(offset(src, block_size * i), BRW_REGISTER_TYPE_UD));
+
+      brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf),
+                                    block_size,
+                                    inst->offset + block_size * REG_SIZE * i);
+   }
+
+   brw_pop_insn_state(p);
+}
+
+void
+fs_generator::generate_scratch_read(fs_inst *inst, struct brw_reg dst)
+{
+   assert(inst->exec_size <= 16 || inst->force_writemask_all);
+   assert(inst->mlen != 0);
+
+   brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf),
+                                inst->exec_size / 8, inst->offset);
+}
+
+void
+fs_generator::generate_scratch_read_gen7(fs_inst *inst, struct brw_reg dst)
+{
+   assert(inst->exec_size <= 16 || inst->force_writemask_all);
+
+   gen7_block_read_scratch(p, dst, inst->exec_size / 8, inst->offset);
+}
+
+void
+fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
+                                                  struct brw_reg dst,
+                                                  struct brw_reg index,
+                                                  struct brw_reg offset)
+{
+   assert(type_sz(dst.type) == 4);
+   assert(inst->mlen != 0);
+
+   assert(index.file == BRW_IMMEDIATE_VALUE &&
+	  index.type == BRW_REGISTER_TYPE_UD);
+   uint32_t surf_index = index.ud;
+
+   assert(offset.file == BRW_IMMEDIATE_VALUE &&
+	  offset.type == BRW_REGISTER_TYPE_UD);
+   uint32_t read_offset = offset.ud;
+
+   brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
+			read_offset, surf_index);
+}
+
+void
+fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
+                                                       struct brw_reg dst,
+                                                       struct brw_reg index,
+                                                       struct brw_reg payload)
+{
+   assert(index.type == BRW_REGISTER_TYPE_UD);
+   assert(payload.file == BRW_GENERAL_REGISTER_FILE);
+   assert(type_sz(dst.type) == 4);
+
+   if (index.file == BRW_IMMEDIATE_VALUE) {
+      const uint32_t surf_index = index.ud;
+
+      brw_push_insn_state(p);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+      brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+      brw_pop_insn_state(p);
+
+      brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UD));
+      brw_set_src0(p, send, retype(payload, BRW_REGISTER_TYPE_UD));
+      brw_set_dp_read_message(p, send, surf_index,
+                              BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size),
+                              GEN7_DATAPORT_DC_OWORD_BLOCK_READ,
+                              GEN6_SFID_DATAPORT_CONSTANT_CACHE,
+                              1, /* mlen */
+                              true, /* header */
+                              DIV_ROUND_UP(inst->size_written, REG_SIZE));
+
+   } else {
+      struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
+
+      brw_push_insn_state(p);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+      /* a0.0 = surf_index & 0xff */
+      brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
+      brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
+      brw_set_dest(p, insn_and, addr);
+      brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
+      brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
+
+      /* dst = send(payload, a0.0 | <descriptor>) */
+      brw_inst *insn = brw_send_indirect_message(
+         p, GEN6_SFID_DATAPORT_CONSTANT_CACHE,
+         retype(dst, BRW_REGISTER_TYPE_UD),
+         retype(payload, BRW_REGISTER_TYPE_UD), addr);
+      brw_set_dp_read_message(p, insn, 0 /* surface */,
+                              BRW_DATAPORT_OWORD_BLOCK_DWORDS(inst->exec_size),
+                              GEN7_DATAPORT_DC_OWORD_BLOCK_READ,
+                              GEN6_SFID_DATAPORT_CONSTANT_CACHE,
+                              1, /* mlen */
+                              true, /* header */
+                              DIV_ROUND_UP(inst->size_written, REG_SIZE));
+
+      brw_pop_insn_state(p);
+   }
+}
+
+void
+fs_generator::generate_varying_pull_constant_load_gen4(fs_inst *inst,
+                                                       struct brw_reg dst,
+                                                       struct brw_reg index)
+{
+   assert(devinfo->gen < 7); /* Should use the gen7 variant. */
+   assert(inst->header_size != 0);
+   assert(inst->mlen);
+
+   assert(index.file == BRW_IMMEDIATE_VALUE &&
+	  index.type == BRW_REGISTER_TYPE_UD);
+   uint32_t surf_index = index.ud;
+
+   uint32_t simd_mode, rlen, msg_type;
+   if (inst->exec_size == 16) {
+      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+      rlen = 8;
+   } else {
+      assert(inst->exec_size == 8);
+      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
+      rlen = 4;
+   }
+
+   if (devinfo->gen >= 5)
+      msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
+   else {
+      /* We always use the SIMD16 message so that we only have to load U, and
+       * not V or R.
+       */
+      msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
+      assert(inst->mlen == 3);
+      assert(inst->size_written == 8 * REG_SIZE);
+      rlen = 8;
+      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+   }
+
+   struct brw_reg header = brw_vec8_grf(0, 0);
+   gen6_resolve_implied_move(p, &header, inst->base_mrf);
+
+   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_inst_set_compression(devinfo, send, false);
+   brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
+   brw_set_src0(p, send, header);
+   if (devinfo->gen < 6)
+      brw_inst_set_base_mrf(p->devinfo, send, inst->base_mrf);
+
+   /* Our surface is set up as floats, regardless of what actual data is
+    * stored in it.
+    */
+   uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
+   brw_set_sampler_message(p, send,
+                           surf_index,
+                           0, /* sampler (unused) */
+                           msg_type,
+                           rlen,
+                           inst->mlen,
+                           inst->header_size != 0,
+                           simd_mode,
+                           return_format);
+}
+
+void
+fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
+                                                       struct brw_reg dst,
+                                                       struct brw_reg index,
+                                                       struct brw_reg offset)
+{
+   assert(devinfo->gen >= 7);
+   /* Varying-offset pull constant loads are treated as a normal expression on
+    * gen7, so the fact that it's a send message is hidden at the IR level.
+    */
+   assert(inst->header_size == 0);
+   assert(!inst->mlen);
+   assert(index.type == BRW_REGISTER_TYPE_UD);
+
+   uint32_t simd_mode, rlen, mlen;
+   if (inst->exec_size == 16) {
+      mlen = 2;
+      rlen = 8;
+      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+   } else {
+      assert(inst->exec_size == 8);
+      mlen = 1;
+      rlen = 4;
+      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
+   }
+
+   if (index.file == BRW_IMMEDIATE_VALUE) {
+
+      uint32_t surf_index = index.ud;
+
+      brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+      brw_set_dest(p, send, retype(dst, BRW_REGISTER_TYPE_UW));
+      brw_set_src0(p, send, offset);
+      brw_set_sampler_message(p, send,
+                              surf_index,
+                              0, /* LD message ignores sampler unit */
+                              GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
+                              rlen,
+                              mlen,
+                              false, /* no header */
+                              simd_mode,
+                              0);
+
+   } else {
+
+      struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
+
+      brw_push_insn_state(p);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+      /* a0.0 = surf_index & 0xff */
+      brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
+      brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
+      brw_set_dest(p, insn_and, addr);
+      brw_set_src0(p, insn_and, vec1(retype(index, BRW_REGISTER_TYPE_UD)));
+      brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
+
+      brw_pop_insn_state(p);
+
+      /* dst = send(offset, a0.0 | <descriptor>) */
+      brw_inst *insn = brw_send_indirect_message(
+         p, BRW_SFID_SAMPLER, retype(dst, BRW_REGISTER_TYPE_UW),
+         offset, addr);
+      brw_set_sampler_message(p, insn,
+                              0 /* surface */,
+                              0 /* sampler */,
+                              GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
+                              rlen /* rlen */,
+                              mlen /* mlen */,
+                              false /* header */,
+                              simd_mode,
+                              0);
+   }
+}
+
+/**
+ * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
+ * into the flags register (f0.0).
+ *
+ * Used only on Gen6 and above.
+ */
+void
+fs_generator::generate_mov_dispatch_to_flags(fs_inst *inst)
+{
+   struct brw_reg flags = brw_flag_reg(0, inst->flag_subreg);
+   struct brw_reg dispatch_mask;
+
+   if (devinfo->gen >= 6)
+      dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
+   else
+      dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
+
+   brw_push_insn_state(p);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, flags, dispatch_mask);
+   brw_pop_insn_state(p);
+}
+
+void
+fs_generator::generate_pixel_interpolator_query(fs_inst *inst,
+                                                struct brw_reg dst,
+                                                struct brw_reg src,
+                                                struct brw_reg msg_data,
+                                                unsigned msg_type)
+{
+   assert(inst->size_written % REG_SIZE == 0);
+   assert(msg_data.type == BRW_REGISTER_TYPE_UD);
+
+   brw_pixel_interpolator_query(p,
+         retype(dst, BRW_REGISTER_TYPE_UW),
+         src,
+         inst->pi_noperspective,
+         msg_type,
+         msg_data,
+         inst->mlen,
+         inst->size_written / REG_SIZE);
+}
+
+/* Sets vstride=1, width=4, hstride=0 of register src1 during
+ * the ADD instruction.
+ */
+void
+fs_generator::generate_set_sample_id(fs_inst *inst,
+                                     struct brw_reg dst,
+                                     struct brw_reg src0,
+                                     struct brw_reg src1)
+{
+   assert(dst.type == BRW_REGISTER_TYPE_D ||
+          dst.type == BRW_REGISTER_TYPE_UD);
+   assert(src0.type == BRW_REGISTER_TYPE_D ||
+          src0.type == BRW_REGISTER_TYPE_UD);
+
+   struct brw_reg reg = stride(src1, 1, 4, 0);
+   if (devinfo->gen >= 8 || inst->exec_size == 8) {
+      brw_ADD(p, dst, src0, reg);
+   } else if (inst->exec_size == 16) {
+      brw_push_insn_state(p);
+      brw_set_default_exec_size(p, BRW_EXECUTE_8);
+      brw_set_default_compression_control(p, BRW_COMPRESSION_NONE);
+      brw_ADD(p, firsthalf(dst), firsthalf(src0), reg);
+      brw_set_default_compression_control(p, BRW_COMPRESSION_2NDHALF);
+      brw_ADD(p, sechalf(dst), sechalf(src0), suboffset(reg, 2));
+      brw_pop_insn_state(p);
+   }
+}
+
+void
+fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
+                                            struct brw_reg dst,
+                                            struct brw_reg x,
+                                            struct brw_reg y)
+{
+   assert(devinfo->gen >= 7);
+   assert(dst.type == BRW_REGISTER_TYPE_UD);
+   assert(x.type == BRW_REGISTER_TYPE_F);
+   assert(y.type == BRW_REGISTER_TYPE_F);
+
+   /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
+    *
+    *   Because this instruction does not have a 16-bit floating-point type,
+    *   the destination data type must be Word (W).
+    *
+    *   The destination must be DWord-aligned and specify a horizontal stride
+    *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
+    *   each destination channel and the upper word is not modified.
+    */
+   struct brw_reg dst_w = spread(retype(dst, BRW_REGISTER_TYPE_W), 2);
+
+   /* Give each 32-bit channel of dst the form below, where "." means
+    * unchanged.
+    *   0x....hhhh
+    */
+   brw_F32TO16(p, dst_w, y);
+
+   /* Now the form:
+    *   0xhhhh0000
+    */
+   brw_SHL(p, dst, dst, brw_imm_ud(16u));
+
+   /* And, finally the form of packHalf2x16's output:
+    *   0xhhhhllll
+    */
+   brw_F32TO16(p, dst_w, x);
+}
+
+void
+fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
+                                              struct brw_reg dst,
+                                              struct brw_reg src)
+{
+   assert(devinfo->gen >= 7);
+   assert(dst.type == BRW_REGISTER_TYPE_F);
+   assert(src.type == BRW_REGISTER_TYPE_UD);
+
+   /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
+    *
+    *   Because this instruction does not have a 16-bit floating-point type,
+    *   the source data type must be Word (W). The destination type must be
+    *   F (Float).
+    */
+   struct brw_reg src_w = spread(retype(src, BRW_REGISTER_TYPE_W), 2);
+
+   /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
+    * For the Y case, we wish to access only the upper word; therefore
+    * a 16-bit subregister offset is needed.
+    */
+   assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X ||
+          inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y);
+   if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y)
+      src_w.subnr += 2;
+
+   brw_F16TO32(p, dst, src_w);
+}
+
+void
+fs_generator::generate_shader_time_add(fs_inst *inst,
+                                       struct brw_reg payload,
+                                       struct brw_reg offset,
+                                       struct brw_reg value)
+{
+   assert(devinfo->gen >= 7);
+   brw_push_insn_state(p);
+   brw_set_default_mask_control(p, true);
+
+   assert(payload.file == BRW_GENERAL_REGISTER_FILE);
+   struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0),
+                                          offset.type);
+   struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0),
+                                         value.type);
+
+   assert(offset.file == BRW_IMMEDIATE_VALUE);
+   if (value.file == BRW_GENERAL_REGISTER_FILE) {
+      value.width = BRW_WIDTH_1;
+      value.hstride = BRW_HORIZONTAL_STRIDE_0;
+      value.vstride = BRW_VERTICAL_STRIDE_0;
+   } else {
+      assert(value.file == BRW_IMMEDIATE_VALUE);
+   }
+
+   /* Trying to deal with setup of the params from the IR is crazy in the FS8
+    * case, and we don't really care about squeezing every bit of performance
+    * out of this path, so we just emit the MOVs from here.
+    */
+   brw_MOV(p, payload_offset, offset);
+   brw_MOV(p, payload_value, value);
+   brw_shader_time_add(p, payload,
+                       prog_data->binding_table.shader_time_start);
+   brw_pop_insn_state(p);
+
+   brw_mark_surface_used(prog_data,
+                         prog_data->binding_table.shader_time_start);
+}
+
+void
+fs_generator::enable_debug(const char *shader_name)
+{
+   debug_flag = true;
+   this->shader_name = shader_name;
+}
+
+int
+fs_generator::generate_code(const cfg_t *cfg, int dispatch_width)
+{
+   /* align to 64 byte boundary. */
+   while (p->next_insn_offset % 64)
+      brw_NOP(p);
+
+   this->dispatch_width = dispatch_width;
+
+   int start_offset = p->next_insn_offset;
+   int spill_count = 0, fill_count = 0;
+   int loop_count = 0;
+
+   struct annotation_info annotation;
+   memset(&annotation, 0, sizeof(annotation));
+
+   foreach_block_and_inst (block, fs_inst, inst, cfg) {
+      struct brw_reg src[3], dst;
+      unsigned int last_insn_offset = p->next_insn_offset;
+      bool multiple_instructions_emitted = false;
+
+      /* From the Broadwell PRM, Volume 7, "3D-Media-GPGPU", in the
+       * "Register Region Restrictions" section: for BDW, SKL:
+       *
+       *    "A POW/FDIV operation must not be followed by an instruction
+       *     that requires two destination registers."
+       *
+       * The documentation is often lacking annotations for Atom parts,
+       * and empirically this affects CHV as well.
+       */
+      if (devinfo->gen >= 8 &&
+          p->nr_insn > 1 &&
+          brw_inst_opcode(devinfo, brw_last_inst) == BRW_OPCODE_MATH &&
+          brw_inst_math_function(devinfo, brw_last_inst) == BRW_MATH_FUNCTION_POW &&
+          inst->dst.component_size(inst->exec_size) > REG_SIZE) {
+         brw_NOP(p);
+         last_insn_offset = p->next_insn_offset;
+      }
+
+      if (unlikely(debug_flag))
+         annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset);
+
+      /* If the instruction writes to more than one register, it needs to be
+       * explicitly marked as compressed on Gen <= 5.  On Gen >= 6 the
+       * hardware figures out by itself what the right compression mode is,
+       * but we still need to know whether the instruction is compressed to
+       * set up the source register regions appropriately.
+       *
+       * XXX - This is wrong for instructions that write a single register but
+       *       read more than one which should strictly speaking be treated as
+       *       compressed.  For instructions that don't write any registers it
+       *       relies on the destination being a null register of the correct
+       *       type and regioning so the instruction is considered compressed
+       *       or not accordingly.
+       */
+      const bool compressed =
+           inst->dst.component_size(inst->exec_size) > REG_SIZE;
+      brw_set_default_compression(p, compressed);
+      brw_set_default_group(p, inst->group);
+
+      for (unsigned int i = 0; i < inst->sources; i++) {
+         src[i] = brw_reg_from_fs_reg(inst, &inst->src[i], devinfo->gen,
+                                      compressed);
+
+	 /* The accumulator result appears to get used for the
+	  * conditional modifier generation.  When negating a UD
+	  * value, there is a 33rd bit generated for the sign in the
+	  * accumulator value, so now you can't check, for example,
+	  * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
+	  */
+	 assert(!inst->conditional_mod ||
+		inst->src[i].type != BRW_REGISTER_TYPE_UD ||
+		!inst->src[i].negate);
+      }
+      dst = brw_reg_from_fs_reg(inst, &inst->dst, devinfo->gen, compressed);
+
+      brw_set_default_access_mode(p, BRW_ALIGN_1);
+      brw_set_default_predicate_control(p, inst->predicate);
+      brw_set_default_predicate_inverse(p, inst->predicate_inverse);
+      brw_set_default_flag_reg(p, 0, inst->flag_subreg);
+      brw_set_default_saturate(p, inst->saturate);
+      brw_set_default_mask_control(p, inst->force_writemask_all);
+      brw_set_default_acc_write_control(p, inst->writes_accumulator);
+      brw_set_default_exec_size(p, cvt(inst->exec_size) - 1);
+
+      assert(inst->force_writemask_all || inst->exec_size >= 4);
+      assert(inst->force_writemask_all || inst->group % inst->exec_size == 0);
+      assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen));
+      assert(inst->mlen <= BRW_MAX_MSG_LENGTH);
+
+      switch (inst->opcode) {
+      case BRW_OPCODE_MOV:
+	 brw_MOV(p, dst, src[0]);
+	 break;
+      case BRW_OPCODE_ADD:
+	 brw_ADD(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_MUL:
+	 brw_MUL(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_AVG:
+	 brw_AVG(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_MACH:
+	 brw_MACH(p, dst, src[0], src[1]);
+	 break;
+
+      case BRW_OPCODE_LINE:
+         brw_LINE(p, dst, src[0], src[1]);
+         break;
+
+      case BRW_OPCODE_MAD:
+         assert(devinfo->gen >= 6);
+	 brw_set_default_access_mode(p, BRW_ALIGN_16);
+         brw_MAD(p, dst, src[0], src[1], src[2]);
+	 break;
+
+      case BRW_OPCODE_LRP:
+         assert(devinfo->gen >= 6);
+	 brw_set_default_access_mode(p, BRW_ALIGN_16);
+         brw_LRP(p, dst, src[0], src[1], src[2]);
+	 break;
+
+      case BRW_OPCODE_FRC:
+	 brw_FRC(p, dst, src[0]);
+	 break;
+      case BRW_OPCODE_RNDD:
+	 brw_RNDD(p, dst, src[0]);
+	 break;
+      case BRW_OPCODE_RNDE:
+	 brw_RNDE(p, dst, src[0]);
+	 break;
+      case BRW_OPCODE_RNDZ:
+	 brw_RNDZ(p, dst, src[0]);
+	 break;
+
+      case BRW_OPCODE_AND:
+	 brw_AND(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_OR:
+	 brw_OR(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_XOR:
+	 brw_XOR(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_NOT:
+	 brw_NOT(p, dst, src[0]);
+	 break;
+      case BRW_OPCODE_ASR:
+	 brw_ASR(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_SHR:
+	 brw_SHR(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_SHL:
+	 brw_SHL(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_F32TO16:
+         assert(devinfo->gen >= 7);
+         brw_F32TO16(p, dst, src[0]);
+         break;
+      case BRW_OPCODE_F16TO32:
+         assert(devinfo->gen >= 7);
+         brw_F16TO32(p, dst, src[0]);
+         break;
+      case BRW_OPCODE_CMP:
+         if (inst->exec_size >= 16 && devinfo->gen == 7 && !devinfo->is_haswell &&
+             dst.file == BRW_ARCHITECTURE_REGISTER_FILE) {
+            /* For unknown reasons the WaCMPInstFlagDepClearedEarly workaround
+             * implemented in the compiler is not sufficient. Overriding the
+             * type when the destination is the null register is necessary but
+             * not sufficient by itself.
+             */
+            assert(dst.nr == BRW_ARF_NULL);
+            dst.type = BRW_REGISTER_TYPE_D;
+         }
+         brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_SEL:
+	 brw_SEL(p, dst, src[0], src[1]);
+	 break;
+      case BRW_OPCODE_BFREV:
+         assert(devinfo->gen >= 7);
+         /* BFREV only supports UD type for src and dst. */
+         brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
+                      retype(src[0], BRW_REGISTER_TYPE_UD));
+         break;
+      case BRW_OPCODE_FBH:
+         assert(devinfo->gen >= 7);
+         /* FBH only supports UD type for dst. */
+         brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
+         break;
+      case BRW_OPCODE_FBL:
+         assert(devinfo->gen >= 7);
+         /* FBL only supports UD type for dst. */
+         brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
+         break;
+      case BRW_OPCODE_LZD:
+         brw_LZD(p, dst, src[0]);
+         break;
+      case BRW_OPCODE_CBIT:
+         assert(devinfo->gen >= 7);
+         /* CBIT only supports UD type for dst. */
+         brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
+         break;
+      case BRW_OPCODE_ADDC:
+         assert(devinfo->gen >= 7);
+         brw_ADDC(p, dst, src[0], src[1]);
+         break;
+      case BRW_OPCODE_SUBB:
+         assert(devinfo->gen >= 7);
+         brw_SUBB(p, dst, src[0], src[1]);
+         break;
+      case BRW_OPCODE_MAC:
+         brw_MAC(p, dst, src[0], src[1]);
+         break;
+
+      case BRW_OPCODE_BFE:
+         assert(devinfo->gen >= 7);
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+         brw_BFE(p, dst, src[0], src[1], src[2]);
+         break;
+
+      case BRW_OPCODE_BFI1:
+         assert(devinfo->gen >= 7);
+         brw_BFI1(p, dst, src[0], src[1]);
+         break;
+      case BRW_OPCODE_BFI2:
+         assert(devinfo->gen >= 7);
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+         brw_BFI2(p, dst, src[0], src[1], src[2]);
+         break;
+
+      case BRW_OPCODE_IF:
+	 if (inst->src[0].file != BAD_FILE) {
+	    /* The instruction has an embedded compare (only allowed on gen6) */
+	    assert(devinfo->gen == 6);
+	    gen6_IF(p, inst->conditional_mod, src[0], src[1]);
+	 } else {
+	    brw_IF(p, brw_inst_exec_size(devinfo, p->current));
+	 }
+	 break;
+
+      case BRW_OPCODE_ELSE:
+	 brw_ELSE(p);
+	 break;
+      case BRW_OPCODE_ENDIF:
+	 brw_ENDIF(p);
+	 break;
+
+      case BRW_OPCODE_DO:
+	 brw_DO(p, brw_inst_exec_size(devinfo, p->current));
+	 break;
+
+      case BRW_OPCODE_BREAK:
+	 brw_BREAK(p);
+	 break;
+      case BRW_OPCODE_CONTINUE:
+         brw_CONT(p);
+	 break;
+
+      case BRW_OPCODE_WHILE:
+	 brw_WHILE(p);
+         loop_count++;
+	 break;
+
+      case SHADER_OPCODE_RCP:
+      case SHADER_OPCODE_RSQ:
+      case SHADER_OPCODE_SQRT:
+      case SHADER_OPCODE_EXP2:
+      case SHADER_OPCODE_LOG2:
+      case SHADER_OPCODE_SIN:
+      case SHADER_OPCODE_COS:
+         assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
+	 if (devinfo->gen >= 6) {
+            assert(inst->mlen == 0);
+            assert(devinfo->gen >= 7 || inst->exec_size == 8);
+            gen6_math(p, dst, brw_math_function(inst->opcode),
+                      src[0], brw_null_reg());
+	 } else {
+            assert(inst->mlen >= 1);
+            assert(devinfo->gen == 5 || devinfo->is_g4x || inst->exec_size == 8);
+            gen4_math(p, dst,
+                      brw_math_function(inst->opcode),
+                      inst->base_mrf, src[0],
+                      BRW_MATH_PRECISION_FULL);
+	 }
+	 break;
+      case SHADER_OPCODE_INT_QUOTIENT:
+      case SHADER_OPCODE_INT_REMAINDER:
+      case SHADER_OPCODE_POW:
+         assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
+         if (devinfo->gen >= 6) {
+            assert(inst->mlen == 0);
+            assert((devinfo->gen >= 7 && inst->opcode == SHADER_OPCODE_POW) ||
+                   inst->exec_size == 8);
+            gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
+         } else {
+            assert(inst->mlen >= 1);
+            assert(inst->exec_size == 8);
+            gen4_math(p, dst, brw_math_function(inst->opcode),
+                      inst->base_mrf, src[0],
+                      BRW_MATH_PRECISION_FULL);
+	 }
+	 break;
+      case FS_OPCODE_CINTERP:
+	 brw_MOV(p, dst, src[0]);
+	 break;
+      case FS_OPCODE_LINTERP:
+	 generate_linterp(inst, dst, src);
+	 break;
+      case FS_OPCODE_PIXEL_X:
+         assert(src[0].type == BRW_REGISTER_TYPE_UW);
+         src[0].subnr = 0 * type_sz(src[0].type);
+         brw_MOV(p, dst, stride(src[0], 8, 4, 1));
+         break;
+      case FS_OPCODE_PIXEL_Y:
+         assert(src[0].type == BRW_REGISTER_TYPE_UW);
+         src[0].subnr = 4 * type_sz(src[0].type);
+         brw_MOV(p, dst, stride(src[0], 8, 4, 1));
+         break;
+      case FS_OPCODE_GET_BUFFER_SIZE:
+         generate_get_buffer_size(inst, dst, src[0], src[1]);
+         break;
+      case SHADER_OPCODE_TEX:
+      case FS_OPCODE_TXB:
+      case SHADER_OPCODE_TXD:
+      case SHADER_OPCODE_TXF:
+      case SHADER_OPCODE_TXF_LZ:
+      case SHADER_OPCODE_TXF_CMS:
+      case SHADER_OPCODE_TXF_CMS_W:
+      case SHADER_OPCODE_TXF_UMS:
+      case SHADER_OPCODE_TXF_MCS:
+      case SHADER_OPCODE_TXL:
+      case SHADER_OPCODE_TXL_LZ:
+      case SHADER_OPCODE_TXS:
+      case SHADER_OPCODE_LOD:
+      case SHADER_OPCODE_TG4:
+      case SHADER_OPCODE_TG4_OFFSET:
+      case SHADER_OPCODE_SAMPLEINFO:
+	 generate_tex(inst, dst, src[0], src[1], src[2]);
+	 break;
+      case FS_OPCODE_DDX_COARSE:
+      case FS_OPCODE_DDX_FINE:
+         generate_ddx(inst->opcode, dst, src[0]);
+         break;
+      case FS_OPCODE_DDY_COARSE:
+      case FS_OPCODE_DDY_FINE:
+         generate_ddy(inst->opcode, dst, src[0]);
+	 break;
+
+      case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
+	 generate_scratch_write(inst, src[0]);
+         spill_count++;
+	 break;
+
+      case SHADER_OPCODE_GEN4_SCRATCH_READ:
+	 generate_scratch_read(inst, dst);
+         fill_count++;
+	 break;
+
+      case SHADER_OPCODE_GEN7_SCRATCH_READ:
+	 generate_scratch_read_gen7(inst, dst);
+         fill_count++;
+	 break;
+
+      case SHADER_OPCODE_MOV_INDIRECT:
+         generate_mov_indirect(inst, dst, src[0], src[1]);
+         break;
+
+      case SHADER_OPCODE_URB_READ_SIMD8:
+      case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
+         generate_urb_read(inst, dst, src[0]);
+         break;
+
+      case SHADER_OPCODE_URB_WRITE_SIMD8:
+      case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
+      case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
+      case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
+	 generate_urb_write(inst, src[0]);
+	 break;
+
+      case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+         assert(inst->force_writemask_all);
+	 generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
+	 break;
+
+      case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
+         assert(inst->force_writemask_all);
+	 generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
+	 break;
+
+      case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
+	 generate_varying_pull_constant_load_gen4(inst, dst, src[0]);
+	 break;
+
+      case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
+	 generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]);
+	 break;
+
+      case FS_OPCODE_REP_FB_WRITE:
+      case FS_OPCODE_FB_WRITE:
+	 generate_fb_write(inst, src[0]);
+	 break;
+
+      case FS_OPCODE_FB_READ:
+         generate_fb_read(inst, dst, src[0]);
+         break;
+
+      case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
+         generate_mov_dispatch_to_flags(inst);
+         break;
+
+      case FS_OPCODE_DISCARD_JUMP:
+         generate_discard_jump(inst);
+         break;
+
+      case SHADER_OPCODE_SHADER_TIME_ADD:
+         generate_shader_time_add(inst, src[0], src[1], src[2]);
+         break;
+
+      case SHADER_OPCODE_UNTYPED_ATOMIC:
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
+         brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud,
+                            inst->mlen, !inst->dst.is_null());
+         break;
+
+      case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
+         brw_untyped_surface_read(p, dst, src[0], src[1],
+                                  inst->mlen, src[2].ud);
+         break;
+
+      case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
+         brw_untyped_surface_write(p, src[0], src[1],
+                                   inst->mlen, src[2].ud);
+         break;
+
+      case SHADER_OPCODE_TYPED_ATOMIC:
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
+         brw_typed_atomic(p, dst, src[0], src[1],
+                          src[2].ud, inst->mlen, !inst->dst.is_null());
+         break;
+
+      case SHADER_OPCODE_TYPED_SURFACE_READ:
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
+         brw_typed_surface_read(p, dst, src[0], src[1],
+                                inst->mlen, src[2].ud);
+         break;
+
+      case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
+         brw_typed_surface_write(p, src[0], src[1], inst->mlen, src[2].ud);
+         break;
+
+      case SHADER_OPCODE_MEMORY_FENCE:
+         brw_memory_fence(p, dst);
+         break;
+
+      case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
+         const struct brw_reg mask =
+            brw_stage_has_packed_dispatch(devinfo, stage,
+                                          prog_data) ? brw_imm_ud(~0u) :
+            stage == MESA_SHADER_FRAGMENT ? brw_vmask_reg() :
+            brw_dmask_reg();
+         brw_find_live_channel(p, dst, mask);
+         break;
+      }
+
+      case SHADER_OPCODE_BROADCAST:
+         assert(inst->force_writemask_all);
+         brw_broadcast(p, dst, src[0], src[1]);
+         break;
+
+      case FS_OPCODE_SET_SAMPLE_ID:
+         generate_set_sample_id(inst, dst, src[0], src[1]);
+         break;
+
+      case FS_OPCODE_PACK_HALF_2x16_SPLIT:
+          generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
+          break;
+
+      case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
+      case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
+         generate_unpack_half_2x16_split(inst, dst, src[0]);
+         break;
+
+      case FS_OPCODE_PLACEHOLDER_HALT:
+         /* This is the place where the final HALT needs to be inserted if
+          * we've emitted any discards.  If not, this will emit no code.
+          */
+         if (!patch_discard_jumps_to_fb_writes()) {
+            if (unlikely(debug_flag)) {
+               annotation.ann_count--;
+            }
+         }
+         break;
+
+      case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
+         generate_pixel_interpolator_query(inst, dst, src[0], src[1],
+                                           GEN7_PIXEL_INTERPOLATOR_LOC_SAMPLE);
+         break;
+
+      case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
+         generate_pixel_interpolator_query(inst, dst, src[0], src[1],
+                                           GEN7_PIXEL_INTERPOLATOR_LOC_SHARED_OFFSET);
+         break;
+
+      case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+         generate_pixel_interpolator_query(inst, dst, src[0], src[1],
+                                           GEN7_PIXEL_INTERPOLATOR_LOC_PER_SLOT_OFFSET);
+         break;
+
+      case CS_OPCODE_CS_TERMINATE:
+         generate_cs_terminate(inst, src[0]);
+         break;
+
+      case SHADER_OPCODE_BARRIER:
+	 generate_barrier(inst, src[0]);
+	 break;
+
+      case BRW_OPCODE_DIM:
+         assert(devinfo->is_haswell);
+         assert(src[0].type == BRW_REGISTER_TYPE_DF);
+         assert(dst.type == BRW_REGISTER_TYPE_DF);
+         brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F));
+         break;
+
+      default:
+         unreachable("Unsupported opcode");
+
+      case SHADER_OPCODE_LOAD_PAYLOAD:
+         unreachable("Should be lowered by lower_load_payload()");
+      }
+
+      if (multiple_instructions_emitted)
+         continue;
+
+      if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
+         assert(p->next_insn_offset == last_insn_offset + 16 ||
+                !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
+                 "emitting more than 1 instruction");
+
+         brw_inst *last = &p->store[last_insn_offset / 16];
+
+         if (inst->conditional_mod)
+            brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
+         brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
+         brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
+      }
+   }
+
+   brw_set_uip_jip(p, start_offset);
+   annotation_finalize(&annotation, p->next_insn_offset);
+
+#ifndef NDEBUG
+   bool validated = brw_validate_instructions(p, start_offset, &annotation);
+#else
+   if (unlikely(debug_flag))
+      brw_validate_instructions(p, start_offset, &annotation);
+#endif
+
+   int before_size = p->next_insn_offset - start_offset;
+   brw_compact_instructions(p, start_offset, annotation.ann_count,
+                            annotation.ann);
+   int after_size = p->next_insn_offset - start_offset;
+
+   if (unlikely(debug_flag)) {
+      fprintf(stderr, "Native code for %s\n"
+              "SIMD%d shader: %d instructions. %d loops. %u cycles. %d:%d spills:fills. Promoted %u constants. Compacted %d to %d"
+              " bytes (%.0f%%)\n",
+              shader_name, dispatch_width, before_size / 16, loop_count, cfg->cycle_count,
+              spill_count, fill_count, promoted_constants, before_size, after_size,
+              100.0f * (before_size - after_size) / before_size);
+
+      dump_assembly(p->store, annotation.ann_count, annotation.ann,
+                    p->devinfo);
+      ralloc_free(annotation.mem_ctx);
+   }
+   assert(validated);
+
+   compiler->shader_debug_log(log_data,
+                              "%s SIMD%d shader: %d inst, %d loops, %u cycles, "
+                              "%d:%d spills:fills, Promoted %u constants, "
+                              "compacted %d to %d bytes.",
+                              _mesa_shader_stage_to_abbrev(stage),
+                              dispatch_width, before_size / 16,
+                              loop_count, cfg->cycle_count, spill_count,
+                              fill_count, promoted_constants, before_size,
+                              after_size);
+
+   return start_offset;
+}
+
+const unsigned *
+fs_generator::get_assembly(unsigned int *assembly_size)
+{
+   return brw_get_program(p, assembly_size);
+}
diff --git a/src/intel/compiler/brw_fs_live_variables.cpp b/src/intel/compiler/brw_fs_live_variables.cpp
new file mode 100644
index 00000000000..c449672a519
--- /dev/null
+++ b/src/intel/compiler/brw_fs_live_variables.cpp
@@ -0,0 +1,334 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "brw_cfg.h"
+#include "brw_fs_live_variables.h"
+
+using namespace brw;
+
+#define MAX_INSTRUCTION (1 << 30)
+
+/** @file brw_fs_live_variables.cpp
+ *
+ * Support for calculating liveness information about virtual GRFs.
+ *
+ * This produces a live interval for each whole virtual GRF.  We could
+ * choose to expose per-component live intervals for VGRFs of size > 1,
+ * but we currently do not.  It is easier for the consumers of this
+ * information to work with whole VGRFs.
+ *
+ * However, we internally track use/def information at the per-GRF level for
+ * greater accuracy.  Large VGRFs may be accessed piecemeal over many
+ * (possibly non-adjacent) instructions.  In this case, examining a single
+ * instruction is insufficient to decide whether a whole VGRF is ultimately
+ * used or defined.  Tracking individual components allows us to easily
+ * assemble this information.
+ *
+ * See Muchnick's Advanced Compiler Design and Implementation, section
+ * 14.1 (p444).
+ */
+
+void
+fs_live_variables::setup_one_read(struct block_data *bd, fs_inst *inst,
+                                  int ip, const fs_reg &reg)
+{
+   int var = var_from_reg(reg);
+   assert(var < num_vars);
+
+   start[var] = MIN2(start[var], ip);
+   end[var] = MAX2(end[var], ip);
+
+   /* The use[] bitset marks when the block makes use of a variable (VGRF
+    * channel) without having completely defined that variable within the
+    * block.
+    */
+   if (!BITSET_TEST(bd->def, var))
+      BITSET_SET(bd->use, var);
+}
+
+void
+fs_live_variables::setup_one_write(struct block_data *bd, fs_inst *inst,
+                                   int ip, const fs_reg &reg)
+{
+   int var = var_from_reg(reg);
+   assert(var < num_vars);
+
+   start[var] = MIN2(start[var], ip);
+   end[var] = MAX2(end[var], ip);
+
+   /* The def[] bitset marks when an initialization in a block completely
+    * screens off previous updates of that variable (VGRF channel).
+    */
+   if (inst->dst.file == VGRF && !inst->is_partial_write()) {
+      if (!BITSET_TEST(bd->use, var))
+         BITSET_SET(bd->def, var);
+   }
+}
+
+/**
+ * Sets up the use[] and def[] bitsets.
+ *
+ * The basic-block-level live variable analysis needs to know which
+ * variables get used before they're completely defined, and which
+ * variables are completely defined before they're used.
+ *
+ * These are tracked at the per-component level, rather than whole VGRFs.
+ */
+void
+fs_live_variables::setup_def_use()
+{
+   int ip = 0;
+
+   foreach_block (block, cfg) {
+      assert(ip == block->start_ip);
+      if (block->num > 0)
+	 assert(cfg->blocks[block->num - 1]->end_ip == ip - 1);
+
+      struct block_data *bd = &block_data[block->num];
+
+      foreach_inst_in_block(fs_inst, inst, block) {
+	 /* Set use[] for this instruction */
+	 for (unsigned int i = 0; i < inst->sources; i++) {
+            fs_reg reg = inst->src[i];
+
+            if (reg.file != VGRF)
+               continue;
+
+            for (unsigned j = 0; j < regs_read(inst, i); j++) {
+               setup_one_read(bd, inst, ip, reg);
+               reg.offset += REG_SIZE;
+            }
+	 }
+
+         bd->flag_use[0] |= inst->flags_read(v->devinfo) & ~bd->flag_def[0];
+
+         /* Set def[] for this instruction */
+         if (inst->dst.file == VGRF) {
+            fs_reg reg = inst->dst;
+            for (unsigned j = 0; j < regs_written(inst); j++) {
+               setup_one_write(bd, inst, ip, reg);
+               reg.offset += REG_SIZE;
+            }
+	 }
+
+         if (!inst->predicate && inst->exec_size >= 8)
+            bd->flag_def[0] |= inst->flags_written() & ~bd->flag_use[0];
+
+	 ip++;
+      }
+   }
+}
+
+/**
+ * The algorithm incrementally sets bits in liveout and livein,
+ * propagating it through control flow.  It will eventually terminate
+ * because it only ever adds bits, and stops when no bits are added in
+ * a pass.
+ */
+void
+fs_live_variables::compute_live_variables()
+{
+   bool cont = true;
+
+   while (cont) {
+      cont = false;
+
+      foreach_block_reverse (block, cfg) {
+         struct block_data *bd = &block_data[block->num];
+
+	 /* Update liveout */
+	 foreach_list_typed(bblock_link, child_link, link, &block->children) {
+            struct block_data *child_bd = &block_data[child_link->block->num];
+
+	    for (int i = 0; i < bitset_words; i++) {
+               BITSET_WORD new_liveout = (child_bd->livein[i] &
+                                          ~bd->liveout[i]);
+               if (new_liveout) {
+                  bd->liveout[i] |= new_liveout;
+                  cont = true;
+               }
+	    }
+            BITSET_WORD new_liveout = (child_bd->flag_livein[0] &
+                                       ~bd->flag_liveout[0]);
+            if (new_liveout) {
+               bd->flag_liveout[0] |= new_liveout;
+               cont = true;
+            }
+	 }
+
+         /* Update livein */
+         for (int i = 0; i < bitset_words; i++) {
+            BITSET_WORD new_livein = (bd->use[i] |
+                                      (bd->liveout[i] &
+                                       ~bd->def[i]));
+            if (new_livein & ~bd->livein[i]) {
+               bd->livein[i] |= new_livein;
+               cont = true;
+            }
+         }
+         BITSET_WORD new_livein = (bd->flag_use[0] |
+                                   (bd->flag_liveout[0] &
+                                    ~bd->flag_def[0]));
+         if (new_livein & ~bd->flag_livein[0]) {
+            bd->flag_livein[0] |= new_livein;
+            cont = true;
+         }
+      }
+   }
+}
+
+/**
+ * Extend the start/end ranges for each variable to account for the
+ * new information calculated from control flow.
+ */
+void
+fs_live_variables::compute_start_end()
+{
+   foreach_block (block, cfg) {
+      struct block_data *bd = &block_data[block->num];
+
+      for (int i = 0; i < num_vars; i++) {
+         if (BITSET_TEST(bd->livein, i)) {
+            start[i] = MIN2(start[i], block->start_ip);
+            end[i] = MAX2(end[i], block->start_ip);
+         }
+
+         if (BITSET_TEST(bd->liveout, i)) {
+            start[i] = MIN2(start[i], block->end_ip);
+            end[i] = MAX2(end[i], block->end_ip);
+         }
+      }
+   }
+}
+
+fs_live_variables::fs_live_variables(fs_visitor *v, const cfg_t *cfg)
+   : v(v), cfg(cfg)
+{
+   mem_ctx = ralloc_context(NULL);
+
+   num_vgrfs = v->alloc.count;
+   num_vars = 0;
+   var_from_vgrf = rzalloc_array(mem_ctx, int, num_vgrfs);
+   for (int i = 0; i < num_vgrfs; i++) {
+      var_from_vgrf[i] = num_vars;
+      num_vars += v->alloc.sizes[i];
+   }
+
+   vgrf_from_var = rzalloc_array(mem_ctx, int, num_vars);
+   for (int i = 0; i < num_vgrfs; i++) {
+      for (unsigned j = 0; j < v->alloc.sizes[i]; j++) {
+         vgrf_from_var[var_from_vgrf[i] + j] = i;
+      }
+   }
+
+   start = ralloc_array(mem_ctx, int, num_vars);
+   end = rzalloc_array(mem_ctx, int, num_vars);
+   for (int i = 0; i < num_vars; i++) {
+      start[i] = MAX_INSTRUCTION;
+      end[i] = -1;
+   }
+
+   block_data= rzalloc_array(mem_ctx, struct block_data, cfg->num_blocks);
+
+   bitset_words = BITSET_WORDS(num_vars);
+   for (int i = 0; i < cfg->num_blocks; i++) {
+      block_data[i].def = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
+      block_data[i].use = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
+      block_data[i].livein = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
+      block_data[i].liveout = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
+
+      block_data[i].flag_def[0] = 0;
+      block_data[i].flag_use[0] = 0;
+      block_data[i].flag_livein[0] = 0;
+      block_data[i].flag_liveout[0] = 0;
+   }
+
+   setup_def_use();
+   compute_live_variables();
+   compute_start_end();
+}
+
+fs_live_variables::~fs_live_variables()
+{
+   ralloc_free(mem_ctx);
+}
+
+void
+fs_visitor::invalidate_live_intervals()
+{
+   ralloc_free(live_intervals);
+   live_intervals = NULL;
+}
+
+/**
+ * Compute the live intervals for each virtual GRF.
+ *
+ * This uses the per-component use/def data, but combines it to produce
+ * information about whole VGRFs.
+ */
+void
+fs_visitor::calculate_live_intervals()
+{
+   if (this->live_intervals)
+      return;
+
+   int num_vgrfs = this->alloc.count;
+   ralloc_free(this->virtual_grf_start);
+   ralloc_free(this->virtual_grf_end);
+   virtual_grf_start = ralloc_array(mem_ctx, int, num_vgrfs);
+   virtual_grf_end = ralloc_array(mem_ctx, int, num_vgrfs);
+
+   for (int i = 0; i < num_vgrfs; i++) {
+      virtual_grf_start[i] = MAX_INSTRUCTION;
+      virtual_grf_end[i] = -1;
+   }
+
+   this->live_intervals = new(mem_ctx) fs_live_variables(this, cfg);
+
+   /* Merge the per-component live ranges to whole VGRF live ranges. */
+   for (int i = 0; i < live_intervals->num_vars; i++) {
+      int vgrf = live_intervals->vgrf_from_var[i];
+      virtual_grf_start[vgrf] = MIN2(virtual_grf_start[vgrf],
+                                     live_intervals->start[i]);
+      virtual_grf_end[vgrf] = MAX2(virtual_grf_end[vgrf],
+                                   live_intervals->end[i]);
+   }
+}
+
+bool
+fs_live_variables::vars_interfere(int a, int b)
+{
+   return !(end[b] <= start[a] ||
+            end[a] <= start[b]);
+}
+
+bool
+fs_visitor::virtual_grf_interferes(int a, int b)
+{
+   return !(virtual_grf_end[a] <= virtual_grf_start[b] ||
+            virtual_grf_end[b] <= virtual_grf_start[a]);
+}
diff --git a/src/intel/compiler/brw_fs_live_variables.h b/src/intel/compiler/brw_fs_live_variables.h
new file mode 100644
index 00000000000..91d1e42cbc1
--- /dev/null
+++ b/src/intel/compiler/brw_fs_live_variables.h
@@ -0,0 +1,115 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "brw_fs.h"
+#include "util/bitset.h"
+
+struct cfg_t;
+
+namespace brw {
+
+struct block_data {
+   /**
+    * Which variables are defined before being used in the block.
+    *
+    * Note that for our purposes, "defined" means unconditionally, completely
+    * defined.
+    */
+   BITSET_WORD *def;
+
+   /**
+    * Which variables are used before being defined in the block.
+    */
+   BITSET_WORD *use;
+
+   /** Which defs reach the entry point of the block. */
+   BITSET_WORD *livein;
+
+   /** Which defs reach the exit point of the block. */
+   BITSET_WORD *liveout;
+
+   BITSET_WORD flag_def[1];
+   BITSET_WORD flag_use[1];
+   BITSET_WORD flag_livein[1];
+   BITSET_WORD flag_liveout[1];
+};
+
+class fs_live_variables {
+public:
+   DECLARE_RALLOC_CXX_OPERATORS(fs_live_variables)
+
+   fs_live_variables(fs_visitor *v, const cfg_t *cfg);
+   ~fs_live_variables();
+
+   bool vars_interfere(int a, int b);
+   int var_from_reg(const fs_reg &reg) const
+   {
+      return var_from_vgrf[reg.nr] + reg.offset / REG_SIZE;
+   }
+
+   /** Map from virtual GRF number to index in block_data arrays. */
+   int *var_from_vgrf;
+
+   /**
+    * Map from any index in block_data to the virtual GRF containing it.
+    *
+    * For alloc.sizes of [1, 2, 3], vgrf_from_var would contain
+    * [0, 1, 1, 2, 2, 2].
+    */
+   int *vgrf_from_var;
+
+   int num_vars;
+   int num_vgrfs;
+   int bitset_words;
+
+   /** @{
+    * Final computed live ranges for each var (each component of each virtual
+    * GRF).
+    */
+   int *start;
+   int *end;
+   /** @} */
+
+   /** Per-basic-block information on live variables */
+   struct block_data *block_data;
+
+protected:
+   void setup_def_use();
+   void setup_one_read(struct block_data *bd, fs_inst *inst, int ip,
+                       const fs_reg &reg);
+   void setup_one_write(struct block_data *bd, fs_inst *inst, int ip,
+                        const fs_reg &reg);
+   void compute_live_variables();
+   void compute_start_end();
+
+   fs_visitor *v;
+   const cfg_t *cfg;
+   void *mem_ctx;
+
+};
+
+} /* namespace brw */
diff --git a/src/intel/compiler/brw_fs_lower_d2x.cpp b/src/intel/compiler/brw_fs_lower_d2x.cpp
new file mode 100644
index 00000000000..a2db1154615
--- /dev/null
+++ b/src/intel/compiler/brw_fs_lower_d2x.cpp
@@ -0,0 +1,78 @@
+/*
+ * Copyright © 2015 Connor Abbott
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "brw_fs_builder.h"
+
+using namespace brw;
+
+bool
+fs_visitor::lower_d2x()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      if (inst->opcode != BRW_OPCODE_MOV)
+         continue;
+
+      if (inst->dst.type != BRW_REGISTER_TYPE_F &&
+          inst->dst.type != BRW_REGISTER_TYPE_D &&
+          inst->dst.type != BRW_REGISTER_TYPE_UD)
+         continue;
+
+      if (inst->src[0].type != BRW_REGISTER_TYPE_DF &&
+          inst->src[0].type != BRW_REGISTER_TYPE_UQ &&
+          inst->src[0].type != BRW_REGISTER_TYPE_Q)
+         continue;
+
+      assert(inst->dst.file == VGRF);
+      assert(inst->saturate == false);
+      fs_reg dst = inst->dst;
+
+      const fs_builder ibld(this, block, inst);
+
+      /* From the Broadwell PRM, 3D Media GPGPU, "Double Precision Float to
+       * Single Precision Float":
+       *
+       *    The upper Dword of every Qword will be written with undefined
+       *    value when converting DF to F.
+       *
+       * So we need to allocate a temporary that's two registers, and then do
+       * a strided MOV to get the lower DWord of every Qword that has the
+       * result.
+       */
+      fs_reg temp = ibld.vgrf(inst->src[0].type, 1);
+      fs_reg strided_temp = subscript(temp, inst->dst.type, 0);
+      ibld.MOV(strided_temp, inst->src[0]);
+      ibld.MOV(dst, strided_temp);
+
+      inst->remove(block);
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
diff --git a/src/intel/compiler/brw_fs_lower_pack.cpp b/src/intel/compiler/brw_fs_lower_pack.cpp
new file mode 100644
index 00000000000..7afaae095bd
--- /dev/null
+++ b/src/intel/compiler/brw_fs_lower_pack.cpp
@@ -0,0 +1,55 @@
+/*
+ * Copyright © 2015 Connor Abbott
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "brw_fs_builder.h"
+
+using namespace brw;
+
+bool
+fs_visitor::lower_pack()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
+      if (inst->opcode != FS_OPCODE_PACK)
+         continue;
+
+      assert(inst->dst.file == VGRF);
+      assert(inst->saturate == false);
+      fs_reg dst = inst->dst;
+
+      const fs_builder ibld(this, block, inst);
+      for (unsigned i = 0; i < inst->sources; i++)
+         ibld.MOV(subscript(dst, inst->src[i].type, i), inst->src[i]);
+
+      inst->remove(block);
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
new file mode 100644
index 00000000000..d403dec5357
--- /dev/null
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -0,0 +1,4679 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/glsl/ir.h"
+#include "brw_fs.h"
+#include "brw_fs_surface_builder.h"
+#include "brw_nir.h"
+
+using namespace brw;
+using namespace brw::surface_access;
+
+void
+fs_visitor::emit_nir_code()
+{
+   /* emit the arrays used for inputs and outputs - load/store intrinsics will
+    * be converted to reads/writes of these arrays
+    */
+   nir_setup_outputs();
+   nir_setup_uniforms();
+   nir_emit_system_values();
+
+   /* get the main function and emit it */
+   nir_foreach_function(function, nir) {
+      assert(strcmp(function->name, "main") == 0);
+      assert(function->impl);
+      nir_emit_impl(function->impl);
+   }
+}
+
+void
+fs_visitor::nir_setup_outputs()
+{
+   if (stage == MESA_SHADER_TESS_CTRL || stage == MESA_SHADER_FRAGMENT)
+      return;
+
+   nir_foreach_variable(var, &nir->outputs) {
+      const unsigned vec4s =
+         var->data.compact ? DIV_ROUND_UP(glsl_get_length(var->type), 4)
+                           : type_size_vec4(var->type);
+      fs_reg reg = bld.vgrf(BRW_REGISTER_TYPE_F, 4 * vec4s);
+      for (unsigned i = 0; i < vec4s; i++) {
+         if (outputs[var->data.driver_location + i].file == BAD_FILE)
+            outputs[var->data.driver_location + i] = offset(reg, bld, 4 * i);
+      }
+   }
+}
+
+void
+fs_visitor::nir_setup_uniforms()
+{
+   if (dispatch_width != min_dispatch_width)
+      return;
+
+   uniforms = nir->num_uniforms / 4;
+}
+
+static bool
+emit_system_values_block(nir_block *block, fs_visitor *v)
+{
+   fs_reg *reg;
+
+   nir_foreach_instr(instr, block) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      switch (intrin->intrinsic) {
+      case nir_intrinsic_load_vertex_id:
+         unreachable("should be lowered by lower_vertex_id().");
+
+      case nir_intrinsic_load_vertex_id_zero_base:
+         assert(v->stage == MESA_SHADER_VERTEX);
+         reg = &v->nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE];
+         if (reg->file == BAD_FILE)
+            *reg = *v->emit_vs_system_value(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
+         break;
+
+      case nir_intrinsic_load_base_vertex:
+         assert(v->stage == MESA_SHADER_VERTEX);
+         reg = &v->nir_system_values[SYSTEM_VALUE_BASE_VERTEX];
+         if (reg->file == BAD_FILE)
+            *reg = *v->emit_vs_system_value(SYSTEM_VALUE_BASE_VERTEX);
+         break;
+
+      case nir_intrinsic_load_instance_id:
+         assert(v->stage == MESA_SHADER_VERTEX);
+         reg = &v->nir_system_values[SYSTEM_VALUE_INSTANCE_ID];
+         if (reg->file == BAD_FILE)
+            *reg = *v->emit_vs_system_value(SYSTEM_VALUE_INSTANCE_ID);
+         break;
+
+      case nir_intrinsic_load_base_instance:
+         assert(v->stage == MESA_SHADER_VERTEX);
+         reg = &v->nir_system_values[SYSTEM_VALUE_BASE_INSTANCE];
+         if (reg->file == BAD_FILE)
+            *reg = *v->emit_vs_system_value(SYSTEM_VALUE_BASE_INSTANCE);
+         break;
+
+      case nir_intrinsic_load_draw_id:
+         assert(v->stage == MESA_SHADER_VERTEX);
+         reg = &v->nir_system_values[SYSTEM_VALUE_DRAW_ID];
+         if (reg->file == BAD_FILE)
+            *reg = *v->emit_vs_system_value(SYSTEM_VALUE_DRAW_ID);
+         break;
+
+      case nir_intrinsic_load_invocation_id:
+         if (v->stage == MESA_SHADER_TESS_CTRL)
+            break;
+         assert(v->stage == MESA_SHADER_GEOMETRY);
+         reg = &v->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
+         if (reg->file == BAD_FILE) {
+            const fs_builder abld = v->bld.annotate("gl_InvocationID", NULL);
+            fs_reg g1(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
+            fs_reg iid = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+            abld.SHR(iid, g1, brw_imm_ud(27u));
+            *reg = iid;
+         }
+         break;
+
+      case nir_intrinsic_load_sample_pos:
+         assert(v->stage == MESA_SHADER_FRAGMENT);
+         reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
+         if (reg->file == BAD_FILE)
+            *reg = *v->emit_samplepos_setup();
+         break;
+
+      case nir_intrinsic_load_sample_id:
+         assert(v->stage == MESA_SHADER_FRAGMENT);
+         reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
+         if (reg->file == BAD_FILE)
+            *reg = *v->emit_sampleid_setup();
+         break;
+
+      case nir_intrinsic_load_sample_mask_in:
+         assert(v->stage == MESA_SHADER_FRAGMENT);
+         assert(v->devinfo->gen >= 7);
+         reg = &v->nir_system_values[SYSTEM_VALUE_SAMPLE_MASK_IN];
+         if (reg->file == BAD_FILE)
+            *reg = *v->emit_samplemaskin_setup();
+         break;
+
+      case nir_intrinsic_load_work_group_id:
+         assert(v->stage == MESA_SHADER_COMPUTE);
+         reg = &v->nir_system_values[SYSTEM_VALUE_WORK_GROUP_ID];
+         if (reg->file == BAD_FILE)
+            *reg = *v->emit_cs_work_group_id_setup();
+         break;
+
+      case nir_intrinsic_load_helper_invocation:
+         assert(v->stage == MESA_SHADER_FRAGMENT);
+         reg = &v->nir_system_values[SYSTEM_VALUE_HELPER_INVOCATION];
+         if (reg->file == BAD_FILE) {
+            const fs_builder abld =
+               v->bld.annotate("gl_HelperInvocation", NULL);
+
+            /* On Gen6+ (gl_HelperInvocation is only exposed on Gen7+) the
+             * pixel mask is in g1.7 of the thread payload.
+             *
+             * We move the per-channel pixel enable bit to the low bit of each
+             * channel by shifting the byte containing the pixel mask by the
+             * vector immediate 0x76543210UV.
+             *
+             * The region of <1,8,0> reads only 1 byte (the pixel masks for
+             * subspans 0 and 1) in SIMD8 and an additional byte (the pixel
+             * masks for 2 and 3) in SIMD16.
+             */
+            fs_reg shifted = abld.vgrf(BRW_REGISTER_TYPE_UW, 1);
+            abld.SHR(shifted,
+                     stride(byte_offset(retype(brw_vec1_grf(1, 0),
+                                               BRW_REGISTER_TYPE_UB), 28),
+                            1, 8, 0),
+                     brw_imm_v(0x76543210));
+
+            /* A set bit in the pixel mask means the channel is enabled, but
+             * that is the opposite of gl_HelperInvocation so we need to invert
+             * the mask.
+             *
+             * The negate source-modifier bit of logical instructions on Gen8+
+             * performs 1's complement negation, so we can use that instead of
+             * a NOT instruction.
+             */
+            fs_reg inverted = negate(shifted);
+            if (v->devinfo->gen < 8) {
+               inverted = abld.vgrf(BRW_REGISTER_TYPE_UW);
+               abld.NOT(inverted, shifted);
+            }
+
+            /* We then resolve the 0/1 result to 0/~0 boolean values by ANDing
+             * with 1 and negating.
+             */
+            fs_reg anded = abld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+            abld.AND(anded, inverted, brw_imm_uw(1));
+
+            fs_reg dst = abld.vgrf(BRW_REGISTER_TYPE_D, 1);
+            abld.MOV(dst, negate(retype(anded, BRW_REGISTER_TYPE_D)));
+            *reg = dst;
+         }
+         break;
+
+      default:
+         break;
+      }
+   }
+
+   return true;
+}
+
+void
+fs_visitor::nir_emit_system_values()
+{
+   nir_system_values = ralloc_array(mem_ctx, fs_reg, SYSTEM_VALUE_MAX);
+   for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
+      nir_system_values[i] = fs_reg();
+   }
+
+   nir_foreach_function(function, nir) {
+      assert(strcmp(function->name, "main") == 0);
+      assert(function->impl);
+      nir_foreach_block(block, function->impl) {
+         emit_system_values_block(block, this);
+      }
+   }
+}
+
+void
+fs_visitor::nir_emit_impl(nir_function_impl *impl)
+{
+   nir_locals = ralloc_array(mem_ctx, fs_reg, impl->reg_alloc);
+   for (unsigned i = 0; i < impl->reg_alloc; i++) {
+      nir_locals[i] = fs_reg();
+   }
+
+   foreach_list_typed(nir_register, reg, node, &impl->registers) {
+      unsigned array_elems =
+         reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
+      unsigned size = array_elems * reg->num_components;
+      const brw_reg_type reg_type =
+         reg->bit_size == 32 ? BRW_REGISTER_TYPE_F : BRW_REGISTER_TYPE_DF;
+      nir_locals[reg->index] = bld.vgrf(reg_type, size);
+   }
+
+   nir_ssa_values = reralloc(mem_ctx, nir_ssa_values, fs_reg,
+                             impl->ssa_alloc);
+
+   nir_emit_cf_list(&impl->body);
+}
+
+void
+fs_visitor::nir_emit_cf_list(exec_list *list)
+{
+   exec_list_validate(list);
+   foreach_list_typed(nir_cf_node, node, node, list) {
+      switch (node->type) {
+      case nir_cf_node_if:
+         nir_emit_if(nir_cf_node_as_if(node));
+         break;
+
+      case nir_cf_node_loop:
+         nir_emit_loop(nir_cf_node_as_loop(node));
+         break;
+
+      case nir_cf_node_block:
+         nir_emit_block(nir_cf_node_as_block(node));
+         break;
+
+      default:
+         unreachable("Invalid CFG node block");
+      }
+   }
+}
+
+void
+fs_visitor::nir_emit_if(nir_if *if_stmt)
+{
+   /* first, put the condition into f0 */
+   fs_inst *inst = bld.MOV(bld.null_reg_d(),
+                            retype(get_nir_src(if_stmt->condition),
+                                   BRW_REGISTER_TYPE_D));
+   inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+   bld.IF(BRW_PREDICATE_NORMAL);
+
+   nir_emit_cf_list(&if_stmt->then_list);
+
+   /* note: if the else is empty, dead CF elimination will remove it */
+   bld.emit(BRW_OPCODE_ELSE);
+
+   nir_emit_cf_list(&if_stmt->else_list);
+
+   bld.emit(BRW_OPCODE_ENDIF);
+}
+
+void
+fs_visitor::nir_emit_loop(nir_loop *loop)
+{
+   bld.emit(BRW_OPCODE_DO);
+
+   nir_emit_cf_list(&loop->body);
+
+   bld.emit(BRW_OPCODE_WHILE);
+}
+
+void
+fs_visitor::nir_emit_block(nir_block *block)
+{
+   nir_foreach_instr(instr, block) {
+      nir_emit_instr(instr);
+   }
+}
+
+void
+fs_visitor::nir_emit_instr(nir_instr *instr)
+{
+   const fs_builder abld = bld.annotate(NULL, instr);
+
+   switch (instr->type) {
+   case nir_instr_type_alu:
+      nir_emit_alu(abld, nir_instr_as_alu(instr));
+      break;
+
+   case nir_instr_type_intrinsic:
+      switch (stage) {
+      case MESA_SHADER_VERTEX:
+         nir_emit_vs_intrinsic(abld, nir_instr_as_intrinsic(instr));
+         break;
+      case MESA_SHADER_TESS_CTRL:
+         nir_emit_tcs_intrinsic(abld, nir_instr_as_intrinsic(instr));
+         break;
+      case MESA_SHADER_TESS_EVAL:
+         nir_emit_tes_intrinsic(abld, nir_instr_as_intrinsic(instr));
+         break;
+      case MESA_SHADER_GEOMETRY:
+         nir_emit_gs_intrinsic(abld, nir_instr_as_intrinsic(instr));
+         break;
+      case MESA_SHADER_FRAGMENT:
+         nir_emit_fs_intrinsic(abld, nir_instr_as_intrinsic(instr));
+         break;
+      case MESA_SHADER_COMPUTE:
+         nir_emit_cs_intrinsic(abld, nir_instr_as_intrinsic(instr));
+         break;
+      default:
+         unreachable("unsupported shader stage");
+      }
+      break;
+
+   case nir_instr_type_tex:
+      nir_emit_texture(abld, nir_instr_as_tex(instr));
+      break;
+
+   case nir_instr_type_load_const:
+      nir_emit_load_const(abld, nir_instr_as_load_const(instr));
+      break;
+
+   case nir_instr_type_ssa_undef:
+      /* We create a new VGRF for undefs on every use (by handling
+       * them in get_nir_src()), rather than for each definition.
+       * This helps register coalescing eliminate MOVs from undef.
+       */
+      break;
+
+   case nir_instr_type_jump:
+      nir_emit_jump(abld, nir_instr_as_jump(instr));
+      break;
+
+   default:
+      unreachable("unknown instruction type");
+   }
+}
+
+/**
+ * Recognizes a parent instruction of nir_op_extract_* and changes the type to
+ * match instr.
+ */
+bool
+fs_visitor::optimize_extract_to_float(nir_alu_instr *instr,
+                                      const fs_reg &result)
+{
+   if (!instr->src[0].src.is_ssa ||
+       !instr->src[0].src.ssa->parent_instr)
+      return false;
+
+   if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
+      return false;
+
+   nir_alu_instr *src0 =
+      nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
+
+   if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 &&
+       src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16)
+      return false;
+
+   nir_const_value *element = nir_src_as_const_value(src0->src[1].src);
+   assert(element != NULL);
+
+   /* Element type to extract.*/
+   const brw_reg_type type = brw_int_type(
+      src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16 ? 2 : 1,
+      src0->op == nir_op_extract_i16 || src0->op == nir_op_extract_i8);
+
+   fs_reg op0 = get_nir_src(src0->src[0].src);
+   op0.type = brw_type_for_nir_type(devinfo,
+      (nir_alu_type)(nir_op_infos[src0->op].input_types[0] |
+                     nir_src_bit_size(src0->src[0].src)));
+   op0 = offset(op0, bld, src0->src[0].swizzle[0]);
+
+   set_saturate(instr->dest.saturate,
+                bld.MOV(result, subscript(op0, type, element->u32[0])));
+   return true;
+}
+
+bool
+fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
+                                         const fs_reg &result)
+{
+   if (!instr->src[0].src.is_ssa ||
+       instr->src[0].src.ssa->parent_instr->type != nir_instr_type_intrinsic)
+      return false;
+
+   nir_intrinsic_instr *src0 =
+      nir_instr_as_intrinsic(instr->src[0].src.ssa->parent_instr);
+
+   if (src0->intrinsic != nir_intrinsic_load_front_face)
+      return false;
+
+   nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
+   if (!value1 || fabsf(value1->f32[0]) != 1.0f)
+      return false;
+
+   nir_const_value *value2 = nir_src_as_const_value(instr->src[2].src);
+   if (!value2 || fabsf(value2->f32[0]) != 1.0f)
+      return false;
+
+   fs_reg tmp = vgrf(glsl_type::int_type);
+
+   if (devinfo->gen >= 6) {
+      /* Bit 15 of g0.0 is 0 if the polygon is front facing. */
+      fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
+
+      /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
+       *
+       *    or(8)  tmp.1<2>W  g0.0<0,1,0>W  0x00003f80W
+       *    and(8) dst<1>D    tmp<8,8,1>D   0xbf800000D
+       *
+       * and negate g0.0<0,1,0>W for (gl_FrontFacing ? -1.0 : 1.0).
+       *
+       * This negation looks like it's safe in practice, because bits 0:4 will
+       * surely be TRIANGLES
+       */
+
+      if (value1->f32[0] == -1.0f) {
+         g0.negate = true;
+      }
+
+      bld.OR(subscript(tmp, BRW_REGISTER_TYPE_W, 1),
+             g0, brw_imm_uw(0x3f80));
+   } else {
+      /* Bit 31 of g1.6 is 0 if the polygon is front facing. */
+      fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
+
+      /* For (gl_FrontFacing ? 1.0 : -1.0), emit:
+       *
+       *    or(8)  tmp<1>D  g1.6<0,1,0>D  0x3f800000D
+       *    and(8) dst<1>D  tmp<8,8,1>D   0xbf800000D
+       *
+       * and negate g1.6<0,1,0>D for (gl_FrontFacing ? -1.0 : 1.0).
+       *
+       * This negation looks like it's safe in practice, because bits 0:4 will
+       * surely be TRIANGLES
+       */
+
+      if (value1->f32[0] == -1.0f) {
+         g1_6.negate = true;
+      }
+
+      bld.OR(tmp, g1_6, brw_imm_d(0x3f800000));
+   }
+   bld.AND(retype(result, BRW_REGISTER_TYPE_D), tmp, brw_imm_d(0xbf800000));
+
+   return true;
+}
+
+static void
+emit_find_msb_using_lzd(const fs_builder &bld,
+                        const fs_reg &result,
+                        const fs_reg &src,
+                        bool is_signed)
+{
+   fs_inst *inst;
+   fs_reg temp = src;
+
+   if (is_signed) {
+      /* LZD of an absolute value source almost always does the right
+       * thing.  There are two problem values:
+       *
+       * * 0x80000000.  Since abs(0x80000000) == 0x80000000, LZD returns
+       *   0.  However, findMSB(int(0x80000000)) == 30.
+       *
+       * * 0xffffffff.  Since abs(0xffffffff) == 1, LZD returns
+       *   31.  Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
+       *
+       *    For a value of zero or negative one, -1 will be returned.
+       *
+       * * Negative powers of two.  LZD(abs(-(1<<x))) returns x, but
+       *   findMSB(-(1<<x)) should return x-1.
+       *
+       * For all negative number cases, including 0x80000000 and
+       * 0xffffffff, the correct value is obtained from LZD if instead of
+       * negating the (already negative) value the logical-not is used.  A
+       * conditonal logical-not can be achieved in two instructions.
+       */
+      temp = bld.vgrf(BRW_REGISTER_TYPE_D);
+
+      bld.ASR(temp, src, brw_imm_d(31));
+      bld.XOR(temp, temp, src);
+   }
+
+   bld.LZD(retype(result, BRW_REGISTER_TYPE_UD),
+           retype(temp, BRW_REGISTER_TYPE_UD));
+
+   /* LZD counts from the MSB side, while GLSL's findMSB() wants the count
+    * from the LSB side. Subtract the result from 31 to convert the MSB
+    * count into an LSB count.  If no bits are set, LZD will return 32.
+    * 31-32 = -1, which is exactly what findMSB() is supposed to return.
+    */
+   inst = bld.ADD(result, retype(result, BRW_REGISTER_TYPE_D), brw_imm_d(31));
+   inst->src[0].negate = true;
+}
+
+void
+fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
+{
+   struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key;
+   fs_inst *inst;
+
+   fs_reg result = get_nir_dest(instr->dest.dest);
+   result.type = brw_type_for_nir_type(devinfo,
+      (nir_alu_type)(nir_op_infos[instr->op].output_type |
+                     nir_dest_bit_size(instr->dest.dest)));
+
+   fs_reg op[4];
+   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
+      op[i] = get_nir_src(instr->src[i].src);
+      op[i].type = brw_type_for_nir_type(devinfo,
+         (nir_alu_type)(nir_op_infos[instr->op].input_types[i] |
+                        nir_src_bit_size(instr->src[i].src)));
+      op[i].abs = instr->src[i].abs;
+      op[i].negate = instr->src[i].negate;
+   }
+
+   /* We get a bunch of mov's out of the from_ssa pass and they may still
+    * be vectorized.  We'll handle them as a special-case.  We'll also
+    * handle vecN here because it's basically the same thing.
+    */
+   switch (instr->op) {
+   case nir_op_imov:
+   case nir_op_fmov:
+   case nir_op_vec2:
+   case nir_op_vec3:
+   case nir_op_vec4: {
+      fs_reg temp = result;
+      bool need_extra_copy = false;
+      for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
+         if (!instr->src[i].src.is_ssa &&
+             instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) {
+            need_extra_copy = true;
+            temp = bld.vgrf(result.type, 4);
+            break;
+         }
+      }
+
+      for (unsigned i = 0; i < 4; i++) {
+         if (!(instr->dest.write_mask & (1 << i)))
+            continue;
+
+         if (instr->op == nir_op_imov || instr->op == nir_op_fmov) {
+            inst = bld.MOV(offset(temp, bld, i),
+                           offset(op[0], bld, instr->src[0].swizzle[i]));
+         } else {
+            inst = bld.MOV(offset(temp, bld, i),
+                           offset(op[i], bld, instr->src[i].swizzle[0]));
+         }
+         inst->saturate = instr->dest.saturate;
+      }
+
+      /* In this case the source and destination registers were the same,
+       * so we need to insert an extra set of moves in order to deal with
+       * any swizzling.
+       */
+      if (need_extra_copy) {
+         for (unsigned i = 0; i < 4; i++) {
+            if (!(instr->dest.write_mask & (1 << i)))
+               continue;
+
+            bld.MOV(offset(result, bld, i), offset(temp, bld, i));
+         }
+      }
+      return;
+   }
+   default:
+      break;
+   }
+
+   /* At this point, we have dealt with any instruction that operates on
+    * more than a single channel.  Therefore, we can just adjust the source
+    * and destination registers for that channel and emit the instruction.
+    */
+   unsigned channel = 0;
+   if (nir_op_infos[instr->op].output_size == 0) {
+      /* Since NIR is doing the scalarizing for us, we should only ever see
+       * vectorized operations with a single channel.
+       */
+      assert(_mesa_bitcount(instr->dest.write_mask) == 1);
+      channel = ffs(instr->dest.write_mask) - 1;
+
+      result = offset(result, bld, channel);
+   }
+
+   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
+      assert(nir_op_infos[instr->op].input_sizes[i] < 2);
+      op[i] = offset(op[i], bld, instr->src[i].swizzle[channel]);
+   }
+
+   switch (instr->op) {
+   case nir_op_i2f:
+   case nir_op_u2f:
+   case nir_op_i642d:
+   case nir_op_u642d:
+      if (optimize_extract_to_float(instr, result))
+         return;
+      inst = bld.MOV(result, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_f2d:
+   case nir_op_i2d:
+   case nir_op_u2d:
+      /* CHV PRM, vol07, 3D Media GPGPU Engine, Register Region Restrictions:
+       *
+       *    "When source or destination is 64b (...), regioning in Align1
+       *     must follow these rules:
+       *
+       *     1. Source and destination horizontal stride must be aligned to
+       *        the same qword.
+       *     (...)"
+       *
+       * This means that 32-bit to 64-bit conversions need to have the 32-bit
+       * data elements aligned to 64-bit. This restriction does not apply to
+       * BDW and later.
+       */
+      if (nir_dest_bit_size(instr->dest.dest) == 64 &&
+          nir_src_bit_size(instr->src[0].src) == 32 &&
+          (devinfo->is_cherryview || devinfo->is_broxton)) {
+         fs_reg tmp = bld.vgrf(result.type, 1);
+         tmp = subscript(tmp, op[0].type, 0);
+         inst = bld.MOV(tmp, op[0]);
+         inst = bld.MOV(result, tmp);
+         inst->saturate = instr->dest.saturate;
+         break;
+      }
+      /* fallthrough */
+   case nir_op_f2i64:
+   case nir_op_f2u64:
+   case nir_op_i2i64:
+   case nir_op_i2u64:
+   case nir_op_u2i64:
+   case nir_op_u2u64:
+   case nir_op_b2i64:
+   case nir_op_d2f:
+   case nir_op_d2i:
+   case nir_op_d2u:
+   case nir_op_i642f:
+   case nir_op_u642f:
+   case nir_op_u2i32:
+   case nir_op_i2i32:
+   case nir_op_u2u32:
+   case nir_op_i2u32:
+      if (instr->op == nir_op_b2i64) {
+         bld.MOV(result, negate(op[0]));
+      } else {
+         inst = bld.MOV(result, op[0]);
+         inst->saturate = instr->dest.saturate;
+      }
+      break;
+
+   case nir_op_f2i:
+   case nir_op_f2u:
+      bld.MOV(result, op[0]);
+      break;
+
+   case nir_op_fsign: {
+      if (op[0].abs) {
+         /* Straightforward since the source can be assumed to be
+          * non-negative.
+          */
+         set_condmod(BRW_CONDITIONAL_NZ, bld.MOV(result, op[0]));
+         set_predicate(BRW_PREDICATE_NORMAL, bld.MOV(result, brw_imm_f(1.0f)));
+
+      } else if (type_sz(op[0].type) < 8) {
+         /* AND(val, 0x80000000) gives the sign bit.
+          *
+          * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
+          * zero.
+          */
+         bld.CMP(bld.null_reg_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
+
+         fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
+         op[0].type = BRW_REGISTER_TYPE_UD;
+         result.type = BRW_REGISTER_TYPE_UD;
+         bld.AND(result_int, op[0], brw_imm_ud(0x80000000u));
+
+         inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u));
+         inst->predicate = BRW_PREDICATE_NORMAL;
+         if (instr->dest.saturate) {
+            inst = bld.MOV(result, result);
+            inst->saturate = true;
+         }
+      } else {
+         /* For doubles we do the same but we need to consider:
+          *
+          * - 2-src instructions can't operate with 64-bit immediates
+          * - The sign is encoded in the high 32-bit of each DF
+          * - CMP with DF requires special handling in SIMD16
+          * - We need to produce a DF result.
+          */
+
+         /* 2-src instructions can't have 64-bit immediates, so put 0.0 in
+          * a register and compare with that.
+          */
+         fs_reg tmp = vgrf(glsl_type::double_type);
+         bld.MOV(tmp, setup_imm_df(bld, 0.0));
+
+         /* A direct DF CMP using the flag register (null dst) won't work in
+          * SIMD16 because the CMP will be split in two by lower_simd_width,
+          * resulting in two CMP instructions with the same dst (NULL),
+          * leading to dead code elimination of the first one. In SIMD8,
+          * however, there is no need to split the CMP and we can save some
+          * work.
+          */
+         fs_reg dst_tmp = vgrf(glsl_type::double_type);
+         bld.CMP(dst_tmp, op[0], tmp, BRW_CONDITIONAL_NZ);
+
+         /* In SIMD16 we want to avoid using a NULL dst register with DF CMP,
+          * so we store the result of the comparison in a vgrf instead and
+          * then we generate a UD comparison from that that won't have to
+          * be split by lower_simd_width. This is what NIR does to handle
+          * double comparisons in the general case.
+          */
+         if (bld.dispatch_width() == 16 ) {
+            fs_reg dst_tmp_ud = retype(dst_tmp, BRW_REGISTER_TYPE_UD);
+            bld.MOV(dst_tmp_ud, subscript(dst_tmp, BRW_REGISTER_TYPE_UD, 0));
+            bld.CMP(bld.null_reg_ud(),
+                    dst_tmp_ud, brw_imm_ud(0), BRW_CONDITIONAL_NZ);
+         }
+
+         /* Get the high 32-bit of each double component where the sign is */
+         fs_reg result_int = retype(result, BRW_REGISTER_TYPE_UD);
+         bld.MOV(result_int, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
+
+         /* Get the sign bit */
+         bld.AND(result_int, result_int, brw_imm_ud(0x80000000u));
+
+         /* Add 1.0 to the sign, predicated to skip the case of op[0] == 0.0 */
+         inst = bld.OR(result_int, result_int, brw_imm_ud(0x3f800000u));
+         inst->predicate = BRW_PREDICATE_NORMAL;
+
+         /* Convert from 32-bit float to 64-bit double */
+         result.type = BRW_REGISTER_TYPE_DF;
+         inst = bld.MOV(result, retype(result_int, BRW_REGISTER_TYPE_F));
+
+         if (instr->dest.saturate) {
+            inst = bld.MOV(result, result);
+            inst->saturate = true;
+         }
+      }
+      break;
+   }
+
+   case nir_op_isign:
+      /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
+       *               -> non-negative val generates 0x00000000.
+       *  Predicated OR sets 1 if val is positive.
+       */
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_G);
+      bld.ASR(result, op[0], brw_imm_d(31));
+      inst = bld.OR(result, result, brw_imm_d(1));
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      break;
+
+   case nir_op_frcp:
+      inst = bld.emit(SHADER_OPCODE_RCP, result, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fexp2:
+      inst = bld.emit(SHADER_OPCODE_EXP2, result, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_flog2:
+      inst = bld.emit(SHADER_OPCODE_LOG2, result, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fsin:
+      inst = bld.emit(SHADER_OPCODE_SIN, result, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fcos:
+      inst = bld.emit(SHADER_OPCODE_COS, result, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fddx:
+      if (fs_key->high_quality_derivatives) {
+         inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
+      } else {
+         inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
+      }
+      inst->saturate = instr->dest.saturate;
+      break;
+   case nir_op_fddx_fine:
+      inst = bld.emit(FS_OPCODE_DDX_FINE, result, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+   case nir_op_fddx_coarse:
+      inst = bld.emit(FS_OPCODE_DDX_COARSE, result, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+   case nir_op_fddy:
+      if (fs_key->high_quality_derivatives) {
+         inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
+      } else {
+         inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
+      }
+      inst->saturate = instr->dest.saturate;
+      break;
+   case nir_op_fddy_fine:
+      inst = bld.emit(FS_OPCODE_DDY_FINE, result, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+   case nir_op_fddy_coarse:
+      inst = bld.emit(FS_OPCODE_DDY_COARSE, result, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_iadd:
+   case nir_op_fadd:
+      inst = bld.ADD(result, op[0], op[1]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fmul:
+      inst = bld.MUL(result, op[0], op[1]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_imul:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      bld.MUL(result, op[0], op[1]);
+      break;
+
+   case nir_op_imul_high:
+   case nir_op_umul_high:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      bld.emit(SHADER_OPCODE_MULH, result, op[0], op[1]);
+      break;
+
+   case nir_op_idiv:
+   case nir_op_udiv:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      bld.emit(SHADER_OPCODE_INT_QUOTIENT, result, op[0], op[1]);
+      break;
+
+   case nir_op_uadd_carry:
+      unreachable("Should have been lowered by carry_to_arith().");
+
+   case nir_op_usub_borrow:
+      unreachable("Should have been lowered by borrow_to_arith().");
+
+   case nir_op_umod:
+   case nir_op_irem:
+      /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
+       * appears that our hardware just does the right thing for signed
+       * remainder.
+       */
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
+      break;
+
+   case nir_op_imod: {
+      /* Get a regular C-style remainder.  If a % b == 0, set the predicate. */
+      bld.emit(SHADER_OPCODE_INT_REMAINDER, result, op[0], op[1]);
+
+      /* Math instructions don't support conditional mod */
+      inst = bld.MOV(bld.null_reg_d(), result);
+      inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+      /* Now, we need to determine if signs of the sources are different.
+       * When we XOR the sources, the top bit is 0 if they are the same and 1
+       * if they are different.  We can then use a conditional modifier to
+       * turn that into a predicate.  This leads us to an XOR.l instruction.
+       *
+       * Technically, according to the PRM, you're not allowed to use .l on a
+       * XOR instruction.  However, emperical experiments and Curro's reading
+       * of the simulator source both indicate that it's safe.
+       */
+      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_D);
+      inst = bld.XOR(tmp, op[0], op[1]);
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      inst->conditional_mod = BRW_CONDITIONAL_L;
+
+      /* If the result of the initial remainder operation is non-zero and the
+       * two sources have different signs, add in a copy of op[1] to get the
+       * final integer modulus value.
+       */
+      inst = bld.ADD(result, result, op[1]);
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      break;
+   }
+
+   case nir_op_flt:
+   case nir_op_fge:
+   case nir_op_feq:
+   case nir_op_fne: {
+      fs_reg dest = result;
+      if (nir_src_bit_size(instr->src[0].src) > 32) {
+         dest = bld.vgrf(BRW_REGISTER_TYPE_DF, 1);
+      }
+      brw_conditional_mod cond;
+      switch (instr->op) {
+      case nir_op_flt:
+         cond = BRW_CONDITIONAL_L;
+         break;
+      case nir_op_fge:
+         cond = BRW_CONDITIONAL_GE;
+         break;
+      case nir_op_feq:
+         cond = BRW_CONDITIONAL_Z;
+         break;
+      case nir_op_fne:
+         cond = BRW_CONDITIONAL_NZ;
+         break;
+      default:
+         unreachable("bad opcode");
+      }
+      bld.CMP(dest, op[0], op[1], cond);
+      if (nir_src_bit_size(instr->src[0].src) > 32) {
+         bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
+      }
+      break;
+   }
+
+   case nir_op_ilt:
+   case nir_op_ult:
+   case nir_op_ige:
+   case nir_op_uge:
+   case nir_op_ieq:
+   case nir_op_ine: {
+      fs_reg dest = result;
+      if (nir_src_bit_size(instr->src[0].src) > 32) {
+         dest = bld.vgrf(BRW_REGISTER_TYPE_UQ, 1);
+      }
+
+      brw_conditional_mod cond;
+      switch (instr->op) {
+      case nir_op_ilt:
+      case nir_op_ult:
+         cond = BRW_CONDITIONAL_L;
+         break;
+      case nir_op_ige:
+      case nir_op_uge:
+         cond = BRW_CONDITIONAL_GE;
+         break;
+      case nir_op_ieq:
+         cond = BRW_CONDITIONAL_Z;
+         break;
+      case nir_op_ine:
+         cond = BRW_CONDITIONAL_NZ;
+         break;
+      default:
+         unreachable("bad opcode");
+      }
+      bld.CMP(dest, op[0], op[1], cond);
+      if (nir_src_bit_size(instr->src[0].src) > 32) {
+         bld.MOV(result, subscript(dest, BRW_REGISTER_TYPE_UD, 0));
+      }
+      break;
+   }
+
+   case nir_op_inot:
+      if (devinfo->gen >= 8) {
+         op[0] = resolve_source_modifiers(op[0]);
+      }
+      bld.NOT(result, op[0]);
+      break;
+   case nir_op_ixor:
+      if (devinfo->gen >= 8) {
+         op[0] = resolve_source_modifiers(op[0]);
+         op[1] = resolve_source_modifiers(op[1]);
+      }
+      bld.XOR(result, op[0], op[1]);
+      break;
+   case nir_op_ior:
+      if (devinfo->gen >= 8) {
+         op[0] = resolve_source_modifiers(op[0]);
+         op[1] = resolve_source_modifiers(op[1]);
+      }
+      bld.OR(result, op[0], op[1]);
+      break;
+   case nir_op_iand:
+      if (devinfo->gen >= 8) {
+         op[0] = resolve_source_modifiers(op[0]);
+         op[1] = resolve_source_modifiers(op[1]);
+      }
+      bld.AND(result, op[0], op[1]);
+      break;
+
+   case nir_op_fdot2:
+   case nir_op_fdot3:
+   case nir_op_fdot4:
+   case nir_op_ball_fequal2:
+   case nir_op_ball_iequal2:
+   case nir_op_ball_fequal3:
+   case nir_op_ball_iequal3:
+   case nir_op_ball_fequal4:
+   case nir_op_ball_iequal4:
+   case nir_op_bany_fnequal2:
+   case nir_op_bany_inequal2:
+   case nir_op_bany_fnequal3:
+   case nir_op_bany_inequal3:
+   case nir_op_bany_fnequal4:
+   case nir_op_bany_inequal4:
+      unreachable("Lowered by nir_lower_alu_reductions");
+
+   case nir_op_fnoise1_1:
+   case nir_op_fnoise1_2:
+   case nir_op_fnoise1_3:
+   case nir_op_fnoise1_4:
+   case nir_op_fnoise2_1:
+   case nir_op_fnoise2_2:
+   case nir_op_fnoise2_3:
+   case nir_op_fnoise2_4:
+   case nir_op_fnoise3_1:
+   case nir_op_fnoise3_2:
+   case nir_op_fnoise3_3:
+   case nir_op_fnoise3_4:
+   case nir_op_fnoise4_1:
+   case nir_op_fnoise4_2:
+   case nir_op_fnoise4_3:
+   case nir_op_fnoise4_4:
+      unreachable("not reached: should be handled by lower_noise");
+
+   case nir_op_ldexp:
+      unreachable("not reached: should be handled by ldexp_to_arith()");
+
+   case nir_op_fsqrt:
+      inst = bld.emit(SHADER_OPCODE_SQRT, result, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_frsq:
+      inst = bld.emit(SHADER_OPCODE_RSQ, result, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_b2i:
+   case nir_op_b2f:
+      bld.MOV(result, negate(op[0]));
+      break;
+
+   case nir_op_f2b:
+      bld.CMP(result, op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ);
+      break;
+
+   case nir_op_i642b:
+   case nir_op_d2b: {
+      /* two-argument instructions can't take 64-bit immediates */
+      fs_reg zero;
+      fs_reg tmp;
+
+      if (instr->op == nir_op_d2b) {
+         zero = vgrf(glsl_type::double_type);
+         tmp = vgrf(glsl_type::double_type);
+      } else {
+         zero = vgrf(glsl_type::int64_t_type);
+         tmp = vgrf(glsl_type::int64_t_type);
+      }
+
+      bld.MOV(zero, setup_imm_df(bld, 0.0));
+      /* A SIMD16 execution needs to be split in two instructions, so use
+       * a vgrf instead of the flag register as dst so instruction splitting
+       * works
+       */
+      bld.CMP(tmp, op[0], zero, BRW_CONDITIONAL_NZ);
+      bld.MOV(result, subscript(tmp, BRW_REGISTER_TYPE_UD, 0));
+      break;
+   }
+   case nir_op_i2b:
+      bld.CMP(result, op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
+      break;
+
+   case nir_op_ftrunc:
+      inst = bld.RNDZ(result, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fceil: {
+      op[0].negate = !op[0].negate;
+      fs_reg temp = vgrf(glsl_type::float_type);
+      bld.RNDD(temp, op[0]);
+      temp.negate = true;
+      inst = bld.MOV(result, temp);
+      inst->saturate = instr->dest.saturate;
+      break;
+   }
+   case nir_op_ffloor:
+      inst = bld.RNDD(result, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+   case nir_op_ffract:
+      inst = bld.FRC(result, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+   case nir_op_fround_even:
+      inst = bld.RNDE(result, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fquantize2f16: {
+      fs_reg tmp16 = bld.vgrf(BRW_REGISTER_TYPE_D);
+      fs_reg tmp32 = bld.vgrf(BRW_REGISTER_TYPE_F);
+      fs_reg zero = bld.vgrf(BRW_REGISTER_TYPE_F);
+
+      /* The destination stride must be at least as big as the source stride. */
+      tmp16.type = BRW_REGISTER_TYPE_W;
+      tmp16.stride = 2;
+
+      /* Check for denormal */
+      fs_reg abs_src0 = op[0];
+      abs_src0.abs = true;
+      bld.CMP(bld.null_reg_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
+              BRW_CONDITIONAL_L);
+      /* Get the appropriately signed zero */
+      bld.AND(retype(zero, BRW_REGISTER_TYPE_UD),
+              retype(op[0], BRW_REGISTER_TYPE_UD),
+              brw_imm_ud(0x80000000));
+      /* Do the actual F32 -> F16 -> F32 conversion */
+      bld.emit(BRW_OPCODE_F32TO16, tmp16, op[0]);
+      bld.emit(BRW_OPCODE_F16TO32, tmp32, tmp16);
+      /* Select that or zero based on normal status */
+      inst = bld.SEL(result, zero, tmp32);
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      inst->saturate = instr->dest.saturate;
+      break;
+   }
+
+   case nir_op_imin:
+   case nir_op_umin:
+   case nir_op_fmin:
+      inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_L);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_imax:
+   case nir_op_umax:
+   case nir_op_fmax:
+      inst = bld.emit_minmax(result, op[0], op[1], BRW_CONDITIONAL_GE);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_pack_snorm_2x16:
+   case nir_op_pack_snorm_4x8:
+   case nir_op_pack_unorm_2x16:
+   case nir_op_pack_unorm_4x8:
+   case nir_op_unpack_snorm_2x16:
+   case nir_op_unpack_snorm_4x8:
+   case nir_op_unpack_unorm_2x16:
+   case nir_op_unpack_unorm_4x8:
+   case nir_op_unpack_half_2x16:
+   case nir_op_pack_half_2x16:
+      unreachable("not reached: should be handled by lower_packing_builtins");
+
+   case nir_op_unpack_half_2x16_split_x:
+      inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X, result, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+   case nir_op_unpack_half_2x16_split_y:
+      inst = bld.emit(FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y, result, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_pack_64_2x32_split:
+      bld.emit(FS_OPCODE_PACK, result, op[0], op[1]);
+      break;
+
+   case nir_op_unpack_64_2x32_split_x:
+   case nir_op_unpack_64_2x32_split_y: {
+      if (instr->op == nir_op_unpack_64_2x32_split_x)
+         bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 0));
+      else
+         bld.MOV(result, subscript(op[0], BRW_REGISTER_TYPE_UD, 1));
+      break;
+   }
+
+   case nir_op_fpow:
+      inst = bld.emit(SHADER_OPCODE_POW, result, op[0], op[1]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_bitfield_reverse:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      bld.BFREV(result, op[0]);
+      break;
+
+   case nir_op_bit_count:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      bld.CBIT(result, op[0]);
+      break;
+
+   case nir_op_ufind_msb: {
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      emit_find_msb_using_lzd(bld, result, op[0], false);
+      break;
+   }
+
+   case nir_op_ifind_msb: {
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+
+      if (devinfo->gen < 7) {
+         emit_find_msb_using_lzd(bld, result, op[0], true);
+      } else {
+         bld.FBH(retype(result, BRW_REGISTER_TYPE_UD), op[0]);
+
+         /* FBH counts from the MSB side, while GLSL's findMSB() wants the
+          * count from the LSB side. If FBH didn't return an error
+          * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB
+          * count into an LSB count.
+          */
+         bld.CMP(bld.null_reg_d(), result, brw_imm_d(-1), BRW_CONDITIONAL_NZ);
+
+         inst = bld.ADD(result, result, brw_imm_d(31));
+         inst->predicate = BRW_PREDICATE_NORMAL;
+         inst->src[0].negate = true;
+      }
+      break;
+   }
+
+   case nir_op_find_lsb:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+
+      if (devinfo->gen < 7) {
+         fs_reg temp = vgrf(glsl_type::int_type);
+
+         /* (x & -x) generates a value that consists of only the LSB of x.
+          * For all powers of 2, findMSB(y) == findLSB(y).
+          */
+         fs_reg src = retype(op[0], BRW_REGISTER_TYPE_D);
+         fs_reg negated_src = src;
+
+         /* One must be negated, and the other must be non-negated.  It
+          * doesn't matter which is which.
+          */
+         negated_src.negate = true;
+         src.negate = false;
+
+         bld.AND(temp, src, negated_src);
+         emit_find_msb_using_lzd(bld, result, temp, false);
+      } else {
+         bld.FBL(result, op[0]);
+      }
+      break;
+
+   case nir_op_ubitfield_extract:
+   case nir_op_ibitfield_extract:
+      unreachable("should have been lowered");
+   case nir_op_ubfe:
+   case nir_op_ibfe:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      bld.BFE(result, op[2], op[1], op[0]);
+      break;
+   case nir_op_bfm:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      bld.BFI1(result, op[0], op[1]);
+      break;
+   case nir_op_bfi:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      bld.BFI2(result, op[0], op[1], op[2]);
+      break;
+
+   case nir_op_bitfield_insert:
+      unreachable("not reached: should have been lowered");
+
+   case nir_op_ishl:
+      bld.SHL(result, op[0], op[1]);
+      break;
+   case nir_op_ishr:
+      bld.ASR(result, op[0], op[1]);
+      break;
+   case nir_op_ushr:
+      bld.SHR(result, op[0], op[1]);
+      break;
+
+   case nir_op_pack_half_2x16_split:
+      bld.emit(FS_OPCODE_PACK_HALF_2x16_SPLIT, result, op[0], op[1]);
+      break;
+
+   case nir_op_ffma:
+      inst = bld.MAD(result, op[2], op[1], op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_flrp:
+      inst = bld.LRP(result, op[0], op[1], op[2]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_bcsel:
+      if (optimize_frontfacing_ternary(instr, result))
+         return;
+
+      bld.CMP(bld.null_reg_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ);
+      inst = bld.SEL(result, op[1], op[2]);
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      break;
+
+   case nir_op_extract_u8:
+   case nir_op_extract_i8: {
+      const brw_reg_type type = brw_int_type(1, instr->op == nir_op_extract_i8);
+      nir_const_value *byte = nir_src_as_const_value(instr->src[1].src);
+      assert(byte != NULL);
+      bld.MOV(result, subscript(op[0], type, byte->u32[0]));
+      break;
+   }
+
+   case nir_op_extract_u16:
+   case nir_op_extract_i16: {
+      const brw_reg_type type = brw_int_type(2, instr->op == nir_op_extract_i16);
+      nir_const_value *word = nir_src_as_const_value(instr->src[1].src);
+      assert(word != NULL);
+      bld.MOV(result, subscript(op[0], type, word->u32[0]));
+      break;
+   }
+
+   default:
+      unreachable("unhandled instruction");
+   }
+
+   /* If we need to do a boolean resolve, replace the result with -(x & 1)
+    * to sign extend the low bit to 0/~0
+    */
+   if (devinfo->gen <= 5 &&
+       (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) == BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
+      fs_reg masked = vgrf(glsl_type::int_type);
+      bld.AND(masked, result, brw_imm_d(1));
+      masked.negate = true;
+      bld.MOV(retype(result, BRW_REGISTER_TYPE_D), masked);
+   }
+}
+
+void
+fs_visitor::nir_emit_load_const(const fs_builder &bld,
+                                nir_load_const_instr *instr)
+{
+   const brw_reg_type reg_type =
+      instr->def.bit_size == 32 ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_DF;
+   fs_reg reg = bld.vgrf(reg_type, instr->def.num_components);
+
+   switch (instr->def.bit_size) {
+   case 32:
+      for (unsigned i = 0; i < instr->def.num_components; i++)
+         bld.MOV(offset(reg, bld, i), brw_imm_d(instr->value.i32[i]));
+      break;
+
+   case 64:
+      for (unsigned i = 0; i < instr->def.num_components; i++)
+         bld.MOV(offset(reg, bld, i),
+                 setup_imm_df(bld, instr->value.f64[i]));
+      break;
+
+   default:
+      unreachable("Invalid bit size");
+   }
+
+   nir_ssa_values[instr->def.index] = reg;
+}
+
+fs_reg
+fs_visitor::get_nir_src(const nir_src &src)
+{
+   fs_reg reg;
+   if (src.is_ssa) {
+      if (src.ssa->parent_instr->type == nir_instr_type_ssa_undef) {
+         const brw_reg_type reg_type = src.ssa->bit_size == 32 ?
+            BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_DF;
+         reg = bld.vgrf(reg_type, src.ssa->num_components);
+      } else {
+         reg = nir_ssa_values[src.ssa->index];
+      }
+   } else {
+      /* We don't handle indirects on locals */
+      assert(src.reg.indirect == NULL);
+      reg = offset(nir_locals[src.reg.reg->index], bld,
+                   src.reg.base_offset * src.reg.reg->num_components);
+   }
+
+   /* to avoid floating-point denorm flushing problems, set the type by
+    * default to D - instructions that need floating point semantics will set
+    * this to F if they need to
+    */
+   return retype(reg, BRW_REGISTER_TYPE_D);
+}
+
+/**
+ * Return an IMM for constants; otherwise call get_nir_src() as normal.
+ */
+fs_reg
+fs_visitor::get_nir_src_imm(const nir_src &src)
+{
+   nir_const_value *val = nir_src_as_const_value(src);
+   return val ? fs_reg(brw_imm_d(val->i32[0])) : get_nir_src(src);
+}
+
+fs_reg
+fs_visitor::get_nir_dest(const nir_dest &dest)
+{
+   if (dest.is_ssa) {
+      const brw_reg_type reg_type =
+         dest.ssa.bit_size == 32 ? BRW_REGISTER_TYPE_F : BRW_REGISTER_TYPE_DF;
+      nir_ssa_values[dest.ssa.index] =
+         bld.vgrf(reg_type, dest.ssa.num_components);
+      return nir_ssa_values[dest.ssa.index];
+   } else {
+      /* We don't handle indirects on locals */
+      assert(dest.reg.indirect == NULL);
+      return offset(nir_locals[dest.reg.reg->index], bld,
+                    dest.reg.base_offset * dest.reg.reg->num_components);
+   }
+}
+
+fs_reg
+fs_visitor::get_nir_image_deref(const nir_deref_var *deref)
+{
+   fs_reg image(UNIFORM, deref->var->data.driver_location / 4,
+                BRW_REGISTER_TYPE_UD);
+   fs_reg indirect;
+   unsigned indirect_max = 0;
+
+   for (const nir_deref *tail = &deref->deref; tail->child;
+        tail = tail->child) {
+      const nir_deref_array *deref_array = nir_deref_as_array(tail->child);
+      assert(tail->child->deref_type == nir_deref_type_array);
+      const unsigned size = glsl_get_length(tail->type);
+      const unsigned element_size = type_size_scalar(deref_array->deref.type);
+      const unsigned base = MIN2(deref_array->base_offset, size - 1);
+      image = offset(image, bld, base * element_size);
+
+      if (deref_array->deref_array_type == nir_deref_array_type_indirect) {
+         fs_reg tmp = vgrf(glsl_type::uint_type);
+
+         /* Accessing an invalid surface index with the dataport can result
+          * in a hang.  According to the spec "if the index used to
+          * select an individual element is negative or greater than or
+          * equal to the size of the array, the results of the operation
+          * are undefined but may not lead to termination" -- which is one
+          * of the possible outcomes of the hang.  Clamp the index to
+          * prevent access outside of the array bounds.
+          */
+         bld.emit_minmax(tmp, retype(get_nir_src(deref_array->indirect),
+                                     BRW_REGISTER_TYPE_UD),
+                         brw_imm_ud(size - base - 1), BRW_CONDITIONAL_L);
+
+         indirect_max += element_size * (tail->type->length - 1);
+
+         bld.MUL(tmp, tmp, brw_imm_ud(element_size * 4));
+         if (indirect.file == BAD_FILE) {
+            indirect = tmp;
+         } else {
+            bld.ADD(indirect, indirect, tmp);
+         }
+      }
+   }
+
+   if (indirect.file == BAD_FILE) {
+      return image;
+   } else {
+      /* Emit a pile of MOVs to load the uniform into a temporary.  The
+       * dead-code elimination pass will get rid of what we don't use.
+       */
+      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, BRW_IMAGE_PARAM_SIZE);
+      for (unsigned j = 0; j < BRW_IMAGE_PARAM_SIZE; j++) {
+         bld.emit(SHADER_OPCODE_MOV_INDIRECT,
+                  offset(tmp, bld, j), offset(image, bld, j),
+                  indirect, brw_imm_ud((indirect_max + 1) * 4));
+      }
+      return tmp;
+   }
+}
+
+void
+fs_visitor::emit_percomp(const fs_builder &bld, const fs_inst &inst,
+                         unsigned wr_mask)
+{
+   for (unsigned i = 0; i < 4; i++) {
+      if (!((wr_mask >> i) & 1))
+         continue;
+
+      fs_inst *new_inst = new(mem_ctx) fs_inst(inst);
+      new_inst->dst = offset(new_inst->dst, bld, i);
+      for (unsigned j = 0; j < new_inst->sources; j++)
+         if (new_inst->src[j].file == VGRF)
+            new_inst->src[j] = offset(new_inst->src[j], bld, i);
+
+      bld.emit(new_inst);
+   }
+}
+
+/**
+ * Get the matching channel register datatype for an image intrinsic of the
+ * specified GLSL image type.
+ */
+static brw_reg_type
+get_image_base_type(const glsl_type *type)
+{
+   switch ((glsl_base_type)type->sampled_type) {
+   case GLSL_TYPE_UINT:
+      return BRW_REGISTER_TYPE_UD;
+   case GLSL_TYPE_INT:
+      return BRW_REGISTER_TYPE_D;
+   case GLSL_TYPE_FLOAT:
+      return BRW_REGISTER_TYPE_F;
+   default:
+      unreachable("Not reached.");
+   }
+}
+
+/**
+ * Get the appropriate atomic op for an image atomic intrinsic.
+ */
+static unsigned
+get_image_atomic_op(nir_intrinsic_op op, const glsl_type *type)
+{
+   switch (op) {
+   case nir_intrinsic_image_atomic_add:
+      return BRW_AOP_ADD;
+   case nir_intrinsic_image_atomic_min:
+      return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
+              BRW_AOP_IMIN : BRW_AOP_UMIN);
+   case nir_intrinsic_image_atomic_max:
+      return (get_image_base_type(type) == BRW_REGISTER_TYPE_D ?
+              BRW_AOP_IMAX : BRW_AOP_UMAX);
+   case nir_intrinsic_image_atomic_and:
+      return BRW_AOP_AND;
+   case nir_intrinsic_image_atomic_or:
+      return BRW_AOP_OR;
+   case nir_intrinsic_image_atomic_xor:
+      return BRW_AOP_XOR;
+   case nir_intrinsic_image_atomic_exchange:
+      return BRW_AOP_MOV;
+   case nir_intrinsic_image_atomic_comp_swap:
+      return BRW_AOP_CMPWR;
+   default:
+      unreachable("Not reachable.");
+   }
+}
+
+static fs_inst *
+emit_pixel_interpolater_send(const fs_builder &bld,
+                             enum opcode opcode,
+                             const fs_reg &dst,
+                             const fs_reg &src,
+                             const fs_reg &desc,
+                             glsl_interp_mode interpolation)
+{
+   struct brw_wm_prog_data *wm_prog_data =
+      brw_wm_prog_data(bld.shader->stage_prog_data);
+   fs_inst *inst;
+   fs_reg payload;
+   int mlen;
+
+   if (src.file == BAD_FILE) {
+      /* Dummy payload */
+      payload = bld.vgrf(BRW_REGISTER_TYPE_F, 1);
+      mlen = 1;
+   } else {
+      payload = src;
+      mlen = 2 * bld.dispatch_width() / 8;
+   }
+
+   inst = bld.emit(opcode, dst, payload, desc);
+   inst->mlen = mlen;
+   /* 2 floats per slot returned */
+   inst->size_written = 2 * dst.component_size(inst->exec_size);
+   inst->pi_noperspective = interpolation == INTERP_MODE_NOPERSPECTIVE;
+
+   wm_prog_data->pulls_bary = true;
+
+   return inst;
+}
+
+/**
+ * Computes 1 << x, given a D/UD register containing some value x.
+ */
+static fs_reg
+intexp2(const fs_builder &bld, const fs_reg &x)
+{
+   assert(x.type == BRW_REGISTER_TYPE_UD || x.type == BRW_REGISTER_TYPE_D);
+
+   fs_reg result = bld.vgrf(x.type, 1);
+   fs_reg one = bld.vgrf(x.type, 1);
+
+   bld.MOV(one, retype(brw_imm_d(1), one.type));
+   bld.SHL(result, one, x);
+   return result;
+}
+
+void
+fs_visitor::emit_gs_end_primitive(const nir_src &vertex_count_nir_src)
+{
+   assert(stage == MESA_SHADER_GEOMETRY);
+
+   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
+
+   if (gs_compile->control_data_header_size_bits == 0)
+      return;
+
+   /* We can only do EndPrimitive() functionality when the control data
+    * consists of cut bits.  Fortunately, the only time it isn't is when the
+    * output type is points, in which case EndPrimitive() is a no-op.
+    */
+   if (gs_prog_data->control_data_format !=
+       GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
+      return;
+   }
+
+   /* Cut bits use one bit per vertex. */
+   assert(gs_compile->control_data_bits_per_vertex == 1);
+
+   fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
+   vertex_count.type = BRW_REGISTER_TYPE_UD;
+
+   /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
+    * vertex n, 0 otherwise.  So all we need to do here is mark bit
+    * (vertex_count - 1) % 32 in the cut_bits register to indicate that
+    * EndPrimitive() was called after emitting vertex (vertex_count - 1);
+    * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
+    *
+    * Note that if EndPrimitive() is called before emitting any vertices, this
+    * will cause us to set bit 31 of the control_data_bits register to 1.
+    * That's fine because:
+    *
+    * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
+    *   output, so the hardware will ignore cut bit 31.
+    *
+    * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
+    *   last vertex, so setting cut bit 31 has no effect (since the primitive
+    *   is automatically ended when the GS terminates).
+    *
+    * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
+    *   control_data_bits register to 0 when the first vertex is emitted.
+    */
+
+   const fs_builder abld = bld.annotate("end primitive");
+
+   /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
+   fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+   abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
+   fs_reg mask = intexp2(abld, prev_count);
+   /* Note: we're relying on the fact that the GEN SHL instruction only pays
+    * attention to the lower 5 bits of its second source argument, so on this
+    * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
+    * ((vertex_count - 1) % 32).
+    */
+   abld.OR(this->control_data_bits, this->control_data_bits, mask);
+}
+
+void
+fs_visitor::emit_gs_control_data_bits(const fs_reg &vertex_count)
+{
+   assert(stage == MESA_SHADER_GEOMETRY);
+   assert(gs_compile->control_data_bits_per_vertex != 0);
+
+   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
+
+   const fs_builder abld = bld.annotate("emit control data bits");
+   const fs_builder fwa_bld = bld.exec_all();
+
+   /* We use a single UD register to accumulate control data bits (32 bits
+    * for each of the SIMD8 channels).  So we need to write a DWord (32 bits)
+    * at a time.
+    *
+    * Unfortunately, the URB_WRITE_SIMD8 message uses 128-bit (OWord) offsets.
+    * We have select a 128-bit group via the Global and Per-Slot Offsets, then
+    * use the Channel Mask phase to enable/disable which DWord within that
+    * group to write.  (Remember, different SIMD8 channels may have emitted
+    * different numbers of vertices, so we may need per-slot offsets.)
+    *
+    * Channel masking presents an annoying problem: we may have to replicate
+    * the data up to 4 times:
+    *
+    * Msg = Handles, Per-Slot Offsets, Channel Masks, Data, Data, Data, Data.
+    *
+    * To avoid penalizing shaders that emit a small number of vertices, we
+    * can avoid these sometimes: if the size of the control data header is
+    * <= 128 bits, then there is only 1 OWord.  All SIMD8 channels will land
+    * land in the same 128-bit group, so we can skip per-slot offsets.
+    *
+    * Similarly, if the control data header is <= 32 bits, there is only one
+    * DWord, so we can skip channel masks.
+    */
+   enum opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
+
+   fs_reg channel_mask, per_slot_offset;
+
+   if (gs_compile->control_data_header_size_bits > 32) {
+      opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
+      channel_mask = vgrf(glsl_type::uint_type);
+   }
+
+   if (gs_compile->control_data_header_size_bits > 128) {
+      opcode = SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT;
+      per_slot_offset = vgrf(glsl_type::uint_type);
+   }
+
+   /* Figure out which DWord we're trying to write to using the formula:
+    *
+    *    dword_index = (vertex_count - 1) * bits_per_vertex / 32
+    *
+    * Since bits_per_vertex is a power of two, and is known at compile
+    * time, this can be optimized to:
+    *
+    *    dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
+    */
+   if (opcode != SHADER_OPCODE_URB_WRITE_SIMD8) {
+      fs_reg dword_index = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+      fs_reg prev_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+      abld.ADD(prev_count, vertex_count, brw_imm_ud(0xffffffffu));
+      unsigned log2_bits_per_vertex =
+         util_last_bit(gs_compile->control_data_bits_per_vertex);
+      abld.SHR(dword_index, prev_count, brw_imm_ud(6u - log2_bits_per_vertex));
+
+      if (per_slot_offset.file != BAD_FILE) {
+         /* Set the per-slot offset to dword_index / 4, so that we'll write to
+          * the appropriate OWord within the control data header.
+          */
+         abld.SHR(per_slot_offset, dword_index, brw_imm_ud(2u));
+      }
+
+      /* Set the channel masks to 1 << (dword_index % 4), so that we'll
+       * write to the appropriate DWORD within the OWORD.
+       */
+      fs_reg channel = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+      fwa_bld.AND(channel, dword_index, brw_imm_ud(3u));
+      channel_mask = intexp2(fwa_bld, channel);
+      /* Then the channel masks need to be in bits 23:16. */
+      fwa_bld.SHL(channel_mask, channel_mask, brw_imm_ud(16u));
+   }
+
+   /* Store the control data bits in the message payload and send it. */
+   int mlen = 2;
+   if (channel_mask.file != BAD_FILE)
+      mlen += 4; /* channel masks, plus 3 extra copies of the data */
+   if (per_slot_offset.file != BAD_FILE)
+      mlen++;
+
+   fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
+   fs_reg *sources = ralloc_array(mem_ctx, fs_reg, mlen);
+   int i = 0;
+   sources[i++] = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
+   if (per_slot_offset.file != BAD_FILE)
+      sources[i++] = per_slot_offset;
+   if (channel_mask.file != BAD_FILE)
+      sources[i++] = channel_mask;
+   while (i < mlen) {
+      sources[i++] = this->control_data_bits;
+   }
+
+   abld.LOAD_PAYLOAD(payload, sources, mlen, mlen);
+   fs_inst *inst = abld.emit(opcode, reg_undef, payload);
+   inst->mlen = mlen;
+   /* We need to increment Global Offset by 256-bits to make room for
+    * Broadwell's extra "Vertex Count" payload at the beginning of the
+    * URB entry.  Since this is an OWord message, Global Offset is counted
+    * in 128-bit units, so we must set it to 2.
+    */
+   if (gs_prog_data->static_vertex_count == -1)
+      inst->offset = 2;
+}
+
+void
+fs_visitor::set_gs_stream_control_data_bits(const fs_reg &vertex_count,
+                                            unsigned stream_id)
+{
+   /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
+
+   /* Note: we are calling this *before* increasing vertex_count, so
+    * this->vertex_count == vertex_count - 1 in the formula above.
+    */
+
+   /* Stream mode uses 2 bits per vertex */
+   assert(gs_compile->control_data_bits_per_vertex == 2);
+
+   /* Must be a valid stream */
+   assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS);
+
+   /* Control data bits are initialized to 0 so we don't have to set any
+    * bits when sending vertices to stream 0.
+    */
+   if (stream_id == 0)
+      return;
+
+   const fs_builder abld = bld.annotate("set stream control data bits", NULL);
+
+   /* reg::sid = stream_id */
+   fs_reg sid = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+   abld.MOV(sid, brw_imm_ud(stream_id));
+
+   /* reg:shift_count = 2 * (vertex_count - 1) */
+   fs_reg shift_count = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+   abld.SHL(shift_count, vertex_count, brw_imm_ud(1u));
+
+   /* Note: we're relying on the fact that the GEN SHL instruction only pays
+    * attention to the lower 5 bits of its second source argument, so on this
+    * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
+    * stream_id << ((2 * (vertex_count - 1)) % 32).
+    */
+   fs_reg mask = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+   abld.SHL(mask, sid, shift_count);
+   abld.OR(this->control_data_bits, this->control_data_bits, mask);
+}
+
+void
+fs_visitor::emit_gs_vertex(const nir_src &vertex_count_nir_src,
+                           unsigned stream_id)
+{
+   assert(stage == MESA_SHADER_GEOMETRY);
+
+   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
+
+   fs_reg vertex_count = get_nir_src(vertex_count_nir_src);
+   vertex_count.type = BRW_REGISTER_TYPE_UD;
+
+   /* Haswell and later hardware ignores the "Render Stream Select" bits
+    * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
+    * and instead sends all primitives down the pipeline for rasterization.
+    * If the SOL stage is enabled, "Render Stream Select" is honored and
+    * primitives bound to non-zero streams are discarded after stream output.
+    *
+    * Since the only purpose of primives sent to non-zero streams is to
+    * be recorded by transform feedback, we can simply discard all geometry
+    * bound to these streams when transform feedback is disabled.
+    */
+   if (stream_id > 0 && !nir->info->has_transform_feedback_varyings)
+      return;
+
+   /* If we're outputting 32 control data bits or less, then we can wait
+    * until the shader is over to output them all.  Otherwise we need to
+    * output them as we go.  Now is the time to do it, since we're about to
+    * output the vertex_count'th vertex, so it's guaranteed that the
+    * control data bits associated with the (vertex_count - 1)th vertex are
+    * correct.
+    */
+   if (gs_compile->control_data_header_size_bits > 32) {
+      const fs_builder abld =
+         bld.annotate("emit vertex: emit control data bits");
+
+      /* Only emit control data bits if we've finished accumulating a batch
+       * of 32 bits.  This is the case when:
+       *
+       *     (vertex_count * bits_per_vertex) % 32 == 0
+       *
+       * (in other words, when the last 5 bits of vertex_count *
+       * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
+       * integer n (which is always the case, since bits_per_vertex is
+       * always 1 or 2), this is equivalent to requiring that the last 5-n
+       * bits of vertex_count are 0:
+       *
+       *     vertex_count & (2^(5-n) - 1) == 0
+       *
+       * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
+       * equivalent to:
+       *
+       *     vertex_count & (32 / bits_per_vertex - 1) == 0
+       *
+       * TODO: If vertex_count is an immediate, we could do some of this math
+       *       at compile time...
+       */
+      fs_inst *inst =
+         abld.AND(bld.null_reg_d(), vertex_count,
+                  brw_imm_ud(32u / gs_compile->control_data_bits_per_vertex - 1u));
+      inst->conditional_mod = BRW_CONDITIONAL_Z;
+
+      abld.IF(BRW_PREDICATE_NORMAL);
+      /* If vertex_count is 0, then no control data bits have been
+       * accumulated yet, so we can skip emitting them.
+       */
+      abld.CMP(bld.null_reg_d(), vertex_count, brw_imm_ud(0u),
+               BRW_CONDITIONAL_NEQ);
+      abld.IF(BRW_PREDICATE_NORMAL);
+      emit_gs_control_data_bits(vertex_count);
+      abld.emit(BRW_OPCODE_ENDIF);
+
+      /* Reset control_data_bits to 0 so we can start accumulating a new
+       * batch.
+       *
+       * Note: in the case where vertex_count == 0, this neutralizes the
+       * effect of any call to EndPrimitive() that the shader may have
+       * made before outputting its first vertex.
+       */
+      inst = abld.MOV(this->control_data_bits, brw_imm_ud(0u));
+      inst->force_writemask_all = true;
+      abld.emit(BRW_OPCODE_ENDIF);
+   }
+
+   emit_urb_writes(vertex_count);
+
+   /* In stream mode we have to set control data bits for all vertices
+    * unless we have disabled control data bits completely (which we do
+    * do for GL_POINTS outputs that don't use streams).
+    */
+   if (gs_compile->control_data_header_size_bits > 0 &&
+       gs_prog_data->control_data_format ==
+          GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
+      set_gs_stream_control_data_bits(vertex_count, stream_id);
+   }
+}
+
+void
+fs_visitor::emit_gs_input_load(const fs_reg &dst,
+                               const nir_src &vertex_src,
+                               unsigned base_offset,
+                               const nir_src &offset_src,
+                               unsigned num_components,
+                               unsigned first_component)
+{
+   struct brw_gs_prog_data *gs_prog_data = brw_gs_prog_data(prog_data);
+
+   nir_const_value *vertex_const = nir_src_as_const_value(vertex_src);
+   nir_const_value *offset_const = nir_src_as_const_value(offset_src);
+   const unsigned push_reg_count = gs_prog_data->base.urb_read_length * 8;
+
+   /* Offset 0 is the VUE header, which contains VARYING_SLOT_LAYER [.y],
+    * VARYING_SLOT_VIEWPORT [.z], and VARYING_SLOT_PSIZ [.w].  Only
+    * gl_PointSize is available as a GS input, however, so it must be that.
+    */
+   const bool is_point_size = (base_offset == 0);
+
+   /* TODO: figure out push input layout for invocations == 1 */
+   if (gs_prog_data->invocations == 1 &&
+       offset_const != NULL && vertex_const != NULL &&
+       4 * (base_offset + offset_const->u32[0]) < push_reg_count) {
+      int imm_offset = (base_offset + offset_const->u32[0]) * 4 +
+                       vertex_const->u32[0] * push_reg_count;
+      /* This input was pushed into registers. */
+      if (is_point_size) {
+         /* gl_PointSize comes in .w */
+         bld.MOV(dst, fs_reg(ATTR, imm_offset + 3, dst.type));
+      } else {
+         for (unsigned i = 0; i < num_components; i++) {
+            bld.MOV(offset(dst, bld, i),
+                    fs_reg(ATTR, imm_offset + i + first_component, dst.type));
+         }
+      }
+      return;
+   }
+
+   /* Resort to the pull model.  Ensure the VUE handles are provided. */
+   gs_prog_data->base.include_vue_handles = true;
+
+   unsigned first_icp_handle = gs_prog_data->include_primitive_id ? 3 : 2;
+   fs_reg icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+
+   if (gs_prog_data->invocations == 1) {
+      if (vertex_const) {
+         /* The vertex index is constant; just select the proper URB handle. */
+         icp_handle =
+            retype(brw_vec8_grf(first_icp_handle + vertex_const->i32[0], 0),
+                   BRW_REGISTER_TYPE_UD);
+      } else {
+         /* The vertex index is non-constant.  We need to use indirect
+          * addressing to fetch the proper URB handle.
+          *
+          * First, we start with the sequence <7, 6, 5, 4, 3, 2, 1, 0>
+          * indicating that channel <n> should read the handle from
+          * DWord <n>.  We convert that to bytes by multiplying by 4.
+          *
+          * Next, we convert the vertex index to bytes by multiplying
+          * by 32 (shifting by 5), and add the two together.  This is
+          * the final indirect byte offset.
+          */
+         fs_reg sequence = bld.vgrf(BRW_REGISTER_TYPE_W, 1);
+         fs_reg channel_offsets = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+
+         /* sequence = <7, 6, 5, 4, 3, 2, 1, 0> */
+         bld.MOV(sequence, fs_reg(brw_imm_v(0x76543210)));
+         /* channel_offsets = 4 * sequence = <28, 24, 20, 16, 12, 8, 4, 0> */
+         bld.SHL(channel_offsets, sequence, brw_imm_ud(2u));
+         /* Convert vertex_index to bytes (multiply by 32) */
+         bld.SHL(vertex_offset_bytes,
+                 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
+                 brw_imm_ud(5u));
+         bld.ADD(icp_offset_bytes, vertex_offset_bytes, channel_offsets);
+
+         /* Use first_icp_handle as the base offset.  There is one register
+          * of URB handles per vertex, so inform the register allocator that
+          * we might read up to nir->info->gs.vertices_in registers.
+          */
+         bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
+                  retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
+                  fs_reg(icp_offset_bytes),
+                  brw_imm_ud(nir->info->gs.vertices_in * REG_SIZE));
+      }
+   } else {
+      assert(gs_prog_data->invocations > 1);
+
+      if (vertex_const) {
+         assert(devinfo->gen >= 9 || vertex_const->i32[0] <= 5);
+         bld.MOV(icp_handle,
+                 retype(brw_vec1_grf(first_icp_handle +
+                                     vertex_const->i32[0] / 8,
+                                     vertex_const->i32[0] % 8),
+                        BRW_REGISTER_TYPE_UD));
+      } else {
+         /* The vertex index is non-constant.  We need to use indirect
+          * addressing to fetch the proper URB handle.
+          *
+          */
+         fs_reg icp_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+
+         /* Convert vertex_index to bytes (multiply by 4) */
+         bld.SHL(icp_offset_bytes,
+                 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
+                 brw_imm_ud(2u));
+
+         /* Use first_icp_handle as the base offset.  There is one DWord
+          * of URB handles per vertex, so inform the register allocator that
+          * we might read up to ceil(nir->info->gs.vertices_in / 8) registers.
+          */
+         bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
+                  retype(brw_vec8_grf(first_icp_handle, 0), icp_handle.type),
+                  fs_reg(icp_offset_bytes),
+                  brw_imm_ud(DIV_ROUND_UP(nir->info->gs.vertices_in, 8) *
+                             REG_SIZE));
+      }
+   }
+
+   fs_inst *inst;
+
+   fs_reg tmp_dst = dst;
+   fs_reg indirect_offset = get_nir_src(offset_src);
+   unsigned num_iterations = 1;
+   unsigned orig_num_components = num_components;
+
+   if (type_sz(dst.type) == 8) {
+      if (num_components > 2) {
+         num_iterations = 2;
+         num_components = 2;
+      }
+      fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type);
+      tmp_dst = tmp;
+      first_component = first_component / 2;
+   }
+
+   for (unsigned iter = 0; iter < num_iterations; iter++) {
+      if (offset_const) {
+         /* Constant indexing - use global offset. */
+         if (first_component != 0) {
+            unsigned read_components = num_components + first_component;
+            fs_reg tmp = bld.vgrf(dst.type, read_components);
+            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
+            inst->size_written = read_components *
+                                 tmp.component_size(inst->exec_size);
+            for (unsigned i = 0; i < num_components; i++) {
+               bld.MOV(offset(tmp_dst, bld, i),
+                       offset(tmp, bld, i + first_component));
+            }
+         } else {
+            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp_dst,
+                            icp_handle);
+            inst->size_written = num_components *
+                                 tmp_dst.component_size(inst->exec_size);
+         }
+         inst->offset = base_offset + offset_const->u32[0];
+         inst->mlen = 1;
+      } else {
+         /* Indirect indexing - use per-slot offsets as well. */
+         const fs_reg srcs[] = { icp_handle, indirect_offset };
+         unsigned read_components = num_components + first_component;
+         fs_reg tmp = bld.vgrf(dst.type, read_components);
+         fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
+         if (first_component != 0) {
+            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
+                            payload);
+            inst->size_written = read_components *
+                                 tmp.component_size(inst->exec_size);
+            for (unsigned i = 0; i < num_components; i++) {
+               bld.MOV(offset(tmp_dst, bld, i),
+                       offset(tmp, bld, i + first_component));
+            }
+         } else {
+            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp_dst,
+                         payload);
+            inst->size_written = num_components *
+                                 tmp_dst.component_size(inst->exec_size);
+         }
+         inst->offset = base_offset;
+         inst->mlen = 2;
+      }
+
+      if (type_sz(dst.type) == 8) {
+         shuffle_32bit_load_result_to_64bit_data(
+            bld, tmp_dst, retype(tmp_dst, BRW_REGISTER_TYPE_F), num_components);
+
+         for (unsigned c = 0; c < num_components; c++)
+            bld.MOV(offset(dst, bld, iter * 2 + c), offset(tmp_dst, bld, c));
+      }
+
+      if (num_iterations > 1) {
+         num_components = orig_num_components - 2;
+         if(offset_const) {
+            base_offset++;
+         } else {
+            fs_reg new_indirect = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+            bld.ADD(new_indirect, indirect_offset, brw_imm_ud(1u));
+            indirect_offset = new_indirect;
+         }
+      }
+   }
+
+   if (is_point_size) {
+      /* Read the whole VUE header (because of alignment) and read .w. */
+      fs_reg tmp = bld.vgrf(dst.type, 4);
+      inst->dst = tmp;
+      inst->size_written = 4 * REG_SIZE;
+      bld.MOV(dst, offset(tmp, bld, 3));
+   }
+}
+
+fs_reg
+fs_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
+{
+   nir_src *offset_src = nir_get_io_offset_src(instr);
+   nir_const_value *const_value = nir_src_as_const_value(*offset_src);
+
+   if (const_value) {
+      /* The only constant offset we should find is 0.  brw_nir.c's
+       * add_const_offset_to_base() will fold other constant offsets
+       * into instr->const_index[0].
+       */
+      assert(const_value->u32[0] == 0);
+      return fs_reg();
+   }
+
+   return get_nir_src(*offset_src);
+}
+
+static void
+do_untyped_vector_read(const fs_builder &bld,
+                       const fs_reg dest,
+                       const fs_reg surf_index,
+                       const fs_reg offset_reg,
+                       unsigned num_components)
+{
+   if (type_sz(dest.type) == 4) {
+      fs_reg read_result = emit_untyped_read(bld, surf_index, offset_reg,
+                                             1 /* dims */,
+                                             num_components,
+                                             BRW_PREDICATE_NONE);
+      read_result.type = dest.type;
+      for (unsigned i = 0; i < num_components; i++)
+         bld.MOV(offset(dest, bld, i), offset(read_result, bld, i));
+   } else if (type_sz(dest.type) == 8) {
+      /* Reading a dvec, so we need to:
+       *
+       * 1. Multiply num_components by 2, to account for the fact that we
+       *    need to read 64-bit components.
+       * 2. Shuffle the result of the load to form valid 64-bit elements
+       * 3. Emit a second load (for components z/w) if needed.
+       */
+      fs_reg read_offset = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      bld.MOV(read_offset, offset_reg);
+
+      int iters = num_components <= 2 ? 1 : 2;
+
+      /* Load the dvec, the first iteration loads components x/y, the second
+       * iteration, if needed, loads components z/w
+       */
+      for (int it = 0; it < iters; it++) {
+         /* Compute number of components to read in this iteration */
+         int iter_components = MIN2(2, num_components);
+         num_components -= iter_components;
+
+         /* Read. Since this message reads 32-bit components, we need to
+          * read twice as many components.
+          */
+         fs_reg read_result = emit_untyped_read(bld, surf_index, read_offset,
+                                                1 /* dims */,
+                                                iter_components * 2,
+                                                BRW_PREDICATE_NONE);
+
+         /* Shuffle the 32-bit load result into valid 64-bit data */
+         const fs_reg packed_result = bld.vgrf(dest.type, iter_components);
+         shuffle_32bit_load_result_to_64bit_data(
+            bld, packed_result, read_result, iter_components);
+
+         /* Move each component to its destination */
+         read_result = retype(read_result, BRW_REGISTER_TYPE_DF);
+         for (int c = 0; c < iter_components; c++) {
+            bld.MOV(offset(dest, bld, it * 2 + c),
+                    offset(packed_result, bld, c));
+         }
+
+         bld.ADD(read_offset, read_offset, brw_imm_ud(16));
+      }
+   } else {
+      unreachable("Unsupported type");
+   }
+}
+
+void
+fs_visitor::nir_emit_vs_intrinsic(const fs_builder &bld,
+                                  nir_intrinsic_instr *instr)
+{
+   assert(stage == MESA_SHADER_VERTEX);
+
+   fs_reg dest;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dest = get_nir_dest(instr->dest);
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_vertex_id:
+      unreachable("should be lowered by lower_vertex_id()");
+
+   case nir_intrinsic_load_vertex_id_zero_base:
+   case nir_intrinsic_load_base_vertex:
+   case nir_intrinsic_load_instance_id:
+   case nir_intrinsic_load_base_instance:
+   case nir_intrinsic_load_draw_id: {
+      gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
+      fs_reg val = nir_system_values[sv];
+      assert(val.file != BAD_FILE);
+      dest.type = val.type;
+      bld.MOV(dest, val);
+      break;
+   }
+
+   case nir_intrinsic_load_input: {
+      fs_reg src = fs_reg(ATTR, instr->const_index[0], dest.type);
+      unsigned first_component = nir_intrinsic_component(instr);
+      unsigned num_components = instr->num_components;
+      enum brw_reg_type type = dest.type;
+
+      nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+      assert(const_offset && "Indirect input loads not allowed");
+      src = offset(src, bld, const_offset->u32[0]);
+
+      for (unsigned j = 0; j < num_components; j++) {
+         bld.MOV(offset(dest, bld, j), offset(src, bld, j + first_component));
+      }
+
+      if (type == BRW_REGISTER_TYPE_DF) {
+         /* Once the double vector is read, set again its original register
+          * type to continue with normal execution.
+          */
+         src = retype(src, type);
+         dest = retype(dest, type);
+      }
+
+      if (type_sz(src.type) == 8) {
+         shuffle_32bit_load_result_to_64bit_data(bld,
+                                                 dest,
+                                                 retype(dest, BRW_REGISTER_TYPE_F),
+                                                 instr->num_components);
+      }
+      break;
+   }
+
+   default:
+      nir_emit_intrinsic(bld, instr);
+      break;
+   }
+}
+
+void
+fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
+                                   nir_intrinsic_instr *instr)
+{
+   assert(stage == MESA_SHADER_TESS_CTRL);
+   struct brw_tcs_prog_key *tcs_key = (struct brw_tcs_prog_key *) key;
+   struct brw_tcs_prog_data *tcs_prog_data = brw_tcs_prog_data(prog_data);
+
+   fs_reg dst;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dst = get_nir_dest(instr->dest);
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_primitive_id:
+      bld.MOV(dst, fs_reg(brw_vec1_grf(0, 1)));
+      break;
+   case nir_intrinsic_load_invocation_id:
+      bld.MOV(retype(dst, invocation_id.type), invocation_id);
+      break;
+   case nir_intrinsic_load_patch_vertices_in:
+      bld.MOV(retype(dst, BRW_REGISTER_TYPE_D),
+              brw_imm_d(tcs_key->input_vertices));
+      break;
+
+   case nir_intrinsic_barrier: {
+      if (tcs_prog_data->instances == 1)
+         break;
+
+      fs_reg m0 = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+      fs_reg m0_2 = component(m0, 2);
+
+      const fs_builder chanbld = bld.exec_all().group(1, 0);
+
+      /* Zero the message header */
+      bld.exec_all().MOV(m0, brw_imm_ud(0u));
+
+      /* Copy "Barrier ID" from r0.2, bits 16:13 */
+      chanbld.AND(m0_2, retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
+                  brw_imm_ud(INTEL_MASK(16, 13)));
+
+      /* Shift it up to bits 27:24. */
+      chanbld.SHL(m0_2, m0_2, brw_imm_ud(11));
+
+      /* Set the Barrier Count and the enable bit */
+      chanbld.OR(m0_2, m0_2,
+                 brw_imm_ud(tcs_prog_data->instances << 9 | (1 << 15)));
+
+      bld.emit(SHADER_OPCODE_BARRIER, bld.null_reg_ud(), m0);
+      break;
+   }
+
+   case nir_intrinsic_load_input:
+      unreachable("nir_lower_io should never give us these.");
+      break;
+
+   case nir_intrinsic_load_per_vertex_input: {
+      fs_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = instr->const_index[0];
+
+      const nir_src &vertex_src = instr->src[0];
+      nir_const_value *vertex_const = nir_src_as_const_value(vertex_src);
+
+      fs_inst *inst;
+
+      fs_reg icp_handle;
+
+      if (vertex_const) {
+         /* Emit a MOV to resolve <0,1,0> regioning. */
+         icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         bld.MOV(icp_handle,
+                 retype(brw_vec1_grf(1 + (vertex_const->i32[0] >> 3),
+                                     vertex_const->i32[0] & 7),
+                        BRW_REGISTER_TYPE_UD));
+      } else if (tcs_prog_data->instances == 1 &&
+                 vertex_src.is_ssa &&
+                 vertex_src.ssa->parent_instr->type == nir_instr_type_intrinsic &&
+                 nir_instr_as_intrinsic(vertex_src.ssa->parent_instr)->intrinsic == nir_intrinsic_load_invocation_id) {
+         /* For the common case of only 1 instance, an array index of
+          * gl_InvocationID means reading g1.  Skip all the indirect work.
+          */
+         icp_handle = retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD);
+      } else {
+         /* The vertex index is non-constant.  We need to use indirect
+          * addressing to fetch the proper URB handle.
+          */
+         icp_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+
+         /* Each ICP handle is a single DWord (4 bytes) */
+         fs_reg vertex_offset_bytes = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         bld.SHL(vertex_offset_bytes,
+                 retype(get_nir_src(vertex_src), BRW_REGISTER_TYPE_UD),
+                 brw_imm_ud(2u));
+
+         /* Start at g1.  We might read up to 4 registers. */
+         bld.emit(SHADER_OPCODE_MOV_INDIRECT, icp_handle,
+                  retype(brw_vec8_grf(1, 0), icp_handle.type), vertex_offset_bytes,
+                  brw_imm_ud(4 * REG_SIZE));
+      }
+
+      /* We can only read two double components with each URB read, so
+       * we send two read messages in that case, each one loading up to
+       * two double components.
+       */
+      unsigned num_iterations = 1;
+      unsigned num_components = instr->num_components;
+      unsigned first_component = nir_intrinsic_component(instr);
+      fs_reg orig_dst = dst;
+      if (type_sz(dst.type) == 8) {
+         first_component = first_component / 2;
+         if (instr->num_components > 2) {
+            num_iterations = 2;
+            num_components = 2;
+         }
+
+         fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dst.type);
+         dst = tmp;
+      }
+
+      for (unsigned iter = 0; iter < num_iterations; iter++) {
+         if (indirect_offset.file == BAD_FILE) {
+            /* Constant indexing - use global offset. */
+            if (first_component != 0) {
+               unsigned read_components = num_components + first_component;
+               fs_reg tmp = bld.vgrf(dst.type, read_components);
+               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
+               for (unsigned i = 0; i < num_components; i++) {
+                  bld.MOV(offset(dst, bld, i),
+                          offset(tmp, bld, i + first_component));
+               }
+            } else {
+               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst, icp_handle);
+            }
+            inst->offset = imm_offset;
+            inst->mlen = 1;
+         } else {
+            /* Indirect indexing - use per-slot offsets as well. */
+            const fs_reg srcs[] = { icp_handle, indirect_offset };
+            fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+            bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
+            if (first_component != 0) {
+               unsigned read_components = num_components + first_component;
+               fs_reg tmp = bld.vgrf(dst.type, read_components);
+               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
+                               payload);
+               for (unsigned i = 0; i < num_components; i++) {
+                  bld.MOV(offset(dst, bld, i),
+                          offset(tmp, bld, i + first_component));
+               }
+            } else {
+               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
+                               payload);
+            }
+            inst->offset = imm_offset;
+            inst->mlen = 2;
+         }
+         inst->size_written = (num_components + first_component) *
+                              inst->dst.component_size(inst->exec_size);
+
+         /* If we are reading 64-bit data using 32-bit read messages we need
+          * build proper 64-bit data elements by shuffling the low and high
+          * 32-bit components around like we do for other things like UBOs
+          * or SSBOs.
+          */
+         if (type_sz(dst.type) == 8) {
+            shuffle_32bit_load_result_to_64bit_data(
+               bld, dst, retype(dst, BRW_REGISTER_TYPE_F), num_components);
+
+            for (unsigned c = 0; c < num_components; c++) {
+               bld.MOV(offset(orig_dst, bld, iter * 2 + c),
+                       offset(dst, bld, c));
+            }
+         }
+
+         /* Copy the temporary to the destination to deal with writemasking.
+          *
+          * Also attempt to deal with gl_PointSize being in the .w component.
+          */
+         if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
+            assert(type_sz(dst.type) < 8);
+            inst->dst = bld.vgrf(dst.type, 4);
+            inst->size_written = 4 * REG_SIZE;
+            bld.MOV(dst, offset(inst->dst, bld, 3));
+         }
+
+         /* If we are loading double data and we need a second read message
+          * adjust the write offset
+          */
+         if (num_iterations > 1) {
+            num_components = instr->num_components - 2;
+            imm_offset++;
+         }
+      }
+      break;
+   }
+
+   case nir_intrinsic_load_output:
+   case nir_intrinsic_load_per_vertex_output: {
+      fs_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = instr->const_index[0];
+      unsigned first_component = nir_intrinsic_component(instr);
+
+      fs_inst *inst;
+      if (indirect_offset.file == BAD_FILE) {
+         /* Replicate the patch handle to all enabled channels */
+         fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+         bld.MOV(patch_handle,
+                 retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+         {
+            if (first_component != 0) {
+               unsigned read_components =
+                  instr->num_components + first_component;
+               fs_reg tmp = bld.vgrf(dst.type, read_components);
+               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
+                               patch_handle);
+               inst->size_written = read_components * REG_SIZE;
+               for (unsigned i = 0; i < instr->num_components; i++) {
+                  bld.MOV(offset(dst, bld, i),
+                          offset(tmp, bld, i + first_component));
+               }
+            } else {
+               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dst,
+                               patch_handle);
+               inst->size_written = instr->num_components * REG_SIZE;
+            }
+            inst->offset = imm_offset;
+            inst->mlen = 1;
+         }
+      } else {
+         /* Indirect indexing - use per-slot offsets as well. */
+         const fs_reg srcs[] = {
+            retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
+            indirect_offset
+         };
+         fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
+         if (first_component != 0) {
+            unsigned read_components =
+               instr->num_components + first_component;
+            fs_reg tmp = bld.vgrf(dst.type, read_components);
+            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
+                            payload);
+            inst->size_written = read_components * REG_SIZE;
+            for (unsigned i = 0; i < instr->num_components; i++) {
+               bld.MOV(offset(dst, bld, i),
+                       offset(tmp, bld, i + first_component));
+            }
+         } else {
+            inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dst,
+                            payload);
+            inst->size_written = instr->num_components * REG_SIZE;
+         }
+         inst->offset = imm_offset;
+         inst->mlen = 2;
+      }
+      break;
+   }
+
+   case nir_intrinsic_store_output:
+   case nir_intrinsic_store_per_vertex_output: {
+      fs_reg value = get_nir_src(instr->src[0]);
+      bool is_64bit = (instr->src[0].is_ssa ?
+         instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 64;
+      fs_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = instr->const_index[0];
+      unsigned swiz = BRW_SWIZZLE_XYZW;
+      unsigned mask = instr->const_index[1];
+      unsigned header_regs = 0;
+      fs_reg srcs[7];
+      srcs[header_regs++] = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD);
+
+      if (indirect_offset.file != BAD_FILE) {
+         srcs[header_regs++] = indirect_offset;
+      }
+
+      if (mask == 0)
+         break;
+
+      unsigned num_components = util_last_bit(mask);
+      enum opcode opcode;
+
+      /* We can only pack two 64-bit components in a single message, so send
+       * 2 messages if we have more components
+       */
+      unsigned num_iterations = 1;
+      unsigned iter_components = num_components;
+      unsigned first_component = nir_intrinsic_component(instr);
+      if (is_64bit) {
+         first_component = first_component / 2;
+         if (instr->num_components > 2) {
+            num_iterations = 2;
+            iter_components = 2;
+         }
+      }
+
+      /* 64-bit data needs to me shuffled before we can write it to the URB.
+       * We will use this temporary to shuffle the components in each
+       * iteration.
+       */
+      fs_reg tmp =
+         fs_reg(VGRF, alloc.allocate(2 * iter_components), value.type);
+
+      mask = mask << first_component;
+
+      for (unsigned iter = 0; iter < num_iterations; iter++) {
+         if (!is_64bit && mask != WRITEMASK_XYZW) {
+            srcs[header_regs++] = brw_imm_ud(mask << 16);
+            opcode = indirect_offset.file != BAD_FILE ?
+               SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
+               SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
+         } else if (is_64bit && ((mask & WRITEMASK_XY) != WRITEMASK_XY)) {
+            /* Expand the 64-bit mask to 32-bit channels. We only handle
+             * two channels in each iteration, so we only care about X/Y.
+             */
+            unsigned mask32 = 0;
+            if (mask & WRITEMASK_X)
+               mask32 |= WRITEMASK_XY;
+            if (mask & WRITEMASK_Y)
+               mask32 |= WRITEMASK_ZW;
+
+            /* If the mask does not include any of the channels X or Y there
+             * is nothing to do in this iteration. Move on to the next couple
+             * of 64-bit channels.
+             */
+            if (!mask32) {
+               mask >>= 2;
+               imm_offset++;
+               continue;
+            }
+
+            srcs[header_regs++] = brw_imm_ud(mask32 << 16);
+            opcode = indirect_offset.file != BAD_FILE ?
+               SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT :
+               SHADER_OPCODE_URB_WRITE_SIMD8_MASKED;
+         } else {
+            opcode = indirect_offset.file != BAD_FILE ?
+               SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT :
+               SHADER_OPCODE_URB_WRITE_SIMD8;
+         }
+
+         for (unsigned i = 0; i < iter_components; i++) {
+            if (!(mask & (1 << (i + first_component))))
+               continue;
+
+            if (!is_64bit) {
+               srcs[header_regs + i + first_component] =
+                  offset(value, bld, BRW_GET_SWZ(swiz, i));
+            } else {
+               /* We need to shuffle the 64-bit data to match the layout
+                * expected by our 32-bit URB write messages. We use a temporary
+                * for that.
+                */
+               unsigned channel = BRW_GET_SWZ(swiz, iter * 2 + i);
+               shuffle_64bit_data_for_32bit_write(bld,
+                  retype(offset(tmp, bld, 2 * i), BRW_REGISTER_TYPE_F),
+                  retype(offset(value, bld, 2 * channel), BRW_REGISTER_TYPE_DF),
+                  1);
+
+               /* Now copy the data to the destination */
+               fs_reg dest = fs_reg(VGRF, alloc.allocate(2), value.type);
+               unsigned idx = 2 * i;
+               bld.MOV(dest, offset(tmp, bld, idx));
+               bld.MOV(offset(dest, bld, 1), offset(tmp, bld, idx + 1));
+               srcs[header_regs + idx + first_component * 2] = dest;
+               srcs[header_regs + idx + 1 + first_component * 2] =
+                  offset(dest, bld, 1);
+            }
+         }
+
+         unsigned mlen =
+            header_regs + (is_64bit ? 2 * iter_components : iter_components) +
+            (is_64bit ? 2 * first_component : first_component);
+         fs_reg payload =
+            bld.vgrf(BRW_REGISTER_TYPE_UD, mlen);
+         bld.LOAD_PAYLOAD(payload, srcs, mlen, header_regs);
+
+         fs_inst *inst = bld.emit(opcode, bld.null_reg_ud(), payload);
+         inst->offset = imm_offset;
+         inst->mlen = mlen;
+
+         /* If this is a 64-bit attribute, select the next two 64-bit channels
+          * to be handled in the next iteration.
+          */
+         if (is_64bit) {
+            mask >>= 2;
+            imm_offset++;
+         }
+      }
+      break;
+   }
+
+   default:
+      nir_emit_intrinsic(bld, instr);
+      break;
+   }
+}
+
+void
+fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
+                                   nir_intrinsic_instr *instr)
+{
+   assert(stage == MESA_SHADER_TESS_EVAL);
+   struct brw_tes_prog_data *tes_prog_data = brw_tes_prog_data(prog_data);
+
+   fs_reg dest;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dest = get_nir_dest(instr->dest);
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_primitive_id:
+      bld.MOV(dest, fs_reg(brw_vec1_grf(0, 1)));
+      break;
+   case nir_intrinsic_load_tess_coord:
+      /* gl_TessCoord is part of the payload in g1-3 */
+      for (unsigned i = 0; i < 3; i++) {
+         bld.MOV(offset(dest, bld, i), fs_reg(brw_vec8_grf(1 + i, 0)));
+      }
+      break;
+
+   case nir_intrinsic_load_input:
+   case nir_intrinsic_load_per_vertex_input: {
+      fs_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = instr->const_index[0];
+      unsigned first_component = nir_intrinsic_component(instr);
+
+      if (type_sz(dest.type) == 8) {
+         first_component = first_component / 2;
+      }
+
+      fs_inst *inst;
+      if (indirect_offset.file == BAD_FILE) {
+         /* Arbitrarily only push up to 32 vec4 slots worth of data,
+          * which is 16 registers (since each holds 2 vec4 slots).
+          */
+         const unsigned max_push_slots = 32;
+         if (imm_offset < max_push_slots) {
+            fs_reg src = fs_reg(ATTR, imm_offset / 2, dest.type);
+            for (int i = 0; i < instr->num_components; i++) {
+               unsigned comp = 16 / type_sz(dest.type) * (imm_offset % 2) +
+                  i + first_component;
+               bld.MOV(offset(dest, bld, i), component(src, comp));
+            }
+            tes_prog_data->base.urb_read_length =
+               MAX2(tes_prog_data->base.urb_read_length,
+                    DIV_ROUND_UP(imm_offset + 1, 2));
+         } else {
+            /* Replicate the patch handle to all enabled channels */
+            const fs_reg srcs[] = {
+               retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)
+            };
+            fs_reg patch_handle = bld.vgrf(BRW_REGISTER_TYPE_UD, 1);
+            bld.LOAD_PAYLOAD(patch_handle, srcs, ARRAY_SIZE(srcs), 0);
+
+            if (first_component != 0) {
+               unsigned read_components =
+                  instr->num_components + first_component;
+               fs_reg tmp = bld.vgrf(dest.type, read_components);
+               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp,
+                               patch_handle);
+               inst->size_written = read_components * REG_SIZE;
+               for (unsigned i = 0; i < instr->num_components; i++) {
+                  bld.MOV(offset(dest, bld, i),
+                          offset(tmp, bld, i + first_component));
+               }
+            } else {
+               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, dest,
+                               patch_handle);
+               inst->size_written = instr->num_components * REG_SIZE;
+            }
+            inst->mlen = 1;
+            inst->offset = imm_offset;
+         }
+      } else {
+         /* Indirect indexing - use per-slot offsets as well. */
+
+         /* We can only read two double components with each URB read, so
+          * we send two read messages in that case, each one loading up to
+          * two double components.
+          */
+         unsigned num_iterations = 1;
+         unsigned num_components = instr->num_components;
+         fs_reg orig_dest = dest;
+         if (type_sz(dest.type) == 8) {
+            if (instr->num_components > 2) {
+               num_iterations = 2;
+               num_components = 2;
+            }
+            fs_reg tmp = fs_reg(VGRF, alloc.allocate(4), dest.type);
+            dest = tmp;
+         }
+
+         for (unsigned iter = 0; iter < num_iterations; iter++) {
+            const fs_reg srcs[] = {
+               retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
+               indirect_offset
+            };
+            fs_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+            bld.LOAD_PAYLOAD(payload, srcs, ARRAY_SIZE(srcs), 0);
+
+            if (first_component != 0) {
+               unsigned read_components =
+                   num_components + first_component;
+               fs_reg tmp = bld.vgrf(dest.type, read_components);
+               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, tmp,
+                               payload);
+               for (unsigned i = 0; i < num_components; i++) {
+                  bld.MOV(offset(dest, bld, i),
+                          offset(tmp, bld, i + first_component));
+               }
+            } else {
+               inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT, dest,
+                               payload);
+            }
+            inst->mlen = 2;
+            inst->offset = imm_offset;
+            inst->size_written = (num_components + first_component) *
+                                 inst->dst.component_size(inst->exec_size);
+
+            /* If we are reading 64-bit data using 32-bit read messages we need
+             * build proper 64-bit data elements by shuffling the low and high
+             * 32-bit components around like we do for other things like UBOs
+             * or SSBOs.
+             */
+            if (type_sz(dest.type) == 8) {
+               shuffle_32bit_load_result_to_64bit_data(
+                  bld, dest, retype(dest, BRW_REGISTER_TYPE_F), num_components);
+
+               for (unsigned c = 0; c < num_components; c++) {
+                  bld.MOV(offset(orig_dest, bld, iter * 2 + c),
+                          offset(dest, bld, c));
+               }
+            }
+
+            /* If we are loading double data and we need a second read message
+             * adjust the offset
+             */
+            if (num_iterations > 1) {
+               num_components = instr->num_components - 2;
+               imm_offset++;
+            }
+         }
+      }
+      break;
+   }
+   default:
+      nir_emit_intrinsic(bld, instr);
+      break;
+   }
+}
+
+void
+fs_visitor::nir_emit_gs_intrinsic(const fs_builder &bld,
+                                  nir_intrinsic_instr *instr)
+{
+   assert(stage == MESA_SHADER_GEOMETRY);
+   fs_reg indirect_offset;
+
+   fs_reg dest;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dest = get_nir_dest(instr->dest);
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_primitive_id:
+      assert(stage == MESA_SHADER_GEOMETRY);
+      assert(brw_gs_prog_data(prog_data)->include_primitive_id);
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD),
+              retype(fs_reg(brw_vec8_grf(2, 0)), BRW_REGISTER_TYPE_UD));
+      break;
+
+   case nir_intrinsic_load_input:
+      unreachable("load_input intrinsics are invalid for the GS stage");
+
+   case nir_intrinsic_load_per_vertex_input:
+      emit_gs_input_load(dest, instr->src[0], instr->const_index[0],
+                         instr->src[1], instr->num_components,
+                         nir_intrinsic_component(instr));
+      break;
+
+   case nir_intrinsic_emit_vertex_with_counter:
+      emit_gs_vertex(instr->src[0], instr->const_index[0]);
+      break;
+
+   case nir_intrinsic_end_primitive_with_counter:
+      emit_gs_end_primitive(instr->src[0]);
+      break;
+
+   case nir_intrinsic_set_vertex_count:
+      bld.MOV(this->final_gs_vertex_count, get_nir_src(instr->src[0]));
+      break;
+
+   case nir_intrinsic_load_invocation_id: {
+      fs_reg val = nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
+      assert(val.file != BAD_FILE);
+      dest.type = val.type;
+      bld.MOV(dest, val);
+      break;
+   }
+
+   default:
+      nir_emit_intrinsic(bld, instr);
+      break;
+   }
+}
+
+/**
+ * Fetch the current render target layer index.
+ */
+static fs_reg
+fetch_render_target_array_index(const fs_builder &bld)
+{
+   if (bld.shader->devinfo->gen >= 6) {
+      /* The render target array index is provided in the thread payload as
+       * bits 26:16 of r0.0.
+       */
+      const fs_reg idx = bld.vgrf(BRW_REGISTER_TYPE_UD);
+      bld.AND(idx, brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, 0, 1),
+              brw_imm_uw(0x7ff));
+      return idx;
+   } else {
+      /* Pre-SNB we only ever render into the first layer of the framebuffer
+       * since layered rendering is not implemented.
+       */
+      return brw_imm_ud(0);
+   }
+}
+
+/**
+ * Fake non-coherent framebuffer read implemented using TXF to fetch from the
+ * framebuffer at the current fragment coordinates and sample index.
+ */
+fs_inst *
+fs_visitor::emit_non_coherent_fb_read(const fs_builder &bld, const fs_reg &dst,
+                                      unsigned target)
+{
+   const struct gen_device_info *devinfo = bld.shader->devinfo;
+
+   assert(bld.shader->stage == MESA_SHADER_FRAGMENT);
+   const brw_wm_prog_key *wm_key =
+      reinterpret_cast<const brw_wm_prog_key *>(key);
+   assert(!wm_key->coherent_fb_fetch);
+   const struct brw_wm_prog_data *wm_prog_data =
+      brw_wm_prog_data(stage_prog_data);
+
+   /* Calculate the surface index relative to the start of the texture binding
+    * table block, since that's what the texturing messages expect.
+    */
+   const unsigned surface = target +
+      wm_prog_data->binding_table.render_target_read_start -
+      wm_prog_data->base.binding_table.texture_start;
+
+   brw_mark_surface_used(
+      bld.shader->stage_prog_data,
+      wm_prog_data->binding_table.render_target_read_start + target);
+
+   /* Calculate the fragment coordinates. */
+   const fs_reg coords = bld.vgrf(BRW_REGISTER_TYPE_UD, 3);
+   bld.MOV(offset(coords, bld, 0), pixel_x);
+   bld.MOV(offset(coords, bld, 1), pixel_y);
+   bld.MOV(offset(coords, bld, 2), fetch_render_target_array_index(bld));
+
+   /* Calculate the sample index and MCS payload when multisampling.  Luckily
+    * the MCS fetch message behaves deterministically for UMS surfaces, so it
+    * shouldn't be necessary to recompile based on whether the framebuffer is
+    * CMS or UMS.
+    */
+   if (wm_key->multisample_fbo &&
+       nir_system_values[SYSTEM_VALUE_SAMPLE_ID].file == BAD_FILE)
+      nir_system_values[SYSTEM_VALUE_SAMPLE_ID] = *emit_sampleid_setup();
+
+   const fs_reg sample = nir_system_values[SYSTEM_VALUE_SAMPLE_ID];
+   const fs_reg mcs = wm_key->multisample_fbo ?
+      emit_mcs_fetch(coords, 3, brw_imm_ud(surface)) : fs_reg();
+
+   /* Use either a normal or a CMS texel fetch message depending on whether
+    * the framebuffer is single or multisample.  On SKL+ use the wide CMS
+    * message just in case the framebuffer uses 16x multisampling, it should
+    * be equivalent to the normal CMS fetch for lower multisampling modes.
+    */
+   const opcode op = !wm_key->multisample_fbo ? SHADER_OPCODE_TXF_LOGICAL :
+                     devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W_LOGICAL :
+                     SHADER_OPCODE_TXF_CMS_LOGICAL;
+
+   /* Emit the instruction. */
+   const fs_reg srcs[] = { coords, fs_reg(), brw_imm_ud(0), fs_reg(),
+                           sample, mcs,
+                           brw_imm_ud(surface), brw_imm_ud(0),
+                           fs_reg(), brw_imm_ud(3), brw_imm_ud(0) };
+   STATIC_ASSERT(ARRAY_SIZE(srcs) == TEX_LOGICAL_NUM_SRCS);
+
+   fs_inst *inst = bld.emit(op, dst, srcs, ARRAY_SIZE(srcs));
+   inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
+
+   return inst;
+}
+
+/**
+ * Actual coherent framebuffer read implemented using the native render target
+ * read message.  Requires SKL+.
+ */
+static fs_inst *
+emit_coherent_fb_read(const fs_builder &bld, const fs_reg &dst, unsigned target)
+{
+   assert(bld.shader->devinfo->gen >= 9);
+   fs_inst *inst = bld.emit(FS_OPCODE_FB_READ_LOGICAL, dst);
+   inst->target = target;
+   inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
+
+   return inst;
+}
+
+static fs_reg
+alloc_temporary(const fs_builder &bld, unsigned size, fs_reg *regs, unsigned n)
+{
+   if (n && regs[0].file != BAD_FILE) {
+      return regs[0];
+
+   } else {
+      const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_F, size);
+
+      for (unsigned i = 0; i < n; i++)
+         regs[i] = tmp;
+
+      return tmp;
+   }
+}
+
+static fs_reg
+alloc_frag_output(fs_visitor *v, unsigned location)
+{
+   assert(v->stage == MESA_SHADER_FRAGMENT);
+   const brw_wm_prog_key *const key =
+      reinterpret_cast<const brw_wm_prog_key *>(v->key);
+   const unsigned l = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_LOCATION);
+   const unsigned i = GET_FIELD(location, BRW_NIR_FRAG_OUTPUT_INDEX);
+
+   if (i > 0 || (key->force_dual_color_blend && l == FRAG_RESULT_DATA1))
+      return alloc_temporary(v->bld, 4, &v->dual_src_output, 1);
+
+   else if (l == FRAG_RESULT_COLOR)
+      return alloc_temporary(v->bld, 4, v->outputs,
+                             MAX2(key->nr_color_regions, 1));
+
+   else if (l == FRAG_RESULT_DEPTH)
+      return alloc_temporary(v->bld, 1, &v->frag_depth, 1);
+
+   else if (l == FRAG_RESULT_STENCIL)
+      return alloc_temporary(v->bld, 1, &v->frag_stencil, 1);
+
+   else if (l == FRAG_RESULT_SAMPLE_MASK)
+      return alloc_temporary(v->bld, 1, &v->sample_mask, 1);
+
+   else if (l >= FRAG_RESULT_DATA0 &&
+            l < FRAG_RESULT_DATA0 + BRW_MAX_DRAW_BUFFERS)
+      return alloc_temporary(v->bld, 4,
+                             &v->outputs[l - FRAG_RESULT_DATA0], 1);
+
+   else
+      unreachable("Invalid location");
+}
+
+void
+fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
+                                  nir_intrinsic_instr *instr)
+{
+   assert(stage == MESA_SHADER_FRAGMENT);
+
+   fs_reg dest;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dest = get_nir_dest(instr->dest);
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_front_face:
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D),
+              *emit_frontfacing_interpolation());
+      break;
+
+   case nir_intrinsic_load_sample_pos: {
+      fs_reg sample_pos = nir_system_values[SYSTEM_VALUE_SAMPLE_POS];
+      assert(sample_pos.file != BAD_FILE);
+      dest.type = sample_pos.type;
+      bld.MOV(dest, sample_pos);
+      bld.MOV(offset(dest, bld, 1), offset(sample_pos, bld, 1));
+      break;
+   }
+
+   case nir_intrinsic_load_layer_id:
+      dest.type = BRW_REGISTER_TYPE_UD;
+      bld.MOV(dest, fetch_render_target_array_index(bld));
+      break;
+
+   case nir_intrinsic_load_helper_invocation:
+   case nir_intrinsic_load_sample_mask_in:
+   case nir_intrinsic_load_sample_id: {
+      gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
+      fs_reg val = nir_system_values[sv];
+      assert(val.file != BAD_FILE);
+      dest.type = val.type;
+      bld.MOV(dest, val);
+      break;
+   }
+
+   case nir_intrinsic_store_output: {
+      const fs_reg src = get_nir_src(instr->src[0]);
+      const nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
+      assert(const_offset && "Indirect output stores not allowed");
+      const unsigned location = nir_intrinsic_base(instr) +
+         SET_FIELD(const_offset->u32[0], BRW_NIR_FRAG_OUTPUT_LOCATION);
+      const fs_reg new_dest = retype(alloc_frag_output(this, location),
+                                     src.type);
+
+      for (unsigned j = 0; j < instr->num_components; j++)
+         bld.MOV(offset(new_dest, bld, nir_intrinsic_component(instr) + j),
+                 offset(src, bld, j));
+
+      break;
+   }
+
+   case nir_intrinsic_load_output: {
+      const unsigned l = GET_FIELD(nir_intrinsic_base(instr),
+                                   BRW_NIR_FRAG_OUTPUT_LOCATION);
+      assert(l >= FRAG_RESULT_DATA0);
+      nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+      assert(const_offset && "Indirect output loads not allowed");
+      const unsigned target = l - FRAG_RESULT_DATA0 + const_offset->u32[0];
+      const fs_reg tmp = bld.vgrf(dest.type, 4);
+
+      if (reinterpret_cast<const brw_wm_prog_key *>(key)->coherent_fb_fetch)
+         emit_coherent_fb_read(bld, tmp, target);
+      else
+         emit_non_coherent_fb_read(bld, tmp, target);
+
+      for (unsigned j = 0; j < instr->num_components; j++) {
+         bld.MOV(offset(dest, bld, j),
+                 offset(tmp, bld, nir_intrinsic_component(instr) + j));
+      }
+
+      break;
+   }
+
+   case nir_intrinsic_discard:
+   case nir_intrinsic_discard_if: {
+      /* We track our discarded pixels in f0.1.  By predicating on it, we can
+       * update just the flag bits that aren't yet discarded.  If there's no
+       * condition, we emit a CMP of g0 != g0, so all currently executing
+       * channels will get turned off.
+       */
+      fs_inst *cmp;
+      if (instr->intrinsic == nir_intrinsic_discard_if) {
+         cmp = bld.CMP(bld.null_reg_f(), get_nir_src(instr->src[0]),
+                       brw_imm_d(0), BRW_CONDITIONAL_Z);
+      } else {
+         fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
+                                       BRW_REGISTER_TYPE_UW));
+         cmp = bld.CMP(bld.null_reg_f(), some_reg, some_reg, BRW_CONDITIONAL_NZ);
+      }
+      cmp->predicate = BRW_PREDICATE_NORMAL;
+      cmp->flag_subreg = 1;
+
+      if (devinfo->gen >= 6) {
+         emit_discard_jump();
+      }
+      break;
+   }
+
+   case nir_intrinsic_load_input: {
+      /* load_input is only used for flat inputs */
+      unsigned base = nir_intrinsic_base(instr);
+      unsigned component = nir_intrinsic_component(instr);
+      unsigned num_components = instr->num_components;
+      enum brw_reg_type type = dest.type;
+
+      /* Special case fields in the VUE header */
+      if (base == VARYING_SLOT_LAYER)
+         component = 1;
+      else if (base == VARYING_SLOT_VIEWPORT)
+         component = 2;
+
+      if (nir_dest_bit_size(instr->dest) == 64) {
+         /* const_index is in 32-bit type size units that could not be aligned
+          * with DF. We need to read the double vector as if it was a float
+          * vector of twice the number of components to fetch the right data.
+          */
+         type = BRW_REGISTER_TYPE_F;
+         num_components *= 2;
+      }
+
+      for (unsigned int i = 0; i < num_components; i++) {
+         struct brw_reg interp = interp_reg(base, component + i);
+         interp = suboffset(interp, 3);
+         bld.emit(FS_OPCODE_CINTERP, offset(retype(dest, type), bld, i),
+                  retype(fs_reg(interp), type));
+      }
+
+      if (nir_dest_bit_size(instr->dest) == 64) {
+         shuffle_32bit_load_result_to_64bit_data(bld,
+                                                 dest,
+                                                 retype(dest, type),
+                                                 instr->num_components);
+      }
+      break;
+   }
+
+   case nir_intrinsic_load_barycentric_pixel:
+   case nir_intrinsic_load_barycentric_centroid:
+   case nir_intrinsic_load_barycentric_sample:
+      /* Do nothing - load_interpolated_input handling will handle it later. */
+      break;
+
+   case nir_intrinsic_load_barycentric_at_sample: {
+      const glsl_interp_mode interpolation =
+         (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
+
+      nir_const_value *const_sample = nir_src_as_const_value(instr->src[0]);
+
+      if (const_sample) {
+         unsigned msg_data = const_sample->i32[0] << 4;
+
+         emit_pixel_interpolater_send(bld,
+                                      FS_OPCODE_INTERPOLATE_AT_SAMPLE,
+                                      dest,
+                                      fs_reg(), /* src */
+                                      brw_imm_ud(msg_data),
+                                      interpolation);
+      } else {
+         const fs_reg sample_src = retype(get_nir_src(instr->src[0]),
+                                          BRW_REGISTER_TYPE_UD);
+
+         if (nir_src_is_dynamically_uniform(instr->src[0])) {
+            const fs_reg sample_id = bld.emit_uniformize(sample_src);
+            const fs_reg msg_data = vgrf(glsl_type::uint_type);
+            bld.exec_all().group(1, 0)
+               .SHL(msg_data, sample_id, brw_imm_ud(4u));
+            emit_pixel_interpolater_send(bld,
+                                         FS_OPCODE_INTERPOLATE_AT_SAMPLE,
+                                         dest,
+                                         fs_reg(), /* src */
+                                         msg_data,
+                                         interpolation);
+         } else {
+            /* Make a loop that sends a message to the pixel interpolater
+             * for the sample number in each live channel. If there are
+             * multiple channels with the same sample number then these
+             * will be handled simultaneously with a single interation of
+             * the loop.
+             */
+            bld.emit(BRW_OPCODE_DO);
+
+            /* Get the next live sample number into sample_id_reg */
+            const fs_reg sample_id = bld.emit_uniformize(sample_src);
+
+            /* Set the flag register so that we can perform the send
+             * message on all channels that have the same sample number
+             */
+            bld.CMP(bld.null_reg_ud(),
+                    sample_src, sample_id,
+                    BRW_CONDITIONAL_EQ);
+            const fs_reg msg_data = vgrf(glsl_type::uint_type);
+            bld.exec_all().group(1, 0)
+               .SHL(msg_data, sample_id, brw_imm_ud(4u));
+            fs_inst *inst =
+               emit_pixel_interpolater_send(bld,
+                                            FS_OPCODE_INTERPOLATE_AT_SAMPLE,
+                                            dest,
+                                            fs_reg(), /* src */
+                                            msg_data,
+                                            interpolation);
+            set_predicate(BRW_PREDICATE_NORMAL, inst);
+
+            /* Continue the loop if there are any live channels left */
+            set_predicate_inv(BRW_PREDICATE_NORMAL,
+                              true, /* inverse */
+                              bld.emit(BRW_OPCODE_WHILE));
+         }
+      }
+      break;
+   }
+
+   case nir_intrinsic_load_barycentric_at_offset: {
+      const glsl_interp_mode interpolation =
+         (enum glsl_interp_mode) nir_intrinsic_interp_mode(instr);
+
+      nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+
+      if (const_offset) {
+         unsigned off_x = MIN2((int)(const_offset->f32[0] * 16), 7) & 0xf;
+         unsigned off_y = MIN2((int)(const_offset->f32[1] * 16), 7) & 0xf;
+
+         emit_pixel_interpolater_send(bld,
+                                      FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET,
+                                      dest,
+                                      fs_reg(), /* src */
+                                      brw_imm_ud(off_x | (off_y << 4)),
+                                      interpolation);
+      } else {
+         fs_reg src = vgrf(glsl_type::ivec2_type);
+         fs_reg offset_src = retype(get_nir_src(instr->src[0]),
+                                    BRW_REGISTER_TYPE_F);
+         for (int i = 0; i < 2; i++) {
+            fs_reg temp = vgrf(glsl_type::float_type);
+            bld.MUL(temp, offset(offset_src, bld, i), brw_imm_f(16.0f));
+            fs_reg itemp = vgrf(glsl_type::int_type);
+            /* float to int */
+            bld.MOV(itemp, temp);
+
+            /* Clamp the upper end of the range to +7/16.
+             * ARB_gpu_shader5 requires that we support a maximum offset
+             * of +0.5, which isn't representable in a S0.4 value -- if
+             * we didn't clamp it, we'd end up with -8/16, which is the
+             * opposite of what the shader author wanted.
+             *
+             * This is legal due to ARB_gpu_shader5's quantization
+             * rules:
+             *
+             * "Not all values of <offset> may be supported; x and y
+             * offsets may be rounded to fixed-point values with the
+             * number of fraction bits given by the
+             * implementation-dependent constant
+             * FRAGMENT_INTERPOLATION_OFFSET_BITS"
+             */
+            set_condmod(BRW_CONDITIONAL_L,
+                        bld.SEL(offset(src, bld, i), itemp, brw_imm_d(7)));
+         }
+
+         const enum opcode opcode = FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET;
+         emit_pixel_interpolater_send(bld,
+                                      opcode,
+                                      dest,
+                                      src,
+                                      brw_imm_ud(0u),
+                                      interpolation);
+      }
+      break;
+   }
+
+   case nir_intrinsic_load_interpolated_input: {
+      if (nir_intrinsic_base(instr) == VARYING_SLOT_POS) {
+         emit_fragcoord_interpolation(dest);
+         break;
+      }
+
+      assert(instr->src[0].ssa &&
+             instr->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic);
+      nir_intrinsic_instr *bary_intrinsic =
+         nir_instr_as_intrinsic(instr->src[0].ssa->parent_instr);
+      nir_intrinsic_op bary_intrin = bary_intrinsic->intrinsic;
+      enum glsl_interp_mode interp_mode =
+         (enum glsl_interp_mode) nir_intrinsic_interp_mode(bary_intrinsic);
+      fs_reg dst_xy;
+
+      if (bary_intrin == nir_intrinsic_load_barycentric_at_offset ||
+          bary_intrin == nir_intrinsic_load_barycentric_at_sample) {
+         /* Use the result of the PI message */
+         dst_xy = retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_F);
+      } else {
+         /* Use the delta_xy values computed from the payload */
+         enum brw_barycentric_mode bary =
+            brw_barycentric_mode(interp_mode, bary_intrin);
+
+         dst_xy = this->delta_xy[bary];
+      }
+
+      for (unsigned int i = 0; i < instr->num_components; i++) {
+         fs_reg interp =
+            fs_reg(interp_reg(nir_intrinsic_base(instr),
+                              nir_intrinsic_component(instr) + i));
+         interp.type = BRW_REGISTER_TYPE_F;
+         dest.type = BRW_REGISTER_TYPE_F;
+
+         if (devinfo->gen < 6 && interp_mode == INTERP_MODE_SMOOTH) {
+            fs_reg tmp = vgrf(glsl_type::float_type);
+            bld.emit(FS_OPCODE_LINTERP, tmp, dst_xy, interp);
+            bld.MUL(offset(dest, bld, i), tmp, this->pixel_w);
+         } else {
+            bld.emit(FS_OPCODE_LINTERP, offset(dest, bld, i), dst_xy, interp);
+         }
+      }
+      break;
+   }
+
+   default:
+      nir_emit_intrinsic(bld, instr);
+      break;
+   }
+}
+
+void
+fs_visitor::nir_emit_cs_intrinsic(const fs_builder &bld,
+                                  nir_intrinsic_instr *instr)
+{
+   assert(stage == MESA_SHADER_COMPUTE);
+   struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
+
+   fs_reg dest;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dest = get_nir_dest(instr->dest);
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_barrier:
+      emit_barrier();
+      cs_prog_data->uses_barrier = true;
+      break;
+
+   case nir_intrinsic_load_local_invocation_id:
+   case nir_intrinsic_load_work_group_id: {
+      gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
+      fs_reg val = nir_system_values[sv];
+      assert(val.file != BAD_FILE);
+      dest.type = val.type;
+      for (unsigned i = 0; i < 3; i++)
+         bld.MOV(offset(dest, bld, i), offset(val, bld, i));
+      break;
+   }
+
+   case nir_intrinsic_load_num_work_groups: {
+      const unsigned surface =
+         cs_prog_data->binding_table.work_groups_start;
+
+      cs_prog_data->uses_num_work_groups = true;
+
+      fs_reg surf_index = brw_imm_ud(surface);
+      brw_mark_surface_used(prog_data, surface);
+
+      /* Read the 3 GLuint components of gl_NumWorkGroups */
+      for (unsigned i = 0; i < 3; i++) {
+         fs_reg read_result =
+            emit_untyped_read(bld, surf_index,
+                              brw_imm_ud(i << 2),
+                              1 /* dims */, 1 /* size */,
+                              BRW_PREDICATE_NONE);
+         read_result.type = dest.type;
+         bld.MOV(dest, read_result);
+         dest = offset(dest, bld, 1);
+      }
+      break;
+   }
+
+   case nir_intrinsic_shared_atomic_add:
+      nir_emit_shared_atomic(bld, BRW_AOP_ADD, instr);
+      break;
+   case nir_intrinsic_shared_atomic_imin:
+      nir_emit_shared_atomic(bld, BRW_AOP_IMIN, instr);
+      break;
+   case nir_intrinsic_shared_atomic_umin:
+      nir_emit_shared_atomic(bld, BRW_AOP_UMIN, instr);
+      break;
+   case nir_intrinsic_shared_atomic_imax:
+      nir_emit_shared_atomic(bld, BRW_AOP_IMAX, instr);
+      break;
+   case nir_intrinsic_shared_atomic_umax:
+      nir_emit_shared_atomic(bld, BRW_AOP_UMAX, instr);
+      break;
+   case nir_intrinsic_shared_atomic_and:
+      nir_emit_shared_atomic(bld, BRW_AOP_AND, instr);
+      break;
+   case nir_intrinsic_shared_atomic_or:
+      nir_emit_shared_atomic(bld, BRW_AOP_OR, instr);
+      break;
+   case nir_intrinsic_shared_atomic_xor:
+      nir_emit_shared_atomic(bld, BRW_AOP_XOR, instr);
+      break;
+   case nir_intrinsic_shared_atomic_exchange:
+      nir_emit_shared_atomic(bld, BRW_AOP_MOV, instr);
+      break;
+   case nir_intrinsic_shared_atomic_comp_swap:
+      nir_emit_shared_atomic(bld, BRW_AOP_CMPWR, instr);
+      break;
+
+   case nir_intrinsic_load_shared: {
+      assert(devinfo->gen >= 7);
+
+      fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
+
+      /* Get the offset to read from */
+      fs_reg offset_reg;
+      nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+      if (const_offset) {
+         offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]);
+      } else {
+         offset_reg = vgrf(glsl_type::uint_type);
+         bld.ADD(offset_reg,
+                 retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
+                 brw_imm_ud(instr->const_index[0]));
+      }
+
+      /* Read the vector */
+      do_untyped_vector_read(bld, dest, surf_index, offset_reg,
+                             instr->num_components);
+      break;
+   }
+
+   case nir_intrinsic_store_shared: {
+      assert(devinfo->gen >= 7);
+
+      /* Block index */
+      fs_reg surf_index = brw_imm_ud(GEN7_BTI_SLM);
+
+      /* Value */
+      fs_reg val_reg = get_nir_src(instr->src[0]);
+
+      /* Writemask */
+      unsigned writemask = instr->const_index[1];
+
+      /* get_nir_src() retypes to integer. Be wary of 64-bit types though
+       * since the untyped writes below operate in units of 32-bits, which
+       * means that we need to write twice as many components each time.
+       * Also, we have to suffle 64-bit data to be in the appropriate layout
+       * expected by our 32-bit write messages.
+       */
+      unsigned type_size = 4;
+      unsigned bit_size = instr->src[0].is_ssa ?
+         instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
+      if (bit_size == 64) {
+         type_size = 8;
+         fs_reg tmp =
+           fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type);
+         shuffle_64bit_data_for_32bit_write(
+            bld,
+            retype(tmp, BRW_REGISTER_TYPE_F),
+            retype(val_reg, BRW_REGISTER_TYPE_DF),
+            instr->num_components);
+         val_reg = tmp;
+      }
+
+      unsigned type_slots = type_size / 4;
+
+      /* Combine groups of consecutive enabled channels in one write
+       * message. We use ffs to find the first enabled channel and then ffs on
+       * the bit-inverse, down-shifted writemask to determine the length of
+       * the block of enabled bits.
+       */
+      while (writemask) {
+         unsigned first_component = ffs(writemask) - 1;
+         unsigned length = ffs(~(writemask >> first_component)) - 1;
+
+         /* We can't write more than 2 64-bit components at once. Limit the
+          * length of the write to what we can do and let the next iteration
+          * handle the rest
+          */
+         if (type_size > 4)
+            length = MIN2(2, length);
+
+         fs_reg offset_reg;
+         nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
+         if (const_offset) {
+            offset_reg = brw_imm_ud(instr->const_index[0] + const_offset->u32[0] +
+                                    type_size * first_component);
+         } else {
+            offset_reg = vgrf(glsl_type::uint_type);
+            bld.ADD(offset_reg,
+                    retype(get_nir_src(instr->src[1]), BRW_REGISTER_TYPE_UD),
+                    brw_imm_ud(instr->const_index[0] + type_size * first_component));
+         }
+
+         emit_untyped_write(bld, surf_index, offset_reg,
+                            offset(val_reg, bld, first_component * type_slots),
+                            1 /* dims */, length * type_slots,
+                            BRW_PREDICATE_NONE);
+
+         /* Clear the bits in the writemask that we just wrote, then try
+          * again to see if more channels are left.
+          */
+         writemask &= (15 << (first_component + length));
+      }
+
+      break;
+   }
+
+   default:
+      nir_emit_intrinsic(bld, instr);
+      break;
+   }
+}
+
+void
+fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr)
+{
+   fs_reg dest;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dest = get_nir_dest(instr->dest);
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_atomic_counter_inc:
+   case nir_intrinsic_atomic_counter_dec:
+   case nir_intrinsic_atomic_counter_read:
+   case nir_intrinsic_atomic_counter_add:
+   case nir_intrinsic_atomic_counter_min:
+   case nir_intrinsic_atomic_counter_max:
+   case nir_intrinsic_atomic_counter_and:
+   case nir_intrinsic_atomic_counter_or:
+   case nir_intrinsic_atomic_counter_xor:
+   case nir_intrinsic_atomic_counter_exchange:
+   case nir_intrinsic_atomic_counter_comp_swap: {
+      if (stage == MESA_SHADER_FRAGMENT &&
+          instr->intrinsic != nir_intrinsic_atomic_counter_read)
+         brw_wm_prog_data(prog_data)->has_side_effects = true;
+
+      /* Get some metadata from the image intrinsic. */
+      const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
+
+      /* Get the arguments of the atomic intrinsic. */
+      const fs_reg offset = get_nir_src(instr->src[0]);
+      const unsigned surface = (stage_prog_data->binding_table.abo_start +
+                                instr->const_index[0]);
+      const fs_reg src0 = (info->num_srcs >= 2
+                           ? get_nir_src(instr->src[1]) : fs_reg());
+      const fs_reg src1 = (info->num_srcs >= 3
+                           ? get_nir_src(instr->src[2]) : fs_reg());
+      fs_reg tmp;
+
+      assert(info->num_srcs <= 3);
+
+      /* Emit a surface read or atomic op. */
+      if (instr->intrinsic == nir_intrinsic_atomic_counter_read) {
+         tmp = emit_untyped_read(bld, brw_imm_ud(surface), offset, 1, 1);
+      } else {
+         tmp = emit_untyped_atomic(bld, brw_imm_ud(surface), offset, src0,
+                                   src1, 1, 1,
+                                   get_atomic_counter_op(instr->intrinsic));
+      }
+
+      /* Assign the result. */
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_UD), tmp);
+
+      /* Mark the surface as used. */
+      brw_mark_surface_used(stage_prog_data, surface);
+      break;
+   }
+
+   case nir_intrinsic_image_load:
+   case nir_intrinsic_image_store:
+   case nir_intrinsic_image_atomic_add:
+   case nir_intrinsic_image_atomic_min:
+   case nir_intrinsic_image_atomic_max:
+   case nir_intrinsic_image_atomic_and:
+   case nir_intrinsic_image_atomic_or:
+   case nir_intrinsic_image_atomic_xor:
+   case nir_intrinsic_image_atomic_exchange:
+   case nir_intrinsic_image_atomic_comp_swap: {
+      using namespace image_access;
+
+      if (stage == MESA_SHADER_FRAGMENT &&
+          instr->intrinsic != nir_intrinsic_image_load)
+         brw_wm_prog_data(prog_data)->has_side_effects = true;
+
+      /* Get the referenced image variable and type. */
+      const nir_variable *var = instr->variables[0]->var;
+      const glsl_type *type = var->type->without_array();
+      const brw_reg_type base_type = get_image_base_type(type);
+
+      /* Get some metadata from the image intrinsic. */
+      const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
+      const unsigned arr_dims = type->sampler_array ? 1 : 0;
+      const unsigned surf_dims = type->coordinate_components() - arr_dims;
+      const unsigned format = var->data.image.format;
+
+      /* Get the arguments of the image intrinsic. */
+      const fs_reg image = get_nir_image_deref(instr->variables[0]);
+      const fs_reg addr = retype(get_nir_src(instr->src[0]),
+                                 BRW_REGISTER_TYPE_UD);
+      const fs_reg src0 = (info->num_srcs >= 3 ?
+                           retype(get_nir_src(instr->src[2]), base_type) :
+                           fs_reg());
+      const fs_reg src1 = (info->num_srcs >= 4 ?
+                           retype(get_nir_src(instr->src[3]), base_type) :
+                           fs_reg());
+      fs_reg tmp;
+
+      /* Emit an image load, store or atomic op. */
+      if (instr->intrinsic == nir_intrinsic_image_load)
+         tmp = emit_image_load(bld, image, addr, surf_dims, arr_dims, format);
+
+      else if (instr->intrinsic == nir_intrinsic_image_store)
+         emit_image_store(bld, image, addr, src0, surf_dims, arr_dims,
+                          var->data.image.write_only ? GL_NONE : format);
+
+      else
+         tmp = emit_image_atomic(bld, image, addr, src0, src1,
+                                 surf_dims, arr_dims, info->dest_components,
+                                 get_image_atomic_op(instr->intrinsic, type));
+
+      /* Assign the result. */
+      for (unsigned c = 0; c < info->dest_components; ++c)
+         bld.MOV(offset(retype(dest, base_type), bld, c),
+                 offset(tmp, bld, c));
+      break;
+   }
+
+   case nir_intrinsic_memory_barrier_atomic_counter:
+   case nir_intrinsic_memory_barrier_buffer:
+   case nir_intrinsic_memory_barrier_image:
+   case nir_intrinsic_memory_barrier: {
+      const fs_builder ubld = bld.group(8, 0);
+      const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+      ubld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp)
+         ->size_written = 2 * REG_SIZE;
+      break;
+   }
+
+   case nir_intrinsic_group_memory_barrier:
+   case nir_intrinsic_memory_barrier_shared:
+      /* We treat these workgroup-level barriers as no-ops.  This should be
+       * safe at present and as long as:
+       *
+       *  - Memory access instructions are not subsequently reordered by the
+       *    compiler back-end.
+       *
+       *  - All threads from a given compute shader workgroup fit within a
+       *    single subslice and therefore talk to the same HDC shared unit
+       *    what supposedly guarantees ordering and coherency between threads
+       *    from the same workgroup.  This may change in the future when we
+       *    start splitting workgroups across multiple subslices.
+       *
+       *  - The context is not in fault-and-stream mode, which could cause
+       *    memory transactions (including to SLM) prior to the barrier to be
+       *    replayed after the barrier if a pagefault occurs.  This shouldn't
+       *    be a problem up to and including SKL because fault-and-stream is
+       *    not usable due to hardware issues, but that's likely to change in
+       *    the future.
+       */
+      break;
+
+   case nir_intrinsic_shader_clock: {
+      /* We cannot do anything if there is an event, so ignore it for now */
+      const fs_reg shader_clock = get_timestamp(bld);
+      const fs_reg srcs[] = { component(shader_clock, 0),
+                              component(shader_clock, 1) };
+      bld.LOAD_PAYLOAD(dest, srcs, ARRAY_SIZE(srcs), 0);
+      break;
+   }
+
+   case nir_intrinsic_image_size: {
+      /* Get the referenced image variable and type. */
+      const nir_variable *var = instr->variables[0]->var;
+      const glsl_type *type = var->type->without_array();
+
+      /* Get the size of the image. */
+      const fs_reg image = get_nir_image_deref(instr->variables[0]);
+      const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
+
+      /* For 1DArray image types, the array index is stored in the Z component.
+       * Fix this by swizzling the Z component to the Y component.
+       */
+      const bool is_1d_array_image =
+                  type->sampler_dimensionality == GLSL_SAMPLER_DIM_1D &&
+                  type->sampler_array;
+
+      /* For CubeArray images, we should count the number of cubes instead
+       * of the number of faces. Fix it by dividing the (Z component) by 6.
+       */
+      const bool is_cube_array_image =
+                  type->sampler_dimensionality == GLSL_SAMPLER_DIM_CUBE &&
+                  type->sampler_array;
+
+      /* Copy all the components. */
+      const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
+      for (unsigned c = 0; c < info->dest_components; ++c) {
+         if ((int)c >= type->coordinate_components()) {
+             bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
+                     brw_imm_d(1));
+         } else if (c == 1 && is_1d_array_image) {
+            bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
+                    offset(size, bld, 2));
+         } else if (c == 2 && is_cube_array_image) {
+            bld.emit(SHADER_OPCODE_INT_QUOTIENT,
+                     offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
+                     offset(size, bld, c), brw_imm_d(6));
+         } else {
+            bld.MOV(offset(retype(dest, BRW_REGISTER_TYPE_D), bld, c),
+                    offset(size, bld, c));
+         }
+       }
+
+      break;
+   }
+
+   case nir_intrinsic_image_samples:
+      /* The driver does not support multi-sampled images. */
+      bld.MOV(retype(dest, BRW_REGISTER_TYPE_D), brw_imm_d(1));
+      break;
+
+   case nir_intrinsic_load_uniform: {
+      /* Offsets are in bytes but they should always be multiples of 4 */
+      assert(instr->const_index[0] % 4 == 0);
+
+      fs_reg src(UNIFORM, instr->const_index[0] / 4, dest.type);
+
+      nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+      if (const_offset) {
+         /* Offsets are in bytes but they should always be multiples of 4 */
+         assert(const_offset->u32[0] % 4 == 0);
+         src.offset = const_offset->u32[0];
+
+         for (unsigned j = 0; j < instr->num_components; j++) {
+            bld.MOV(offset(dest, bld, j), offset(src, bld, j));
+         }
+      } else {
+         fs_reg indirect = retype(get_nir_src(instr->src[0]),
+                                  BRW_REGISTER_TYPE_UD);
+
+         /* We need to pass a size to the MOV_INDIRECT but we don't want it to
+          * go past the end of the uniform.  In order to keep the n'th
+          * component from running past, we subtract off the size of all but
+          * one component of the vector.
+          */
+         assert(instr->const_index[1] >=
+                instr->num_components * (int) type_sz(dest.type));
+         unsigned read_size = instr->const_index[1] -
+            (instr->num_components - 1) * type_sz(dest.type);
+
+         bool supports_64bit_indirects =
+            !devinfo->is_cherryview && !devinfo->is_broxton;
+
+         if (type_sz(dest.type) != 8 || supports_64bit_indirects) {
+            for (unsigned j = 0; j < instr->num_components; j++) {
+               bld.emit(SHADER_OPCODE_MOV_INDIRECT,
+                        offset(dest, bld, j), offset(src, bld, j),
+                        indirect, brw_imm_ud(read_size));
+            }
+         } else {
+            const unsigned num_mov_indirects =
+               type_sz(dest.type) / type_sz(BRW_REGISTER_TYPE_UD);
+            /* We read a little bit less per MOV INDIRECT, as they are now
+             * 32-bits ones instead of 64-bit. Fix read_size then.
+             */
+            const unsigned read_size_32bit = read_size -
+                (num_mov_indirects - 1) * type_sz(BRW_REGISTER_TYPE_UD);
+            for (unsigned j = 0; j < instr->num_components; j++) {
+               for (unsigned i = 0; i < num_mov_indirects; i++) {
+                  bld.emit(SHADER_OPCODE_MOV_INDIRECT,
+                           subscript(offset(dest, bld, j), BRW_REGISTER_TYPE_UD, i),
+                           subscript(offset(src, bld, j), BRW_REGISTER_TYPE_UD, i),
+                           indirect, brw_imm_ud(read_size_32bit));
+               }
+            }
+         }
+      }
+      break;
+   }
+
+   case nir_intrinsic_load_ubo: {
+      nir_const_value *const_index = nir_src_as_const_value(instr->src[0]);
+      fs_reg surf_index;
+
+      if (const_index) {
+         const unsigned index = stage_prog_data->binding_table.ubo_start +
+                                const_index->u32[0];
+         surf_index = brw_imm_ud(index);
+         brw_mark_surface_used(prog_data, index);
+      } else {
+         /* The block index is not a constant. Evaluate the index expression
+          * per-channel and add the base UBO index; we have to select a value
+          * from any live channel.
+          */
+         surf_index = vgrf(glsl_type::uint_type);
+         bld.ADD(surf_index, get_nir_src(instr->src[0]),
+                 brw_imm_ud(stage_prog_data->binding_table.ubo_start));
+         surf_index = bld.emit_uniformize(surf_index);
+
+         /* Assume this may touch any UBO. It would be nice to provide
+          * a tighter bound, but the array information is already lowered away.
+          */
+         brw_mark_surface_used(prog_data,
+                               stage_prog_data->binding_table.ubo_start +
+                               nir->info->num_ubos - 1);
+      }
+
+      nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
+      if (const_offset == NULL) {
+         fs_reg base_offset = retype(get_nir_src(instr->src[1]),
+                                     BRW_REGISTER_TYPE_UD);
+
+         for (int i = 0; i < instr->num_components; i++)
+            VARYING_PULL_CONSTANT_LOAD(bld, offset(dest, bld, i), surf_index,
+                                       base_offset, i * type_sz(dest.type));
+      } else {
+         /* Even if we are loading doubles, a pull constant load will load
+          * a 32-bit vec4, so should only reserve vgrf space for that. If we
+          * need to load a full dvec4 we will have to emit 2 loads. This is
+          * similar to demote_pull_constants(), except that in that case we
+          * see individual accesses to each component of the vector and then
+          * we let CSE deal with duplicate loads. Here we see a vector access
+          * and we have to split it if necessary.
+          */
+         const unsigned type_size = type_sz(dest.type);
+         const unsigned block_sz = 64; /* Fetch one cacheline at a time. */
+         const fs_builder ubld = bld.exec_all().group(block_sz / 4, 0);
+         const fs_reg packed_consts = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+
+         for (unsigned c = 0; c < instr->num_components;) {
+            const unsigned base = const_offset->u32[0] + c * type_size;
+            /* Number of usable components in the next block-aligned load. */
+            const unsigned count = MIN2(instr->num_components - c,
+                                        (block_sz - base % block_sz) / type_size);
+
+            ubld.emit(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
+                      packed_consts, surf_index,
+                      brw_imm_ud(base & ~(block_sz - 1)));
+
+            const fs_reg consts =
+               retype(byte_offset(packed_consts, base & (block_sz - 1)),
+                      dest.type);
+
+            for (unsigned d = 0; d < count; d++)
+               bld.MOV(offset(dest, bld, c + d), component(consts, d));
+
+            c += count;
+         }
+      }
+      break;
+   }
+
+   case nir_intrinsic_load_ssbo: {
+      assert(devinfo->gen >= 7);
+
+      nir_const_value *const_uniform_block =
+         nir_src_as_const_value(instr->src[0]);
+
+      fs_reg surf_index;
+      if (const_uniform_block) {
+         unsigned index = stage_prog_data->binding_table.ssbo_start +
+                          const_uniform_block->u32[0];
+         surf_index = brw_imm_ud(index);
+         brw_mark_surface_used(prog_data, index);
+      } else {
+         surf_index = vgrf(glsl_type::uint_type);
+         bld.ADD(surf_index, get_nir_src(instr->src[0]),
+                 brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
+
+         /* Assume this may touch any UBO. It would be nice to provide
+          * a tighter bound, but the array information is already lowered away.
+          */
+         brw_mark_surface_used(prog_data,
+                               stage_prog_data->binding_table.ssbo_start +
+                               nir->info->num_ssbos - 1);
+      }
+
+      fs_reg offset_reg;
+      nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
+      if (const_offset) {
+         offset_reg = brw_imm_ud(const_offset->u32[0]);
+      } else {
+         offset_reg = get_nir_src(instr->src[1]);
+      }
+
+      /* Read the vector */
+      do_untyped_vector_read(bld, dest, surf_index, offset_reg,
+                             instr->num_components);
+
+      break;
+   }
+
+   case nir_intrinsic_store_ssbo: {
+      assert(devinfo->gen >= 7);
+
+      if (stage == MESA_SHADER_FRAGMENT)
+         brw_wm_prog_data(prog_data)->has_side_effects = true;
+
+      /* Block index */
+      fs_reg surf_index;
+      nir_const_value *const_uniform_block =
+         nir_src_as_const_value(instr->src[1]);
+      if (const_uniform_block) {
+         unsigned index = stage_prog_data->binding_table.ssbo_start +
+                          const_uniform_block->u32[0];
+         surf_index = brw_imm_ud(index);
+         brw_mark_surface_used(prog_data, index);
+      } else {
+         surf_index = vgrf(glsl_type::uint_type);
+         bld.ADD(surf_index, get_nir_src(instr->src[1]),
+                  brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
+
+         brw_mark_surface_used(prog_data,
+                               stage_prog_data->binding_table.ssbo_start +
+                               nir->info->num_ssbos - 1);
+      }
+
+      /* Value */
+      fs_reg val_reg = get_nir_src(instr->src[0]);
+
+      /* Writemask */
+      unsigned writemask = instr->const_index[0];
+
+      /* get_nir_src() retypes to integer. Be wary of 64-bit types though
+       * since the untyped writes below operate in units of 32-bits, which
+       * means that we need to write twice as many components each time.
+       * Also, we have to suffle 64-bit data to be in the appropriate layout
+       * expected by our 32-bit write messages.
+       */
+      unsigned type_size = 4;
+      unsigned bit_size = instr->src[0].is_ssa ?
+         instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
+      if (bit_size == 64) {
+         type_size = 8;
+         fs_reg tmp =
+           fs_reg(VGRF, alloc.allocate(alloc.sizes[val_reg.nr]), val_reg.type);
+         shuffle_64bit_data_for_32bit_write(bld,
+            retype(tmp, BRW_REGISTER_TYPE_F),
+            retype(val_reg, BRW_REGISTER_TYPE_DF),
+            instr->num_components);
+         val_reg = tmp;
+      }
+
+      unsigned type_slots = type_size / 4;
+
+      /* Combine groups of consecutive enabled channels in one write
+       * message. We use ffs to find the first enabled channel and then ffs on
+       * the bit-inverse, down-shifted writemask to determine the length of
+       * the block of enabled bits.
+       */
+      while (writemask) {
+         unsigned first_component = ffs(writemask) - 1;
+         unsigned length = ffs(~(writemask >> first_component)) - 1;
+
+         /* We can't write more than 2 64-bit components at once. Limit the
+          * length of the write to what we can do and let the next iteration
+          * handle the rest
+          */
+         if (type_size > 4)
+            length = MIN2(2, length);
+
+         fs_reg offset_reg;
+         nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]);
+         if (const_offset) {
+            offset_reg = brw_imm_ud(const_offset->u32[0] +
+                                    type_size * first_component);
+         } else {
+            offset_reg = vgrf(glsl_type::uint_type);
+            bld.ADD(offset_reg,
+                    retype(get_nir_src(instr->src[2]), BRW_REGISTER_TYPE_UD),
+                    brw_imm_ud(type_size * first_component));
+         }
+
+
+         emit_untyped_write(bld, surf_index, offset_reg,
+                            offset(val_reg, bld, first_component * type_slots),
+                            1 /* dims */, length * type_slots,
+                            BRW_PREDICATE_NONE);
+
+         /* Clear the bits in the writemask that we just wrote, then try
+          * again to see if more channels are left.
+          */
+         writemask &= (15 << (first_component + length));
+      }
+      break;
+   }
+
+   case nir_intrinsic_store_output: {
+      fs_reg src = get_nir_src(instr->src[0]);
+
+      nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
+      assert(const_offset && "Indirect output stores not allowed");
+      fs_reg new_dest = retype(offset(outputs[instr->const_index[0]], bld,
+                                      4 * const_offset->u32[0]), src.type);
+
+      unsigned num_components = instr->num_components;
+      unsigned first_component = nir_intrinsic_component(instr);
+      unsigned bit_size = instr->src[0].is_ssa ?
+         instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
+      if (bit_size == 64) {
+         fs_reg tmp =
+            fs_reg(VGRF, alloc.allocate(2 * num_components),
+                   BRW_REGISTER_TYPE_F);
+         shuffle_64bit_data_for_32bit_write(
+            bld, tmp, retype(src, BRW_REGISTER_TYPE_DF), num_components);
+         src = retype(tmp, src.type);
+         num_components *= 2;
+      }
+
+      for (unsigned j = 0; j < num_components; j++) {
+         bld.MOV(offset(new_dest, bld, j + first_component),
+                 offset(src, bld, j));
+      }
+      break;
+   }
+
+   case nir_intrinsic_ssbo_atomic_add:
+      nir_emit_ssbo_atomic(bld, BRW_AOP_ADD, instr);
+      break;
+   case nir_intrinsic_ssbo_atomic_imin:
+      nir_emit_ssbo_atomic(bld, BRW_AOP_IMIN, instr);
+      break;
+   case nir_intrinsic_ssbo_atomic_umin:
+      nir_emit_ssbo_atomic(bld, BRW_AOP_UMIN, instr);
+      break;
+   case nir_intrinsic_ssbo_atomic_imax:
+      nir_emit_ssbo_atomic(bld, BRW_AOP_IMAX, instr);
+      break;
+   case nir_intrinsic_ssbo_atomic_umax:
+      nir_emit_ssbo_atomic(bld, BRW_AOP_UMAX, instr);
+      break;
+   case nir_intrinsic_ssbo_atomic_and:
+      nir_emit_ssbo_atomic(bld, BRW_AOP_AND, instr);
+      break;
+   case nir_intrinsic_ssbo_atomic_or:
+      nir_emit_ssbo_atomic(bld, BRW_AOP_OR, instr);
+      break;
+   case nir_intrinsic_ssbo_atomic_xor:
+      nir_emit_ssbo_atomic(bld, BRW_AOP_XOR, instr);
+      break;
+   case nir_intrinsic_ssbo_atomic_exchange:
+      nir_emit_ssbo_atomic(bld, BRW_AOP_MOV, instr);
+      break;
+   case nir_intrinsic_ssbo_atomic_comp_swap:
+      nir_emit_ssbo_atomic(bld, BRW_AOP_CMPWR, instr);
+      break;
+
+   case nir_intrinsic_get_buffer_size: {
+      nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]);
+      unsigned ssbo_index = const_uniform_block ? const_uniform_block->u32[0] : 0;
+
+      /* A resinfo's sampler message is used to get the buffer size.  The
+       * SIMD8's writeback message consists of four registers and SIMD16's
+       * writeback message consists of 8 destination registers (two per each
+       * component).  Because we are only interested on the first channel of
+       * the first returned component, where resinfo returns the buffer size
+       * for SURFTYPE_BUFFER, we can just use the SIMD8 variant regardless of
+       * the dispatch width.
+       */
+      const fs_builder ubld = bld.exec_all().group(8, 0);
+      fs_reg src_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD);
+      fs_reg ret_payload = ubld.vgrf(BRW_REGISTER_TYPE_UD, 4);
+
+      /* Set LOD = 0 */
+      ubld.MOV(src_payload, brw_imm_d(0));
+
+      const unsigned index = prog_data->binding_table.ssbo_start + ssbo_index;
+      fs_inst *inst = ubld.emit(FS_OPCODE_GET_BUFFER_SIZE, ret_payload,
+                                src_payload, brw_imm_ud(index));
+      inst->header_size = 0;
+      inst->mlen = 1;
+      inst->size_written = 4 * REG_SIZE;
+
+      bld.MOV(retype(dest, ret_payload.type), component(ret_payload, 0));
+      brw_mark_surface_used(prog_data, index);
+      break;
+   }
+
+   case nir_intrinsic_load_channel_num: {
+      fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UW);
+      dest = retype(dest, BRW_REGISTER_TYPE_UD);
+      const fs_builder allbld8 = bld.group(8, 0).exec_all();
+      allbld8.MOV(tmp, brw_imm_v(0x76543210));
+      if (dispatch_width > 8)
+         allbld8.ADD(byte_offset(tmp, 16), tmp, brw_imm_uw(8u));
+      if (dispatch_width > 16) {
+         const fs_builder allbld16 = bld.group(16, 0).exec_all();
+         allbld16.ADD(byte_offset(tmp, 32), tmp, brw_imm_uw(16u));
+      }
+      bld.MOV(dest, tmp);
+      break;
+   }
+
+   default:
+      unreachable("unknown intrinsic");
+   }
+}
+
+void
+fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
+                                 int op, nir_intrinsic_instr *instr)
+{
+   if (stage == MESA_SHADER_FRAGMENT)
+      brw_wm_prog_data(prog_data)->has_side_effects = true;
+
+   fs_reg dest;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dest = get_nir_dest(instr->dest);
+
+   fs_reg surface;
+   nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]);
+   if (const_surface) {
+      unsigned surf_index = stage_prog_data->binding_table.ssbo_start +
+                            const_surface->u32[0];
+      surface = brw_imm_ud(surf_index);
+      brw_mark_surface_used(prog_data, surf_index);
+   } else {
+      surface = vgrf(glsl_type::uint_type);
+      bld.ADD(surface, get_nir_src(instr->src[0]),
+              brw_imm_ud(stage_prog_data->binding_table.ssbo_start));
+
+      /* Assume this may touch any SSBO. This is the same we do for other
+       * UBO/SSBO accesses with non-constant surface.
+       */
+      brw_mark_surface_used(prog_data,
+                            stage_prog_data->binding_table.ssbo_start +
+                            nir->info->num_ssbos - 1);
+   }
+
+   fs_reg offset = get_nir_src(instr->src[1]);
+   fs_reg data1 = get_nir_src(instr->src[2]);
+   fs_reg data2;
+   if (op == BRW_AOP_CMPWR)
+      data2 = get_nir_src(instr->src[3]);
+
+   /* Emit the actual atomic operation */
+
+   fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
+                                              data1, data2,
+                                              1 /* dims */, 1 /* rsize */,
+                                              op,
+                                              BRW_PREDICATE_NONE);
+   dest.type = atomic_result.type;
+   bld.MOV(dest, atomic_result);
+}
+
+void
+fs_visitor::nir_emit_shared_atomic(const fs_builder &bld,
+                                   int op, nir_intrinsic_instr *instr)
+{
+   fs_reg dest;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dest = get_nir_dest(instr->dest);
+
+   fs_reg surface = brw_imm_ud(GEN7_BTI_SLM);
+   fs_reg offset;
+   fs_reg data1 = get_nir_src(instr->src[1]);
+   fs_reg data2;
+   if (op == BRW_AOP_CMPWR)
+      data2 = get_nir_src(instr->src[2]);
+
+   /* Get the offset */
+   nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+   if (const_offset) {
+      offset = brw_imm_ud(instr->const_index[0] + const_offset->u32[0]);
+   } else {
+      offset = vgrf(glsl_type::uint_type);
+      bld.ADD(offset,
+	      retype(get_nir_src(instr->src[0]), BRW_REGISTER_TYPE_UD),
+	      brw_imm_ud(instr->const_index[0]));
+   }
+
+   /* Emit the actual atomic operation operation */
+
+   fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
+                                              data1, data2,
+                                              1 /* dims */, 1 /* rsize */,
+                                              op,
+                                              BRW_PREDICATE_NONE);
+   dest.type = atomic_result.type;
+   bld.MOV(dest, atomic_result);
+}
+
+void
+fs_visitor::nir_emit_texture(const fs_builder &bld, nir_tex_instr *instr)
+{
+   unsigned texture = instr->texture_index;
+   unsigned sampler = instr->sampler_index;
+
+   fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
+
+   srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture);
+   srcs[TEX_LOGICAL_SRC_SAMPLER] = brw_imm_ud(sampler);
+
+   int lod_components = 0;
+
+   /* The hardware requires a LOD for buffer textures */
+   if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
+      srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_d(0);
+
+   uint32_t header_bits = 0;
+   for (unsigned i = 0; i < instr->num_srcs; i++) {
+      fs_reg src = get_nir_src(instr->src[i].src);
+      switch (instr->src[i].src_type) {
+      case nir_tex_src_bias:
+         srcs[TEX_LOGICAL_SRC_LOD] =
+            retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
+         break;
+      case nir_tex_src_comparator:
+         srcs[TEX_LOGICAL_SRC_SHADOW_C] = retype(src, BRW_REGISTER_TYPE_F);
+         break;
+      case nir_tex_src_coord:
+         switch (instr->op) {
+         case nir_texop_txf:
+         case nir_texop_txf_ms:
+         case nir_texop_txf_ms_mcs:
+         case nir_texop_samples_identical:
+            srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_D);
+            break;
+         default:
+            srcs[TEX_LOGICAL_SRC_COORDINATE] = retype(src, BRW_REGISTER_TYPE_F);
+            break;
+         }
+         break;
+      case nir_tex_src_ddx:
+         srcs[TEX_LOGICAL_SRC_LOD] = retype(src, BRW_REGISTER_TYPE_F);
+         lod_components = nir_tex_instr_src_size(instr, i);
+         break;
+      case nir_tex_src_ddy:
+         srcs[TEX_LOGICAL_SRC_LOD2] = retype(src, BRW_REGISTER_TYPE_F);
+         break;
+      case nir_tex_src_lod:
+         switch (instr->op) {
+         case nir_texop_txs:
+            srcs[TEX_LOGICAL_SRC_LOD] =
+               retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_UD);
+            break;
+         case nir_texop_txf:
+            srcs[TEX_LOGICAL_SRC_LOD] =
+               retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_D);
+            break;
+         default:
+            srcs[TEX_LOGICAL_SRC_LOD] =
+               retype(get_nir_src_imm(instr->src[i].src), BRW_REGISTER_TYPE_F);
+            break;
+         }
+         break;
+      case nir_tex_src_ms_index:
+         srcs[TEX_LOGICAL_SRC_SAMPLE_INDEX] = retype(src, BRW_REGISTER_TYPE_UD);
+         break;
+
+      case nir_tex_src_offset: {
+         nir_const_value *const_offset =
+            nir_src_as_const_value(instr->src[i].src);
+         unsigned offset_bits = 0;
+         if (const_offset &&
+             brw_texture_offset(const_offset->i32,
+                                nir_tex_instr_src_size(instr, i),
+                                &offset_bits)) {
+            header_bits |= offset_bits;
+         } else {
+            srcs[TEX_LOGICAL_SRC_TG4_OFFSET] =
+               retype(src, BRW_REGISTER_TYPE_D);
+         }
+         break;
+      }
+
+      case nir_tex_src_projector:
+         unreachable("should be lowered");
+
+      case nir_tex_src_texture_offset: {
+         /* Figure out the highest possible texture index and mark it as used */
+         uint32_t max_used = texture + instr->texture_array_size - 1;
+         if (instr->op == nir_texop_tg4 && devinfo->gen < 8) {
+            max_used += stage_prog_data->binding_table.gather_texture_start;
+         } else {
+            max_used += stage_prog_data->binding_table.texture_start;
+         }
+         brw_mark_surface_used(prog_data, max_used);
+
+         /* Emit code to evaluate the actual indexing expression */
+         fs_reg tmp = vgrf(glsl_type::uint_type);
+         bld.ADD(tmp, src, brw_imm_ud(texture));
+         srcs[TEX_LOGICAL_SRC_SURFACE] = bld.emit_uniformize(tmp);
+         break;
+      }
+
+      case nir_tex_src_sampler_offset: {
+         /* Emit code to evaluate the actual indexing expression */
+         fs_reg tmp = vgrf(glsl_type::uint_type);
+         bld.ADD(tmp, src, brw_imm_ud(sampler));
+         srcs[TEX_LOGICAL_SRC_SAMPLER] = bld.emit_uniformize(tmp);
+         break;
+      }
+
+      case nir_tex_src_ms_mcs:
+         assert(instr->op == nir_texop_txf_ms);
+         srcs[TEX_LOGICAL_SRC_MCS] = retype(src, BRW_REGISTER_TYPE_D);
+         break;
+
+      case nir_tex_src_plane: {
+         nir_const_value *const_plane =
+            nir_src_as_const_value(instr->src[i].src);
+         const uint32_t plane = const_plane->u32[0];
+         const uint32_t texture_index =
+            instr->texture_index +
+            stage_prog_data->binding_table.plane_start[plane] -
+            stage_prog_data->binding_table.texture_start;
+
+         srcs[TEX_LOGICAL_SRC_SURFACE] = brw_imm_ud(texture_index);
+         break;
+      }
+
+      default:
+         unreachable("unknown texture source");
+      }
+   }
+
+   if (srcs[TEX_LOGICAL_SRC_MCS].file == BAD_FILE &&
+       (instr->op == nir_texop_txf_ms ||
+        instr->op == nir_texop_samples_identical)) {
+      if (devinfo->gen >= 7 &&
+          key_tex->compressed_multisample_layout_mask & (1 << texture)) {
+         srcs[TEX_LOGICAL_SRC_MCS] =
+            emit_mcs_fetch(srcs[TEX_LOGICAL_SRC_COORDINATE],
+                           instr->coord_components,
+                           srcs[TEX_LOGICAL_SRC_SURFACE]);
+      } else {
+         srcs[TEX_LOGICAL_SRC_MCS] = brw_imm_ud(0u);
+      }
+   }
+
+   srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(instr->coord_components);
+   srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(lod_components);
+
+   if (instr->op == nir_texop_query_levels ||
+       (instr->op == nir_texop_tex && stage != MESA_SHADER_FRAGMENT)) {
+      /* textureQueryLevels() and texture() are implemented in terms of TXS
+       * and TXL respectively, so we need to pass a valid LOD argument.
+       */
+      assert(srcs[TEX_LOGICAL_SRC_LOD].file == BAD_FILE);
+      srcs[TEX_LOGICAL_SRC_LOD] = brw_imm_ud(0u);
+   }
+
+   enum opcode opcode;
+   switch (instr->op) {
+   case nir_texop_tex:
+      opcode = (stage == MESA_SHADER_FRAGMENT ? SHADER_OPCODE_TEX_LOGICAL :
+                SHADER_OPCODE_TXL_LOGICAL);
+      break;
+   case nir_texop_txb:
+      opcode = FS_OPCODE_TXB_LOGICAL;
+      break;
+   case nir_texop_txl:
+      opcode = SHADER_OPCODE_TXL_LOGICAL;
+      break;
+   case nir_texop_txd:
+      opcode = SHADER_OPCODE_TXD_LOGICAL;
+      break;
+   case nir_texop_txf:
+      opcode = SHADER_OPCODE_TXF_LOGICAL;
+      break;
+   case nir_texop_txf_ms:
+      if ((key_tex->msaa_16 & (1 << sampler)))
+         opcode = SHADER_OPCODE_TXF_CMS_W_LOGICAL;
+      else
+         opcode = SHADER_OPCODE_TXF_CMS_LOGICAL;
+      break;
+   case nir_texop_txf_ms_mcs:
+      opcode = SHADER_OPCODE_TXF_MCS_LOGICAL;
+      break;
+   case nir_texop_query_levels:
+   case nir_texop_txs:
+      opcode = SHADER_OPCODE_TXS_LOGICAL;
+      break;
+   case nir_texop_lod:
+      opcode = SHADER_OPCODE_LOD_LOGICAL;
+      break;
+   case nir_texop_tg4:
+      if (srcs[TEX_LOGICAL_SRC_TG4_OFFSET].file != BAD_FILE)
+         opcode = SHADER_OPCODE_TG4_OFFSET_LOGICAL;
+      else
+         opcode = SHADER_OPCODE_TG4_LOGICAL;
+      break;
+   case nir_texop_texture_samples:
+      opcode = SHADER_OPCODE_SAMPLEINFO_LOGICAL;
+      break;
+   case nir_texop_samples_identical: {
+      fs_reg dst = retype(get_nir_dest(instr->dest), BRW_REGISTER_TYPE_D);
+
+      /* If mcs is an immediate value, it means there is no MCS.  In that case
+       * just return false.
+       */
+      if (srcs[TEX_LOGICAL_SRC_MCS].file == BRW_IMMEDIATE_VALUE) {
+         bld.MOV(dst, brw_imm_ud(0u));
+      } else if ((key_tex->msaa_16 & (1 << sampler))) {
+         fs_reg tmp = vgrf(glsl_type::uint_type);
+         bld.OR(tmp, srcs[TEX_LOGICAL_SRC_MCS],
+                offset(srcs[TEX_LOGICAL_SRC_MCS], bld, 1));
+         bld.CMP(dst, tmp, brw_imm_ud(0u), BRW_CONDITIONAL_EQ);
+      } else {
+         bld.CMP(dst, srcs[TEX_LOGICAL_SRC_MCS], brw_imm_ud(0u),
+                 BRW_CONDITIONAL_EQ);
+      }
+      return;
+   }
+   default:
+      unreachable("unknown texture opcode");
+   }
+
+   if (instr->op == nir_texop_tg4) {
+      if (instr->component == 1 &&
+          key_tex->gather_channel_quirk_mask & (1 << texture)) {
+         /* gather4 sampler is broken for green channel on RG32F --
+          * we must ask for blue instead.
+          */
+         header_bits |= 2 << 16;
+      } else {
+         header_bits |= instr->component << 16;
+      }
+   }
+
+   fs_reg dst = bld.vgrf(brw_type_for_nir_type(devinfo, instr->dest_type), 4);
+   fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
+   inst->offset = header_bits;
+
+   const unsigned dest_size = nir_tex_instr_dest_size(instr);
+   if (devinfo->gen >= 9 &&
+       instr->op != nir_texop_tg4 && instr->op != nir_texop_query_levels) {
+      unsigned write_mask = instr->dest.is_ssa ?
+                            nir_ssa_def_components_read(&instr->dest.ssa):
+                            (1 << dest_size) - 1;
+      assert(write_mask != 0); /* dead code should have been eliminated */
+      inst->size_written = util_last_bit(write_mask) *
+                           inst->dst.component_size(inst->exec_size);
+   } else {
+      inst->size_written = 4 * inst->dst.component_size(inst->exec_size);
+   }
+
+   if (srcs[TEX_LOGICAL_SRC_SHADOW_C].file != BAD_FILE)
+      inst->shadow_compare = true;
+
+   if (instr->op == nir_texop_tg4 && devinfo->gen == 6)
+      emit_gen6_gather_wa(key_tex->gen6_gather_wa[texture], dst);
+
+   fs_reg nir_dest[4];
+   for (unsigned i = 0; i < dest_size; i++)
+      nir_dest[i] = offset(dst, bld, i);
+
+   if (instr->op == nir_texop_query_levels) {
+      /* # levels is in .w */
+      nir_dest[0] = offset(dst, bld, 3);
+   } else if (instr->op == nir_texop_txs &&
+              dest_size >= 3 && devinfo->gen < 7) {
+      /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
+      fs_reg depth = offset(dst, bld, 2);
+      nir_dest[2] = vgrf(glsl_type::int_type);
+      bld.emit_minmax(nir_dest[2], depth, brw_imm_d(1), BRW_CONDITIONAL_GE);
+   }
+
+   bld.LOAD_PAYLOAD(get_nir_dest(instr->dest), nir_dest, dest_size, 0);
+}
+
+void
+fs_visitor::nir_emit_jump(const fs_builder &bld, nir_jump_instr *instr)
+{
+   switch (instr->type) {
+   case nir_jump_break:
+      bld.emit(BRW_OPCODE_BREAK);
+      break;
+   case nir_jump_continue:
+      bld.emit(BRW_OPCODE_CONTINUE);
+      break;
+   case nir_jump_return:
+   default:
+      unreachable("unknown jump");
+   }
+}
+
+/**
+ * This helper takes the result of a load operation that reads 32-bit elements
+ * in this format:
+ *
+ * x x x x x x x x
+ * y y y y y y y y
+ * z z z z z z z z
+ * w w w w w w w w
+ *
+ * and shuffles the data to get this:
+ *
+ * x y x y x y x y
+ * x y x y x y x y
+ * z w z w z w z w
+ * z w z w z w z w
+ *
+ * Which is exactly what we want if the load is reading 64-bit components
+ * like doubles, where x represents the low 32-bit of the x double component
+ * and y represents the high 32-bit of the x double component (likewise with
+ * z and w for double component y). The parameter @components represents
+ * the number of 64-bit components present in @src. This would typically be
+ * 2 at most, since we can only fit 2 double elements in the result of a
+ * vec4 load.
+ *
+ * Notice that @dst and @src can be the same register.
+ */
+void
+shuffle_32bit_load_result_to_64bit_data(const fs_builder &bld,
+                                        const fs_reg &dst,
+                                        const fs_reg &src,
+                                        uint32_t components)
+{
+   assert(type_sz(src.type) == 4);
+   assert(type_sz(dst.type) == 8);
+
+   /* A temporary that we will use to shuffle the 32-bit data of each
+    * component in the vector into valid 64-bit data. We can't write directly
+    * to dst because dst can be (and would usually be) the same as src
+    * and in that case the first MOV in the loop below would overwrite the
+    * data read in the second MOV.
+    */
+   fs_reg tmp = bld.vgrf(dst.type);
+
+   for (unsigned i = 0; i < components; i++) {
+      const fs_reg component_i = offset(src, bld, 2 * i);
+
+      bld.MOV(subscript(tmp, src.type, 0), component_i);
+      bld.MOV(subscript(tmp, src.type, 1), offset(component_i, bld, 1));
+
+      bld.MOV(offset(dst, bld, i), tmp);
+   }
+}
+
+/**
+ * This helper does the inverse operation of
+ * SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA.
+ *
+ * We need to do this when we are going to use untyped write messsages that
+ * operate with 32-bit components in order to arrange our 64-bit data to be
+ * in the expected layout.
+ *
+ * Notice that callers of this function, unlike in the case of the inverse
+ * operation, would typically need to call this with dst and src being
+ * different registers, since they would otherwise corrupt the original
+ * 64-bit data they are about to write. Because of this the function checks
+ * that the src and dst regions involved in the operation do not overlap.
+ */
+void
+shuffle_64bit_data_for_32bit_write(const fs_builder &bld,
+                                   const fs_reg &dst,
+                                   const fs_reg &src,
+                                   uint32_t components)
+{
+   assert(type_sz(src.type) == 8);
+   assert(type_sz(dst.type) == 4);
+
+   assert(!regions_overlap(
+             dst, 2 * components * dst.component_size(bld.dispatch_width()),
+             src, components * src.component_size(bld.dispatch_width())));
+
+   for (unsigned i = 0; i < components; i++) {
+      const fs_reg component_i = offset(src, bld, i);
+      bld.MOV(offset(dst, bld, 2 * i), subscript(component_i, dst.type, 0));
+      bld.MOV(offset(dst, bld, 2 * i + 1), subscript(component_i, dst.type, 1));
+   }
+}
+
+fs_reg
+setup_imm_df(const fs_builder &bld, double v)
+{
+   const struct gen_device_info *devinfo = bld.shader->devinfo;
+   assert(devinfo->gen >= 7);
+
+   if (devinfo->gen >= 8)
+      return brw_imm_df(v);
+
+   /* gen7.5 does not support DF immediates straighforward but the DIM
+    * instruction allows to set the 64-bit immediate value.
+    */
+   if (devinfo->is_haswell) {
+      const fs_builder ubld = bld.exec_all().group(1, 0);
+      fs_reg dst = ubld.vgrf(BRW_REGISTER_TYPE_DF, 1);
+      ubld.DIM(dst, brw_imm_df(v));
+      return component(dst, 0);
+   }
+
+   /* gen7 does not support DF immediates, so we generate a 64-bit constant by
+    * writing the low 32-bit of the constant to suboffset 0 of a VGRF and
+    * the high 32-bit to suboffset 4 and then applying a stride of 0.
+    *
+    * Alternatively, we could also produce a normal VGRF (without stride 0)
+    * by writing to all the channels in the VGRF, however, that would hit the
+    * gen7 bug where we have to split writes that span more than 1 register
+    * into instructions with a width of 4 (otherwise the write to the second
+    * register written runs into an execmask hardware bug) which isn't very
+    * nice.
+    */
+   union {
+      double d;
+      struct {
+         uint32_t i1;
+         uint32_t i2;
+      };
+   } di;
+
+   di.d = v;
+
+   const fs_builder ubld = bld.exec_all().group(1, 0);
+   const fs_reg tmp = ubld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+   ubld.MOV(tmp, brw_imm_ud(di.i1));
+   ubld.MOV(horiz_offset(tmp, 1), brw_imm_ud(di.i2));
+
+   return component(retype(tmp, BRW_REGISTER_TYPE_DF), 0);
+}
diff --git a/src/intel/compiler/brw_fs_reg_allocate.cpp b/src/intel/compiler/brw_fs_reg_allocate.cpp
new file mode 100644
index 00000000000..5c6f3d490f0
--- /dev/null
+++ b/src/intel/compiler/brw_fs_reg_allocate.cpp
@@ -0,0 +1,992 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "brw_eu.h"
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "util/register_allocate.h"
+
+using namespace brw;
+
+static void
+assign_reg(unsigned *reg_hw_locations, fs_reg *reg)
+{
+   if (reg->file == VGRF) {
+      reg->nr = reg_hw_locations[reg->nr] + reg->offset / REG_SIZE;
+      reg->offset %= REG_SIZE;
+   }
+}
+
+void
+fs_visitor::assign_regs_trivial()
+{
+   unsigned hw_reg_mapping[this->alloc.count + 1];
+   unsigned i;
+   int reg_width = dispatch_width / 8;
+
+   /* Note that compressed instructions require alignment to 2 registers. */
+   hw_reg_mapping[0] = ALIGN(this->first_non_payload_grf, reg_width);
+   for (i = 1; i <= this->alloc.count; i++) {
+      hw_reg_mapping[i] = (hw_reg_mapping[i - 1] +
+			   this->alloc.sizes[i - 1]);
+   }
+   this->grf_used = hw_reg_mapping[this->alloc.count];
+
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      assign_reg(hw_reg_mapping, &inst->dst);
+      for (i = 0; i < inst->sources; i++) {
+         assign_reg(hw_reg_mapping, &inst->src[i]);
+      }
+   }
+
+   if (this->grf_used >= max_grf) {
+      fail("Ran out of regs on trivial allocator (%d/%d)\n",
+	   this->grf_used, max_grf);
+   } else {
+      this->alloc.count = this->grf_used;
+   }
+
+}
+
+static void
+brw_alloc_reg_set(struct brw_compiler *compiler, int dispatch_width)
+{
+   const struct gen_device_info *devinfo = compiler->devinfo;
+   int base_reg_count = BRW_MAX_GRF;
+   const int index = _mesa_logbase2(dispatch_width / 8);
+
+   if (dispatch_width > 8 && devinfo->gen >= 7) {
+      /* For IVB+, we don't need the PLN hacks or the even-reg alignment in
+       * SIMD16.  Therefore, we can use the exact same register sets for
+       * SIMD16 as we do for SIMD8 and we don't need to recalculate them.
+       */
+      compiler->fs_reg_sets[index] = compiler->fs_reg_sets[0];
+      return;
+   }
+
+   /* The registers used to make up almost all values handled in the compiler
+    * are a scalar value occupying a single register (or 2 registers in the
+    * case of SIMD16, which is handled by dividing base_reg_count by 2 and
+    * multiplying allocated register numbers by 2).  Things that were
+    * aggregates of scalar values at the GLSL level were split to scalar
+    * values by split_virtual_grfs().
+    *
+    * However, texture SEND messages return a series of contiguous registers
+    * to write into.  We currently always ask for 4 registers, but we may
+    * convert that to use less some day.
+    *
+    * Additionally, on gen5 we need aligned pairs of registers for the PLN
+    * instruction, and on gen4 we need 8 contiguous regs for workaround simd16
+    * texturing.
+    */
+   const int class_count = MAX_VGRF_SIZE;
+   int class_sizes[MAX_VGRF_SIZE];
+   for (unsigned i = 0; i < MAX_VGRF_SIZE; i++)
+      class_sizes[i] = i + 1;
+
+   memset(compiler->fs_reg_sets[index].class_to_ra_reg_range, 0,
+          sizeof(compiler->fs_reg_sets[index].class_to_ra_reg_range));
+   int *class_to_ra_reg_range = compiler->fs_reg_sets[index].class_to_ra_reg_range;
+
+   /* Compute the total number of registers across all classes. */
+   int ra_reg_count = 0;
+   for (int i = 0; i < class_count; i++) {
+      if (devinfo->gen <= 5 && dispatch_width >= 16) {
+         /* From the G45 PRM:
+          *
+          * In order to reduce the hardware complexity, the following
+          * rules and restrictions apply to the compressed instruction:
+          * ...
+          * * Operand Alignment Rule: With the exceptions listed below, a
+          *   source/destination operand in general should be aligned to
+          *   even 256-bit physical register with a region size equal to
+          *   two 256-bit physical register
+          */
+         ra_reg_count += (base_reg_count - (class_sizes[i] - 1)) / 2;
+      } else {
+         ra_reg_count += base_reg_count - (class_sizes[i] - 1);
+      }
+      /* Mark the last register. We'll fill in the beginnings later. */
+      class_to_ra_reg_range[class_sizes[i]] = ra_reg_count;
+   }
+
+   /* Fill out the rest of the range markers */
+   for (int i = 1; i < 17; ++i) {
+      if (class_to_ra_reg_range[i] == 0)
+         class_to_ra_reg_range[i] = class_to_ra_reg_range[i-1];
+   }
+
+   uint8_t *ra_reg_to_grf = ralloc_array(compiler, uint8_t, ra_reg_count);
+   struct ra_regs *regs = ra_alloc_reg_set(compiler, ra_reg_count, false);
+   if (devinfo->gen >= 6)
+      ra_set_allocate_round_robin(regs);
+   int *classes = ralloc_array(compiler, int, class_count);
+   int aligned_pairs_class = -1;
+
+   /* Allocate space for q values.  We allocate class_count + 1 because we
+    * want to leave room for the aligned pairs class if we have it. */
+   unsigned int **q_values = ralloc_array(compiler, unsigned int *,
+                                          class_count + 1);
+   for (int i = 0; i < class_count + 1; ++i)
+      q_values[i] = ralloc_array(q_values, unsigned int, class_count + 1);
+
+   /* Now, add the registers to their classes, and add the conflicts
+    * between them and the base GRF registers (and also each other).
+    */
+   int reg = 0;
+   int pairs_base_reg = 0;
+   int pairs_reg_count = 0;
+   for (int i = 0; i < class_count; i++) {
+      int class_reg_count;
+      if (devinfo->gen <= 5 && dispatch_width >= 16) {
+         class_reg_count = (base_reg_count - (class_sizes[i] - 1)) / 2;
+
+         /* See comment below.  The only difference here is that we are
+          * dealing with pairs of registers instead of single registers.
+          * Registers of odd sizes simply get rounded up. */
+         for (int j = 0; j < class_count; j++)
+            q_values[i][j] = (class_sizes[i] + 1) / 2 +
+                             (class_sizes[j] + 1) / 2 - 1;
+      } else {
+         class_reg_count = base_reg_count - (class_sizes[i] - 1);
+
+         /* From register_allocate.c:
+          *
+          * q(B,C) (indexed by C, B is this register class) in
+          * Runeson/Nyström paper.  This is "how many registers of B could
+          * the worst choice register from C conflict with".
+          *
+          * If we just let the register allocation algorithm compute these
+          * values, is extremely expensive.  However, since all of our
+          * registers are laid out, we can very easily compute them
+          * ourselves.  View the register from C as fixed starting at GRF n
+          * somwhere in the middle, and the register from B as sliding back
+          * and forth.  Then the first register to conflict from B is the
+          * one starting at n - class_size[B] + 1 and the last register to
+          * conflict will start at n + class_size[B] - 1.  Therefore, the
+          * number of conflicts from B is class_size[B] + class_size[C] - 1.
+          *
+          *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
+          * B | | | | | |n| --> | | | | | | |
+          *   +-+-+-+-+-+-+     +-+-+-+-+-+-+
+          *             +-+-+-+-+-+
+          * C           |n| | | | |
+          *             +-+-+-+-+-+
+          */
+         for (int j = 0; j < class_count; j++)
+            q_values[i][j] = class_sizes[i] + class_sizes[j] - 1;
+      }
+      classes[i] = ra_alloc_reg_class(regs);
+
+      /* Save this off for the aligned pair class at the end. */
+      if (class_sizes[i] == 2) {
+         pairs_base_reg = reg;
+         pairs_reg_count = class_reg_count;
+      }
+
+      if (devinfo->gen <= 5 && dispatch_width >= 16) {
+         for (int j = 0; j < class_reg_count; j++) {
+            ra_class_add_reg(regs, classes[i], reg);
+
+            ra_reg_to_grf[reg] = j * 2;
+
+            for (int base_reg = j;
+                 base_reg < j + (class_sizes[i] + 1) / 2;
+                 base_reg++) {
+               ra_add_reg_conflict(regs, base_reg, reg);
+            }
+
+            reg++;
+         }
+      } else {
+         for (int j = 0; j < class_reg_count; j++) {
+            ra_class_add_reg(regs, classes[i], reg);
+
+            ra_reg_to_grf[reg] = j;
+
+            for (int base_reg = j;
+                 base_reg < j + class_sizes[i];
+                 base_reg++) {
+               ra_add_reg_conflict(regs, base_reg, reg);
+            }
+
+            reg++;
+         }
+      }
+   }
+   assert(reg == ra_reg_count);
+
+   /* Applying transitivity to all of the base registers gives us the
+    * appropreate register conflict relationships everywhere.
+    */
+   for (int reg = 0; reg < base_reg_count; reg++)
+      ra_make_reg_conflicts_transitive(regs, reg);
+
+   /* Add a special class for aligned pairs, which we'll put delta_xy
+    * in on Gen <= 6 so that we can do PLN.
+    */
+   if (devinfo->has_pln && dispatch_width == 8 && devinfo->gen <= 6) {
+      aligned_pairs_class = ra_alloc_reg_class(regs);
+
+      for (int i = 0; i < pairs_reg_count; i++) {
+	 if ((ra_reg_to_grf[pairs_base_reg + i] & 1) == 0) {
+	    ra_class_add_reg(regs, aligned_pairs_class, pairs_base_reg + i);
+	 }
+      }
+
+      for (int i = 0; i < class_count; i++) {
+         /* These are a little counter-intuitive because the pair registers
+          * are required to be aligned while the register they are
+          * potentially interferring with are not.  In the case where the
+          * size is even, the worst-case is that the register is
+          * odd-aligned.  In the odd-size case, it doesn't matter.
+          */
+         q_values[class_count][i] = class_sizes[i] / 2 + 1;
+         q_values[i][class_count] = class_sizes[i] + 1;
+      }
+      q_values[class_count][class_count] = 1;
+   }
+
+   ra_set_finalize(regs, q_values);
+
+   ralloc_free(q_values);
+
+   compiler->fs_reg_sets[index].regs = regs;
+   for (unsigned i = 0; i < ARRAY_SIZE(compiler->fs_reg_sets[index].classes); i++)
+      compiler->fs_reg_sets[index].classes[i] = -1;
+   for (int i = 0; i < class_count; i++)
+      compiler->fs_reg_sets[index].classes[class_sizes[i] - 1] = classes[i];
+   compiler->fs_reg_sets[index].ra_reg_to_grf = ra_reg_to_grf;
+   compiler->fs_reg_sets[index].aligned_pairs_class = aligned_pairs_class;
+}
+
+void
+brw_fs_alloc_reg_sets(struct brw_compiler *compiler)
+{
+   brw_alloc_reg_set(compiler, 8);
+   brw_alloc_reg_set(compiler, 16);
+   brw_alloc_reg_set(compiler, 32);
+}
+
+static int
+count_to_loop_end(const bblock_t *block)
+{
+   if (block->end()->opcode == BRW_OPCODE_WHILE)
+      return block->end_ip;
+
+   int depth = 1;
+   /* Skip the first block, since we don't want to count the do the calling
+    * function found.
+    */
+   for (block = block->next();
+        depth > 0;
+        block = block->next()) {
+      if (block->start()->opcode == BRW_OPCODE_DO)
+         depth++;
+      if (block->end()->opcode == BRW_OPCODE_WHILE) {
+         depth--;
+         if (depth == 0)
+            return block->end_ip;
+      }
+   }
+   unreachable("not reached");
+}
+
+void fs_visitor::calculate_payload_ranges(int payload_node_count,
+                                          int *payload_last_use_ip)
+{
+   int loop_depth = 0;
+   int loop_end_ip = 0;
+
+   for (int i = 0; i < payload_node_count; i++)
+      payload_last_use_ip[i] = -1;
+
+   int ip = 0;
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      switch (inst->opcode) {
+      case BRW_OPCODE_DO:
+         loop_depth++;
+
+         /* Since payload regs are deffed only at the start of the shader
+          * execution, any uses of the payload within a loop mean the live
+          * interval extends to the end of the outermost loop.  Find the ip of
+          * the end now.
+          */
+         if (loop_depth == 1)
+            loop_end_ip = count_to_loop_end(block);
+         break;
+      case BRW_OPCODE_WHILE:
+         loop_depth--;
+         break;
+      default:
+         break;
+      }
+
+      int use_ip;
+      if (loop_depth > 0)
+         use_ip = loop_end_ip;
+      else
+         use_ip = ip;
+
+      /* Note that UNIFORM args have been turned into FIXED_GRF by
+       * assign_curbe_setup(), and interpolation uses fixed hardware regs from
+       * the start (see interp_reg()).
+       */
+      for (int i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == FIXED_GRF) {
+            int node_nr = inst->src[i].nr;
+            if (node_nr >= payload_node_count)
+               continue;
+
+            for (unsigned j = 0; j < regs_read(inst, i); j++) {
+               payload_last_use_ip[node_nr + j] = use_ip;
+               assert(node_nr + j < unsigned(payload_node_count));
+            }
+         }
+      }
+
+      /* Special case instructions which have extra implied registers used. */
+      switch (inst->opcode) {
+      case CS_OPCODE_CS_TERMINATE:
+         payload_last_use_ip[0] = use_ip;
+         break;
+
+      default:
+         if (inst->eot) {
+            /* We could omit this for the !inst->header_present case, except
+             * that the simulator apparently incorrectly reads from g0/g1
+             * instead of sideband.  It also really freaks out driver
+             * developers to see g0 used in unusual places, so just always
+             * reserve it.
+             */
+            payload_last_use_ip[0] = use_ip;
+            payload_last_use_ip[1] = use_ip;
+         }
+         break;
+      }
+
+      ip++;
+   }
+}
+
+
+/**
+ * Sets up interference between thread payload registers and the virtual GRFs
+ * to be allocated for program temporaries.
+ *
+ * We want to be able to reallocate the payload for our virtual GRFs, notably
+ * because the setup coefficients for a full set of 16 FS inputs takes up 8 of
+ * our 128 registers.
+ *
+ * The layout of the payload registers is:
+ *
+ * 0..payload.num_regs-1: fixed function setup (including bary coordinates).
+ * payload.num_regs..payload.num_regs+curb_read_lengh-1: uniform data
+ * payload.num_regs+curb_read_lengh..first_non_payload_grf-1: setup coefficients.
+ *
+ * And we have payload_node_count nodes covering these registers in order
+ * (note that in SIMD16, a node is two registers).
+ */
+void
+fs_visitor::setup_payload_interference(struct ra_graph *g,
+                                       int payload_node_count,
+                                       int first_payload_node)
+{
+   int payload_last_use_ip[payload_node_count];
+   calculate_payload_ranges(payload_node_count, payload_last_use_ip);
+
+   for (int i = 0; i < payload_node_count; i++) {
+      if (payload_last_use_ip[i] == -1)
+         continue;
+
+      /* Mark the payload node as interfering with any virtual grf that is
+       * live between the start of the program and our last use of the payload
+       * node.
+       */
+      for (unsigned j = 0; j < this->alloc.count; j++) {
+         /* Note that we use a <= comparison, unlike virtual_grf_interferes(),
+          * in order to not have to worry about the uniform issue described in
+          * calculate_live_intervals().
+          */
+         if (this->virtual_grf_start[j] <= payload_last_use_ip[i]) {
+            ra_add_node_interference(g, first_payload_node + i, j);
+         }
+      }
+   }
+
+   for (int i = 0; i < payload_node_count; i++) {
+      /* Mark each payload node as being allocated to its physical register.
+       *
+       * The alternative would be to have per-physical-register classes, which
+       * would just be silly.
+       */
+      if (devinfo->gen <= 5 && dispatch_width >= 16) {
+         /* We have to divide by 2 here because we only have even numbered
+          * registers.  Some of the payload registers will be odd, but
+          * that's ok because their physical register numbers have already
+          * been assigned.  The only thing this is used for is interference.
+          */
+         ra_set_node_reg(g, first_payload_node + i, i / 2);
+      } else {
+         ra_set_node_reg(g, first_payload_node + i, i);
+      }
+   }
+}
+
+/**
+ * Sets the mrf_used array to indicate which MRFs are used by the shader IR
+ *
+ * This is used in assign_regs() to decide which of the GRFs that we use as
+ * MRFs on gen7 get normally register allocated, and in register spilling to
+ * see if we can actually use MRFs to do spills without overwriting normal MRF
+ * contents.
+ */
+static void
+get_used_mrfs(fs_visitor *v, bool *mrf_used)
+{
+   int reg_width = v->dispatch_width / 8;
+
+   memset(mrf_used, 0, BRW_MAX_MRF(v->devinfo->gen) * sizeof(bool));
+
+   foreach_block_and_inst(block, fs_inst, inst, v->cfg) {
+      if (inst->dst.file == MRF) {
+         int reg = inst->dst.nr & ~BRW_MRF_COMPR4;
+         mrf_used[reg] = true;
+         if (reg_width == 2) {
+            if (inst->dst.nr & BRW_MRF_COMPR4) {
+               mrf_used[reg + 4] = true;
+            } else {
+               mrf_used[reg + 1] = true;
+            }
+         }
+      }
+
+      if (inst->mlen > 0) {
+	 for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
+            mrf_used[inst->base_mrf + i] = true;
+         }
+      }
+   }
+}
+
+/**
+ * Sets interference between virtual GRFs and usage of the high GRFs for SEND
+ * messages (treated as MRFs in code generation).
+ */
+static void
+setup_mrf_hack_interference(fs_visitor *v, struct ra_graph *g,
+                            int first_mrf_node, int *first_used_mrf)
+{
+   bool mrf_used[BRW_MAX_MRF(v->devinfo->gen)];
+   get_used_mrfs(v, mrf_used);
+
+   *first_used_mrf = BRW_MAX_MRF(v->devinfo->gen);
+   for (int i = 0; i < BRW_MAX_MRF(v->devinfo->gen); i++) {
+      /* Mark each MRF reg node as being allocated to its physical register.
+       *
+       * The alternative would be to have per-physical-register classes, which
+       * would just be silly.
+       */
+      ra_set_node_reg(g, first_mrf_node + i, GEN7_MRF_HACK_START + i);
+
+      /* Since we don't have any live/dead analysis on the MRFs, just mark all
+       * that are used as conflicting with all virtual GRFs.
+       */
+      if (mrf_used[i]) {
+         if (i < *first_used_mrf)
+            *first_used_mrf = i;
+
+         for (unsigned j = 0; j < v->alloc.count; j++) {
+            ra_add_node_interference(g, first_mrf_node + i, j);
+         }
+      }
+   }
+}
+
+bool
+fs_visitor::assign_regs(bool allow_spilling, bool spill_all)
+{
+   /* Most of this allocation was written for a reg_width of 1
+    * (dispatch_width == 8).  In extending to SIMD16, the code was
+    * left in place and it was converted to have the hardware
+    * registers it's allocating be contiguous physical pairs of regs
+    * for reg_width == 2.
+    */
+   int reg_width = dispatch_width / 8;
+   unsigned hw_reg_mapping[this->alloc.count];
+   int payload_node_count = ALIGN(this->first_non_payload_grf, reg_width);
+   int rsi = _mesa_logbase2(reg_width); /* Which compiler->fs_reg_sets[] to use */
+   calculate_live_intervals();
+
+   int node_count = this->alloc.count;
+   int first_payload_node = node_count;
+   node_count += payload_node_count;
+   int first_mrf_hack_node = node_count;
+   if (devinfo->gen >= 7)
+      node_count += BRW_MAX_GRF - GEN7_MRF_HACK_START;
+   struct ra_graph *g =
+      ra_alloc_interference_graph(compiler->fs_reg_sets[rsi].regs, node_count);
+
+   for (unsigned i = 0; i < this->alloc.count; i++) {
+      unsigned size = this->alloc.sizes[i];
+      int c;
+
+      assert(size <= ARRAY_SIZE(compiler->fs_reg_sets[rsi].classes) &&
+             "Register allocation relies on split_virtual_grfs()");
+      c = compiler->fs_reg_sets[rsi].classes[size - 1];
+
+      /* Special case: on pre-GEN6 hardware that supports PLN, the
+       * second operand of a PLN instruction needs to be an
+       * even-numbered register, so we have a special register class
+       * wm_aligned_pairs_class to handle this case.  pre-GEN6 always
+       * uses this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL] as the
+       * second operand of a PLN instruction (since it doesn't support
+       * any other interpolation modes).  So all we need to do is find
+       * that register and set it to the appropriate class.
+       */
+      if (compiler->fs_reg_sets[rsi].aligned_pairs_class >= 0 &&
+          this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL].file == VGRF &&
+          this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL].nr == i) {
+         c = compiler->fs_reg_sets[rsi].aligned_pairs_class;
+      }
+
+      ra_set_node_class(g, i, c);
+
+      for (unsigned j = 0; j < i; j++) {
+	 if (virtual_grf_interferes(i, j)) {
+	    ra_add_node_interference(g, i, j);
+	 }
+      }
+   }
+
+   /* Certain instructions can't safely use the same register for their
+    * sources and destination.  Add interference.
+    */
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) {
+         for (unsigned i = 0; i < 3; i++) {
+            if (inst->src[i].file == VGRF) {
+               ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr);
+            }
+         }
+      }
+   }
+
+   setup_payload_interference(g, payload_node_count, first_payload_node);
+   if (devinfo->gen >= 7) {
+      int first_used_mrf = BRW_MAX_MRF(devinfo->gen);
+      setup_mrf_hack_interference(this, g, first_mrf_hack_node,
+                                  &first_used_mrf);
+
+      foreach_block_and_inst(block, fs_inst, inst, cfg) {
+         /* When we do send-from-GRF for FB writes, we need to ensure that
+          * the last write instruction sends from a high register.  This is
+          * because the vertex fetcher wants to start filling the low
+          * payload registers while the pixel data port is still working on
+          * writing out the memory.  If we don't do this, we get rendering
+          * artifacts.
+          *
+          * We could just do "something high".  Instead, we just pick the
+          * highest register that works.
+          */
+         if (inst->eot) {
+            int size = alloc.sizes[inst->src[0].nr];
+            int reg = compiler->fs_reg_sets[rsi].class_to_ra_reg_range[size] - 1;
+
+            /* If something happened to spill, we want to push the EOT send
+             * register early enough in the register file that we don't
+             * conflict with any used MRF hack registers.
+             */
+            reg -= BRW_MAX_MRF(devinfo->gen) - first_used_mrf;
+
+            ra_set_node_reg(g, inst->src[0].nr, reg);
+            break;
+         }
+      }
+   }
+
+   if (dispatch_width > 8) {
+      /* In 16-wide dispatch we have an issue where a compressed
+       * instruction is actually two instructions executed simultaneiously.
+       * It's actually ok to have the source and destination registers be
+       * the same.  In this case, each instruction over-writes its own
+       * source and there's no problem.  The real problem here is if the
+       * source and destination registers are off by one.  Then you can end
+       * up in a scenario where the first instruction over-writes the
+       * source of the second instruction.  Since the compiler doesn't know
+       * about this level of granularity, we simply make the source and
+       * destination interfere.
+       */
+      foreach_block_and_inst(block, fs_inst, inst, cfg) {
+         if (inst->dst.file != VGRF)
+            continue;
+
+         for (int i = 0; i < inst->sources; ++i) {
+            if (inst->src[i].file == VGRF) {
+               ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr);
+            }
+         }
+      }
+   }
+
+   /* Debug of register spilling: Go spill everything. */
+   if (unlikely(spill_all)) {
+      int reg = choose_spill_reg(g);
+
+      if (reg != -1) {
+         spill_reg(reg);
+         ralloc_free(g);
+         return false;
+      }
+   }
+
+   if (!ra_allocate(g)) {
+      /* Failed to allocate registers.  Spill a reg, and the caller will
+       * loop back into here to try again.
+       */
+      int reg = choose_spill_reg(g);
+
+      if (reg == -1) {
+         fail("no register to spill:\n");
+         dump_instructions(NULL);
+      } else if (allow_spilling) {
+         spill_reg(reg);
+      }
+
+      ralloc_free(g);
+
+      return false;
+   }
+
+   /* Get the chosen virtual registers for each node, and map virtual
+    * regs in the register classes back down to real hardware reg
+    * numbers.
+    */
+   this->grf_used = payload_node_count;
+   for (unsigned i = 0; i < this->alloc.count; i++) {
+      int reg = ra_get_node_reg(g, i);
+
+      hw_reg_mapping[i] = compiler->fs_reg_sets[rsi].ra_reg_to_grf[reg];
+      this->grf_used = MAX2(this->grf_used,
+			    hw_reg_mapping[i] + this->alloc.sizes[i]);
+   }
+
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      assign_reg(hw_reg_mapping, &inst->dst);
+      for (int i = 0; i < inst->sources; i++) {
+         assign_reg(hw_reg_mapping, &inst->src[i]);
+      }
+   }
+
+   this->alloc.count = this->grf_used;
+
+   ralloc_free(g);
+
+   return true;
+}
+
+namespace {
+   /**
+    * Maximum spill block size we expect to encounter in 32B units.
+    *
+    * This is somewhat arbitrary and doesn't necessarily limit the maximum
+    * variable size that can be spilled -- A higher value will allow a
+    * variable of a given size to be spilled more efficiently with a smaller
+    * number of scratch messages, but will increase the likelihood of a
+    * collision between the MRFs reserved for spilling and other MRFs used by
+    * the program (and possibly increase GRF register pressure on platforms
+    * without hardware MRFs), what could cause register allocation to fail.
+    *
+    * For the moment reserve just enough space so a register of 32 bit
+    * component type and natural region width can be spilled without splitting
+    * into multiple (force_writemask_all) scratch messages.
+    */
+   unsigned
+   spill_max_size(const backend_shader *s)
+   {
+      /* FINISHME - On Gen7+ it should be possible to avoid this limit
+       *            altogether by spilling directly from the temporary GRF
+       *            allocated to hold the result of the instruction (and the
+       *            scratch write header).
+       */
+      /* FINISHME - The shader's dispatch width probably belongs in
+       *            backend_shader (or some nonexistent fs_shader class?)
+       *            rather than in the visitor class.
+       */
+      return static_cast<const fs_visitor *>(s)->dispatch_width / 8;
+   }
+
+   /**
+    * First MRF register available for spilling.
+    */
+   unsigned
+   spill_base_mrf(const backend_shader *s)
+   {
+      return BRW_MAX_MRF(s->devinfo->gen) - spill_max_size(s) - 1;
+   }
+}
+
+static void
+emit_unspill(const fs_builder &bld, fs_reg dst,
+             uint32_t spill_offset, unsigned count)
+{
+   const gen_device_info *devinfo = bld.shader->devinfo;
+   const unsigned reg_size = dst.component_size(bld.dispatch_width()) /
+                             REG_SIZE;
+   assert(count % reg_size == 0);
+
+   for (unsigned i = 0; i < count / reg_size; i++) {
+      /* The Gen7 descriptor-based offset is 12 bits of HWORD units.  Because
+       * the Gen7-style scratch block read is hardwired to BTI 255, on Gen9+
+       * it would cause the DC to do an IA-coherent read, what largely
+       * outweighs the slight advantage from not having to provide the address
+       * as part of the message header, so we're better off using plain old
+       * oword block reads.
+       */
+      bool gen7_read = (devinfo->gen >= 7 && devinfo->gen < 9 &&
+                        spill_offset < (1 << 12) * REG_SIZE);
+      fs_inst *unspill_inst = bld.emit(gen7_read ?
+                                       SHADER_OPCODE_GEN7_SCRATCH_READ :
+                                       SHADER_OPCODE_GEN4_SCRATCH_READ,
+                                       dst);
+      unspill_inst->offset = spill_offset;
+
+      if (!gen7_read) {
+         unspill_inst->base_mrf = spill_base_mrf(bld.shader);
+         unspill_inst->mlen = 1; /* header contains offset */
+      }
+
+      dst.offset += reg_size * REG_SIZE;
+      spill_offset += reg_size * REG_SIZE;
+   }
+}
+
+static void
+emit_spill(const fs_builder &bld, fs_reg src,
+           uint32_t spill_offset, unsigned count)
+{
+   const unsigned reg_size = src.component_size(bld.dispatch_width()) /
+                             REG_SIZE;
+   assert(count % reg_size == 0);
+
+   for (unsigned i = 0; i < count / reg_size; i++) {
+      fs_inst *spill_inst =
+         bld.emit(SHADER_OPCODE_GEN4_SCRATCH_WRITE, bld.null_reg_f(), src);
+      src.offset += reg_size * REG_SIZE;
+      spill_inst->offset = spill_offset + i * reg_size * REG_SIZE;
+      spill_inst->mlen = 1 + reg_size; /* header, value */
+      spill_inst->base_mrf = spill_base_mrf(bld.shader);
+   }
+}
+
+int
+fs_visitor::choose_spill_reg(struct ra_graph *g)
+{
+   float loop_scale = 1.0;
+   float spill_costs[this->alloc.count];
+   bool no_spill[this->alloc.count];
+
+   for (unsigned i = 0; i < this->alloc.count; i++) {
+      spill_costs[i] = 0.0;
+      no_spill[i] = false;
+   }
+
+   /* Calculate costs for spilling nodes.  Call it a cost of 1 per
+    * spill/unspill we'll have to do, and guess that the insides of
+    * loops run 10 times.
+    */
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      for (unsigned int i = 0; i < inst->sources; i++) {
+	 if (inst->src[i].file == VGRF)
+            spill_costs[inst->src[i].nr] += loop_scale;
+      }
+
+      if (inst->dst.file == VGRF)
+         spill_costs[inst->dst.nr] += DIV_ROUND_UP(inst->size_written, REG_SIZE)
+                                      * loop_scale;
+
+      switch (inst->opcode) {
+
+      case BRW_OPCODE_DO:
+	 loop_scale *= 10;
+	 break;
+
+      case BRW_OPCODE_WHILE:
+	 loop_scale /= 10;
+	 break;
+
+      case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
+	 if (inst->src[0].file == VGRF)
+            no_spill[inst->src[0].nr] = true;
+	 break;
+
+      case SHADER_OPCODE_GEN4_SCRATCH_READ:
+      case SHADER_OPCODE_GEN7_SCRATCH_READ:
+	 if (inst->dst.file == VGRF)
+            no_spill[inst->dst.nr] = true;
+	 break;
+
+      default:
+	 break;
+      }
+   }
+
+   for (unsigned i = 0; i < this->alloc.count; i++) {
+      if (!no_spill[i])
+	 ra_set_node_spill_cost(g, i, spill_costs[i]);
+   }
+
+   return ra_get_best_spill_node(g);
+}
+
+void
+fs_visitor::spill_reg(int spill_reg)
+{
+   int size = alloc.sizes[spill_reg];
+   unsigned int spill_offset = last_scratch;
+   assert(ALIGN(spill_offset, 16) == spill_offset); /* oword read/write req. */
+
+   /* Spills may use MRFs 13-15 in the SIMD16 case.  Our texturing is done
+    * using up to 11 MRFs starting from either m1 or m2, and fb writes can use
+    * up to m13 (gen6+ simd16: 2 header + 8 color + 2 src0alpha + 2 omask) or
+    * m15 (gen4-5 simd16: 2 header + 8 color + 1 aads + 2 src depth + 2 dst
+    * depth), starting from m1.  In summary: We may not be able to spill in
+    * SIMD16 mode, because we'd stomp the FB writes.
+    */
+   if (!spilled_any_registers) {
+      bool mrf_used[BRW_MAX_MRF(devinfo->gen)];
+      get_used_mrfs(this, mrf_used);
+
+      for (int i = spill_base_mrf(this); i < BRW_MAX_MRF(devinfo->gen); i++) {
+         if (mrf_used[i]) {
+            fail("Register spilling not supported with m%d used", i);
+          return;
+         }
+      }
+
+      spilled_any_registers = true;
+   }
+
+   last_scratch += size * REG_SIZE;
+
+   /* Generate spill/unspill instructions for the objects being
+    * spilled.  Right now, we spill or unspill the whole thing to a
+    * virtual grf of the same size.  For most instructions, though, we
+    * could just spill/unspill the GRF being accessed.
+    */
+   foreach_block_and_inst (block, fs_inst, inst, cfg) {
+      const fs_builder ibld = fs_builder(this, block, inst);
+
+      for (unsigned int i = 0; i < inst->sources; i++) {
+	 if (inst->src[i].file == VGRF &&
+             inst->src[i].nr == spill_reg) {
+            int count = regs_read(inst, i);
+            int subset_spill_offset = spill_offset +
+               ROUND_DOWN_TO(inst->src[i].offset, REG_SIZE);
+            fs_reg unspill_dst(VGRF, alloc.allocate(count));
+
+            inst->src[i].nr = unspill_dst.nr;
+            inst->src[i].offset %= REG_SIZE;
+
+            /* We read the largest power-of-two divisor of the register count
+             * (because only POT scratch read blocks are allowed by the
+             * hardware) up to the maximum supported block size.
+             */
+            const unsigned width =
+               MIN2(32, 1u << (ffs(MAX2(1, count) * 8) - 1));
+
+            /* Set exec_all() on unspill messages under the (rather
+             * pessimistic) assumption that there is no one-to-one
+             * correspondence between channels of the spilled variable in
+             * scratch space and the scratch read message, which operates on
+             * 32 bit channels.  It shouldn't hurt in any case because the
+             * unspill destination is a block-local temporary.
+             */
+            emit_unspill(ibld.exec_all().group(width, 0),
+                         unspill_dst, subset_spill_offset, count);
+	 }
+      }
+
+      if (inst->dst.file == VGRF &&
+          inst->dst.nr == spill_reg) {
+         int subset_spill_offset = spill_offset +
+            ROUND_DOWN_TO(inst->dst.offset, REG_SIZE);
+         fs_reg spill_src(VGRF, alloc.allocate(regs_written(inst)));
+
+         inst->dst.nr = spill_src.nr;
+         inst->dst.offset %= REG_SIZE;
+
+         /* If we're immediately spilling the register, we should not use
+          * destination dependency hints.  Doing so will cause the GPU do
+          * try to read and write the register at the same time and may
+          * hang the GPU.
+          */
+         inst->no_dd_clear = false;
+         inst->no_dd_check = false;
+
+         /* Calculate the execution width of the scratch messages (which work
+          * in terms of 32 bit components so we have a fixed number of eight
+          * channels per spilled register).  We attempt to write one
+          * exec_size-wide component of the variable at a time without
+          * exceeding the maximum number of (fake) MRF registers reserved for
+          * spills.
+          */
+         const unsigned width = 8 * MIN2(
+            DIV_ROUND_UP(inst->dst.component_size(inst->exec_size), REG_SIZE),
+            spill_max_size(this));
+
+         /* Spills should only write data initialized by the instruction for
+          * whichever channels are enabled in the excution mask.  If that's
+          * not possible we'll have to emit a matching unspill before the
+          * instruction and set force_writemask_all on the spill.
+          */
+         const bool per_channel =
+            inst->dst.is_contiguous() && type_sz(inst->dst.type) == 4 &&
+            inst->exec_size == width;
+
+         /* Builder used to emit the scratch messages. */
+         const fs_builder ubld = ibld.exec_all(!per_channel).group(width, 0);
+
+	 /* If our write is going to affect just part of the
+          * regs_written(inst), then we need to unspill the destination since
+          * we write back out all of the regs_written().  If the original
+          * instruction had force_writemask_all set and is not a partial
+          * write, there should be no need for the unspill since the
+          * instruction will be overwriting the whole destination in any case.
+	  */
+         if (inst->is_partial_write() ||
+             (!inst->force_writemask_all && !per_channel))
+            emit_unspill(ubld, spill_src, subset_spill_offset,
+                         regs_written(inst));
+
+         emit_spill(ubld.at(block, inst->next), spill_src,
+                    subset_spill_offset, regs_written(inst));
+      }
+   }
+
+   invalidate_live_intervals();
+}
diff --git a/src/intel/compiler/brw_fs_register_coalesce.cpp b/src/intel/compiler/brw_fs_register_coalesce.cpp
new file mode 100644
index 00000000000..952276faed8
--- /dev/null
+++ b/src/intel/compiler/brw_fs_register_coalesce.cpp
@@ -0,0 +1,295 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_register_coalesce.cpp
+ *
+ * Implements register coalescing: Checks if the two registers involved in a
+ * raw move don't interfere, in which case they can both be stored in the same
+ * place and the MOV removed.
+ *
+ * To do this, all uses of the source of the MOV in the shader are replaced
+ * with the destination of the MOV. For example:
+ *
+ * add vgrf3:F, vgrf1:F, vgrf2:F
+ * mov vgrf4:F, vgrf3:F
+ * mul vgrf5:F, vgrf5:F, vgrf4:F
+ *
+ * becomes
+ *
+ * add vgrf4:F, vgrf1:F, vgrf2:F
+ * mul vgrf5:F, vgrf5:F, vgrf4:F
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "brw_fs_live_variables.h"
+
+static bool
+is_nop_mov(const fs_inst *inst)
+{
+   if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
+      fs_reg dst = inst->dst;
+      for (int i = 0; i < inst->sources; i++) {
+         if (!dst.equals(inst->src[i])) {
+            return false;
+         }
+         dst.offset += (i < inst->header_size ? REG_SIZE :
+                        inst->exec_size * dst.stride *
+                        type_sz(inst->src[i].type));
+      }
+      return true;
+   } else if (inst->opcode == BRW_OPCODE_MOV) {
+      return inst->dst.equals(inst->src[0]);
+   }
+
+   return false;
+}
+
+static bool
+is_coalesce_candidate(const fs_visitor *v, const fs_inst *inst)
+{
+   if ((inst->opcode != BRW_OPCODE_MOV &&
+        inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD) ||
+       inst->is_partial_write() ||
+       inst->saturate ||
+       inst->src[0].file != VGRF ||
+       inst->src[0].negate ||
+       inst->src[0].abs ||
+       !inst->src[0].is_contiguous() ||
+       inst->dst.file != VGRF ||
+       inst->dst.type != inst->src[0].type) {
+      return false;
+   }
+
+   if (v->alloc.sizes[inst->src[0].nr] >
+       v->alloc.sizes[inst->dst.nr])
+      return false;
+
+   if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
+      if (!inst->is_copy_payload(v->alloc)) {
+         return false;
+      }
+   }
+
+   return true;
+}
+
+static bool
+can_coalesce_vars(brw::fs_live_variables *live_intervals,
+                  const cfg_t *cfg, const fs_inst *inst,
+                  int dst_var, int src_var)
+{
+   if (!live_intervals->vars_interfere(src_var, dst_var))
+      return true;
+
+   int dst_start = live_intervals->start[dst_var];
+   int dst_end = live_intervals->end[dst_var];
+   int src_start = live_intervals->start[src_var];
+   int src_end = live_intervals->end[src_var];
+
+   /* Variables interfere and one line range isn't a subset of the other. */
+   if ((dst_end > src_end && src_start < dst_start) ||
+       (src_end > dst_end && dst_start < src_start))
+      return false;
+
+   /* Check for a write to either register in the intersection of their live
+    * ranges.
+    */
+   int start_ip = MAX2(dst_start, src_start);
+   int end_ip = MIN2(dst_end, src_end);
+
+   foreach_block(block, cfg) {
+      if (block->end_ip < start_ip)
+         continue;
+
+      int scan_ip = block->start_ip - 1;
+
+      foreach_inst_in_block(fs_inst, scan_inst, block) {
+         scan_ip++;
+
+         /* Ignore anything before the intersection of the live ranges */
+         if (scan_ip < start_ip)
+            continue;
+
+         /* Ignore the copying instruction itself */
+         if (scan_inst == inst)
+            continue;
+
+         if (scan_ip > end_ip)
+            return true; /* registers do not interfere */
+
+         if (regions_overlap(scan_inst->dst, scan_inst->size_written,
+                             inst->dst, inst->size_written) ||
+             regions_overlap(scan_inst->dst, scan_inst->size_written,
+                             inst->src[0], inst->size_read(0)))
+            return false; /* registers interfere */
+      }
+   }
+
+   return true;
+}
+
+bool
+fs_visitor::register_coalesce()
+{
+   bool progress = false;
+
+   calculate_live_intervals();
+
+   int src_size = 0;
+   int channels_remaining = 0;
+   int src_reg = -1, dst_reg = -1;
+   int dst_reg_offset[MAX_VGRF_SIZE];
+   fs_inst *mov[MAX_VGRF_SIZE];
+   int dst_var[MAX_VGRF_SIZE];
+   int src_var[MAX_VGRF_SIZE];
+
+   foreach_block_and_inst(block, fs_inst, inst, cfg) {
+      if (!is_coalesce_candidate(this, inst))
+         continue;
+
+      if (is_nop_mov(inst)) {
+         inst->opcode = BRW_OPCODE_NOP;
+         progress = true;
+         continue;
+      }
+
+      if (src_reg != inst->src[0].nr) {
+         src_reg = inst->src[0].nr;
+
+         src_size = alloc.sizes[inst->src[0].nr];
+         assert(src_size <= MAX_VGRF_SIZE);
+
+         channels_remaining = src_size;
+         memset(mov, 0, sizeof(mov));
+
+         dst_reg = inst->dst.nr;
+      }
+
+      if (dst_reg != inst->dst.nr)
+         continue;
+
+      if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
+         for (int i = 0; i < src_size; i++) {
+            dst_reg_offset[i] = i;
+         }
+         mov[0] = inst;
+         channels_remaining -= regs_written(inst);
+      } else {
+         const int offset = inst->src[0].offset / REG_SIZE;
+         if (mov[offset]) {
+            /* This is the second time that this offset in the register has
+             * been set.  This means, in particular, that inst->dst was
+             * live before this instruction and that the live ranges of
+             * inst->dst and inst->src[0] overlap and we can't coalesce the
+             * two variables.  Let's ensure that doesn't happen.
+             */
+            channels_remaining = -1;
+            continue;
+         }
+         for (unsigned i = 0; i < MAX2(inst->size_written / REG_SIZE, 1); i++)
+            dst_reg_offset[offset + i] = inst->dst.offset / REG_SIZE + i;
+         mov[offset] = inst;
+         channels_remaining -= regs_written(inst);
+      }
+
+      if (channels_remaining)
+         continue;
+
+      bool can_coalesce = true;
+      for (int i = 0; i < src_size; i++) {
+         if (dst_reg_offset[i] != dst_reg_offset[0] + i) {
+            /* Registers are out-of-order. */
+            can_coalesce = false;
+            src_reg = -1;
+            break;
+         }
+
+         dst_var[i] = live_intervals->var_from_vgrf[dst_reg] + dst_reg_offset[i];
+         src_var[i] = live_intervals->var_from_vgrf[src_reg] + i;
+
+         if (!can_coalesce_vars(live_intervals, cfg, inst,
+                                dst_var[i], src_var[i])) {
+            can_coalesce = false;
+            src_reg = -1;
+            break;
+         }
+      }
+
+      if (!can_coalesce)
+         continue;
+
+      progress = true;
+
+      for (int i = 0; i < src_size; i++) {
+         if (mov[i]) {
+            mov[i]->opcode = BRW_OPCODE_NOP;
+            mov[i]->conditional_mod = BRW_CONDITIONAL_NONE;
+            mov[i]->dst = reg_undef;
+            for (int j = 0; j < mov[i]->sources; j++) {
+               mov[i]->src[j] = reg_undef;
+            }
+         }
+      }
+
+      foreach_block_and_inst(block, fs_inst, scan_inst, cfg) {
+         if (scan_inst->dst.file == VGRF &&
+             scan_inst->dst.nr == src_reg) {
+            scan_inst->dst.nr = dst_reg;
+            scan_inst->dst.offset = scan_inst->dst.offset % REG_SIZE +
+               dst_reg_offset[scan_inst->dst.offset / REG_SIZE] * REG_SIZE;
+         }
+
+         for (int j = 0; j < scan_inst->sources; j++) {
+            if (scan_inst->src[j].file == VGRF &&
+                scan_inst->src[j].nr == src_reg) {
+               scan_inst->src[j].nr = dst_reg;
+               scan_inst->src[j].offset = scan_inst->src[j].offset % REG_SIZE +
+                  dst_reg_offset[scan_inst->src[j].offset / REG_SIZE] * REG_SIZE;
+            }
+         }
+      }
+
+      for (int i = 0; i < src_size; i++) {
+         live_intervals->start[dst_var[i]] =
+            MIN2(live_intervals->start[dst_var[i]],
+                 live_intervals->start[src_var[i]]);
+         live_intervals->end[dst_var[i]] =
+            MAX2(live_intervals->end[dst_var[i]],
+                 live_intervals->end[src_var[i]]);
+      }
+      src_reg = -1;
+   }
+
+   if (progress) {
+      foreach_block_and_inst_safe (block, backend_instruction, inst, cfg) {
+         if (inst->opcode == BRW_OPCODE_NOP) {
+            inst->remove(block);
+         }
+      }
+
+      invalidate_live_intervals();
+   }
+
+   return progress;
+}
diff --git a/src/intel/compiler/brw_fs_saturate_propagation.cpp b/src/intel/compiler/brw_fs_saturate_propagation.cpp
new file mode 100644
index 00000000000..1c97a507d8c
--- /dev/null
+++ b/src/intel/compiler/brw_fs_saturate_propagation.cpp
@@ -0,0 +1,156 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_fs_live_variables.h"
+#include "brw_cfg.h"
+
+/** @file brw_fs_saturate_propagation.cpp
+ *
+ * Implements a pass that propagates the SAT modifier from a MOV.SAT into the
+ * instruction that produced the source of the MOV.SAT, thereby allowing the
+ * MOV's src and dst to be coalesced and the MOV removed.
+ *
+ * For instance,
+ *
+ *    ADD     tmp, src0, src1
+ *    MOV.SAT dst, tmp
+ *
+ * would be transformed into
+ *
+ *    ADD.SAT tmp, src0, src1
+ *    MOV     dst, tmp
+ */
+
+static bool
+opt_saturate_propagation_local(fs_visitor *v, bblock_t *block)
+{
+   bool progress = false;
+   int ip = block->end_ip + 1;
+
+   foreach_inst_in_block_reverse(fs_inst, inst, block) {
+      ip--;
+
+      if (inst->opcode != BRW_OPCODE_MOV ||
+          !inst->saturate ||
+          inst->dst.file != VGRF ||
+          inst->dst.type != inst->src[0].type ||
+          inst->src[0].file != VGRF ||
+          inst->src[0].abs)
+         continue;
+
+      int src_var = v->live_intervals->var_from_reg(inst->src[0]);
+      int src_end_ip = v->live_intervals->end[src_var];
+
+      bool interfered = false;
+      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst) {
+         if (regions_overlap(scan_inst->dst, scan_inst->size_written,
+                             inst->src[0], inst->size_read(0))) {
+            if (scan_inst->is_partial_write() ||
+                (scan_inst->dst.type != inst->dst.type &&
+                 !scan_inst->can_change_types()))
+               break;
+
+            if (scan_inst->saturate) {
+               inst->saturate = false;
+               progress = true;
+            } else if (src_end_ip == ip || inst->dst.equals(inst->src[0])) {
+               if (scan_inst->can_do_saturate()) {
+                  if (scan_inst->dst.type != inst->dst.type) {
+                     scan_inst->dst.type = inst->dst.type;
+                     for (int i = 0; i < scan_inst->sources; i++) {
+                        scan_inst->src[i].type = inst->dst.type;
+                     }
+                  }
+
+                  if (inst->src[0].negate) {
+                     if (scan_inst->opcode == BRW_OPCODE_MUL) {
+                        scan_inst->src[0].negate = !scan_inst->src[0].negate;
+                        inst->src[0].negate = false;
+                     } else if (scan_inst->opcode == BRW_OPCODE_MAD) {
+                        scan_inst->src[0].negate = !scan_inst->src[0].negate;
+                        scan_inst->src[1].negate = !scan_inst->src[1].negate;
+                        inst->src[0].negate = false;
+                     } else if (scan_inst->opcode == BRW_OPCODE_ADD) {
+                        if (scan_inst->src[1].file == IMM) {
+                           if (!brw_negate_immediate(scan_inst->src[1].type,
+                                                     &scan_inst->src[1].as_brw_reg())) {
+                              break;
+                           }
+                        } else {
+                           scan_inst->src[1].negate = !scan_inst->src[1].negate;
+                        }
+                        scan_inst->src[0].negate = !scan_inst->src[0].negate;
+                        inst->src[0].negate = false;
+                     } else {
+                        break;
+                     }
+                  }
+
+                  scan_inst->saturate = true;
+                  inst->saturate = false;
+                  progress = true;
+               }
+            }
+            break;
+         }
+         for (int i = 0; i < scan_inst->sources; i++) {
+            if (scan_inst->src[i].file == VGRF &&
+                scan_inst->src[i].nr == inst->src[0].nr &&
+                scan_inst->src[i].offset / REG_SIZE ==
+                 inst->src[0].offset / REG_SIZE) {
+               if (scan_inst->opcode != BRW_OPCODE_MOV ||
+                   !scan_inst->saturate ||
+                   scan_inst->src[0].abs ||
+                   scan_inst->src[0].negate ||
+                   scan_inst->src[0].abs != inst->src[0].abs ||
+                   scan_inst->src[0].negate != inst->src[0].negate) {
+                  interfered = true;
+                  break;
+               }
+            }
+         }
+
+         if (interfered)
+            break;
+      }
+   }
+
+   return progress;
+}
+
+bool
+fs_visitor::opt_saturate_propagation()
+{
+   bool progress = false;
+
+   calculate_live_intervals();
+
+   foreach_block (block, cfg) {
+      progress = opt_saturate_propagation_local(this, block) || progress;
+   }
+
+   /* Live intervals are still valid. */
+
+   return progress;
+}
diff --git a/src/intel/compiler/brw_fs_sel_peephole.cpp b/src/intel/compiler/brw_fs_sel_peephole.cpp
new file mode 100644
index 00000000000..8cd897f72e0
--- /dev/null
+++ b/src/intel/compiler/brw_fs_sel_peephole.cpp
@@ -0,0 +1,220 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+
+/** @file brw_fs_sel_peephole.cpp
+ *
+ * This file contains the opt_peephole_sel() optimization pass that replaces
+ * MOV instructions to the same destination in the "then" and "else" bodies of
+ * an if statement with SEL instructions.
+ */
+
+/* Four MOVs seems to be pretty typical, so I picked the next power of two in
+ * the hopes that it would handle almost anything possible in a single
+ * pass.
+ */
+#define MAX_MOVS 8 /**< The maximum number of MOVs to attempt to match. */
+
+using namespace brw;
+
+/**
+ * Scans forwards from an IF counting consecutive MOV instructions in the
+ * "then" and "else" blocks of the if statement.
+ *
+ * A pointer to the bblock_t following the IF is passed as the <then_block>
+ * argument. The function stores pointers to the MOV instructions in the
+ * <then_mov> and <else_mov> arrays.
+ *
+ * \return the minimum number of MOVs found in the two branches or zero if
+ *         an error occurred.
+ *
+ * E.g.:
+ *                  IF ...
+ *    then_mov[0] = MOV g4, ...
+ *    then_mov[1] = MOV g5, ...
+ *    then_mov[2] = MOV g6, ...
+ *                  ELSE ...
+ *    else_mov[0] = MOV g4, ...
+ *    else_mov[1] = MOV g5, ...
+ *    else_mov[2] = MOV g7, ...
+ *                  ENDIF
+ *    returns 3.
+ */
+static int
+count_movs_from_if(fs_inst *then_mov[MAX_MOVS], fs_inst *else_mov[MAX_MOVS],
+                   bblock_t *then_block, bblock_t *else_block)
+{
+   int then_movs = 0;
+   foreach_inst_in_block(fs_inst, inst, then_block) {
+      if (then_movs == MAX_MOVS || inst->opcode != BRW_OPCODE_MOV)
+         break;
+
+      then_mov[then_movs] = inst;
+      then_movs++;
+   }
+
+   int else_movs = 0;
+   foreach_inst_in_block(fs_inst, inst, else_block) {
+      if (else_movs == MAX_MOVS || inst->opcode != BRW_OPCODE_MOV)
+         break;
+
+      else_mov[else_movs] = inst;
+      else_movs++;
+   }
+
+   return MIN2(then_movs, else_movs);
+}
+
+/**
+ * Try to replace IF/MOV+/ELSE/MOV+/ENDIF with SEL.
+ *
+ * Many GLSL shaders contain the following pattern:
+ *
+ *    x = condition ? foo : bar
+ *
+ * or
+ *
+ *    if (...) a.xyzw = foo.xyzw;
+ *    else     a.xyzw = bar.xyzw;
+ *
+ * The compiler emits an ir_if tree for this, since each subexpression might be
+ * a complex tree that could have side-effects or short-circuit logic.
+ *
+ * However, the common case is to simply select one of two constants or
+ * variable values---which is exactly what SEL is for.  In this case, the
+ * assembly looks like:
+ *
+ *    (+f0) IF
+ *    MOV dst src0
+ *    ...
+ *    ELSE
+ *    MOV dst src1
+ *    ...
+ *    ENDIF
+ *
+ * where each pair of MOVs to a common destination and can be easily translated
+ * into
+ *
+ *    (+f0) SEL dst src0 src1
+ *
+ * If src0 is an immediate value, we promote it to a temporary GRF.
+ */
+bool
+fs_visitor::opt_peephole_sel()
+{
+   bool progress = false;
+
+   foreach_block (block, cfg) {
+      /* IF instructions, by definition, can only be found at the ends of
+       * basic blocks.
+       */
+      fs_inst *if_inst = (fs_inst *)block->end();
+      if (if_inst->opcode != BRW_OPCODE_IF)
+         continue;
+
+      fs_inst *else_mov[MAX_MOVS] = { NULL };
+      fs_inst *then_mov[MAX_MOVS] = { NULL };
+
+      bblock_t *then_block = block->next();
+      bblock_t *else_block = NULL;
+      foreach_list_typed(bblock_link, child, link, &block->children) {
+         if (child->block != then_block) {
+            if (child->block->prev()->end()->opcode == BRW_OPCODE_ELSE) {
+               else_block = child->block;
+            }
+            break;
+         }
+      }
+      if (else_block == NULL)
+         continue;
+
+      int movs = count_movs_from_if(then_mov, else_mov, then_block, else_block);
+
+      if (movs == 0)
+         continue;
+
+      /* Generate SEL instructions for pairs of MOVs to a common destination. */
+      for (int i = 0; i < movs; i++) {
+         if (!then_mov[i] || !else_mov[i])
+            break;
+
+         /* Check that the MOVs are the right form. */
+         if (!then_mov[i]->dst.equals(else_mov[i]->dst) ||
+             then_mov[i]->exec_size != else_mov[i]->exec_size ||
+             then_mov[i]->group != else_mov[i]->group ||
+             then_mov[i]->force_writemask_all != else_mov[i]->force_writemask_all ||
+             then_mov[i]->is_partial_write() ||
+             else_mov[i]->is_partial_write() ||
+             then_mov[i]->conditional_mod != BRW_CONDITIONAL_NONE ||
+             else_mov[i]->conditional_mod != BRW_CONDITIONAL_NONE) {
+            movs = i;
+            break;
+         }
+
+         /* Check that source types for mov operations match. */
+         if (then_mov[i]->src[0].type != else_mov[i]->src[0].type) {
+            movs = i;
+            break;
+         }
+      }
+
+      if (movs == 0)
+         continue;
+
+      for (int i = 0; i < movs; i++) {
+         const fs_builder ibld = fs_builder(this, then_block, then_mov[i])
+                                 .at(block, if_inst);
+
+         if (then_mov[i]->src[0].equals(else_mov[i]->src[0])) {
+            ibld.MOV(then_mov[i]->dst, then_mov[i]->src[0]);
+         } else {
+            /* Only the last source register can be a constant, so if the MOV
+             * in the "then" clause uses a constant, we need to put it in a
+             * temporary.
+             */
+            fs_reg src0(then_mov[i]->src[0]);
+            if (src0.file == IMM) {
+               src0 = vgrf(glsl_type::float_type);
+               src0.type = then_mov[i]->src[0].type;
+               ibld.MOV(src0, then_mov[i]->src[0]);
+            }
+
+            set_predicate_inv(if_inst->predicate, if_inst->predicate_inverse,
+                              ibld.SEL(then_mov[i]->dst, src0,
+                                       else_mov[i]->src[0]));
+         }
+
+         then_mov[i]->remove(then_block);
+         else_mov[i]->remove(else_block);
+      }
+
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
diff --git a/src/intel/compiler/brw_fs_surface_builder.cpp b/src/intel/compiler/brw_fs_surface_builder.cpp
new file mode 100644
index 00000000000..8990a5ca710
--- /dev/null
+++ b/src/intel/compiler/brw_fs_surface_builder.cpp
@@ -0,0 +1,1194 @@
+/*
+ * Copyright © 2013-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "isl/isl.h"
+#include "brw_fs_surface_builder.h"
+#include "brw_fs.h"
+
+using namespace brw;
+
+namespace brw {
+   namespace surface_access {
+      namespace {
+         /**
+          * Generate a logical send opcode for a surface message and return
+          * the result.
+          */
+         fs_reg
+         emit_send(const fs_builder &bld, enum opcode opcode,
+                   const fs_reg &addr, const fs_reg &src, const fs_reg &surface,
+                   unsigned dims, unsigned arg, unsigned rsize,
+                   brw_predicate pred = BRW_PREDICATE_NONE)
+         {
+            /* Reduce the dynamically uniform surface index to a single
+             * scalar.
+             */
+            const fs_reg usurface = bld.emit_uniformize(surface);
+            const fs_reg srcs[] = {
+               addr, src, usurface, brw_imm_ud(dims), brw_imm_ud(arg)
+            };
+            const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, rsize);
+            fs_inst *inst = bld.emit(opcode, dst, srcs, ARRAY_SIZE(srcs));
+
+            inst->size_written = rsize * dst.component_size(inst->exec_size);
+            inst->predicate = pred;
+            return dst;
+         }
+      }
+
+      /**
+       * Emit an untyped surface read opcode.  \p dims determines the number
+       * of components of the address and \p size the number of components of
+       * the returned value.
+       */
+      fs_reg
+      emit_untyped_read(const fs_builder &bld,
+                        const fs_reg &surface, const fs_reg &addr,
+                        unsigned dims, unsigned size,
+                        brw_predicate pred)
+      {
+         return emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL,
+                          addr, fs_reg(), surface, dims, size, size, pred);
+      }
+
+      /**
+       * Emit an untyped surface write opcode.  \p dims determines the number
+       * of components of the address and \p size the number of components of
+       * the argument.
+       */
+      void
+      emit_untyped_write(const fs_builder &bld, const fs_reg &surface,
+                         const fs_reg &addr, const fs_reg &src,
+                         unsigned dims, unsigned size,
+                         brw_predicate pred)
+      {
+         emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL,
+                   addr, src, surface, dims, size, 0, pred);
+      }
+
+      /**
+       * Emit an untyped surface atomic opcode.  \p dims determines the number
+       * of components of the address and \p rsize the number of components of
+       * the returned value (either zero or one).
+       */
+      fs_reg
+      emit_untyped_atomic(const fs_builder &bld,
+                          const fs_reg &surface, const fs_reg &addr,
+                          const fs_reg &src0, const fs_reg &src1,
+                          unsigned dims, unsigned rsize, unsigned op,
+                          brw_predicate pred)
+      {
+         /* FINISHME: Factor out this frequently recurring pattern into a
+          * helper function.
+          */
+         const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
+         const fs_reg srcs[] = { src0, src1 };
+         const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
+         bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
+
+         return emit_send(bld, SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL,
+                          addr, tmp, surface, dims, op, rsize, pred);
+      }
+
+      /**
+       * Emit a typed surface read opcode.  \p dims determines the number of
+       * components of the address and \p size the number of components of the
+       * returned value.
+       */
+      fs_reg
+      emit_typed_read(const fs_builder &bld, const fs_reg &surface,
+                      const fs_reg &addr, unsigned dims, unsigned size)
+      {
+         return emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL,
+                          addr, fs_reg(), surface, dims, size, size);
+      }
+
+      /**
+       * Emit a typed surface write opcode.  \p dims determines the number of
+       * components of the address and \p size the number of components of the
+       * argument.
+       */
+      void
+      emit_typed_write(const fs_builder &bld, const fs_reg &surface,
+                       const fs_reg &addr, const fs_reg &src,
+                       unsigned dims, unsigned size)
+      {
+         emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL,
+                   addr, src, surface, dims, size, 0);
+      }
+
+      /**
+       * Emit a typed surface atomic opcode.  \p dims determines the number of
+       * components of the address and \p rsize the number of components of
+       * the returned value (either zero or one).
+       */
+      fs_reg
+      emit_typed_atomic(const fs_builder &bld, const fs_reg &surface,
+                        const fs_reg &addr,
+                        const fs_reg &src0, const fs_reg &src1,
+                        unsigned dims, unsigned rsize, unsigned op,
+                        brw_predicate pred)
+      {
+         /* FINISHME: Factor out this frequently recurring pattern into a
+          * helper function.
+          */
+         const unsigned n = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
+         const fs_reg srcs[] = { src0, src1 };
+         const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, n);
+         bld.LOAD_PAYLOAD(tmp, srcs, n, 0);
+
+         return emit_send(bld, SHADER_OPCODE_TYPED_ATOMIC_LOGICAL,
+                          addr, tmp, surface, dims, op, rsize);
+      }
+   }
+}
+
+namespace {
+   namespace image_format_info {
+      /* The higher compiler layers use the GL enums for image formats even if
+       * they come in from SPIR-V or Vulkan.  We need to turn them into an ISL
+       * enum before we can use them.
+       */
+      enum isl_format
+      isl_format_for_gl_format(uint32_t gl_format)
+      {
+         switch (gl_format) {
+         case GL_R8:             return ISL_FORMAT_R8_UNORM;
+         case GL_R8_SNORM:       return ISL_FORMAT_R8_SNORM;
+         case GL_R8UI:           return ISL_FORMAT_R8_UINT;
+         case GL_R8I:            return ISL_FORMAT_R8_SINT;
+         case GL_RG8:            return ISL_FORMAT_R8G8_UNORM;
+         case GL_RG8_SNORM:      return ISL_FORMAT_R8G8_SNORM;
+         case GL_RG8UI:          return ISL_FORMAT_R8G8_UINT;
+         case GL_RG8I:           return ISL_FORMAT_R8G8_SINT;
+         case GL_RGBA8:          return ISL_FORMAT_R8G8B8A8_UNORM;
+         case GL_RGBA8_SNORM:    return ISL_FORMAT_R8G8B8A8_SNORM;
+         case GL_RGBA8UI:        return ISL_FORMAT_R8G8B8A8_UINT;
+         case GL_RGBA8I:         return ISL_FORMAT_R8G8B8A8_SINT;
+         case GL_R11F_G11F_B10F: return ISL_FORMAT_R11G11B10_FLOAT;
+         case GL_RGB10_A2:       return ISL_FORMAT_R10G10B10A2_UNORM;
+         case GL_RGB10_A2UI:     return ISL_FORMAT_R10G10B10A2_UINT;
+         case GL_R16:            return ISL_FORMAT_R16_UNORM;
+         case GL_R16_SNORM:      return ISL_FORMAT_R16_SNORM;
+         case GL_R16F:           return ISL_FORMAT_R16_FLOAT;
+         case GL_R16UI:          return ISL_FORMAT_R16_UINT;
+         case GL_R16I:           return ISL_FORMAT_R16_SINT;
+         case GL_RG16:           return ISL_FORMAT_R16G16_UNORM;
+         case GL_RG16_SNORM:     return ISL_FORMAT_R16G16_SNORM;
+         case GL_RG16F:          return ISL_FORMAT_R16G16_FLOAT;
+         case GL_RG16UI:         return ISL_FORMAT_R16G16_UINT;
+         case GL_RG16I:          return ISL_FORMAT_R16G16_SINT;
+         case GL_RGBA16:         return ISL_FORMAT_R16G16B16A16_UNORM;
+         case GL_RGBA16_SNORM:   return ISL_FORMAT_R16G16B16A16_SNORM;
+         case GL_RGBA16F:        return ISL_FORMAT_R16G16B16A16_FLOAT;
+         case GL_RGBA16UI:       return ISL_FORMAT_R16G16B16A16_UINT;
+         case GL_RGBA16I:        return ISL_FORMAT_R16G16B16A16_SINT;
+         case GL_R32F:           return ISL_FORMAT_R32_FLOAT;
+         case GL_R32UI:          return ISL_FORMAT_R32_UINT;
+         case GL_R32I:           return ISL_FORMAT_R32_SINT;
+         case GL_RG32F:          return ISL_FORMAT_R32G32_FLOAT;
+         case GL_RG32UI:         return ISL_FORMAT_R32G32_UINT;
+         case GL_RG32I:          return ISL_FORMAT_R32G32_SINT;
+         case GL_RGBA32F:        return ISL_FORMAT_R32G32B32A32_FLOAT;
+         case GL_RGBA32UI:       return ISL_FORMAT_R32G32B32A32_UINT;
+         case GL_RGBA32I:        return ISL_FORMAT_R32G32B32A32_SINT;
+         case GL_NONE:           return ISL_FORMAT_UNSUPPORTED;
+         default:
+            assert(!"Invalid image format");
+            return ISL_FORMAT_UNSUPPORTED;
+         }
+      }
+
+      /**
+       * Simple 4-tuple of scalars used to pass around per-color component
+       * values.
+       */
+      struct color_u {
+         color_u(unsigned x = 0) : r(x), g(x), b(x), a(x)
+         {
+         }
+
+         color_u(unsigned r, unsigned g, unsigned b, unsigned a) :
+            r(r), g(g), b(b), a(a)
+         {
+         }
+
+         unsigned
+         operator[](unsigned i) const
+         {
+            const unsigned xs[] = { r, g, b, a };
+            return xs[i];
+         }
+
+         unsigned r, g, b, a;
+      };
+
+      /**
+       * Return the per-channel bitfield widths for a given image format.
+       */
+      inline color_u
+      get_bit_widths(isl_format format)
+      {
+         const isl_format_layout *fmtl = isl_format_get_layout(format);
+
+         return color_u(fmtl->channels.r.bits,
+                        fmtl->channels.g.bits,
+                        fmtl->channels.b.bits,
+                        fmtl->channels.a.bits);
+      }
+
+      /**
+       * Return the per-channel bitfield shifts for a given image format.
+       */
+      inline color_u
+      get_bit_shifts(isl_format format)
+      {
+         const color_u widths = get_bit_widths(format);
+         return color_u(0, widths.r, widths.r + widths.g,
+                        widths.r + widths.g + widths.b);
+      }
+
+      /**
+       * Return true if all present components have the same bit width.
+       */
+      inline bool
+      is_homogeneous(isl_format format)
+      {
+         const color_u widths = get_bit_widths(format);
+         return ((widths.g == 0 || widths.g == widths.r) &&
+                 (widths.b == 0 || widths.b == widths.r) &&
+                 (widths.a == 0 || widths.a == widths.r));
+      }
+
+      /**
+       * Return true if the format conversion boils down to a trivial copy.
+       */
+      inline bool
+      is_conversion_trivial(const gen_device_info *devinfo, isl_format format)
+      {
+         return (get_bit_widths(format).r == 32 && is_homogeneous(format)) ||
+                 format == isl_lower_storage_image_format(devinfo, format);
+      }
+
+      /**
+       * Return true if the hardware natively supports some format with
+       * compatible bitfield layout, but possibly different data types.
+       */
+      inline bool
+      has_supported_bit_layout(const gen_device_info *devinfo,
+                               isl_format format)
+      {
+         const color_u widths = get_bit_widths(format);
+         const color_u lower_widths = get_bit_widths(
+            isl_lower_storage_image_format(devinfo, format));
+
+         return (widths.r == lower_widths.r &&
+                 widths.g == lower_widths.g &&
+                 widths.b == lower_widths.b &&
+                 widths.a == lower_widths.a);
+      }
+
+      /**
+       * Return true if we are required to spread individual components over
+       * several components of the format used by the hardware (RG32 and
+       * friends implemented as RGBA16UI).
+       */
+      inline bool
+      has_split_bit_layout(const gen_device_info *devinfo, isl_format format)
+      {
+         const isl_format lower_format =
+            isl_lower_storage_image_format(devinfo, format);
+
+         return (isl_format_get_num_channels(format) <
+                 isl_format_get_num_channels(lower_format));
+      }
+
+      /**
+       * Return true if the hardware returns garbage in the unused high bits
+       * of each component.  This may happen on IVB because we rely on the
+       * undocumented behavior that typed reads from surfaces of the
+       * unsupported R8 and R16 formats return useful data in their least
+       * significant bits.
+       */
+      inline bool
+      has_undefined_high_bits(const gen_device_info *devinfo,
+                              isl_format format)
+      {
+         const isl_format lower_format =
+            isl_lower_storage_image_format(devinfo, format);
+
+         return (devinfo->gen == 7 && !devinfo->is_haswell &&
+                 (lower_format == ISL_FORMAT_R16_UINT ||
+                  lower_format == ISL_FORMAT_R8_UINT));
+      }
+
+      /**
+       * Return true if the format represents values as signed integers
+       * requiring sign extension when unpacking.
+       */
+      inline bool
+      needs_sign_extension(isl_format format)
+      {
+         return isl_format_has_snorm_channel(format) ||
+                isl_format_has_sint_channel(format);
+      }
+   }
+
+   namespace image_validity {
+      /**
+       * Check whether the bound image is suitable for untyped access.
+       */
+      brw_predicate
+      emit_untyped_image_check(const fs_builder &bld, const fs_reg &image,
+                               brw_predicate pred)
+      {
+         const gen_device_info *devinfo = bld.shader->devinfo;
+         const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
+
+         if (devinfo->gen == 7 && !devinfo->is_haswell) {
+            /* Check whether the first stride component (i.e. the Bpp value)
+             * is greater than four, what on Gen7 indicates that a surface of
+             * type RAW has been bound for untyped access.  Reading or writing
+             * to a surface of type other than RAW using untyped surface
+             * messages causes a hang on IVB and VLV.
+             */
+            set_predicate(pred,
+                          bld.CMP(bld.null_reg_ud(), stride, brw_imm_d(4),
+                                  BRW_CONDITIONAL_G));
+
+            return BRW_PREDICATE_NORMAL;
+         } else {
+            /* More recent generations handle the format mismatch
+             * gracefully.
+             */
+            return pred;
+         }
+      }
+
+      /**
+       * Check whether there is an image bound at the given index and write
+       * the comparison result to f0.0.  Returns an appropriate predication
+       * mode to use on subsequent image operations.
+       */
+      brw_predicate
+      emit_typed_atomic_check(const fs_builder &bld, const fs_reg &image)
+      {
+         const gen_device_info *devinfo = bld.shader->devinfo;
+         const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
+
+         if (devinfo->gen == 7 && !devinfo->is_haswell) {
+            /* Check the first component of the size field to find out if the
+             * image is bound.  Necessary on IVB for typed atomics because
+             * they don't seem to respect null surfaces and will happily
+             * corrupt or read random memory when no image is bound.
+             */
+            bld.CMP(bld.null_reg_ud(),
+                    retype(size, BRW_REGISTER_TYPE_UD),
+                    brw_imm_d(0), BRW_CONDITIONAL_NZ);
+
+            return BRW_PREDICATE_NORMAL;
+         } else {
+            /* More recent platforms implement compliant behavior when a null
+             * surface is bound.
+             */
+            return BRW_PREDICATE_NONE;
+         }
+      }
+
+      /**
+       * Check whether the provided coordinates are within the image bounds
+       * and write the comparison result to f0.0.  Returns an appropriate
+       * predication mode to use on subsequent image operations.
+       */
+      brw_predicate
+      emit_bounds_check(const fs_builder &bld, const fs_reg &image,
+                        const fs_reg &addr, unsigned dims)
+      {
+         const fs_reg size = offset(image, bld, BRW_IMAGE_PARAM_SIZE_OFFSET);
+
+         for (unsigned c = 0; c < dims; ++c)
+            set_predicate(c == 0 ? BRW_PREDICATE_NONE : BRW_PREDICATE_NORMAL,
+                          bld.CMP(bld.null_reg_ud(),
+                                  offset(retype(addr, BRW_REGISTER_TYPE_UD), bld, c),
+                                  offset(size, bld, c),
+                                  BRW_CONDITIONAL_L));
+
+         return BRW_PREDICATE_NORMAL;
+      }
+   }
+
+   namespace image_coordinates {
+      /**
+       * Return the total number of coordinates needed to address a texel of
+       * the surface, which may be more than the sum of \p surf_dims and \p
+       * arr_dims if padding is required.
+       */
+      unsigned
+      num_image_coordinates(const fs_builder &bld,
+                            unsigned surf_dims, unsigned arr_dims,
+                            isl_format format)
+      {
+         /* HSW in vec4 mode and our software coordinate handling for untyped
+          * reads want the array index to be at the Z component.
+          */
+         const bool array_index_at_z =
+            format != ISL_FORMAT_UNSUPPORTED &&
+            !isl_has_matching_typed_storage_image_format(
+               bld.shader->devinfo, format);
+         const unsigned zero_dims =
+            ((surf_dims == 1 && arr_dims == 1 && array_index_at_z) ? 1 : 0);
+
+         return surf_dims + zero_dims + arr_dims;
+      }
+
+      /**
+       * Transform image coordinates into the form expected by the
+       * implementation.
+       */
+      fs_reg
+      emit_image_coordinates(const fs_builder &bld, const fs_reg &addr,
+                             unsigned surf_dims, unsigned arr_dims,
+                             isl_format format)
+      {
+         const unsigned dims =
+            num_image_coordinates(bld, surf_dims, arr_dims, format);
+
+         if (dims > surf_dims + arr_dims) {
+            assert(surf_dims == 1 && arr_dims == 1 && dims == 3);
+            /* The array index is required to be passed in as the Z component,
+             * insert a zero at the Y component to shift it to the right
+             * position.
+             *
+             * FINISHME: Factor out this frequently recurring pattern into a
+             * helper function.
+             */
+            const fs_reg srcs[] = { addr, brw_imm_d(0), offset(addr, bld, 1) };
+            const fs_reg dst = bld.vgrf(addr.type, dims);
+            bld.LOAD_PAYLOAD(dst, srcs, dims, 0);
+            return dst;
+         } else {
+            return addr;
+         }
+      }
+
+      /**
+       * Calculate the offset in memory of the texel given by \p coord.
+       *
+       * This is meant to be used with untyped surface messages to access a
+       * tiled surface, what involves taking into account the tiling and
+       * swizzling modes of the surface manually so it will hopefully not
+       * happen very often.
+       *
+       * The tiling algorithm implemented here matches either the X or Y
+       * tiling layouts supported by the hardware depending on the tiling
+       * coefficients passed to the program as uniforms.  See Volume 1 Part 2
+       * Section 4.5 "Address Tiling Function" of the IVB PRM for an in-depth
+       * explanation of the hardware tiling format.
+       */
+      fs_reg
+      emit_address_calculation(const fs_builder &bld, const fs_reg &image,
+                               const fs_reg &coord, unsigned dims)
+      {
+         const gen_device_info *devinfo = bld.shader->devinfo;
+         const fs_reg off = offset(image, bld, BRW_IMAGE_PARAM_OFFSET_OFFSET);
+         const fs_reg stride = offset(image, bld, BRW_IMAGE_PARAM_STRIDE_OFFSET);
+         const fs_reg tile = offset(image, bld, BRW_IMAGE_PARAM_TILING_OFFSET);
+         const fs_reg swz = offset(image, bld, BRW_IMAGE_PARAM_SWIZZLING_OFFSET);
+         const fs_reg addr = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         const fs_reg minor = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         const fs_reg major = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+         /* Shift the coordinates by the fixed surface offset.  It may be
+          * non-zero if the image is a single slice of a higher-dimensional
+          * surface, or if a non-zero mipmap level of the surface is bound to
+          * the pipeline.  The offset needs to be applied here rather than at
+          * surface state set-up time because the desired slice-level may
+          * start mid-tile, so simply shifting the surface base address
+          * wouldn't give a well-formed tiled surface in the general case.
+          */
+         for (unsigned c = 0; c < 2; ++c)
+            bld.ADD(offset(addr, bld, c), offset(off, bld, c),
+                    (c < dims ?
+                     offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, c) :
+                     fs_reg(brw_imm_d(0))));
+
+         /* The layout of 3-D textures in memory is sort-of like a tiling
+          * format.  At each miplevel, the slices are arranged in rows of
+          * 2^level slices per row.  The slice row is stored in tmp.y and
+          * the slice within the row is stored in tmp.x.
+          *
+          * The layout of 2-D array textures and cubemaps is much simpler:
+          * Depending on whether the ARYSPC_LOD0 layout is in use it will be
+          * stored in memory as an array of slices, each one being a 2-D
+          * arrangement of miplevels, or as a 2D arrangement of miplevels,
+          * each one being an array of slices.  In either case the separation
+          * between slices of the same LOD is equal to the qpitch value
+          * provided as stride.w.
+          *
+          * This code can be made to handle either 2D arrays and 3D textures
+          * by passing in the miplevel as tile.z for 3-D textures and 0 in
+          * tile.z for 2-D array textures.
+          *
+          * See Volume 1 Part 1 of the Gen7 PRM, sections 6.18.4.7 "Surface
+          * Arrays" and 6.18.6 "3D Surfaces" for a more extensive discussion
+          * of the hardware 3D texture and 2D array layouts.
+          */
+         if (dims > 2) {
+            /* Decompose z into a major (tmp.y) and a minor (tmp.x)
+             * index.
+             */
+            bld.BFE(offset(tmp, bld, 0), offset(tile, bld, 2), brw_imm_d(0),
+                    offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2));
+            bld.SHR(offset(tmp, bld, 1),
+                    offset(retype(coord, BRW_REGISTER_TYPE_UD), bld, 2),
+                    offset(tile, bld, 2));
+
+            /* Take into account the horizontal (tmp.x) and vertical (tmp.y)
+             * slice offset.
+             */
+            for (unsigned c = 0; c < 2; ++c) {
+               bld.MUL(offset(tmp, bld, c),
+                       offset(stride, bld, 2 + c), offset(tmp, bld, c));
+               bld.ADD(offset(addr, bld, c),
+                       offset(addr, bld, c), offset(tmp, bld, c));
+            }
+         }
+
+         if (dims > 1) {
+            /* Calculate the major/minor x and y indices.  In order to
+             * accommodate both X and Y tiling, the Y-major tiling format is
+             * treated as being a bunch of narrow X-tiles placed next to each
+             * other.  This means that the tile width for Y-tiling is actually
+             * the width of one sub-column of the Y-major tile where each 4K
+             * tile has 8 512B sub-columns.
+             *
+             * The major Y value is the row of tiles in which the pixel lives.
+             * The major X value is the tile sub-column in which the pixel
+             * lives; for X tiling, this is the same as the tile column, for Y
+             * tiling, each tile has 8 sub-columns.  The minor X and Y indices
+             * are the position within the sub-column.
+             */
+            for (unsigned c = 0; c < 2; ++c) {
+               /* Calculate the minor x and y indices. */
+               bld.BFE(offset(minor, bld, c), offset(tile, bld, c),
+                       brw_imm_d(0), offset(addr, bld, c));
+
+               /* Calculate the major x and y indices. */
+               bld.SHR(offset(major, bld, c),
+                       offset(addr, bld, c), offset(tile, bld, c));
+            }
+
+            /* Calculate the texel index from the start of the tile row and
+             * the vertical coordinate of the row.
+             * Equivalent to:
+             *   tmp.x = (major.x << tile.y << tile.x) +
+             *           (minor.y << tile.x) + minor.x
+             *   tmp.y = major.y << tile.y
+             */
+            bld.SHL(tmp, major, offset(tile, bld, 1));
+            bld.ADD(tmp, tmp, offset(minor, bld, 1));
+            bld.SHL(tmp, tmp, offset(tile, bld, 0));
+            bld.ADD(tmp, tmp, minor);
+            bld.SHL(offset(tmp, bld, 1),
+                    offset(major, bld, 1), offset(tile, bld, 1));
+
+            /* Add it to the start of the tile row. */
+            bld.MUL(offset(tmp, bld, 1),
+                    offset(tmp, bld, 1), offset(stride, bld, 1));
+            bld.ADD(tmp, tmp, offset(tmp, bld, 1));
+
+            /* Multiply by the Bpp value. */
+            bld.MUL(dst, tmp, stride);
+
+            if (devinfo->gen < 8 && !devinfo->is_baytrail) {
+               /* Take into account the two dynamically specified shifts.
+                * Both need are used to implement swizzling of X-tiled
+                * surfaces.  For Y-tiled surfaces only one bit needs to be
+                * XOR-ed with bit 6 of the memory address, so a swz value of
+                * 0xff (actually interpreted as 31 by the hardware) will be
+                * provided to cause the relevant bit of tmp.y to be zero and
+                * turn the first XOR into the identity.  For linear surfaces
+                * or platforms lacking address swizzling both shifts will be
+                * 0xff causing the relevant bits of both tmp.x and .y to be
+                * zero, what effectively disables swizzling.
+                */
+               for (unsigned c = 0; c < 2; ++c)
+                  bld.SHR(offset(tmp, bld, c), dst, offset(swz, bld, c));
+
+               /* XOR tmp.x and tmp.y with bit 6 of the memory address. */
+               bld.XOR(tmp, tmp, offset(tmp, bld, 1));
+               bld.AND(tmp, tmp, brw_imm_d(1 << 6));
+               bld.XOR(dst, dst, tmp);
+            }
+
+         } else {
+            /* Multiply by the Bpp/stride value.  Note that the addr.y may be
+             * non-zero even if the image is one-dimensional because a
+             * vertical offset may have been applied above to select a
+             * non-zero slice or level of a higher-dimensional texture.
+             */
+            bld.MUL(offset(addr, bld, 1),
+                    offset(addr, bld, 1), offset(stride, bld, 1));
+            bld.ADD(addr, addr, offset(addr, bld, 1));
+            bld.MUL(dst, addr, stride);
+         }
+
+         return dst;
+      }
+   }
+
+   namespace image_format_conversion {
+      using image_format_info::color_u;
+
+      namespace {
+         /**
+          * Maximum representable value in an unsigned integer with the given
+          * number of bits.
+          */
+         inline unsigned
+         scale(unsigned n)
+         {
+            return (1 << n) - 1;
+         }
+      }
+
+      /**
+       * Pack the vector \p src in a bitfield given the per-component bit
+       * shifts and widths.  Note that bitfield components are not allowed to
+       * cross 32-bit boundaries.
+       */
+      fs_reg
+      emit_pack(const fs_builder &bld, const fs_reg &src,
+                const color_u &shifts, const color_u &widths)
+      {
+         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
+         bool seen[4] = {};
+
+         for (unsigned c = 0; c < 4; ++c) {
+            if (widths[c]) {
+               const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+               /* Shift each component left to the correct bitfield position. */
+               bld.SHL(tmp, offset(src, bld, c), brw_imm_ud(shifts[c] % 32));
+
+               /* Add everything up. */
+               if (seen[shifts[c] / 32]) {
+                  bld.OR(offset(dst, bld, shifts[c] / 32),
+                         offset(dst, bld, shifts[c] / 32), tmp);
+               } else {
+                  bld.MOV(offset(dst, bld, shifts[c] / 32), tmp);
+                  seen[shifts[c] / 32] = true;
+               }
+            }
+         }
+
+         return dst;
+      }
+
+      /**
+       * Unpack a vector from the bitfield \p src given the per-component bit
+       * shifts and widths.  Note that bitfield components are not allowed to
+       * cross 32-bit boundaries.
+       */
+      fs_reg
+      emit_unpack(const fs_builder &bld, const fs_reg &src,
+                  const color_u &shifts, const color_u &widths)
+      {
+         const fs_reg dst = bld.vgrf(src.type, 4);
+
+         for (unsigned c = 0; c < 4; ++c) {
+            if (widths[c]) {
+               /* Shift left to discard the most significant bits. */
+               bld.SHL(offset(dst, bld, c),
+                       offset(src, bld, shifts[c] / 32),
+                       brw_imm_ud(32 - shifts[c] % 32 - widths[c]));
+
+               /* Shift back to the least significant bits using an arithmetic
+                * shift to get sign extension on signed types.
+                */
+               bld.ASR(offset(dst, bld, c),
+                       offset(dst, bld, c), brw_imm_ud(32 - widths[c]));
+            }
+         }
+
+         return dst;
+      }
+
+      /**
+       * Convert an integer vector into another integer vector of the
+       * specified bit widths, properly handling overflow.
+       */
+      fs_reg
+      emit_convert_to_integer(const fs_builder &bld, const fs_reg &src,
+                              const color_u &widths, bool is_signed)
+      {
+         const unsigned s = (is_signed ? 1 : 0);
+         const fs_reg dst = bld.vgrf(
+            is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
+         assert(src.type == dst.type);
+
+         for (unsigned c = 0; c < 4; ++c) {
+            if (widths[c]) {
+               /* Clamp to the maximum value. */
+               bld.emit_minmax(offset(dst, bld, c), offset(src, bld, c),
+                               brw_imm_d((int)scale(widths[c] - s)),
+                               BRW_CONDITIONAL_L);
+
+               /* Clamp to the minimum value. */
+               if (is_signed)
+                  bld.emit_minmax(offset(dst, bld, c), offset(dst, bld, c),
+                                  brw_imm_d(-(int)scale(widths[c] - s) - 1),
+                                  BRW_CONDITIONAL_GE);
+
+               /* Mask off all but the bits we actually want.  Otherwise, if
+                * we pass a negative number into the hardware when it's
+                * expecting something like UINT8, it will happily clamp it to
+                * +255 for us.
+                */
+               if (is_signed && widths[c] < 32)
+                  bld.AND(offset(dst, bld, c), offset(dst, bld, c),
+                          brw_imm_d(scale(widths[c])));
+            }
+         }
+
+         return dst;
+      }
+
+      /**
+       * Convert a normalized fixed-point vector of the specified signedness
+       * and bit widths into a floating point vector.
+       */
+      fs_reg
+      emit_convert_from_scaled(const fs_builder &bld, const fs_reg &src,
+                               const color_u &widths, bool is_signed)
+      {
+         const unsigned s = (is_signed ? 1 : 0);
+         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+
+         for (unsigned c = 0; c < 4; ++c) {
+            if (widths[c]) {
+               /* Convert to float. */
+               bld.MOV(offset(dst, bld, c), offset(src, bld, c));
+
+               /* Divide by the normalization constants. */
+               bld.MUL(offset(dst, bld, c), offset(dst, bld, c),
+                       brw_imm_f(1.0f / scale(widths[c] - s)));
+
+               /* Clamp to the minimum value. */
+               if (is_signed)
+                  bld.emit_minmax(offset(dst, bld, c),
+                                  offset(dst, bld, c), brw_imm_f(-1.0f),
+                                  BRW_CONDITIONAL_GE);
+            }
+         }
+         return dst;
+      }
+
+      /**
+       * Convert a floating-point vector into a normalized fixed-point vector
+       * of the specified signedness and bit widths.
+       */
+      fs_reg
+      emit_convert_to_scaled(const fs_builder &bld, const fs_reg &src,
+                             const color_u &widths, bool is_signed)
+      {
+         const unsigned s = (is_signed ? 1 : 0);
+         const fs_reg dst = bld.vgrf(
+            is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD, 4);
+         const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
+
+         for (unsigned c = 0; c < 4; ++c) {
+            if (widths[c]) {
+               /* Clamp the normalized floating-point argument. */
+               if (is_signed) {
+                  bld.emit_minmax(offset(fdst, bld, c), offset(src, bld, c),
+                                  brw_imm_f(-1.0f), BRW_CONDITIONAL_GE);
+
+                  bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
+                                  brw_imm_f(1.0f), BRW_CONDITIONAL_L);
+               } else {
+                  set_saturate(true, bld.MOV(offset(fdst, bld, c),
+                                             offset(src, bld, c)));
+               }
+
+               /* Multiply by the normalization constants. */
+               bld.MUL(offset(fdst, bld, c), offset(fdst, bld, c),
+                       brw_imm_f((float)scale(widths[c] - s)));
+
+               /* Convert to integer. */
+               bld.RNDE(offset(fdst, bld, c), offset(fdst, bld, c));
+               bld.MOV(offset(dst, bld, c), offset(fdst, bld, c));
+
+               /* Mask off all but the bits we actually want.  Otherwise, if
+                * we pass a negative number into the hardware when it's
+                * expecting something like UINT8, it will happily clamp it to
+                * +255 for us.
+                */
+               if (is_signed && widths[c] < 32)
+                  bld.AND(offset(dst, bld, c), offset(dst, bld, c),
+                          brw_imm_d(scale(widths[c])));
+            }
+         }
+
+         return dst;
+      }
+
+      /**
+       * Convert a floating point vector of the specified bit widths into a
+       * 32-bit floating point vector.
+       */
+      fs_reg
+      emit_convert_from_float(const fs_builder &bld, const fs_reg &src,
+                              const color_u &widths)
+      {
+         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
+         const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
+
+         for (unsigned c = 0; c < 4; ++c) {
+            if (widths[c]) {
+               bld.MOV(offset(dst, bld, c), offset(src, bld, c));
+
+               /* Extend 10-bit and 11-bit floating point numbers to 15 bits.
+                * This works because they have a 5-bit exponent just like the
+                * 16-bit floating point format, and they have no sign bit.
+                */
+               if (widths[c] < 16)
+                  bld.SHL(offset(dst, bld, c),
+                          offset(dst, bld, c), brw_imm_ud(15 - widths[c]));
+
+               /* Convert to 32-bit floating point. */
+               bld.F16TO32(offset(fdst, bld, c), offset(dst, bld, c));
+            }
+         }
+
+         return fdst;
+      }
+
+      /**
+       * Convert a vector into a floating point vector of the specified bit
+       * widths.
+       */
+      fs_reg
+      emit_convert_to_float(const fs_builder &bld, const fs_reg &src,
+                            const color_u &widths)
+      {
+         const fs_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
+         const fs_reg fdst = retype(dst, BRW_REGISTER_TYPE_F);
+
+         for (unsigned c = 0; c < 4; ++c) {
+            if (widths[c]) {
+               bld.MOV(offset(fdst, bld, c), offset(src, bld, c));
+
+               /* Clamp to the minimum value. */
+               if (widths[c] < 16)
+                  bld.emit_minmax(offset(fdst, bld, c), offset(fdst, bld, c),
+                                  brw_imm_f(0.0f), BRW_CONDITIONAL_GE);
+
+               /* Convert to 16-bit floating-point. */
+               bld.F32TO16(offset(dst, bld, c), offset(fdst, bld, c));
+
+               /* Discard the least significant bits to get floating point
+                * numbers of the requested width.  This works because the
+                * 10-bit and 11-bit floating point formats have a 5-bit
+                * exponent just like the 16-bit format, and they have no sign
+                * bit.
+                */
+               if (widths[c] < 16)
+                  bld.SHR(offset(dst, bld, c), offset(dst, bld, c),
+                          brw_imm_ud(15 - widths[c]));
+            }
+         }
+
+         return dst;
+      }
+
+      /**
+       * Fill missing components of a vector with 0, 0, 0, 1.
+       */
+      fs_reg
+      emit_pad(const fs_builder &bld, const fs_reg &src,
+               const color_u &widths)
+      {
+         const fs_reg dst = bld.vgrf(src.type, 4);
+         const unsigned pad[] = { 0, 0, 0, 1 };
+
+         for (unsigned c = 0; c < 4; ++c)
+            bld.MOV(offset(dst, bld, c),
+                    widths[c] ? offset(src, bld, c)
+                              : fs_reg(brw_imm_ud(pad[c])));
+
+         return dst;
+      }
+   }
+}
+
+namespace brw {
+   namespace image_access {
+      /**
+       * Load a vector from a surface of the given format and dimensionality
+       * at the given coordinates.  \p surf_dims and \p arr_dims give the
+       * number of non-array and array coordinates of the image respectively.
+       */
+      fs_reg
+      emit_image_load(const fs_builder &bld,
+                      const fs_reg &image, const fs_reg &addr,
+                      unsigned surf_dims, unsigned arr_dims,
+                      unsigned gl_format)
+      {
+         using namespace image_format_info;
+         using namespace image_format_conversion;
+         using namespace image_validity;
+         using namespace image_coordinates;
+         using namespace surface_access;
+         const gen_device_info *devinfo = bld.shader->devinfo;
+         const isl_format format = isl_format_for_gl_format(gl_format);
+         const isl_format lower_format =
+            isl_lower_storage_image_format(devinfo, format);
+         fs_reg tmp;
+
+         /* Transform the image coordinates into actual surface coordinates. */
+         const fs_reg saddr =
+            emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
+         const unsigned dims =
+            num_image_coordinates(bld, surf_dims, arr_dims, format);
+
+         if (isl_has_matching_typed_storage_image_format(devinfo, format)) {
+            /* Hopefully we get here most of the time... */
+            tmp = emit_typed_read(bld, image, saddr, dims,
+                                  isl_format_get_num_channels(lower_format));
+         } else {
+            /* Untyped surface reads return 32 bits of the surface per
+             * component, without any sort of unpacking or type conversion,
+             */
+            const unsigned size = isl_format_get_layout(format)->bpb / 32;
+            /* they don't properly handle out of bounds access, so we have to
+             * check manually if the coordinates are valid and predicate the
+             * surface read on the result,
+             */
+            const brw_predicate pred =
+               emit_untyped_image_check(bld, image,
+                                        emit_bounds_check(bld, image,
+                                                          saddr, dims));
+
+            /* and they don't know about surface coordinates, we need to
+             * convert them to a raw memory offset.
+             */
+            const fs_reg laddr = emit_address_calculation(bld, image, saddr, dims);
+
+            tmp = emit_untyped_read(bld, image, laddr, 1, size, pred);
+
+            /* An out of bounds surface access should give zero as result. */
+            for (unsigned c = 0; c < size; ++c)
+               set_predicate(pred, bld.SEL(offset(tmp, bld, c),
+                                           offset(tmp, bld, c), brw_imm_d(0)));
+         }
+
+         /* Set the register type to D instead of UD if the data type is
+          * represented as a signed integer in memory so that sign extension
+          * is handled correctly by unpack.
+          */
+         if (needs_sign_extension(format))
+            tmp = retype(tmp, BRW_REGISTER_TYPE_D);
+
+         if (!has_supported_bit_layout(devinfo, format)) {
+            /* Unpack individual vector components from the bitfield if the
+             * hardware is unable to do it for us.
+             */
+            if (has_split_bit_layout(devinfo, format))
+               tmp = emit_pack(bld, tmp, get_bit_shifts(lower_format),
+                               get_bit_widths(lower_format));
+            else
+               tmp = emit_unpack(bld, tmp, get_bit_shifts(format),
+                                 get_bit_widths(format));
+
+         } else if ((needs_sign_extension(format) &&
+                     !is_conversion_trivial(devinfo, format)) ||
+                    has_undefined_high_bits(devinfo, format)) {
+            /* Perform a trivial unpack even though the bit layout matches in
+             * order to get the most significant bits of each component
+             * initialized properly.
+             */
+            tmp = emit_unpack(bld, tmp, color_u(0, 32, 64, 96),
+                              get_bit_widths(format));
+         }
+
+         if (!isl_format_has_int_channel(format)) {
+            if (is_conversion_trivial(devinfo, format)) {
+               /* Just need to cast the vector to the target type. */
+               tmp = retype(tmp, BRW_REGISTER_TYPE_F);
+            } else {
+               /* Do the right sort of type conversion to float. */
+               if (isl_format_has_float_channel(format))
+                  tmp = emit_convert_from_float(
+                     bld, tmp, get_bit_widths(format));
+               else
+                  tmp = emit_convert_from_scaled(
+                     bld, tmp, get_bit_widths(format),
+                     isl_format_has_snorm_channel(format));
+            }
+         }
+
+         /* Initialize missing components of the result. */
+         return emit_pad(bld, tmp, get_bit_widths(format));
+      }
+
+      /**
+       * Store a vector in a surface of the given format and dimensionality at
+       * the given coordinates.  \p surf_dims and \p arr_dims give the number
+       * of non-array and array coordinates of the image respectively.
+       */
+      void
+      emit_image_store(const fs_builder &bld, const fs_reg &image,
+                       const fs_reg &addr, const fs_reg &src,
+                       unsigned surf_dims, unsigned arr_dims,
+                       unsigned gl_format)
+      {
+         using namespace image_format_info;
+         using namespace image_format_conversion;
+         using namespace image_validity;
+         using namespace image_coordinates;
+         using namespace surface_access;
+         const isl_format format = isl_format_for_gl_format(gl_format);
+         const gen_device_info *devinfo = bld.shader->devinfo;
+
+         /* Transform the image coordinates into actual surface coordinates. */
+         const fs_reg saddr =
+            emit_image_coordinates(bld, addr, surf_dims, arr_dims, format);
+         const unsigned dims =
+            num_image_coordinates(bld, surf_dims, arr_dims, format);
+
+         if (gl_format == GL_NONE) {
+            /* We don't know what the format is, but that's fine because it
+             * implies write-only access, and typed surface writes are always
+             * able to take care of type conversion and packing for us.
+             */
+            emit_typed_write(bld, image, saddr, src, dims, 4);
+
+         } else {
+            const isl_format lower_format =
+               isl_lower_storage_image_format(devinfo, format);
+            fs_reg tmp = src;
+
+            if (!is_conversion_trivial(devinfo, format)) {
+               /* Do the right sort of type conversion. */
+               if (isl_format_has_float_channel(format))
+                  tmp = emit_convert_to_float(bld, tmp, get_bit_widths(format));
+
+               else if (isl_format_has_int_channel(format))
+                  tmp = emit_convert_to_integer(bld, tmp, get_bit_widths(format),
+                                                isl_format_has_sint_channel(format));
+
+               else
+                  tmp = emit_convert_to_scaled(bld, tmp, get_bit_widths(format),
+                                               isl_format_has_snorm_channel(format));
+            }
+
+            /* We're down to bit manipulation at this point. */
+            tmp = retype(tmp, BRW_REGISTER_TYPE_UD);
+
+            if (!has_supported_bit_layout(devinfo, format)) {
+               /* Pack the vector components into a bitfield if the hardware
+                * is unable to do it for us.
+                */
+               if (has_split_bit_layout(devinfo, format))
+                  tmp = emit_unpack(bld, tmp, get_bit_shifts(lower_format),
+                                    get_bit_widths(lower_format));
+
+               else
+                  tmp = emit_pack(bld, tmp, get_bit_shifts(format),
+                                  get_bit_widths(format));
+            }
+
+            if (isl_has_matching_typed_storage_image_format(devinfo, format)) {
+               /* Hopefully we get here most of the time... */
+               emit_typed_write(bld, image, saddr, tmp, dims,
+                                isl_format_get_num_channels(lower_format));
+
+            } else {
+               /* Untyped surface writes store 32 bits of the surface per
+                * component, without any sort of packing or type conversion,
+                */
+               const unsigned size = isl_format_get_layout(format)->bpb / 32;
+
+               /* they don't properly handle out of bounds access, so we have
+                * to check manually if the coordinates are valid and predicate
+                * the surface write on the result,
+                */
+               const brw_predicate pred =
+                  emit_untyped_image_check(bld, image,
+                                           emit_bounds_check(bld, image,
+                                                             saddr, dims));
+
+               /* and, phew, they don't know about surface coordinates, we
+                * need to convert them to a raw memory offset.
+                */
+               const fs_reg laddr = emit_address_calculation(
+                  bld, image, saddr, dims);
+
+               emit_untyped_write(bld, image, laddr, tmp, 1, size, pred);
+            }
+         }
+      }
+
+      /**
+       * Perform an atomic read-modify-write operation in a surface of the
+       * given dimensionality at the given coordinates.  \p surf_dims and \p
+       * arr_dims give the number of non-array and array coordinates of the
+       * image respectively.  Main building block of the imageAtomic GLSL
+       * built-ins.
+       */
+      fs_reg
+      emit_image_atomic(const fs_builder &bld,
+                        const fs_reg &image, const fs_reg &addr,
+                        const fs_reg &src0, const fs_reg &src1,
+                        unsigned surf_dims, unsigned arr_dims,
+                        unsigned rsize, unsigned op)
+      {
+         using namespace image_validity;
+         using namespace image_coordinates;
+         using namespace surface_access;
+         /* Avoid performing an atomic operation on an unbound surface. */
+         const brw_predicate pred = emit_typed_atomic_check(bld, image);
+
+         /* Transform the image coordinates into actual surface coordinates. */
+         const fs_reg saddr =
+            emit_image_coordinates(bld, addr, surf_dims, arr_dims,
+                                   ISL_FORMAT_R32_UINT);
+         const unsigned dims =
+            num_image_coordinates(bld, surf_dims, arr_dims,
+                                  ISL_FORMAT_R32_UINT);
+
+         /* Thankfully we can do without untyped atomics here. */
+         const fs_reg tmp = emit_typed_atomic(bld, image, saddr, src0, src1,
+                                              dims, rsize, op, pred);
+
+         /* An unbound surface access should give zero as result. */
+         if (rsize && pred)
+            set_predicate(pred, bld.SEL(tmp, tmp, brw_imm_d(0)));
+
+         return retype(tmp, src0.type);
+      }
+   }
+}
diff --git a/src/intel/compiler/brw_fs_surface_builder.h b/src/intel/compiler/brw_fs_surface_builder.h
new file mode 100644
index 00000000000..32b56d387f6
--- /dev/null
+++ b/src/intel/compiler/brw_fs_surface_builder.h
@@ -0,0 +1,88 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2013-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_FS_SURFACE_BUILDER_H
+#define BRW_FS_SURFACE_BUILDER_H
+
+#include "brw_fs_builder.h"
+
+namespace brw {
+   namespace surface_access {
+      fs_reg
+      emit_untyped_read(const fs_builder &bld,
+                        const fs_reg &surface, const fs_reg &addr,
+                        unsigned dims, unsigned size,
+                        brw_predicate pred = BRW_PREDICATE_NONE);
+
+      void
+      emit_untyped_write(const fs_builder &bld, const fs_reg &surface,
+                         const fs_reg &addr, const fs_reg &src,
+                         unsigned dims, unsigned size,
+                         brw_predicate pred = BRW_PREDICATE_NONE);
+
+      fs_reg
+      emit_untyped_atomic(const fs_builder &bld,
+                          const fs_reg &surface, const fs_reg &addr,
+                          const fs_reg &src0, const fs_reg &src1,
+                          unsigned dims, unsigned rsize, unsigned op,
+                          brw_predicate pred = BRW_PREDICATE_NONE);
+
+      fs_reg
+      emit_typed_read(const fs_builder &bld, const fs_reg &surface,
+                      const fs_reg &addr, unsigned dims, unsigned size);
+
+      void
+      emit_typed_write(const fs_builder &bld, const fs_reg &surface,
+                       const fs_reg &addr, const fs_reg &src,
+                       unsigned dims, unsigned size);
+
+      fs_reg
+      emit_typed_atomic(const fs_builder &bld, const fs_reg &surface,
+                        const fs_reg &addr,
+                        const fs_reg &src0, const fs_reg &src1,
+                        unsigned dims, unsigned rsize, unsigned op,
+                        brw_predicate pred = BRW_PREDICATE_NONE);
+   }
+
+   namespace image_access {
+      fs_reg
+      emit_image_load(const fs_builder &bld,
+                      const fs_reg &image, const fs_reg &addr,
+                      unsigned surf_dims, unsigned arr_dims,
+                      unsigned gl_format);
+
+      void
+      emit_image_store(const fs_builder &bld, const fs_reg &image,
+                       const fs_reg &addr, const fs_reg &src,
+                       unsigned surf_dims, unsigned arr_dims,
+                       unsigned gl_format);
+      fs_reg
+      emit_image_atomic(const fs_builder &bld,
+                        const fs_reg &image, const fs_reg &addr,
+                        const fs_reg &src0, const fs_reg &src1,
+                        unsigned surf_dims, unsigned arr_dims,
+                        unsigned rsize, unsigned op);
+   }
+}
+#endif
diff --git a/src/intel/compiler/brw_fs_validate.cpp b/src/intel/compiler/brw_fs_validate.cpp
new file mode 100644
index 00000000000..676942c19c0
--- /dev/null
+++ b/src/intel/compiler/brw_fs_validate.cpp
@@ -0,0 +1,57 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_validate.cpp
+ *
+ * Implements a pass that validates various invariants of the IR.  The current
+ * pass only validates that GRF's uses are sane.  More can be added later.
+ */
+
+#include "brw_fs.h"
+#include "brw_cfg.h"
+
+#define fsv_assert(cond) \
+   if (!(cond)) { \
+      fprintf(stderr, "ASSERT: Scalar %s validation failed!\n", stage_abbrev); \
+      dump_instruction(inst, stderr); \
+      fprintf(stderr, "%s:%d: %s\n", __FILE__, __LINE__, #cond); \
+      abort(); \
+   }
+
+void
+fs_visitor::validate()
+{
+   foreach_block_and_inst (block, fs_inst, inst, cfg) {
+      if (inst->dst.file == VGRF) {
+         fsv_assert(inst->dst.offset / REG_SIZE + regs_written(inst) <=
+                    alloc.sizes[inst->dst.nr]);
+      }
+
+      for (unsigned i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == VGRF) {
+            fsv_assert(inst->src[i].offset / REG_SIZE + regs_read(inst, i) <=
+                       alloc.sizes[inst->src[i].nr]);
+         }
+      }
+   }
+}
diff --git a/src/intel/compiler/brw_fs_visitor.cpp b/src/intel/compiler/brw_fs_visitor.cpp
new file mode 100644
index 00000000000..cea38d86237
--- /dev/null
+++ b/src/intel/compiler/brw_fs_visitor.cpp
@@ -0,0 +1,953 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/** @file brw_fs_visitor.cpp
+ *
+ * This file supports generating the FS LIR from the GLSL IR.  The LIR
+ * makes it easier to do backend-specific optimizations than doing so
+ * in the GLSL IR or in the native code.
+ */
+#include "brw_fs.h"
+#include "compiler/glsl_types.h"
+
+using namespace brw;
+
+fs_reg *
+fs_visitor::emit_vs_system_value(int location)
+{
+   fs_reg *reg = new(this->mem_ctx)
+      fs_reg(ATTR, 4 * _mesa_bitcount_64(nir->info->inputs_read),
+             BRW_REGISTER_TYPE_D);
+   struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data);
+
+   switch (location) {
+   case SYSTEM_VALUE_BASE_VERTEX:
+      reg->offset = 0;
+      vs_prog_data->uses_basevertex = true;
+      break;
+   case SYSTEM_VALUE_BASE_INSTANCE:
+      reg->offset = REG_SIZE;
+      vs_prog_data->uses_baseinstance = true;
+      break;
+   case SYSTEM_VALUE_VERTEX_ID:
+      unreachable("should have been lowered");
+   case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
+      reg->offset = 2 * REG_SIZE;
+      vs_prog_data->uses_vertexid = true;
+      break;
+   case SYSTEM_VALUE_INSTANCE_ID:
+      reg->offset = 3 * REG_SIZE;
+      vs_prog_data->uses_instanceid = true;
+      break;
+   case SYSTEM_VALUE_DRAW_ID:
+      if (nir->info->system_values_read &
+          (BITFIELD64_BIT(SYSTEM_VALUE_BASE_VERTEX) |
+           BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE) |
+           BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) |
+           BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID)))
+         reg->nr += 4;
+      reg->offset = 0;
+      vs_prog_data->uses_drawid = true;
+      break;
+   default:
+      unreachable("not reached");
+   }
+
+   return reg;
+}
+
+/* Sample from the MCS surface attached to this multisample texture. */
+fs_reg
+fs_visitor::emit_mcs_fetch(const fs_reg &coordinate, unsigned components,
+                           const fs_reg &texture)
+{
+   const fs_reg dest = vgrf(glsl_type::uvec4_type);
+
+   fs_reg srcs[TEX_LOGICAL_NUM_SRCS];
+   srcs[TEX_LOGICAL_SRC_COORDINATE] = coordinate;
+   srcs[TEX_LOGICAL_SRC_SURFACE] = texture;
+   srcs[TEX_LOGICAL_SRC_SAMPLER] = texture;
+   srcs[TEX_LOGICAL_SRC_COORD_COMPONENTS] = brw_imm_d(components);
+   srcs[TEX_LOGICAL_SRC_GRAD_COMPONENTS] = brw_imm_d(0);
+
+   fs_inst *inst = bld.emit(SHADER_OPCODE_TXF_MCS_LOGICAL, dest, srcs,
+                            ARRAY_SIZE(srcs));
+
+   /* We only care about one or two regs of response, but the sampler always
+    * writes 4/8.
+    */
+   inst->size_written = 4 * dest.component_size(inst->exec_size);
+
+   return dest;
+}
+
+/**
+ * Apply workarounds for Gen6 gather with UINT/SINT
+ */
+void
+fs_visitor::emit_gen6_gather_wa(uint8_t wa, fs_reg dst)
+{
+   if (!wa)
+      return;
+
+   int width = (wa & WA_8BIT) ? 8 : 16;
+
+   for (int i = 0; i < 4; i++) {
+      fs_reg dst_f = retype(dst, BRW_REGISTER_TYPE_F);
+      /* Convert from UNORM to UINT */
+      bld.MUL(dst_f, dst_f, brw_imm_f((1 << width) - 1));
+      bld.MOV(dst, dst_f);
+
+      if (wa & WA_SIGN) {
+         /* Reinterpret the UINT value as a signed INT value by
+          * shifting the sign bit into place, then shifting back
+          * preserving sign.
+          */
+         bld.SHL(dst, dst, brw_imm_d(32 - width));
+         bld.ASR(dst, dst, brw_imm_d(32 - width));
+      }
+
+      dst = offset(dst, bld, 1);
+   }
+}
+
+/** Emits a dummy fragment shader consisting of magenta for bringup purposes. */
+void
+fs_visitor::emit_dummy_fs()
+{
+   int reg_width = dispatch_width / 8;
+
+   /* Everyone's favorite color. */
+   const float color[4] = { 1.0, 0.0, 1.0, 0.0 };
+   for (int i = 0; i < 4; i++) {
+      bld.MOV(fs_reg(MRF, 2 + i * reg_width, BRW_REGISTER_TYPE_F),
+              brw_imm_f(color[i]));
+   }
+
+   fs_inst *write;
+   write = bld.emit(FS_OPCODE_FB_WRITE);
+   write->eot = true;
+   if (devinfo->gen >= 6) {
+      write->base_mrf = 2;
+      write->mlen = 4 * reg_width;
+   } else {
+      write->header_size = 2;
+      write->base_mrf = 0;
+      write->mlen = 2 + 4 * reg_width;
+   }
+
+   /* Tell the SF we don't have any inputs.  Gen4-5 require at least one
+    * varying to avoid GPU hangs, so set that.
+    */
+   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(this->prog_data);
+   wm_prog_data->num_varying_inputs = devinfo->gen < 6 ? 1 : 0;
+   memset(wm_prog_data->urb_setup, -1,
+          sizeof(wm_prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
+
+   /* We don't have any uniforms. */
+   stage_prog_data->nr_params = 0;
+   stage_prog_data->nr_pull_params = 0;
+   stage_prog_data->curb_read_length = 0;
+   stage_prog_data->dispatch_grf_start_reg = 2;
+   wm_prog_data->dispatch_grf_start_reg_2 = 2;
+   grf_used = 1; /* Gen4-5 don't allow zero GRF blocks */
+
+   calculate_cfg();
+}
+
+/* The register location here is relative to the start of the URB
+ * data.  It will get adjusted to be a real location before
+ * generate_code() time.
+ */
+struct brw_reg
+fs_visitor::interp_reg(int location, int channel)
+{
+   assert(stage == MESA_SHADER_FRAGMENT);
+   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+   int regnr = prog_data->urb_setup[location] * 2 + channel / 2;
+   int stride = (channel & 1) * 4;
+
+   assert(prog_data->urb_setup[location] != -1);
+
+   return brw_vec1_grf(regnr, stride);
+}
+
+/** Emits the interpolation for the varying inputs. */
+void
+fs_visitor::emit_interpolation_setup_gen4()
+{
+   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
+
+   fs_builder abld = bld.annotate("compute pixel centers");
+   this->pixel_x = vgrf(glsl_type::uint_type);
+   this->pixel_y = vgrf(glsl_type::uint_type);
+   this->pixel_x.type = BRW_REGISTER_TYPE_UW;
+   this->pixel_y.type = BRW_REGISTER_TYPE_UW;
+   abld.ADD(this->pixel_x,
+            fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
+            fs_reg(brw_imm_v(0x10101010)));
+   abld.ADD(this->pixel_y,
+            fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
+            fs_reg(brw_imm_v(0x11001100)));
+
+   abld = bld.annotate("compute pixel deltas from v0");
+
+   this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL] =
+      vgrf(glsl_type::vec2_type);
+   const fs_reg &delta_xy = this->delta_xy[BRW_BARYCENTRIC_PERSPECTIVE_PIXEL];
+   const fs_reg xstart(negate(brw_vec1_grf(1, 0)));
+   const fs_reg ystart(negate(brw_vec1_grf(1, 1)));
+
+   if (devinfo->has_pln && dispatch_width == 16) {
+      for (unsigned i = 0; i < 2; i++) {
+         abld.half(i).ADD(half(offset(delta_xy, abld, i), 0),
+                          half(this->pixel_x, i), xstart);
+         abld.half(i).ADD(half(offset(delta_xy, abld, i), 1),
+                          half(this->pixel_y, i), ystart);
+      }
+   } else {
+      abld.ADD(offset(delta_xy, abld, 0), this->pixel_x, xstart);
+      abld.ADD(offset(delta_xy, abld, 1), this->pixel_y, ystart);
+   }
+
+   abld = bld.annotate("compute pos.w and 1/pos.w");
+   /* Compute wpos.w.  It's always in our setup, since it's needed to
+    * interpolate the other attributes.
+    */
+   this->wpos_w = vgrf(glsl_type::float_type);
+   abld.emit(FS_OPCODE_LINTERP, wpos_w, delta_xy,
+             interp_reg(VARYING_SLOT_POS, 3));
+   /* Compute the pixel 1/W value from wpos.w. */
+   this->pixel_w = vgrf(glsl_type::float_type);
+   abld.emit(SHADER_OPCODE_RCP, this->pixel_w, wpos_w);
+}
+
+/** Emits the interpolation for the varying inputs. */
+void
+fs_visitor::emit_interpolation_setup_gen6()
+{
+   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
+
+   fs_builder abld = bld.annotate("compute pixel centers");
+   if (devinfo->gen >= 8 || dispatch_width == 8) {
+      /* The "Register Region Restrictions" page says for BDW (and newer,
+       * presumably):
+       *
+       *     "When destination spans two registers, the source may be one or
+       *      two registers. The destination elements must be evenly split
+       *      between the two registers."
+       *
+       * Thus we can do a single add(16) in SIMD8 or an add(32) in SIMD16 to
+       * compute our pixel centers.
+       */
+      fs_reg int_pixel_xy(VGRF, alloc.allocate(dispatch_width / 8),
+                          BRW_REGISTER_TYPE_UW);
+
+      const fs_builder dbld = abld.exec_all().group(dispatch_width * 2, 0);
+      dbld.ADD(int_pixel_xy,
+               fs_reg(stride(suboffset(g1_uw, 4), 1, 4, 0)),
+               fs_reg(brw_imm_v(0x11001010)));
+
+      this->pixel_x = vgrf(glsl_type::float_type);
+      this->pixel_y = vgrf(glsl_type::float_type);
+      abld.emit(FS_OPCODE_PIXEL_X, this->pixel_x, int_pixel_xy);
+      abld.emit(FS_OPCODE_PIXEL_Y, this->pixel_y, int_pixel_xy);
+   } else {
+      /* The "Register Region Restrictions" page says for SNB, IVB, HSW:
+       *
+       *     "When destination spans two registers, the source MUST span two
+       *      registers."
+       *
+       * Since the GRF source of the ADD will only read a single register, we
+       * must do two separate ADDs in SIMD16.
+       */
+      fs_reg int_pixel_x = vgrf(glsl_type::uint_type);
+      fs_reg int_pixel_y = vgrf(glsl_type::uint_type);
+      int_pixel_x.type = BRW_REGISTER_TYPE_UW;
+      int_pixel_y.type = BRW_REGISTER_TYPE_UW;
+      abld.ADD(int_pixel_x,
+               fs_reg(stride(suboffset(g1_uw, 4), 2, 4, 0)),
+               fs_reg(brw_imm_v(0x10101010)));
+      abld.ADD(int_pixel_y,
+               fs_reg(stride(suboffset(g1_uw, 5), 2, 4, 0)),
+               fs_reg(brw_imm_v(0x11001100)));
+
+      /* As of gen6, we can no longer mix float and int sources.  We have
+       * to turn the integer pixel centers into floats for their actual
+       * use.
+       */
+      this->pixel_x = vgrf(glsl_type::float_type);
+      this->pixel_y = vgrf(glsl_type::float_type);
+      abld.MOV(this->pixel_x, int_pixel_x);
+      abld.MOV(this->pixel_y, int_pixel_y);
+   }
+
+   abld = bld.annotate("compute pos.w");
+   this->pixel_w = fs_reg(brw_vec8_grf(payload.source_w_reg, 0));
+   this->wpos_w = vgrf(glsl_type::float_type);
+   abld.emit(SHADER_OPCODE_RCP, this->wpos_w, this->pixel_w);
+
+   struct brw_wm_prog_data *wm_prog_data = brw_wm_prog_data(prog_data);
+   uint32_t centroid_modes = wm_prog_data->barycentric_interp_modes &
+      (1 << BRW_BARYCENTRIC_PERSPECTIVE_CENTROID |
+       1 << BRW_BARYCENTRIC_NONPERSPECTIVE_CENTROID);
+
+   for (int i = 0; i < BRW_BARYCENTRIC_MODE_COUNT; ++i) {
+      uint8_t reg = payload.barycentric_coord_reg[i];
+      this->delta_xy[i] = fs_reg(brw_vec16_grf(reg, 0));
+
+      if (devinfo->needs_unlit_centroid_workaround &&
+          (centroid_modes & (1 << i))) {
+         /* Get the pixel/sample mask into f0 so that we know which
+          * pixels are lit.  Then, for each channel that is unlit,
+          * replace the centroid data with non-centroid data.
+          */
+         bld.emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
+
+         uint8_t pixel_reg = payload.barycentric_coord_reg[i - 1];
+
+         set_predicate_inv(BRW_PREDICATE_NORMAL, true,
+                           bld.half(0).MOV(brw_vec8_grf(reg, 0),
+                                           brw_vec8_grf(pixel_reg, 0)));
+         set_predicate_inv(BRW_PREDICATE_NORMAL, true,
+                           bld.half(0).MOV(brw_vec8_grf(reg + 1, 0),
+                                           brw_vec8_grf(pixel_reg + 1, 0)));
+         if (dispatch_width == 16) {
+            set_predicate_inv(BRW_PREDICATE_NORMAL, true,
+                              bld.half(1).MOV(brw_vec8_grf(reg + 2, 0),
+                                              brw_vec8_grf(pixel_reg + 2, 0)));
+            set_predicate_inv(BRW_PREDICATE_NORMAL, true,
+                              bld.half(1).MOV(brw_vec8_grf(reg + 3, 0),
+                                              brw_vec8_grf(pixel_reg + 3, 0)));
+         }
+         assert(dispatch_width != 32); /* not implemented yet */
+      }
+   }
+}
+
+static enum brw_conditional_mod
+cond_for_alpha_func(GLenum func)
+{
+   switch(func) {
+      case GL_GREATER:
+         return BRW_CONDITIONAL_G;
+      case GL_GEQUAL:
+         return BRW_CONDITIONAL_GE;
+      case GL_LESS:
+         return BRW_CONDITIONAL_L;
+      case GL_LEQUAL:
+         return BRW_CONDITIONAL_LE;
+      case GL_EQUAL:
+         return BRW_CONDITIONAL_EQ;
+      case GL_NOTEQUAL:
+         return BRW_CONDITIONAL_NEQ;
+      default:
+         unreachable("Not reached");
+   }
+}
+
+/**
+ * Alpha test support for when we compile it into the shader instead
+ * of using the normal fixed-function alpha test.
+ */
+void
+fs_visitor::emit_alpha_test()
+{
+   assert(stage == MESA_SHADER_FRAGMENT);
+   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+   const fs_builder abld = bld.annotate("Alpha test");
+
+   fs_inst *cmp;
+   if (key->alpha_test_func == GL_ALWAYS)
+      return;
+
+   if (key->alpha_test_func == GL_NEVER) {
+      /* f0.1 = 0 */
+      fs_reg some_reg = fs_reg(retype(brw_vec8_grf(0, 0),
+                                      BRW_REGISTER_TYPE_UW));
+      cmp = abld.CMP(bld.null_reg_f(), some_reg, some_reg,
+                     BRW_CONDITIONAL_NEQ);
+   } else {
+      /* RT0 alpha */
+      fs_reg color = offset(outputs[0], bld, 3);
+
+      /* f0.1 &= func(color, ref) */
+      cmp = abld.CMP(bld.null_reg_f(), color, brw_imm_f(key->alpha_test_ref),
+                     cond_for_alpha_func(key->alpha_test_func));
+   }
+   cmp->predicate = BRW_PREDICATE_NORMAL;
+   cmp->flag_subreg = 1;
+}
+
+fs_inst *
+fs_visitor::emit_single_fb_write(const fs_builder &bld,
+                                 fs_reg color0, fs_reg color1,
+                                 fs_reg src0_alpha, unsigned components)
+{
+   assert(stage == MESA_SHADER_FRAGMENT);
+   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+
+   /* Hand over gl_FragDepth or the payload depth. */
+   const fs_reg dst_depth = (payload.dest_depth_reg ?
+                             fs_reg(brw_vec8_grf(payload.dest_depth_reg, 0)) :
+                             fs_reg());
+   fs_reg src_depth, src_stencil;
+
+   if (source_depth_to_render_target) {
+      if (nir->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
+         src_depth = frag_depth;
+      else
+         src_depth = fs_reg(brw_vec8_grf(payload.source_depth_reg, 0));
+   }
+
+   if (nir->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL))
+      src_stencil = frag_stencil;
+
+   const fs_reg sources[] = {
+      color0, color1, src0_alpha, src_depth, dst_depth, src_stencil,
+      (prog_data->uses_omask ? sample_mask : fs_reg()),
+      brw_imm_ud(components)
+   };
+   assert(ARRAY_SIZE(sources) - 1 == FB_WRITE_LOGICAL_SRC_COMPONENTS);
+   fs_inst *write = bld.emit(FS_OPCODE_FB_WRITE_LOGICAL, fs_reg(),
+                             sources, ARRAY_SIZE(sources));
+
+   if (prog_data->uses_kill) {
+      write->predicate = BRW_PREDICATE_NORMAL;
+      write->flag_subreg = 1;
+   }
+
+   return write;
+}
+
+void
+fs_visitor::emit_fb_writes()
+{
+   assert(stage == MESA_SHADER_FRAGMENT);
+   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+
+   fs_inst *inst = NULL;
+
+   if (source_depth_to_render_target && devinfo->gen == 6) {
+      /* For outputting oDepth on gen6, SIMD8 writes have to be used.  This
+       * would require SIMD8 moves of each half to message regs, e.g. by using
+       * the SIMD lowering pass.  Unfortunately this is more difficult than it
+       * sounds because the SIMD8 single-source message lacks channel selects
+       * for the second and third subspans.
+       */
+      limit_dispatch_width(8, "Depth writes unsupported in SIMD16+ mode.\n");
+   }
+
+   if (nir->info->outputs_written & BITFIELD64_BIT(FRAG_RESULT_STENCIL)) {
+      /* From the 'Render Target Write message' section of the docs:
+       * "Output Stencil is not supported with SIMD16 Render Target Write
+       * Messages."
+       */
+      limit_dispatch_width(8, "gl_FragStencilRefARB unsupported "
+                           "in SIMD16+ mode.\n");
+   }
+
+   for (int target = 0; target < key->nr_color_regions; target++) {
+      /* Skip over outputs that weren't written. */
+      if (this->outputs[target].file == BAD_FILE)
+         continue;
+
+      const fs_builder abld = bld.annotate(
+         ralloc_asprintf(this->mem_ctx, "FB write target %d", target));
+
+      fs_reg src0_alpha;
+      if (devinfo->gen >= 6 && key->replicate_alpha && target != 0)
+         src0_alpha = offset(outputs[0], bld, 3);
+
+      inst = emit_single_fb_write(abld, this->outputs[target],
+                                  this->dual_src_output, src0_alpha, 4);
+      inst->target = target;
+   }
+
+   prog_data->dual_src_blend = (this->dual_src_output.file != BAD_FILE);
+   assert(!prog_data->dual_src_blend || key->nr_color_regions == 1);
+
+   if (inst == NULL) {
+      /* Even if there's no color buffers enabled, we still need to send
+       * alpha out the pipeline to our null renderbuffer to support
+       * alpha-testing, alpha-to-coverage, and so on.
+       */
+      /* FINISHME: Factor out this frequently recurring pattern into a
+       * helper function.
+       */
+      const fs_reg srcs[] = { reg_undef, reg_undef,
+                              reg_undef, offset(this->outputs[0], bld, 3) };
+      const fs_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 4);
+      bld.LOAD_PAYLOAD(tmp, srcs, 4, 0);
+
+      inst = emit_single_fb_write(bld, tmp, reg_undef, reg_undef, 4);
+      inst->target = 0;
+   }
+
+   inst->eot = true;
+}
+
+void
+fs_visitor::setup_uniform_clipplane_values(gl_clip_plane *clip_planes)
+{
+   const struct brw_vs_prog_key *key =
+      (const struct brw_vs_prog_key *) this->key;
+
+   for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
+      this->userplane[i] = fs_reg(UNIFORM, uniforms);
+      for (int j = 0; j < 4; ++j) {
+         stage_prog_data->param[uniforms + j] =
+            (gl_constant_value *) &clip_planes[i][j];
+      }
+      uniforms += 4;
+   }
+}
+
+/**
+ * Lower legacy fixed-function and gl_ClipVertex clipping to clip distances.
+ *
+ * This does nothing if the shader uses gl_ClipDistance or user clipping is
+ * disabled altogether.
+ */
+void fs_visitor::compute_clip_distance(gl_clip_plane *clip_planes)
+{
+   struct brw_vue_prog_data *vue_prog_data = brw_vue_prog_data(prog_data);
+   const struct brw_vs_prog_key *key =
+      (const struct brw_vs_prog_key *) this->key;
+
+   /* Bail unless some sort of legacy clipping is enabled */
+   if (key->nr_userclip_plane_consts == 0)
+      return;
+
+   /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
+    *
+    *     "If a linked set of shaders forming the vertex stage contains no
+    *     static write to gl_ClipVertex or gl_ClipDistance, but the
+    *     application has requested clipping against user clip planes through
+    *     the API, then the coordinate written to gl_Position is used for
+    *     comparison against the user clip planes."
+    *
+    * This function is only called if the shader didn't write to
+    * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
+    * if the user wrote to it; otherwise we use gl_Position.
+    */
+
+   gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
+   if (!(vue_prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX))
+      clip_vertex = VARYING_SLOT_POS;
+
+   /* If the clip vertex isn't written, skip this.  Typically this means
+    * the GS will set up clipping. */
+   if (outputs[clip_vertex].file == BAD_FILE)
+      return;
+
+   setup_uniform_clipplane_values(clip_planes);
+
+   const fs_builder abld = bld.annotate("user clip distances");
+
+   this->outputs[VARYING_SLOT_CLIP_DIST0] = vgrf(glsl_type::vec4_type);
+   this->outputs[VARYING_SLOT_CLIP_DIST1] = vgrf(glsl_type::vec4_type);
+
+   for (int i = 0; i < key->nr_userclip_plane_consts; i++) {
+      fs_reg u = userplane[i];
+      const fs_reg output = offset(outputs[VARYING_SLOT_CLIP_DIST0 + i / 4],
+                                   bld, i & 3);
+
+      abld.MUL(output, outputs[clip_vertex], u);
+      for (int j = 1; j < 4; j++) {
+         u.nr = userplane[i].nr + j;
+         abld.MAD(output, output, offset(outputs[clip_vertex], bld, j), u);
+      }
+   }
+}
+
+void
+fs_visitor::emit_urb_writes(const fs_reg &gs_vertex_count)
+{
+   int slot, urb_offset, length;
+   int starting_urb_offset = 0;
+   const struct brw_vue_prog_data *vue_prog_data =
+      brw_vue_prog_data(this->prog_data);
+   const struct brw_vs_prog_key *vs_key =
+      (const struct brw_vs_prog_key *) this->key;
+   const GLbitfield64 psiz_mask =
+      VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT | VARYING_BIT_PSIZ;
+   const struct brw_vue_map *vue_map = &vue_prog_data->vue_map;
+   bool flush;
+   fs_reg sources[8];
+   fs_reg urb_handle;
+
+   if (stage == MESA_SHADER_TESS_EVAL)
+      urb_handle = fs_reg(retype(brw_vec8_grf(4, 0), BRW_REGISTER_TYPE_UD));
+   else
+      urb_handle = fs_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
+
+   /* If we don't have any valid slots to write, just do a minimal urb write
+    * send to terminate the shader.  This includes 1 slot of undefined data,
+    * because it's invalid to write 0 data:
+    *
+    * From the Broadwell PRM, Volume 7: 3D Media GPGPU, Shared Functions -
+    * Unified Return Buffer (URB) > URB_SIMD8_Write and URB_SIMD8_Read >
+    * Write Data Payload:
+    *
+    *    "The write data payload can be between 1 and 8 message phases long."
+    */
+   if (vue_map->slots_valid == 0) {
+      /* For GS, just turn EmitVertex() into a no-op.  We don't want it to
+       * end the thread, and emit_gs_thread_end() already emits a SEND with
+       * EOT at the end of the program for us.
+       */
+      if (stage == MESA_SHADER_GEOMETRY)
+         return;
+
+      fs_reg payload = fs_reg(VGRF, alloc.allocate(2), BRW_REGISTER_TYPE_UD);
+      bld.exec_all().MOV(payload, urb_handle);
+
+      fs_inst *inst = bld.emit(SHADER_OPCODE_URB_WRITE_SIMD8, reg_undef, payload);
+      inst->eot = true;
+      inst->mlen = 2;
+      inst->offset = 1;
+      return;
+   }
+
+   opcode opcode = SHADER_OPCODE_URB_WRITE_SIMD8;
+   int header_size = 1;
+   fs_reg per_slot_offsets;
+
+   if (stage == MESA_SHADER_GEOMETRY) {
+      const struct brw_gs_prog_data *gs_prog_data =
+         brw_gs_prog_data(this->prog_data);
+
+      /* We need to increment the Global Offset to skip over the control data
+       * header and the extra "Vertex Count" field (1 HWord) at the beginning
+       * of the VUE.  We're counting in OWords, so the units are doubled.
+       */
+      starting_urb_offset = 2 * gs_prog_data->control_data_header_size_hwords;
+      if (gs_prog_data->static_vertex_count == -1)
+         starting_urb_offset += 2;
+
+      /* We also need to use per-slot offsets.  The per-slot offset is the
+       * Vertex Count.  SIMD8 mode processes 8 different primitives at a
+       * time; each may output a different number of vertices.
+       */
+      opcode = SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT;
+      header_size++;
+
+      /* The URB offset is in 128-bit units, so we need to multiply by 2 */
+      const int output_vertex_size_owords =
+         gs_prog_data->output_vertex_size_hwords * 2;
+
+      if (gs_vertex_count.file == IMM) {
+         per_slot_offsets = brw_imm_ud(output_vertex_size_owords *
+                                       gs_vertex_count.ud);
+      } else {
+         per_slot_offsets = vgrf(glsl_type::int_type);
+         bld.MUL(per_slot_offsets, gs_vertex_count,
+                 brw_imm_ud(output_vertex_size_owords));
+      }
+   }
+
+   length = 0;
+   urb_offset = starting_urb_offset;
+   flush = false;
+
+   /* SSO shaders can have VUE slots allocated which are never actually
+    * written to, so ignore them when looking for the last (written) slot.
+    */
+   int last_slot = vue_map->num_slots - 1;
+   while (last_slot > 0 &&
+          (vue_map->slot_to_varying[last_slot] == BRW_VARYING_SLOT_PAD ||
+           outputs[vue_map->slot_to_varying[last_slot]].file == BAD_FILE)) {
+      last_slot--;
+   }
+
+   for (slot = 0; slot < vue_map->num_slots; slot++) {
+      int varying = vue_map->slot_to_varying[slot];
+      switch (varying) {
+      case VARYING_SLOT_PSIZ: {
+         /* The point size varying slot is the vue header and is always in the
+          * vue map.  But often none of the special varyings that live there
+          * are written and in that case we can skip writing to the vue
+          * header, provided the corresponding state properly clamps the
+          * values further down the pipeline. */
+         if ((vue_map->slots_valid & psiz_mask) == 0) {
+            assert(length == 0);
+            urb_offset++;
+            break;
+         }
+
+         fs_reg zero(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+         bld.MOV(zero, brw_imm_ud(0u));
+
+         sources[length++] = zero;
+         if (vue_map->slots_valid & VARYING_BIT_LAYER)
+            sources[length++] = this->outputs[VARYING_SLOT_LAYER];
+         else
+            sources[length++] = zero;
+
+         if (vue_map->slots_valid & VARYING_BIT_VIEWPORT)
+            sources[length++] = this->outputs[VARYING_SLOT_VIEWPORT];
+         else
+            sources[length++] = zero;
+
+         if (vue_map->slots_valid & VARYING_BIT_PSIZ)
+            sources[length++] = this->outputs[VARYING_SLOT_PSIZ];
+         else
+            sources[length++] = zero;
+         break;
+      }
+      case BRW_VARYING_SLOT_NDC:
+      case VARYING_SLOT_EDGE:
+         unreachable("unexpected scalar vs output");
+         break;
+
+      default:
+         /* gl_Position is always in the vue map, but isn't always written by
+          * the shader.  Other varyings (clip distances) get added to the vue
+          * map but don't always get written.  In those cases, the
+          * corresponding this->output[] slot will be invalid we and can skip
+          * the urb write for the varying.  If we've already queued up a vue
+          * slot for writing we flush a mlen 5 urb write, otherwise we just
+          * advance the urb_offset.
+          */
+         if (varying == BRW_VARYING_SLOT_PAD ||
+             this->outputs[varying].file == BAD_FILE) {
+            if (length > 0)
+               flush = true;
+            else
+               urb_offset++;
+            break;
+         }
+
+         if (stage == MESA_SHADER_VERTEX && vs_key->clamp_vertex_color &&
+             (varying == VARYING_SLOT_COL0 ||
+              varying == VARYING_SLOT_COL1 ||
+              varying == VARYING_SLOT_BFC0 ||
+              varying == VARYING_SLOT_BFC1)) {
+            /* We need to clamp these guys, so do a saturating MOV into a
+             * temp register and use that for the payload.
+             */
+            for (int i = 0; i < 4; i++) {
+               fs_reg reg = fs_reg(VGRF, alloc.allocate(1), outputs[varying].type);
+               fs_reg src = offset(this->outputs[varying], bld, i);
+               set_saturate(true, bld.MOV(reg, src));
+               sources[length++] = reg;
+            }
+         } else {
+            for (unsigned i = 0; i < 4; i++)
+               sources[length++] = offset(this->outputs[varying], bld, i);
+         }
+         break;
+      }
+
+      const fs_builder abld = bld.annotate("URB write");
+
+      /* If we've queued up 8 registers of payload (2 VUE slots), if this is
+       * the last slot or if we need to flush (see BAD_FILE varying case
+       * above), emit a URB write send now to flush out the data.
+       */
+      if (length == 8 || slot == last_slot)
+         flush = true;
+      if (flush) {
+         fs_reg *payload_sources =
+            ralloc_array(mem_ctx, fs_reg, length + header_size);
+         fs_reg payload = fs_reg(VGRF, alloc.allocate(length + header_size),
+                                 BRW_REGISTER_TYPE_F);
+         payload_sources[0] = urb_handle;
+
+         if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT)
+            payload_sources[1] = per_slot_offsets;
+
+         memcpy(&payload_sources[header_size], sources,
+                length * sizeof sources[0]);
+
+         abld.LOAD_PAYLOAD(payload, payload_sources, length + header_size,
+                           header_size);
+
+         fs_inst *inst = abld.emit(opcode, reg_undef, payload);
+         inst->eot = slot == last_slot && stage != MESA_SHADER_GEOMETRY;
+         inst->mlen = length + header_size;
+         inst->offset = urb_offset;
+         urb_offset = starting_urb_offset + slot + 1;
+         length = 0;
+         flush = false;
+      }
+   }
+}
+
+void
+fs_visitor::emit_cs_terminate()
+{
+   assert(devinfo->gen >= 7);
+
+   /* We are getting the thread ID from the compute shader header */
+   assert(stage == MESA_SHADER_COMPUTE);
+
+   /* We can't directly send from g0, since sends with EOT have to use
+    * g112-127. So, copy it to a virtual register, The register allocator will
+    * make sure it uses the appropriate register range.
+    */
+   struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD);
+   fs_reg payload = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+   bld.group(8, 0).exec_all().MOV(payload, g0);
+
+   /* Send a message to the thread spawner to terminate the thread. */
+   fs_inst *inst = bld.exec_all()
+                      .emit(CS_OPCODE_CS_TERMINATE, reg_undef, payload);
+   inst->eot = true;
+}
+
+void
+fs_visitor::emit_barrier()
+{
+   assert(devinfo->gen >= 7);
+   const uint32_t barrier_id_mask =
+      devinfo->gen >= 9 ? 0x8f000000u : 0x0f000000u;
+
+   /* We are getting the barrier ID from the compute shader header */
+   assert(stage == MESA_SHADER_COMPUTE);
+
+   fs_reg payload = fs_reg(VGRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD);
+
+   const fs_builder pbld = bld.exec_all().group(8, 0);
+
+   /* Clear the message payload */
+   pbld.MOV(payload, brw_imm_ud(0u));
+
+   /* Copy the barrier id from r0.2 to the message payload reg.2 */
+   fs_reg r0_2 = fs_reg(retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD));
+   pbld.AND(component(payload, 2), r0_2, brw_imm_ud(barrier_id_mask));
+
+   /* Emit a gateway "barrier" message using the payload we set up, followed
+    * by a wait instruction.
+    */
+   bld.exec_all().emit(SHADER_OPCODE_BARRIER, reg_undef, payload);
+}
+
+fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
+                       void *mem_ctx,
+                       const void *key,
+                       struct brw_stage_prog_data *prog_data,
+                       struct gl_program *prog,
+                       const nir_shader *shader,
+                       unsigned dispatch_width,
+                       int shader_time_index,
+                       const struct brw_vue_map *input_vue_map)
+   : backend_shader(compiler, log_data, mem_ctx, shader, prog_data),
+     key(key), gs_compile(NULL), prog_data(prog_data), prog(prog),
+     input_vue_map(input_vue_map),
+     dispatch_width(dispatch_width),
+     shader_time_index(shader_time_index),
+     bld(fs_builder(this, dispatch_width).at_end())
+{
+   init();
+}
+
+fs_visitor::fs_visitor(const struct brw_compiler *compiler, void *log_data,
+                       void *mem_ctx,
+                       struct brw_gs_compile *c,
+                       struct brw_gs_prog_data *prog_data,
+                       const nir_shader *shader,
+                       int shader_time_index)
+   : backend_shader(compiler, log_data, mem_ctx, shader,
+                    &prog_data->base.base),
+     key(&c->key), gs_compile(c),
+     prog_data(&prog_data->base.base), prog(NULL),
+     dispatch_width(8),
+     shader_time_index(shader_time_index),
+     bld(fs_builder(this, dispatch_width).at_end())
+{
+   init();
+}
+
+
+void
+fs_visitor::init()
+{
+   switch (stage) {
+   case MESA_SHADER_FRAGMENT:
+      key_tex = &((const brw_wm_prog_key *) key)->tex;
+      break;
+   case MESA_SHADER_VERTEX:
+      key_tex = &((const brw_vs_prog_key *) key)->tex;
+      break;
+   case MESA_SHADER_TESS_CTRL:
+      key_tex = &((const brw_tcs_prog_key *) key)->tex;
+      break;
+   case MESA_SHADER_TESS_EVAL:
+      key_tex = &((const brw_tes_prog_key *) key)->tex;
+      break;
+   case MESA_SHADER_GEOMETRY:
+      key_tex = &((const brw_gs_prog_key *) key)->tex;
+      break;
+   case MESA_SHADER_COMPUTE:
+      key_tex = &((const brw_cs_prog_key*) key)->tex;
+      break;
+   default:
+      unreachable("unhandled shader stage");
+   }
+
+   if (stage == MESA_SHADER_COMPUTE) {
+      const struct brw_cs_prog_data *cs_prog_data = brw_cs_prog_data(prog_data);
+      unsigned size = cs_prog_data->local_size[0] *
+                      cs_prog_data->local_size[1] *
+                      cs_prog_data->local_size[2];
+      size = DIV_ROUND_UP(size, devinfo->max_cs_threads);
+      min_dispatch_width = size > 16 ? 32 : (size > 8 ? 16 : 8);
+   } else {
+      min_dispatch_width = 8;
+   }
+
+   this->max_dispatch_width = 32;
+   this->prog_data = this->stage_prog_data;
+
+   this->failed = false;
+
+   this->nir_locals = NULL;
+   this->nir_ssa_values = NULL;
+
+   memset(&this->payload, 0, sizeof(this->payload));
+   this->source_depth_to_render_target = false;
+   this->runtime_check_aads_emit = false;
+   this->first_non_payload_grf = 0;
+   this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
+
+   this->virtual_grf_start = NULL;
+   this->virtual_grf_end = NULL;
+   this->live_intervals = NULL;
+   this->regs_live_at_ip = NULL;
+
+   this->uniforms = 0;
+   this->last_scratch = 0;
+   this->pull_constant_loc = NULL;
+   this->push_constant_loc = NULL;
+
+   this->promoted_constants = 0,
+
+   this->spilled_any_registers = false;
+}
+
+fs_visitor::~fs_visitor()
+{
+}
diff --git a/src/intel/compiler/brw_inst.h b/src/intel/compiler/brw_inst.h
new file mode 100644
index 00000000000..a0b8fb66dd6
--- /dev/null
+++ b/src/intel/compiler/brw_inst.h
@@ -0,0 +1,866 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file brw_inst.h
+ *
+ * A representation of i965 EU assembly instructions, with helper methods to
+ * get and set various fields.  This is the actual hardware format.
+ */
+
+#ifndef BRW_INST_H
+#define BRW_INST_H
+
+#include <assert.h>
+#include <stdint.h>
+
+#include "brw_eu_defines.h"
+#include "common/gen_device_info.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* brw_context.h has a forward declaration of brw_inst, so name the struct. */
+typedef struct brw_inst {
+   uint64_t data[2];
+} brw_inst;
+
+static inline uint64_t brw_inst_bits(const brw_inst *inst,
+                                     unsigned high, unsigned low);
+static inline void brw_inst_set_bits(brw_inst *inst,
+                                     unsigned high, unsigned low,
+                                     uint64_t value);
+
+#define FC(name, high, low, assertions)                       \
+static inline void                                            \
+brw_inst_set_##name(const struct gen_device_info *devinfo,    \
+                    brw_inst *inst, uint64_t v)               \
+{                                                             \
+   assert(assertions);                                        \
+   (void) devinfo;                                            \
+   brw_inst_set_bits(inst, high, low, v);                     \
+}                                                             \
+static inline uint64_t                                        \
+brw_inst_##name(const struct gen_device_info *devinfo,        \
+                const brw_inst *inst)                         \
+{                                                             \
+   assert(assertions);                                        \
+   (void) devinfo;                                            \
+   return brw_inst_bits(inst, high, low);                     \
+}
+
+/* A simple macro for fields which stay in the same place on all generations. */
+#define F(name, high, low) FC(name, high, low, true)
+
+#define BOUNDS(hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, hi7, lo7, hi8, lo8) \
+   unsigned high, low;                                                       \
+   if (devinfo->gen >= 8) {                                                  \
+      high = hi8;  low = lo8;                                                \
+   } else if (devinfo->gen >= 7) {                                           \
+      high = hi7;  low = lo7;                                                \
+   } else if (devinfo->gen >= 6) {                                           \
+      high = hi6;  low = lo6;                                                \
+   } else if (devinfo->gen >= 5) {                                           \
+      high = hi5;  low = lo5;                                                \
+   } else if (devinfo->is_g4x) {                                             \
+      high = hi45; low = lo45;                                               \
+   } else {                                                                  \
+      high = hi4;  low = lo4;                                                \
+   }                                                                         \
+   assert(((int) high) != -1 && ((int) low) != -1);                          \
+
+/* A general macro for cases where the field has moved to several different
+ * bit locations across generations.  GCC appears to combine cases where the
+ * bits are identical, removing some of the inefficiency.
+ */
+#define FF(name, hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, hi7, lo7, hi8, lo8)\
+static inline void                                                            \
+brw_inst_set_##name(const struct gen_device_info *devinfo,                    \
+                    brw_inst *inst, uint64_t value)                           \
+{                                                                             \
+   BOUNDS(hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, hi7, lo7, hi8, lo8)       \
+   brw_inst_set_bits(inst, high, low, value);                                 \
+}                                                                             \
+static inline uint64_t                                                        \
+brw_inst_##name(const struct gen_device_info *devinfo, const brw_inst *inst)  \
+{                                                                             \
+   BOUNDS(hi4, lo4, hi45, lo45, hi5, lo5, hi6, lo6, hi7, lo7, hi8, lo8)       \
+   return brw_inst_bits(inst, high, low);                                     \
+}
+
+/* A macro for fields which moved as of Gen8+. */
+#define F8(name, gen4_high, gen4_low, gen8_high, gen8_low) \
+FF(name,                                                   \
+   /* 4:   */ gen4_high, gen4_low,                         \
+   /* 4.5: */ gen4_high, gen4_low,                         \
+   /* 5:   */ gen4_high, gen4_low,                         \
+   /* 6:   */ gen4_high, gen4_low,                         \
+   /* 7:   */ gen4_high, gen4_low,                         \
+   /* 8:   */ gen8_high, gen8_low);
+
+F(src1_vstride,        120, 117)
+F(src1_width,          116, 114)
+F(src1_da16_swiz_w,    115, 114)
+F(src1_da16_swiz_z,    113, 112)
+F(src1_hstride,        113, 112)
+F(src1_address_mode,   111, 111)
+/** Src1.SrcMod @{ */
+F(src1_negate,         110, 110)
+F(src1_abs,            109, 109)
+/** @} */
+F8(src1_ia_subreg_nr,  /* 4+ */ 108, 106, /* 8+ */ 108, 105)
+F(src1_da_reg_nr,      108, 101)
+F(src1_da16_subreg_nr, 100, 100)
+F(src1_da1_subreg_nr,  100,  96)
+F(src1_da16_swiz_y,     99,  98)
+F(src1_da16_swiz_x,     97,  96)
+F8(src1_reg_type,      /* 4+ */  46,  44, /* 8+ */  94,  91)
+F8(src1_reg_file,      /* 4+ */  43,  42, /* 8+ */  90,  89)
+F(src0_vstride,         88,  85)
+F(src0_width,           84,  82)
+F(src0_da16_swiz_w,     83,  82)
+F(src0_da16_swiz_z,     81,  80)
+F(src0_hstride,         81,  80)
+F(src0_address_mode,    79,  79)
+/** Src0.SrcMod @{ */
+F(src0_negate,          78,  78)
+F(src0_abs,             77,  77)
+/** @} */
+F8(src0_ia_subreg_nr,  /* 4+ */  76,  74, /* 8+ */  76,  73)
+F(src0_da_reg_nr,       76,  69)
+F(src0_da16_subreg_nr,  68,  68)
+F(src0_da1_subreg_nr,   68,  64)
+F(src0_da16_swiz_y,     67,  66)
+F(src0_da16_swiz_x,     65,  64)
+F(dst_address_mode,     63,  63)
+F(dst_hstride,          62,  61)
+F8(dst_ia_subreg_nr,   /* 4+ */  60,  58, /* 8+ */  60,  57)
+F(dst_da_reg_nr,        60,  53)
+F(dst_da16_subreg_nr,   52,  52)
+F(dst_da1_subreg_nr,    52,  48)
+F(da16_writemask,       51,  48) /* Dst.ChanEn */
+F8(src0_reg_type,      /* 4+ */  41,  39, /* 8+ */  46,  43)
+F8(src0_reg_file,      /* 4+ */  38,  37, /* 8+ */  42,  41)
+F8(dst_reg_type,       /* 4+ */  36,  34, /* 8+ */  40,  37)
+F8(dst_reg_file,       /* 4+ */  33,  32, /* 8+ */  36,  35)
+F8(mask_control,       /* 4+ */   9,   9, /* 8+ */  34,  34)
+FF(flag_reg_nr,
+   /* 4-6: doesn't exist */ -1, -1, -1, -1, -1, -1, -1, -1,
+   /* 7: */ 90, 90,
+   /* 8: */ 33, 33)
+F8(flag_subreg_nr,     /* 4+ */  89, 89, /* 8+ */ 32, 32)
+F(saturate,             31,  31)
+F(debug_control,        30,  30)
+F(cmpt_control,         29,  29)
+FC(branch_control,      28,  28, devinfo->gen >= 8)
+FC(acc_wr_control,      28,  28, devinfo->gen >= 6)
+FC(mask_control_ex,     28,  28, devinfo->is_g4x || devinfo->gen == 5)
+F(cond_modifier,        27,  24)
+FC(math_function,       27,  24, devinfo->gen >= 6)
+F(exec_size,            23,  21)
+F(pred_inv,             20,  20)
+F(pred_control,         19,  16)
+F(thread_control,       15,  14)
+F(qtr_control,          13,  12)
+FF(nib_control,
+   /* 4-6: doesn't exist */ -1, -1, -1, -1, -1, -1, -1, -1,
+   /* 7: */ 47, 47,
+   /* 8: */ 11, 11)
+F8(no_dd_check,        /* 4+ */  11, 11, /* 8+ */  10,  10)
+F8(no_dd_clear,        /* 4+ */  10, 10, /* 8+ */   9,   9)
+F(access_mode,           8,   8)
+/* Bit 7 is Reserved (for future Opcode expansion) */
+F(opcode,                6,   0)
+
+/**
+ * Three-source instructions:
+ *  @{
+ */
+F(3src_src2_reg_nr,    125, 118)
+F(3src_src2_subreg_nr, 117, 115) /* Extra discontiguous bit on CHV? */
+F(3src_src2_swizzle,   114, 107)
+F(3src_src2_rep_ctrl,  106, 106)
+F(3src_src1_reg_nr,    104,  97)
+F(3src_src1_subreg_nr,  96,  94) /* Extra discontiguous bit on CHV? */
+F(3src_src1_swizzle,    93,  86)
+F(3src_src1_rep_ctrl,   85,  85)
+F(3src_src0_reg_nr,     83,  76)
+F(3src_src0_subreg_nr,  75,  73) /* Extra discontiguous bit on CHV? */
+F(3src_src0_swizzle,    72,  65)
+F(3src_src0_rep_ctrl,   64,  64)
+F(3src_dst_reg_nr,      63,  56)
+F(3src_dst_subreg_nr,   55,  53)
+F(3src_dst_writemask,   52,  49)
+F8(3src_nib_ctrl,       47, 47, 11, 11) /* only exists on IVB+ */
+F8(3src_dst_type,       45, 44, 48, 46) /* only exists on IVB+ */
+F8(3src_src_type,       43, 42, 45, 43)
+F8(3src_src2_negate,    41, 41, 42, 42)
+F8(3src_src2_abs,       40, 40, 41, 41)
+F8(3src_src1_negate,    39, 39, 40, 40)
+F8(3src_src1_abs,       38, 38, 39, 39)
+F8(3src_src0_negate,    37, 37, 38, 38)
+F8(3src_src0_abs,       36, 36, 37, 37)
+F8(3src_flag_reg_nr,    34, 34, 33, 33)
+F8(3src_flag_subreg_nr, 33, 33, 32, 32)
+FF(3src_dst_reg_file,
+   /* 4-5: doesn't exist - no 3-source instructions */ -1, -1, -1, -1, -1, -1,
+   /* 6: */ 32, 32,
+   /* 7-8: doesn't exist - no MRFs */ -1, -1, -1, -1)
+F(3src_saturate,        31, 31)
+F(3src_debug_control,   30, 30)
+F(3src_cmpt_control,    29, 29)
+F(3src_acc_wr_control,  28, 28)
+F(3src_cond_modifier,   27, 24)
+F(3src_exec_size,       23, 21)
+F(3src_pred_inv,        20, 20)
+F(3src_pred_control,    19, 16)
+F(3src_thread_control,  15, 14)
+F(3src_qtr_control,     13, 12)
+F8(3src_no_dd_check,    11, 11, 10, 10)
+F8(3src_no_dd_clear,    10, 10,  9,  9)
+F8(3src_mask_control,    9,  9, 34, 34)
+F(3src_access_mode,      8,  8)
+/* Bit 7 is Reserved (for future Opcode expansion) */
+F(3src_opcode,           6,  0)
+/** @} */
+
+/**
+ * Flow control instruction bits:
+ *  @{
+ */
+static inline void
+brw_inst_set_uip(const struct gen_device_info *devinfo,
+                 brw_inst *inst, int32_t value)
+{
+   assert(devinfo->gen >= 6);
+
+   if (devinfo->gen >= 8) {
+      brw_inst_set_bits(inst, 95, 64, (uint32_t)value);
+   } else {
+      assert(value <= (1 << 16) - 1);
+      assert(value > -(1 << 16));
+      brw_inst_set_bits(inst, 127, 112, (uint16_t)value);
+   }
+}
+
+static inline int32_t
+brw_inst_uip(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+   assert(devinfo->gen >= 6);
+
+   if (devinfo->gen >= 8) {
+      return brw_inst_bits(inst, 95, 64);
+   } else {
+      return (int16_t)brw_inst_bits(inst, 127, 112);
+   }
+}
+
+static inline void
+brw_inst_set_jip(const struct gen_device_info *devinfo,
+                 brw_inst *inst, int32_t value)
+{
+   assert(devinfo->gen >= 6);
+
+   if (devinfo->gen >= 8) {
+      brw_inst_set_bits(inst, 127, 96, (uint32_t)value);
+   } else {
+      assert(value <= (1 << 15) - 1);
+      assert(value >= -(1 << 15));
+      brw_inst_set_bits(inst, 111, 96, (uint16_t)value);
+   }
+}
+
+static inline int32_t
+brw_inst_jip(const struct gen_device_info *devinfo, const brw_inst *inst)
+{
+   assert(devinfo->gen >= 6);
+
+   if (devinfo->gen >= 8) {
+      return brw_inst_bits(inst, 127, 96);
+   } else {
+      return (int16_t)brw_inst_bits(inst, 111, 96);
+   }
+}
+
+/** Like FC, but using int16_t to handle negative jump targets. */
+#define FJ(name, high, low, assertions)                                       \
+static inline void                                                            \
+brw_inst_set_##name(const struct gen_device_info *devinfo, brw_inst *inst, int16_t v) \
+{                                                                             \
+   assert(assertions);                                                        \
+   (void) devinfo;                                                            \
+   brw_inst_set_bits(inst, high, low, (uint16_t) v);                          \
+}                                                                             \
+static inline int16_t                                                         \
+brw_inst_##name(const struct gen_device_info *devinfo, const brw_inst *inst)  \
+{                                                                             \
+   assert(assertions);                                                        \
+   (void) devinfo;                                                            \
+   return brw_inst_bits(inst, high, low);                                     \
+}
+
+FJ(gen6_jump_count,  63,  48, devinfo->gen == 6)
+FJ(gen4_jump_count, 111,  96, devinfo->gen < 6)
+FC(gen4_pop_count,  115, 112, devinfo->gen < 6)
+/** @} */
+
+/* Message descriptor bits */
+#define MD(x) ((x) + 96)
+
+/**
+ * Fields for SEND messages:
+ *  @{
+ */
+F(eot,                 127, 127)
+FF(mlen,
+   /* 4:   */ 119, 116,
+   /* 4.5: */ 119, 116,
+   /* 5:   */ 124, 121,
+   /* 6:   */ 124, 121,
+   /* 7:   */ 124, 121,
+   /* 8:   */ 124, 121);
+FF(rlen,
+   /* 4:   */ 115, 112,
+   /* 4.5: */ 115, 112,
+   /* 5:   */ 120, 116,
+   /* 6:   */ 120, 116,
+   /* 7:   */ 120, 116,
+   /* 8:   */ 120, 116);
+FF(header_present,
+   /* 4: doesn't exist */ -1, -1, -1, -1,
+   /* 5:   */ 115, 115,
+   /* 6:   */ 115, 115,
+   /* 7:   */ 115, 115,
+   /* 8:   */ 115, 115)
+F(gateway_notify, MD(16), MD(15))
+FF(function_control,
+   /* 4:   */ 111,  96,
+   /* 4.5: */ 111,  96,
+   /* 5:   */ 114,  96,
+   /* 6:   */ 114,  96,
+   /* 7:   */ 114,  96,
+   /* 8:   */ 114,  96)
+FF(gateway_subfuncid,
+   /* 4:   */ MD(1), MD(0),
+   /* 4.5: */ MD(1), MD(0),
+   /* 5:   */ MD(1), MD(0), /* 2:0, but bit 2 is reserved MBZ */
+   /* 6:   */ MD(2), MD(0),
+   /* 7:   */ MD(2), MD(0),
+   /* 8:   */ MD(2), MD(0))
+FF(sfid,
+   /* 4:   */ 123, 120, /* called msg_target */
+   /* 4.5  */ 123, 120,
+   /* 5:   */  95,  92,
+   /* 6:   */  27,  24,
+   /* 7:   */  27,  24,
+   /* 8:   */  27,  24)
+FC(base_mrf,   27,  24, devinfo->gen < 6);
+/** @} */
+
+/**
+ * URB message function control bits:
+ *  @{
+ */
+FF(urb_per_slot_offset,
+   /* 4-6: */ -1, -1, -1, -1, -1, -1, -1, -1,
+   /* 7:   */ MD(16), MD(16),
+   /* 8:   */ MD(17), MD(17))
+FC(urb_channel_mask_present, MD(15), MD(15), devinfo->gen >= 8)
+FC(urb_complete, MD(15), MD(15), devinfo->gen < 8)
+FC(urb_used, MD(14), MD(14), devinfo->gen < 7)
+FC(urb_allocate, MD(13), MD(13), devinfo->gen < 7)
+FF(urb_swizzle_control,
+   /* 4:   */ MD(11), MD(10),
+   /* 4.5: */ MD(11), MD(10),
+   /* 5:   */ MD(11), MD(10),
+   /* 6:   */ MD(11), MD(10),
+   /* 7:   */ MD(14), MD(14),
+   /* 8:   */ MD(15), MD(15))
+FF(urb_global_offset,
+   /* 4:   */ MD( 9), MD(4),
+   /* 4.5: */ MD( 9), MD(4),
+   /* 5:   */ MD( 9), MD(4),
+   /* 6:   */ MD( 9), MD(4),
+   /* 7:   */ MD(13), MD(3),
+   /* 8:   */ MD(14), MD(4))
+FF(urb_opcode,
+   /* 4:   */ MD( 3), MD(0),
+   /* 4.5: */ MD( 3), MD(0),
+   /* 5:   */ MD( 3), MD(0),
+   /* 6:   */ MD( 3), MD(0),
+   /* 7:   */ MD( 2), MD(0),
+   /* 8:   */ MD( 3), MD(0))
+/** @} */
+
+/**
+ * Gen4-5 math messages:
+ *  @{
+ */
+FC(math_msg_data_type,  MD(7), MD(7), devinfo->gen < 6)
+FC(math_msg_saturate,   MD(6), MD(6), devinfo->gen < 6)
+FC(math_msg_precision,  MD(5), MD(5), devinfo->gen < 6)
+FC(math_msg_signed_int, MD(4), MD(4), devinfo->gen < 6)
+FC(math_msg_function,   MD(3), MD(0), devinfo->gen < 6)
+/** @} */
+
+/**
+ * Sampler message function control bits:
+ *  @{
+ */
+FF(sampler_simd_mode,
+   /* 4: doesn't exist */ -1, -1, -1, -1,
+   /* 5:   */ MD(17), MD(16),
+   /* 6:   */ MD(17), MD(16),
+   /* 7:   */ MD(18), MD(17),
+   /* 8:   */ MD(18), MD(17))
+FF(sampler_msg_type,
+   /* 4:   */ MD(15), MD(14),
+   /* 4.5: */ MD(15), MD(12),
+   /* 5:   */ MD(15), MD(12),
+   /* 6:   */ MD(15), MD(12),
+   /* 7:   */ MD(16), MD(12),
+   /* 8:   */ MD(16), MD(12))
+FC(sampler_return_format, MD(13), MD(12), devinfo->gen == 4 && !devinfo->is_g4x)
+F(sampler,                MD(11), MD(8))
+F(binding_table_index,    MD( 7), MD(0)) /* also used by other messages */
+/** @} */
+
+/**
+ * Data port message function control bits:
+ *  @{
+ */
+FC(dp_category,         MD(18), MD(18), devinfo->gen >= 7)
+
+/* Gen4-5 store fields in different bits for read/write messages. */
+FF(dp_read_msg_type,
+   /* 4:   */ MD(13), MD(12),
+   /* 4.5: */ MD(13), MD(11),
+   /* 5:   */ MD(13), MD(11),
+   /* 6:   */ MD(16), MD(13),
+   /* 7:   */ MD(17), MD(14),
+   /* 8:   */ MD(17), MD(14))
+FF(dp_write_msg_type,
+   /* 4:   */ MD(14), MD(12),
+   /* 4.5: */ MD(14), MD(12),
+   /* 5:   */ MD(14), MD(12),
+   /* 6:   */ MD(16), MD(13),
+   /* 7:   */ MD(17), MD(14),
+   /* 8:   */ MD(17), MD(14))
+FF(dp_read_msg_control,
+   /* 4:   */ MD(11), MD( 8),
+   /* 4.5: */ MD(10), MD( 8),
+   /* 5:   */ MD(10), MD( 8),
+   /* 6:   */ MD(12), MD( 8),
+   /* 7:   */ MD(13), MD( 8),
+   /* 8:   */ MD(13), MD( 8))
+FF(dp_write_msg_control,
+   /* 4:   */ MD(11), MD( 8),
+   /* 4.5: */ MD(11), MD( 8),
+   /* 5:   */ MD(11), MD( 8),
+   /* 6:   */ MD(12), MD( 8),
+   /* 7:   */ MD(13), MD( 8),
+   /* 8:   */ MD(13), MD( 8))
+FC(dp_read_target_cache, MD(15), MD(14), devinfo->gen < 6);
+
+FF(dp_write_commit,
+   /* 4:   */ MD(15),  MD(15),
+   /* 4.5: */ MD(15),  MD(15),
+   /* 5:   */ MD(15),  MD(15),
+   /* 6:   */ MD(17),  MD(17),
+   /* 7+: does not exist */ -1, -1, -1, -1)
+
+/* Gen6+ use the same bit locations for everything. */
+FF(dp_msg_type,
+   /* 4-5: use dp_read_msg_type or dp_write_msg_type instead */
+   -1, -1, -1, -1, -1, -1,
+   /* 6:   */ MD(16), MD(13),
+   /* 7:   */ MD(17), MD(14),
+   /* 8:   */ MD(17), MD(14))
+FF(dp_msg_control,
+   /* 4:   */ MD(11), MD( 8),
+   /* 4.5-5: use dp_read_msg_control or dp_write_msg_control */ -1, -1, -1, -1,
+   /* 6:   */ MD(12), MD( 8),
+   /* 7:   */ MD(13), MD( 8),
+   /* 8:   */ MD(13), MD( 8))
+/** @} */
+
+/**
+ * Scratch message bits (Gen7+):
+ *  @{
+ */
+FC(scratch_read_write, MD(17), MD(17), devinfo->gen >= 7) /* 0 = read,  1 = write */
+FC(scratch_type,       MD(16), MD(16), devinfo->gen >= 7) /* 0 = OWord, 1 = DWord */
+FC(scratch_invalidate_after_read, MD(15), MD(15), devinfo->gen >= 7)
+FC(scratch_block_size,  MD(13),  MD(12), devinfo->gen >= 7)
+FC(scratch_addr_offset, MD(11),  MD( 0), devinfo->gen >= 7)
+/** @} */
+
+/**
+ * Render Target message function control bits:
+ *  @{
+ */
+FF(rt_last,
+   /* 4:   */ MD(11), MD(11),
+   /* 4.5: */ MD(11), MD(11),
+   /* 5:   */ MD(11), MD(11),
+   /* 6:   */ MD(12), MD(12),
+   /* 7:   */ MD(12), MD(12),
+   /* 8:   */ MD(12), MD(12))
+FC(rt_slot_group,      MD(11),  MD(11), devinfo->gen >= 6)
+F(rt_message_type,     MD(10),  MD( 8))
+/** @} */
+
+/**
+ * Thread Spawn message function control bits:
+ *  @{
+ */
+F(ts_resource_select,  MD( 4),  MD( 4))
+F(ts_request_type,     MD( 1),  MD( 1))
+F(ts_opcode,           MD( 0),  MD( 0))
+/** @} */
+
+/**
+ * Pixel Interpolator message function control bits:
+ *  @{
+ */
+F(pi_simd_mode,      MD(16),  MD(16))
+F(pi_nopersp,        MD(14),  MD(14))
+F(pi_message_type,   MD(13),  MD(12))
+F(pi_slot_group,     MD(11),  MD(11))
+F(pi_message_data,   MD(7),   MD(0))
+/** @} */
+
+/**
+ * Immediates:
+ *  @{
+ */
+static inline int
+brw_inst_imm_d(const struct gen_device_info *devinfo, const brw_inst *insn)
+{
+   (void) devinfo;
+   return brw_inst_bits(insn, 127, 96);
+}
+
+static inline unsigned
+brw_inst_imm_ud(const struct gen_device_info *devinfo, const brw_inst *insn)
+{
+   (void) devinfo;
+   return brw_inst_bits(insn, 127, 96);
+}
+
+static inline float
+brw_inst_imm_f(const struct gen_device_info *devinfo, const brw_inst *insn)
+{
+   union {
+      float f;
+      uint32_t u;
+   } ft;
+   (void) devinfo;
+   ft.u = brw_inst_bits(insn, 127, 96);
+   return ft.f;
+}
+
+static inline double
+brw_inst_imm_df(const struct gen_device_info *devinfo, const brw_inst *insn)
+{
+   union {
+      double d;
+      uint64_t u;
+   } dt;
+   (void) devinfo;
+   dt.u = brw_inst_bits(insn, 127, 64);
+   return dt.d;
+}
+
+static inline void
+brw_inst_set_imm_d(const struct gen_device_info *devinfo,
+                   brw_inst *insn, int value)
+{
+   (void) devinfo;
+   return brw_inst_set_bits(insn, 127, 96, value);
+}
+
+static inline void
+brw_inst_set_imm_ud(const struct gen_device_info *devinfo,
+                    brw_inst *insn, unsigned value)
+{
+   (void) devinfo;
+   return brw_inst_set_bits(insn, 127, 96, value);
+}
+
+static inline void
+brw_inst_set_imm_f(const struct gen_device_info *devinfo,
+                   brw_inst *insn, float value)
+{
+   union {
+      float f;
+      uint32_t u;
+   } ft;
+   (void) devinfo;
+   ft.f = value;
+   brw_inst_set_bits(insn, 127, 96, ft.u);
+}
+
+static inline void
+brw_inst_set_imm_df(const struct gen_device_info *devinfo,
+                    brw_inst *insn, double value)
+{
+   union {
+      double d;
+      uint64_t u;
+   } dt;
+   (void) devinfo;
+   dt.d = value;
+   brw_inst_set_bits(insn, 127, 64, dt.u);
+}
+
+static inline void
+brw_inst_set_imm_uq(const struct gen_device_info *devinfo,
+                    brw_inst *insn, uint64_t value)
+{
+   (void) devinfo;
+   brw_inst_set_bits(insn, 127, 64, value);
+}
+
+/** @} */
+
+/* The AddrImm fields are split into two discontiguous sections on Gen8+ */
+#define BRW_IA1_ADDR_IMM(reg, g4_high, g4_low, g8_nine, g8_high, g8_low) \
+static inline void                                                       \
+brw_inst_set_##reg##_ia1_addr_imm(const struct gen_device_info *devinfo, \
+                                  brw_inst *inst,                        \
+                                  unsigned value)                        \
+{                                                                        \
+   assert((value & ~0x3ff) == 0);                                        \
+   if (devinfo->gen >= 8) {                                              \
+      brw_inst_set_bits(inst, g8_high, g8_low, value & 0x1ff);           \
+      brw_inst_set_bits(inst, g8_nine, g8_nine, value >> 9);             \
+   } else {                                                              \
+      brw_inst_set_bits(inst, g4_high, g4_low, value);                   \
+   }                                                                     \
+}                                                                        \
+static inline unsigned                                                   \
+brw_inst_##reg##_ia1_addr_imm(const struct gen_device_info *devinfo,     \
+                              const brw_inst *inst)                      \
+{                                                                        \
+   if (devinfo->gen >= 8) {                                              \
+      return brw_inst_bits(inst, g8_high, g8_low) |                      \
+             (brw_inst_bits(inst, g8_nine, g8_nine) << 9);               \
+   } else {                                                              \
+      return brw_inst_bits(inst, g4_high, g4_low);                       \
+   }                                                                     \
+}
+
+/* AddrImm[9:0] for Align1 Indirect Addressing */
+/*                     -Gen 4-  ----Gen8---- */
+BRW_IA1_ADDR_IMM(src1, 105, 96, 121, 104, 96)
+BRW_IA1_ADDR_IMM(src0,  73, 64,  95,  72, 64)
+BRW_IA1_ADDR_IMM(dst,   57, 48,  47,  56, 48)
+
+#define BRW_IA16_ADDR_IMM(reg, g4_high, g4_low, g8_nine, g8_high, g8_low) \
+static inline void                                                        \
+brw_inst_set_##reg##_ia16_addr_imm(const struct gen_device_info *devinfo, \
+                                   brw_inst *inst, unsigned value)        \
+{                                                                         \
+   assert((value & ~0x3ff) == 0);                                         \
+   if (devinfo->gen >= 8) {                                               \
+      brw_inst_set_bits(inst, g8_high, g8_low, value & 0x1ff);            \
+      brw_inst_set_bits(inst, g8_nine, g8_nine, value >> 9);              \
+   } else {                                                               \
+      brw_inst_set_bits(inst, g4_high, g4_low, value >> 9);               \
+   }                                                                      \
+}                                                                         \
+static inline unsigned                                                    \
+brw_inst_##reg##_ia16_addr_imm(const struct gen_device_info *devinfo,     \
+                               const brw_inst *inst)                      \
+{                                                                         \
+   if (devinfo->gen >= 8) {                                               \
+      return brw_inst_bits(inst, g8_high, g8_low) |                       \
+             (brw_inst_bits(inst, g8_nine, g8_nine) << 9);                \
+   } else {                                                               \
+      return brw_inst_bits(inst, g4_high, g4_low);                        \
+   }                                                                      \
+}
+
+/* AddrImm[9:0] for Align16 Indirect Addressing:
+ * Compared to Align1, these are missing the low 4 bits.
+ *                     -Gen 4-  ----Gen8----
+ */
+BRW_IA16_ADDR_IMM(src1, 105, 96, 121, 104, 100)
+BRW_IA16_ADDR_IMM(src0,  73, 64,  95,  72,  68)
+BRW_IA16_ADDR_IMM(dst,   57, 52,  47,  56,  52)
+
+/**
+ * Fetch a set of contiguous bits from the instruction.
+ *
+ * Bits indices range from 0..127; fields may not cross 64-bit boundaries.
+ */
+static inline uint64_t
+brw_inst_bits(const brw_inst *inst, unsigned high, unsigned low)
+{
+   /* We assume the field doesn't cross 64-bit boundaries. */
+   const unsigned word = high / 64;
+   assert(word == low / 64);
+
+   high %= 64;
+   low %= 64;
+
+   const uint64_t mask = (~0ull >> (64 - (high - low + 1)));
+
+   return (inst->data[word] >> low) & mask;
+}
+
+/**
+ * Set bits in the instruction, with proper shifting and masking.
+ *
+ * Bits indices range from 0..127; fields may not cross 64-bit boundaries.
+ */
+static inline void
+brw_inst_set_bits(brw_inst *inst, unsigned high, unsigned low, uint64_t value)
+{
+   const unsigned word = high / 64;
+   assert(word == low / 64);
+
+   high %= 64;
+   low %= 64;
+
+   const uint64_t mask = (~0ull >> (64 - (high - low + 1))) << low;
+
+   /* Make sure the supplied value actually fits in the given bitfield. */
+   assert((value & (mask >> low)) == value);
+
+   inst->data[word] = (inst->data[word] & ~mask) | (value << low);
+}
+
+#undef BRW_IA16_ADDR_IMM
+#undef BRW_IA1_ADDR_IMM
+#undef MD
+#undef F8
+#undef FF
+#undef BOUNDS
+#undef F
+#undef FC
+
+typedef struct {
+   uint64_t data;
+} brw_compact_inst;
+
+/**
+ * Fetch a set of contiguous bits from the compacted instruction.
+ *
+ * Bits indices range from 0..63.
+ */
+static inline unsigned
+brw_compact_inst_bits(const brw_compact_inst *inst, unsigned high, unsigned low)
+{
+   const uint64_t mask = (1ull << (high - low + 1)) - 1;
+
+   return (inst->data >> low) & mask;
+}
+
+/**
+ * Set bits in the compacted instruction.
+ *
+ * Bits indices range from 0..63.
+ */
+static inline void
+brw_compact_inst_set_bits(brw_compact_inst *inst, unsigned high, unsigned low,
+                          uint64_t value)
+{
+   const uint64_t mask = ((1ull << (high - low + 1)) - 1) << low;
+
+   /* Make sure the supplied value actually fits in the given bitfield. */
+   assert((value & (mask >> low)) == value);
+
+   inst->data = (inst->data & ~mask) | (value << low);
+}
+
+#define FC(name, high, low, assertions)                            \
+static inline void                                                 \
+brw_compact_inst_set_##name(const struct gen_device_info *devinfo, \
+                            brw_compact_inst *inst, unsigned v)    \
+{                                                                  \
+   assert(assertions);                                             \
+   (void) devinfo;                                                 \
+   brw_compact_inst_set_bits(inst, high, low, v);                  \
+}                                                                  \
+static inline unsigned                                             \
+brw_compact_inst_##name(const struct gen_device_info *devinfo,     \
+                        const brw_compact_inst *inst)              \
+{                                                                  \
+   assert(assertions);                                             \
+   (void) devinfo;                                                 \
+   return brw_compact_inst_bits(inst, high, low);                  \
+}
+
+/* A simple macro for fields which stay in the same place on all generations. */
+#define F(name, high, low) FC(name, high, low, true)
+
+F(src1_reg_nr,      63, 56)
+F(src0_reg_nr,      55, 48)
+F(dst_reg_nr,       47, 40)
+F(src1_index,       39, 35)
+F(src0_index,       34, 30)
+F(cmpt_control,     29, 29) /* Same location as brw_inst */
+FC(flag_subreg_nr,  28, 28, devinfo->gen <= 6)
+F(cond_modifier,    27, 24) /* Same location as brw_inst */
+FC(acc_wr_control,  23, 23, devinfo->gen >= 6)
+FC(mask_control_ex, 23, 23, devinfo->is_g4x || devinfo->gen == 5)
+F(subreg_index,     22, 18)
+F(datatype_index,   17, 13)
+F(control_index,    12,  8)
+F(debug_control,     7,  7)
+F(opcode,            6,  0) /* Same location as brw_inst */
+
+/**
+ * (Gen8+) Compacted three-source instructions:
+ *  @{
+ */
+FC(3src_src2_reg_nr,    63, 57, devinfo->gen >= 8)
+FC(3src_src1_reg_nr,    56, 50, devinfo->gen >= 8)
+FC(3src_src0_reg_nr,    49, 43, devinfo->gen >= 8)
+FC(3src_src2_subreg_nr, 42, 40, devinfo->gen >= 8)
+FC(3src_src1_subreg_nr, 39, 37, devinfo->gen >= 8)
+FC(3src_src0_subreg_nr, 36, 34, devinfo->gen >= 8)
+FC(3src_src2_rep_ctrl,  33, 33, devinfo->gen >= 8)
+FC(3src_src1_rep_ctrl,  32, 32, devinfo->gen >= 8)
+FC(3src_saturate,       31, 31, devinfo->gen >= 8)
+FC(3src_debug_control,  30, 30, devinfo->gen >= 8)
+FC(3src_cmpt_control,   29, 29, devinfo->gen >= 8)
+FC(3src_src0_rep_ctrl,  28, 28, devinfo->gen >= 8)
+/* Reserved */
+FC(3src_dst_reg_nr,     18, 12, devinfo->gen >= 8)
+FC(3src_source_index,   11, 10, devinfo->gen >= 8)
+FC(3src_control_index,   9,  8, devinfo->gen >= 8)
+/* Bit 7 is Reserved (for future Opcode expansion) */
+FC(3src_opcode,          6,  0, devinfo->gen >= 8)
+/** @} */
+
+#undef F
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/intel/compiler/brw_interpolation_map.c b/src/intel/compiler/brw_interpolation_map.c
new file mode 100644
index 00000000000..7b9f58eb6ee
--- /dev/null
+++ b/src/intel/compiler/brw_interpolation_map.c
@@ -0,0 +1,109 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_compiler.h"
+#include "compiler/nir/nir.h"
+
+static char const *get_qual_name(int mode)
+{
+   switch (mode) {
+      case INTERP_MODE_NONE:          return "none";
+      case INTERP_MODE_FLAT:          return "flat";
+      case INTERP_MODE_SMOOTH:        return "smooth";
+      case INTERP_MODE_NOPERSPECTIVE: return "nopersp";
+      default:                             return "???";
+   }
+}
+
+static void
+gen4_frag_prog_set_interp_modes(struct brw_wm_prog_data *prog_data,
+                                struct brw_vue_map *vue_map,
+                                unsigned location, unsigned slot_count,
+                                enum glsl_interp_mode interp)
+{
+   for (unsigned k = 0; k < slot_count; k++) {
+      unsigned slot = vue_map->varying_to_slot[location + k];
+      if (slot != -1 && prog_data->interp_mode[slot] == INTERP_MODE_NONE) {
+         prog_data->interp_mode[slot] = interp;
+
+         if (prog_data->interp_mode[slot] == INTERP_MODE_FLAT) {
+            prog_data->contains_flat_varying = true;
+         } else if (prog_data->interp_mode[slot] == INTERP_MODE_NOPERSPECTIVE) {
+            prog_data->contains_noperspective_varying = true;
+         }
+      }
+   }
+}
+
+/* Set up interpolation modes for every element in the VUE */
+void
+brw_setup_vue_interpolation(struct brw_vue_map *vue_map, nir_shader *nir,
+                            struct brw_wm_prog_data *prog_data,
+                            const struct gen_device_info *devinfo)
+{
+   /* Initialise interp_mode. INTERP_MODE_NONE == 0 */
+   memset(prog_data->interp_mode, 0, sizeof(prog_data->interp_mode));
+
+   if (!vue_map)
+      return;
+
+   /* HPOS always wants noperspective. setting it up here allows
+    * us to not need special handling in the SF program.
+    */
+   unsigned pos_slot = vue_map->varying_to_slot[VARYING_SLOT_POS];
+   if (pos_slot != -1) {;
+      prog_data->interp_mode[pos_slot] = INTERP_MODE_NOPERSPECTIVE;
+      prog_data->contains_noperspective_varying = true;
+   }
+
+   foreach_list_typed(nir_variable, var, node, &nir->inputs) {
+      unsigned location = var->data.location;
+      unsigned slot_count = glsl_count_attribute_slots(var->type, false);
+
+      gen4_frag_prog_set_interp_modes(prog_data, vue_map, location, slot_count,
+                                      var->data.interpolation);
+
+      if (location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1) {
+         location = location + VARYING_SLOT_BFC0 - VARYING_SLOT_COL0;
+         gen4_frag_prog_set_interp_modes(prog_data, vue_map, location,
+                                         slot_count, var->data.interpolation);
+      }
+   }
+
+   bool debug = false;
+   if (debug) {
+      fprintf(stderr, "VUE map:\n");
+      for (int i = 0; i < vue_map->num_slots; i++) {
+         int varying = vue_map->slot_to_varying[i];
+         if (varying == -1) {
+            fprintf(stderr, "%d: --\n", i);
+            continue;
+         }
+
+         fprintf(stderr, "%d: %d %s ofs %d\n",
+                 i, varying,
+                 get_qual_name(prog_data->interp_mode[i]),
+                 brw_vue_slot_to_offset(i));
+      }
+   }
+}
diff --git a/src/intel/compiler/brw_ir_allocator.h b/src/intel/compiler/brw_ir_allocator.h
new file mode 100644
index 00000000000..b1237ed38e7
--- /dev/null
+++ b/src/intel/compiler/brw_ir_allocator.h
@@ -0,0 +1,87 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2010-2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_IR_ALLOCATOR_H
+#define BRW_IR_ALLOCATOR_H
+
+#include "main/macros.h"
+
+namespace brw {
+   /**
+    * Simple allocator used to keep track of virtual GRFs.
+    */
+   class simple_allocator {
+   public:
+      simple_allocator() :
+         sizes(NULL), offsets(NULL), count(0), total_size(0), capacity(0)
+      {
+      }
+
+      ~simple_allocator()
+      {
+         free(offsets);
+         free(sizes);
+      }
+
+      unsigned
+      allocate(unsigned size)
+      {
+         if (capacity <= count) {
+            capacity = MAX2(16, capacity * 2);
+            sizes = (unsigned *)realloc(sizes, capacity * sizeof(unsigned));
+            offsets = (unsigned *)realloc(offsets, capacity * sizeof(unsigned));
+         }
+
+         sizes[count] = size;
+         offsets[count] = total_size;
+         total_size += size;
+
+         return count++;
+      }
+
+      /**
+       * Array of sizes for each allocation.  The allocation unit is up to the
+       * back-end, but it's expected to be one scalar value in the FS back-end
+       * and one vec4 in the VEC4 back-end.
+       */
+      unsigned *sizes;
+
+      /**
+       * Array of offsets from the start of the VGRF space in allocation
+       * units.
+       */
+      unsigned *offsets;
+
+      /** Total number of VGRFs allocated. */
+      unsigned count;
+
+      /** Cumulative size in allocation units. */
+      unsigned total_size;
+
+   private:
+      unsigned capacity;
+   };
+}
+
+#endif
diff --git a/src/intel/compiler/brw_ir_fs.h b/src/intel/compiler/brw_ir_fs.h
new file mode 100644
index 00000000000..cad371248c4
--- /dev/null
+++ b/src/intel/compiler/brw_ir_fs.h
@@ -0,0 +1,451 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2010-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_IR_FS_H
+#define BRW_IR_FS_H
+
+#include "brw_shader.h"
+
+class fs_inst;
+
+class fs_reg : public backend_reg {
+public:
+   DECLARE_RALLOC_CXX_OPERATORS(fs_reg)
+
+   void init();
+
+   fs_reg();
+   fs_reg(struct ::brw_reg reg);
+   fs_reg(enum brw_reg_file file, int nr);
+   fs_reg(enum brw_reg_file file, int nr, enum brw_reg_type type);
+
+   bool equals(const fs_reg &r) const;
+   bool is_contiguous() const;
+
+   /**
+    * Return the size in bytes of a single logical component of the
+    * register assuming the given execution width.
+    */
+   unsigned component_size(unsigned width) const;
+
+   /** Register region horizontal stride */
+   uint8_t stride;
+};
+
+static inline fs_reg
+negate(fs_reg reg)
+{
+   assert(reg.file != IMM);
+   reg.negate = !reg.negate;
+   return reg;
+}
+
+static inline fs_reg
+retype(fs_reg reg, enum brw_reg_type type)
+{
+   reg.type = type;
+   return reg;
+}
+
+static inline fs_reg
+byte_offset(fs_reg reg, unsigned delta)
+{
+   switch (reg.file) {
+   case BAD_FILE:
+      break;
+   case VGRF:
+   case ATTR:
+   case UNIFORM:
+      reg.offset += delta;
+      break;
+   case MRF: {
+      const unsigned suboffset = reg.offset + delta;
+      reg.nr += suboffset / REG_SIZE;
+      reg.offset = suboffset % REG_SIZE;
+      break;
+   }
+   case ARF:
+   case FIXED_GRF: {
+      const unsigned suboffset = reg.subnr + delta;
+      reg.nr += suboffset / REG_SIZE;
+      reg.subnr = suboffset % REG_SIZE;
+      break;
+   }
+   case IMM:
+   default:
+      assert(delta == 0);
+   }
+   return reg;
+}
+
+static inline fs_reg
+horiz_offset(const fs_reg &reg, unsigned delta)
+{
+   switch (reg.file) {
+   case BAD_FILE:
+   case UNIFORM:
+   case IMM:
+      /* These only have a single component that is implicitly splatted.  A
+       * horizontal offset should be a harmless no-op.
+       * XXX - Handle vector immediates correctly.
+       */
+      return reg;
+   case VGRF:
+   case MRF:
+   case ATTR:
+      return byte_offset(reg, delta * reg.stride * type_sz(reg.type));
+   case ARF:
+   case FIXED_GRF:
+      if (reg.is_null()) {
+         return reg;
+      } else {
+         const unsigned stride = reg.hstride ? 1 << (reg.hstride - 1) : 0;
+         return byte_offset(reg, delta * stride * type_sz(reg.type));
+      }
+   }
+   unreachable("Invalid register file");
+}
+
+static inline fs_reg
+offset(fs_reg reg, unsigned width, unsigned delta)
+{
+   switch (reg.file) {
+   case BAD_FILE:
+      break;
+   case ARF:
+   case FIXED_GRF:
+   case MRF:
+   case VGRF:
+   case ATTR:
+   case UNIFORM:
+      return byte_offset(reg, delta * reg.component_size(width));
+   case IMM:
+      assert(delta == 0);
+   }
+   return reg;
+}
+
+/**
+ * Get the scalar channel of \p reg given by \p idx and replicate it to all
+ * channels of the result.
+ */
+static inline fs_reg
+component(fs_reg reg, unsigned idx)
+{
+   reg = horiz_offset(reg, idx);
+   reg.stride = 0;
+   return reg;
+}
+
+/**
+ * Return an integer identifying the discrete address space a register is
+ * contained in.  A register is by definition fully contained in the single
+ * reg_space it belongs to, so two registers with different reg_space ids are
+ * guaranteed not to overlap.  Most register files are a single reg_space of
+ * its own, only the VGRF file is composed of multiple discrete address
+ * spaces, one for each VGRF allocation.
+ */
+static inline uint32_t
+reg_space(const fs_reg &r)
+{
+   return r.file << 16 | (r.file == VGRF ? r.nr : 0);
+}
+
+/**
+ * Return the base offset in bytes of a register relative to the start of its
+ * reg_space().
+ */
+static inline unsigned
+reg_offset(const fs_reg &r)
+{
+   return (r.file == VGRF || r.file == IMM ? 0 : r.nr) *
+          (r.file == UNIFORM ? 4 : REG_SIZE) + r.offset +
+          (r.file == ARF || r.file == FIXED_GRF ? r.subnr : 0);
+}
+
+/**
+ * Return the amount of padding in bytes left unused between individual
+ * components of register \p r due to a (horizontal) stride value greater than
+ * one, or zero if components are tightly packed in the register file.
+ */
+static inline unsigned
+reg_padding(const fs_reg &r)
+{
+   const unsigned stride = ((r.file != ARF && r.file != FIXED_GRF) ? r.stride :
+                            r.hstride == 0 ? 0 :
+                            1 << (r.hstride - 1));
+   return (MAX2(1, stride) - 1) * type_sz(r.type);
+}
+
+/**
+ * Return whether the register region starting at \p r and spanning \p dr
+ * bytes could potentially overlap the register region starting at \p s and
+ * spanning \p ds bytes.
+ */
+static inline bool
+regions_overlap(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds)
+{
+   if (r.file == MRF && (r.nr & BRW_MRF_COMPR4)) {
+      fs_reg t = r;
+      t.nr &= ~BRW_MRF_COMPR4;
+      /* COMPR4 regions are translated by the hardware during decompression
+       * into two separate half-regions 4 MRFs apart from each other.
+       */
+      return regions_overlap(t, dr / 2, s, ds) ||
+             regions_overlap(byte_offset(t, 4 * REG_SIZE), dr / 2, s, ds);
+
+   } else if (s.file == MRF && (s.nr & BRW_MRF_COMPR4)) {
+      return regions_overlap(s, ds, r, dr);
+
+   } else {
+      return reg_space(r) == reg_space(s) &&
+             !(reg_offset(r) + dr <= reg_offset(s) ||
+               reg_offset(s) + ds <= reg_offset(r));
+   }
+}
+
+/**
+ * Check that the register region given by r [r.offset, r.offset + dr[
+ * is fully contained inside the register region given by s
+ * [s.offset, s.offset + ds[.
+ */
+static inline bool
+region_contained_in(const fs_reg &r, unsigned dr, const fs_reg &s, unsigned ds)
+{
+   return reg_space(r) == reg_space(s) &&
+          reg_offset(r) >= reg_offset(s) &&
+          reg_offset(r) + dr <= reg_offset(s) + ds;
+}
+
+/**
+ * Return whether the given register region is n-periodic, i.e. whether the
+ * original region remains invariant after shifting it by \p n scalar
+ * channels.
+ */
+static inline bool
+is_periodic(const fs_reg &reg, unsigned n)
+{
+   if (reg.file == BAD_FILE || reg.is_null()) {
+      return true;
+
+   } else if (reg.file == IMM) {
+      const unsigned period = (reg.type == BRW_REGISTER_TYPE_UV ||
+                               reg.type == BRW_REGISTER_TYPE_V ? 8 :
+                               reg.type == BRW_REGISTER_TYPE_VF ? 4 :
+                               1);
+      return n % period == 0;
+
+   } else if (reg.file == ARF || reg.file == FIXED_GRF) {
+      const unsigned period = (reg.hstride == 0 && reg.vstride == 0 ? 1 :
+                               reg.vstride == 0 ? 1 << reg.width :
+                               ~0);
+      return n % period == 0;
+
+   } else {
+      return reg.stride == 0;
+   }
+}
+
+static inline bool
+is_uniform(const fs_reg &reg)
+{
+   return is_periodic(reg, 1);
+}
+
+/**
+ * Get the specified 8-component quarter of a register.
+ * XXX - Maybe come up with a less misleading name for this (e.g. quarter())?
+ */
+static inline fs_reg
+half(const fs_reg &reg, unsigned idx)
+{
+   assert(idx < 2);
+   return horiz_offset(reg, 8 * idx);
+}
+
+/**
+ * Reinterpret each channel of register \p reg as a vector of values of the
+ * given smaller type and take the i-th subcomponent from each.
+ */
+static inline fs_reg
+subscript(fs_reg reg, brw_reg_type type, unsigned i)
+{
+   assert((i + 1) * type_sz(type) <= type_sz(reg.type));
+
+   if (reg.file == ARF || reg.file == FIXED_GRF) {
+      /* The stride is encoded inconsistently for fixed GRF and ARF registers
+       * as the log2 of the actual vertical and horizontal strides.
+       */
+      const int delta = _mesa_logbase2(type_sz(reg.type)) -
+                        _mesa_logbase2(type_sz(type));
+      reg.hstride += (reg.hstride ? delta : 0);
+      reg.vstride += (reg.vstride ? delta : 0);
+
+   } else if (reg.file == IMM) {
+      assert(reg.type == type);
+
+   } else {
+      reg.stride *= type_sz(reg.type) / type_sz(type);
+   }
+
+   return byte_offset(retype(reg, type), i * type_sz(type));
+}
+
+static const fs_reg reg_undef;
+
+class fs_inst : public backend_instruction {
+   fs_inst &operator=(const fs_inst &);
+
+   void init(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
+             const fs_reg *src, unsigned sources);
+
+public:
+   DECLARE_RALLOC_CXX_OPERATORS(fs_inst)
+
+   fs_inst();
+   fs_inst(enum opcode opcode, uint8_t exec_size);
+   fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst);
+   fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
+           const fs_reg &src0);
+   fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
+           const fs_reg &src0, const fs_reg &src1);
+   fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
+           const fs_reg &src0, const fs_reg &src1, const fs_reg &src2);
+   fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
+           const fs_reg src[], unsigned sources);
+   fs_inst(const fs_inst &that);
+   ~fs_inst();
+
+   void resize_sources(uint8_t num_sources);
+
+   bool equals(fs_inst *inst) const;
+   bool is_send_from_grf() const;
+   bool is_partial_write() const;
+   bool is_copy_payload(const brw::simple_allocator &grf_alloc) const;
+   unsigned components_read(unsigned i) const;
+   unsigned size_read(int arg) const;
+   bool can_do_source_mods(const struct gen_device_info *devinfo);
+   bool can_change_types() const;
+   bool has_side_effects() const;
+   bool has_source_and_destination_hazard() const;
+
+   /**
+    * Return the subset of flag registers read by the instruction as a bitset
+    * with byte granularity.
+    */
+   unsigned flags_read(const gen_device_info *devinfo) const;
+
+   /**
+    * Return the subset of flag registers updated by the instruction (either
+    * partially or fully) as a bitset with byte granularity.
+    */
+   unsigned flags_written() const;
+
+   fs_reg dst;
+   fs_reg *src;
+
+   uint8_t sources; /**< Number of fs_reg sources. */
+
+   bool eot:1;
+   bool pi_noperspective:1;   /**< Pixel interpolator noperspective flag */
+};
+
+/**
+ * Make the execution of \p inst dependent on the evaluation of a possibly
+ * inverted predicate.
+ */
+static inline fs_inst *
+set_predicate_inv(enum brw_predicate pred, bool inverse,
+                  fs_inst *inst)
+{
+   inst->predicate = pred;
+   inst->predicate_inverse = inverse;
+   return inst;
+}
+
+/**
+ * Make the execution of \p inst dependent on the evaluation of a predicate.
+ */
+static inline fs_inst *
+set_predicate(enum brw_predicate pred, fs_inst *inst)
+{
+   return set_predicate_inv(pred, false, inst);
+}
+
+/**
+ * Write the result of evaluating the condition given by \p mod to a flag
+ * register.
+ */
+static inline fs_inst *
+set_condmod(enum brw_conditional_mod mod, fs_inst *inst)
+{
+   inst->conditional_mod = mod;
+   return inst;
+}
+
+/**
+ * Clamp the result of \p inst to the saturation range of its destination
+ * datatype.
+ */
+static inline fs_inst *
+set_saturate(bool saturate, fs_inst *inst)
+{
+   inst->saturate = saturate;
+   return inst;
+}
+
+/**
+ * Return the number of dataflow registers written by the instruction (either
+ * fully or partially) counted from 'floor(reg_offset(inst->dst) /
+ * register_size)'.  The somewhat arbitrary register size unit is 4B for the
+ * UNIFORM and IMM files and 32B for all other files.
+ */
+inline unsigned
+regs_written(const fs_inst *inst)
+{
+   assert(inst->dst.file != UNIFORM && inst->dst.file != IMM);
+   return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE +
+                       inst->size_written -
+                       MIN2(inst->size_written, reg_padding(inst->dst)),
+                       REG_SIZE);
+}
+
+/**
+ * Return the number of dataflow registers read by the instruction (either
+ * fully or partially) counted from 'floor(reg_offset(inst->src[i]) /
+ * register_size)'.  The somewhat arbitrary register size unit is 4B for the
+ * UNIFORM and IMM files and 32B for all other files.
+ */
+inline unsigned
+regs_read(const fs_inst *inst, unsigned i)
+{
+   const unsigned reg_size =
+      inst->src[i].file == UNIFORM || inst->src[i].file == IMM ? 4 : REG_SIZE;
+   return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size +
+                       inst->size_read(i) -
+                       MIN2(inst->size_read(i), reg_padding(inst->src[i])),
+                       reg_size);
+}
+
+#endif
diff --git a/src/intel/compiler/brw_ir_vec4.h b/src/intel/compiler/brw_ir_vec4.h
new file mode 100644
index 00000000000..bd026eb2aeb
--- /dev/null
+++ b/src/intel/compiler/brw_ir_vec4.h
@@ -0,0 +1,409 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2011-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_IR_VEC4_H
+#define BRW_IR_VEC4_H
+
+#include "brw_shader.h"
+
+namespace brw {
+
+class dst_reg;
+
+class src_reg : public backend_reg
+{
+public:
+   DECLARE_RALLOC_CXX_OPERATORS(src_reg)
+
+   void init();
+
+   src_reg(enum brw_reg_file file, int nr, const glsl_type *type);
+   src_reg();
+   src_reg(struct ::brw_reg reg);
+
+   bool equals(const src_reg &r) const;
+
+   src_reg(class vec4_visitor *v, const struct glsl_type *type);
+   src_reg(class vec4_visitor *v, const struct glsl_type *type, int size);
+
+   explicit src_reg(const dst_reg &reg);
+
+   src_reg *reladdr;
+};
+
+static inline src_reg
+retype(src_reg reg, enum brw_reg_type type)
+{
+   reg.type = type;
+   return reg;
+}
+
+namespace detail {
+
+static inline void
+add_byte_offset(backend_reg *reg, unsigned bytes)
+{
+   switch (reg->file) {
+      case BAD_FILE:
+         break;
+      case VGRF:
+      case ATTR:
+      case UNIFORM:
+         reg->offset += bytes;
+         assert(reg->offset % 16 == 0);
+         break;
+      case MRF: {
+         const unsigned suboffset = reg->offset + bytes;
+         reg->nr += suboffset / REG_SIZE;
+         reg->offset = suboffset % REG_SIZE;
+         assert(reg->offset % 16 == 0);
+         break;
+      }
+      case ARF:
+      case FIXED_GRF: {
+         const unsigned suboffset = reg->subnr + bytes;
+         reg->nr += suboffset / REG_SIZE;
+         reg->subnr = suboffset % REG_SIZE;
+         assert(reg->subnr % 16 == 0);
+         break;
+      }
+      default:
+         assert(bytes == 0);
+   }
+}
+
+} /* namepace detail */
+
+static inline src_reg
+byte_offset(src_reg reg, unsigned bytes)
+{
+   detail::add_byte_offset(&reg, bytes);
+   return reg;
+}
+
+static inline src_reg
+offset(src_reg reg, unsigned width, unsigned delta)
+{
+   const unsigned stride = (reg.file == UNIFORM ? 0 : 4);
+   const unsigned num_components = MAX2(width / 4 * stride, 4);
+   return byte_offset(reg, num_components * type_sz(reg.type) * delta);
+}
+
+static inline src_reg
+horiz_offset(src_reg reg, unsigned delta)
+{
+   return byte_offset(reg, delta * type_sz(reg.type));
+}
+
+/**
+ * Reswizzle a given source register.
+ * \sa brw_swizzle().
+ */
+static inline src_reg
+swizzle(src_reg reg, unsigned swizzle)
+{
+   if (reg.file == IMM)
+      reg.ud = brw_swizzle_immediate(reg.type, reg.ud, swizzle);
+   else
+      reg.swizzle = brw_compose_swizzle(swizzle, reg.swizzle);
+
+   return reg;
+}
+
+static inline src_reg
+negate(src_reg reg)
+{
+   assert(reg.file != IMM);
+   reg.negate = !reg.negate;
+   return reg;
+}
+
+static inline bool
+is_uniform(const src_reg &reg)
+{
+   return (reg.file == IMM || reg.file == UNIFORM || reg.is_null()) &&
+          (!reg.reladdr || is_uniform(*reg.reladdr));
+}
+
+class dst_reg : public backend_reg
+{
+public:
+   DECLARE_RALLOC_CXX_OPERATORS(dst_reg)
+
+   void init();
+
+   dst_reg();
+   dst_reg(enum brw_reg_file file, int nr);
+   dst_reg(enum brw_reg_file file, int nr, const glsl_type *type,
+           unsigned writemask);
+   dst_reg(enum brw_reg_file file, int nr, brw_reg_type type,
+           unsigned writemask);
+   dst_reg(struct ::brw_reg reg);
+   dst_reg(class vec4_visitor *v, const struct glsl_type *type);
+
+   explicit dst_reg(const src_reg &reg);
+
+   bool equals(const dst_reg &r) const;
+
+   src_reg *reladdr;
+};
+
+static inline dst_reg
+retype(dst_reg reg, enum brw_reg_type type)
+{
+   reg.type = type;
+   return reg;
+}
+
+static inline dst_reg
+byte_offset(dst_reg reg, unsigned bytes)
+{
+   detail::add_byte_offset(&reg, bytes);
+   return reg;
+}
+
+static inline dst_reg
+offset(dst_reg reg, unsigned width, unsigned delta)
+{
+   const unsigned stride = (reg.file == UNIFORM ? 0 : 4);
+   const unsigned num_components = MAX2(width / 4 * stride, 4);
+   return byte_offset(reg, num_components * type_sz(reg.type) * delta);
+}
+
+static inline dst_reg
+horiz_offset(dst_reg reg, unsigned delta)
+{
+   return byte_offset(reg, delta * type_sz(reg.type));
+}
+
+static inline dst_reg
+writemask(dst_reg reg, unsigned mask)
+{
+   assert(reg.file != IMM);
+   assert((reg.writemask & mask) != 0);
+   reg.writemask &= mask;
+   return reg;
+}
+
+/**
+ * Return an integer identifying the discrete address space a register is
+ * contained in.  A register is by definition fully contained in the single
+ * reg_space it belongs to, so two registers with different reg_space ids are
+ * guaranteed not to overlap.  Most register files are a single reg_space of
+ * its own, only the VGRF file is composed of multiple discrete address
+ * spaces, one for each VGRF allocation.
+ */
+static inline uint32_t
+reg_space(const backend_reg &r)
+{
+   return r.file << 16 | (r.file == VGRF ? r.nr : 0);
+}
+
+/**
+ * Return the base offset in bytes of a register relative to the start of its
+ * reg_space().
+ */
+static inline unsigned
+reg_offset(const backend_reg &r)
+{
+   return (r.file == VGRF || r.file == IMM ? 0 : r.nr) *
+          (r.file == UNIFORM ? 16 : REG_SIZE) + r.offset +
+          (r.file == ARF || r.file == FIXED_GRF ? r.subnr : 0);
+}
+
+/**
+ * Return whether the register region starting at \p r and spanning \p dr
+ * bytes could potentially overlap the register region starting at \p s and
+ * spanning \p ds bytes.
+ */
+static inline bool
+regions_overlap(const backend_reg &r, unsigned dr,
+                const backend_reg &s, unsigned ds)
+{
+   if (r.file == MRF && (r.nr & BRW_MRF_COMPR4)) {
+      /* COMPR4 regions are translated by the hardware during decompression
+       * into two separate half-regions 4 MRFs apart from each other.
+       */
+      backend_reg t0 = r;
+      t0.nr &= ~BRW_MRF_COMPR4;
+      backend_reg t1 = t0;
+      t1.offset += 4 * REG_SIZE;
+      return regions_overlap(t0, dr / 2, s, ds) ||
+             regions_overlap(t1, dr / 2, s, ds);
+
+   } else if (s.file == MRF && (s.nr & BRW_MRF_COMPR4)) {
+      return regions_overlap(s, ds, r, dr);
+
+   } else {
+      return reg_space(r) == reg_space(s) &&
+             !(reg_offset(r) + dr <= reg_offset(s) ||
+               reg_offset(s) + ds <= reg_offset(r));
+   }
+}
+
+class vec4_instruction : public backend_instruction {
+public:
+   DECLARE_RALLOC_CXX_OPERATORS(vec4_instruction)
+
+   vec4_instruction(enum opcode opcode,
+                    const dst_reg &dst = dst_reg(),
+                    const src_reg &src0 = src_reg(),
+                    const src_reg &src1 = src_reg(),
+                    const src_reg &src2 = src_reg());
+
+   dst_reg dst;
+   src_reg src[3];
+
+   enum brw_urb_write_flags urb_write_flags;
+
+   unsigned sol_binding; /**< gen6: SOL binding table index */
+   bool sol_final_write; /**< gen6: send commit message */
+   unsigned sol_vertex; /**< gen6: used for setting dst index in SVB header */
+
+   bool is_send_from_grf();
+   unsigned size_read(unsigned arg) const;
+   bool can_reswizzle(const struct gen_device_info *devinfo, int dst_writemask,
+                      int swizzle, int swizzle_mask);
+   void reswizzle(int dst_writemask, int swizzle);
+   bool can_do_source_mods(const struct gen_device_info *devinfo);
+   bool can_do_writemask(const struct gen_device_info *devinfo);
+   bool can_change_types() const;
+   bool has_source_and_destination_hazard() const;
+
+   bool is_align1_partial_write()
+   {
+      return opcode == VEC4_OPCODE_SET_LOW_32BIT ||
+             opcode == VEC4_OPCODE_SET_HIGH_32BIT;
+   }
+
+   bool reads_flag()
+   {
+      return predicate || opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2;
+   }
+
+   bool reads_flag(unsigned c)
+   {
+      if (opcode == VS_OPCODE_UNPACK_FLAGS_SIMD4X2)
+         return true;
+
+      switch (predicate) {
+      case BRW_PREDICATE_NONE:
+         return false;
+      case BRW_PREDICATE_ALIGN16_REPLICATE_X:
+         return c == 0;
+      case BRW_PREDICATE_ALIGN16_REPLICATE_Y:
+         return c == 1;
+      case BRW_PREDICATE_ALIGN16_REPLICATE_Z:
+         return c == 2;
+      case BRW_PREDICATE_ALIGN16_REPLICATE_W:
+         return c == 3;
+      default:
+         return true;
+      }
+   }
+
+   bool writes_flag()
+   {
+      return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
+                                  opcode != BRW_OPCODE_IF &&
+                                  opcode != BRW_OPCODE_WHILE));
+   }
+};
+
+/**
+ * Make the execution of \p inst dependent on the evaluation of a possibly
+ * inverted predicate.
+ */
+inline vec4_instruction *
+set_predicate_inv(enum brw_predicate pred, bool inverse,
+                  vec4_instruction *inst)
+{
+   inst->predicate = pred;
+   inst->predicate_inverse = inverse;
+   return inst;
+}
+
+/**
+ * Make the execution of \p inst dependent on the evaluation of a predicate.
+ */
+inline vec4_instruction *
+set_predicate(enum brw_predicate pred, vec4_instruction *inst)
+{
+   return set_predicate_inv(pred, false, inst);
+}
+
+/**
+ * Write the result of evaluating the condition given by \p mod to a flag
+ * register.
+ */
+inline vec4_instruction *
+set_condmod(enum brw_conditional_mod mod, vec4_instruction *inst)
+{
+   inst->conditional_mod = mod;
+   return inst;
+}
+
+/**
+ * Clamp the result of \p inst to the saturation range of its destination
+ * datatype.
+ */
+inline vec4_instruction *
+set_saturate(bool saturate, vec4_instruction *inst)
+{
+   inst->saturate = saturate;
+   return inst;
+}
+
+/**
+ * Return the number of dataflow registers written by the instruction (either
+ * fully or partially) counted from 'floor(reg_offset(inst->dst) /
+ * register_size)'.  The somewhat arbitrary register size unit is 16B for the
+ * UNIFORM and IMM files and 32B for all other files.
+ */
+inline unsigned
+regs_written(const vec4_instruction *inst)
+{
+   assert(inst->dst.file != UNIFORM && inst->dst.file != IMM);
+   return DIV_ROUND_UP(reg_offset(inst->dst) % REG_SIZE + inst->size_written,
+                       REG_SIZE);
+}
+
+/**
+ * Return the number of dataflow registers read by the instruction (either
+ * fully or partially) counted from 'floor(reg_offset(inst->src[i]) /
+ * register_size)'.  The somewhat arbitrary register size unit is 16B for the
+ * UNIFORM and IMM files and 32B for all other files.
+ */
+inline unsigned
+regs_read(const vec4_instruction *inst, unsigned i)
+{
+   const unsigned reg_size =
+      inst->src[i].file == UNIFORM || inst->src[i].file == IMM ? 16 : REG_SIZE;
+   return DIV_ROUND_UP(reg_offset(inst->src[i]) % reg_size + inst->size_read(i),
+                       reg_size);
+}
+
+} /* namespace brw */
+
+#endif
diff --git a/src/intel/compiler/brw_nir.c b/src/intel/compiler/brw_nir.c
new file mode 100644
index 00000000000..f86308521e9
--- /dev/null
+++ b/src/intel/compiler/brw_nir.c
@@ -0,0 +1,764 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir.h"
+#include "brw_shader.h"
+#include "common/gen_debug.h"
+#include "compiler/glsl_types.h"
+#include "compiler/nir/nir_builder.h"
+
+static bool
+is_input(nir_intrinsic_instr *intrin)
+{
+   return intrin->intrinsic == nir_intrinsic_load_input ||
+          intrin->intrinsic == nir_intrinsic_load_per_vertex_input ||
+          intrin->intrinsic == nir_intrinsic_load_interpolated_input;
+}
+
+static bool
+is_output(nir_intrinsic_instr *intrin)
+{
+   return intrin->intrinsic == nir_intrinsic_load_output ||
+          intrin->intrinsic == nir_intrinsic_load_per_vertex_output ||
+          intrin->intrinsic == nir_intrinsic_store_output ||
+          intrin->intrinsic == nir_intrinsic_store_per_vertex_output;
+}
+
+/**
+ * In many cases, we just add the base and offset together, so there's no
+ * reason to keep them separate.  Sometimes, combining them is essential:
+ * if a shader only accesses part of a compound variable (such as a matrix
+ * or array), the variable's base may not actually exist in the VUE map.
+ *
+ * This pass adds constant offsets to instr->const_index[0], and resets
+ * the offset source to 0.  Non-constant offsets remain unchanged - since
+ * we don't know what part of a compound variable is accessed, we allocate
+ * storage for the entire thing.
+ */
+
+static bool
+add_const_offset_to_base_block(nir_block *block, nir_builder *b,
+                               nir_variable_mode mode)
+{
+   nir_foreach_instr_safe(instr, block) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+      if ((mode == nir_var_shader_in && is_input(intrin)) ||
+          (mode == nir_var_shader_out && is_output(intrin))) {
+         nir_src *offset = nir_get_io_offset_src(intrin);
+         nir_const_value *const_offset = nir_src_as_const_value(*offset);
+
+         if (const_offset) {
+            intrin->const_index[0] += const_offset->u32[0];
+            b->cursor = nir_before_instr(&intrin->instr);
+            nir_instr_rewrite_src(&intrin->instr, offset,
+                                  nir_src_for_ssa(nir_imm_int(b, 0)));
+         }
+      }
+   }
+   return true;
+}
+
+static void
+add_const_offset_to_base(nir_shader *nir, nir_variable_mode mode)
+{
+   nir_foreach_function(f, nir) {
+      if (f->impl) {
+         nir_builder b;
+         nir_builder_init(&b, f->impl);
+         nir_foreach_block(block, f->impl) {
+            add_const_offset_to_base_block(block, &b, mode);
+         }
+      }
+   }
+}
+
+static bool
+remap_vs_attrs(nir_block *block, shader_info *nir_info)
+{
+   nir_foreach_instr(instr, block) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+      if (intrin->intrinsic == nir_intrinsic_load_input) {
+         /* Attributes come in a contiguous block, ordered by their
+          * gl_vert_attrib value.  That means we can compute the slot
+          * number for an attribute by masking out the enabled attributes
+          * before it and counting the bits.
+          */
+         int attr = intrin->const_index[0];
+         int slot = _mesa_bitcount_64(nir_info->inputs_read &
+                                      BITFIELD64_MASK(attr));
+         intrin->const_index[0] = 4 * slot;
+      }
+   }
+   return true;
+}
+
+static bool
+remap_inputs_with_vue_map(nir_block *block, const struct brw_vue_map *vue_map)
+{
+   nir_foreach_instr(instr, block) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+      if (intrin->intrinsic == nir_intrinsic_load_input ||
+          intrin->intrinsic == nir_intrinsic_load_per_vertex_input) {
+         int vue_slot = vue_map->varying_to_slot[intrin->const_index[0]];
+         assert(vue_slot != -1);
+         intrin->const_index[0] = vue_slot;
+      }
+   }
+   return true;
+}
+
+static bool
+remap_tess_levels(nir_builder *b, nir_intrinsic_instr *intr,
+                  GLenum primitive_mode)
+{
+   const int location = nir_intrinsic_base(intr);
+   const unsigned component = nir_intrinsic_component(intr);
+   bool out_of_bounds;
+
+   if (location == VARYING_SLOT_TESS_LEVEL_INNER) {
+      switch (primitive_mode) {
+      case GL_QUADS:
+         /* gl_TessLevelInner[0..1] lives at DWords 3-2 (reversed). */
+         nir_intrinsic_set_base(intr, 0);
+         nir_intrinsic_set_component(intr, 3 - component);
+         out_of_bounds = false;
+         break;
+      case GL_TRIANGLES:
+         /* gl_TessLevelInner[0] lives at DWord 4. */
+         nir_intrinsic_set_base(intr, 1);
+         out_of_bounds = component > 0;
+         break;
+      case GL_ISOLINES:
+         out_of_bounds = true;
+         break;
+      default:
+         unreachable("Bogus tessellation domain");
+      }
+   } else if (location == VARYING_SLOT_TESS_LEVEL_OUTER) {
+      if (primitive_mode == GL_ISOLINES) {
+         /* gl_TessLevelOuter[0..1] lives at DWords 6-7 (in order). */
+         nir_intrinsic_set_base(intr, 1);
+         nir_intrinsic_set_component(intr, 2 + nir_intrinsic_component(intr));
+         out_of_bounds = component > 1;
+      } else {
+         /* Triangles use DWords 7-5 (reversed); Quads use 7-4 (reversed) */
+         nir_intrinsic_set_base(intr, 1);
+         nir_intrinsic_set_component(intr, 3 - nir_intrinsic_component(intr));
+         out_of_bounds = component == 3 && primitive_mode == GL_TRIANGLES;
+      }
+   } else {
+      return false;
+   }
+
+   if (out_of_bounds) {
+      if (nir_intrinsic_infos[intr->intrinsic].has_dest) {
+         b->cursor = nir_before_instr(&intr->instr);
+         nir_ssa_def *undef = nir_ssa_undef(b, 1, 32);
+         nir_ssa_def_rewrite_uses(&intr->dest.ssa, nir_src_for_ssa(undef));
+      }
+      nir_instr_remove(&intr->instr);
+   }
+
+   return true;
+}
+
+static bool
+remap_patch_urb_offsets(nir_block *block, nir_builder *b,
+                        const struct brw_vue_map *vue_map,
+                        GLenum tes_primitive_mode)
+{
+   const bool is_passthrough_tcs = b->shader->info->name &&
+      strcmp(b->shader->info->name, "passthrough") == 0;
+
+   nir_foreach_instr_safe(instr, block) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+
+      gl_shader_stage stage = b->shader->stage;
+
+      if ((stage == MESA_SHADER_TESS_CTRL && is_output(intrin)) ||
+          (stage == MESA_SHADER_TESS_EVAL && is_input(intrin))) {
+
+         if (!is_passthrough_tcs &&
+             remap_tess_levels(b, intrin, tes_primitive_mode))
+            continue;
+
+         int vue_slot = vue_map->varying_to_slot[intrin->const_index[0]];
+         assert(vue_slot != -1);
+         intrin->const_index[0] = vue_slot;
+
+         nir_src *vertex = nir_get_io_vertex_index_src(intrin);
+         if (vertex) {
+            nir_const_value *const_vertex = nir_src_as_const_value(*vertex);
+            if (const_vertex) {
+               intrin->const_index[0] += const_vertex->u32[0] *
+                                         vue_map->num_per_vertex_slots;
+            } else {
+               b->cursor = nir_before_instr(&intrin->instr);
+
+               /* Multiply by the number of per-vertex slots. */
+               nir_ssa_def *vertex_offset =
+                  nir_imul(b,
+                           nir_ssa_for_src(b, *vertex, 1),
+                           nir_imm_int(b,
+                                       vue_map->num_per_vertex_slots));
+
+               /* Add it to the existing offset */
+               nir_src *offset = nir_get_io_offset_src(intrin);
+               nir_ssa_def *total_offset =
+                  nir_iadd(b, vertex_offset,
+                           nir_ssa_for_src(b, *offset, 1));
+
+               nir_instr_rewrite_src(&intrin->instr, offset,
+                                     nir_src_for_ssa(total_offset));
+            }
+         }
+      }
+   }
+   return true;
+}
+
+void
+brw_nir_lower_vs_inputs(nir_shader *nir,
+                        bool is_scalar,
+                        bool use_legacy_snorm_formula,
+                        const uint8_t *vs_attrib_wa_flags)
+{
+   /* Start with the location of the variable's base. */
+   foreach_list_typed(nir_variable, var, node, &nir->inputs) {
+      var->data.driver_location = var->data.location;
+   }
+
+   /* Now use nir_lower_io to walk dereference chains.  Attribute arrays are
+    * loaded as one vec4 or dvec4 per element (or matrix column), depending on
+    * whether it is a double-precision type or not.
+    */
+   nir_lower_io(nir, nir_var_shader_in, type_size_vec4, 0);
+
+   /* This pass needs actual constants */
+   nir_opt_constant_folding(nir);
+
+   add_const_offset_to_base(nir, nir_var_shader_in);
+
+   brw_nir_apply_attribute_workarounds(nir, use_legacy_snorm_formula,
+                                       vs_attrib_wa_flags);
+
+   if (is_scalar) {
+      /* Finally, translate VERT_ATTRIB_* values into the actual registers. */
+
+      nir_foreach_function(function, nir) {
+         if (function->impl) {
+            nir_foreach_block(block, function->impl) {
+               remap_vs_attrs(block, nir->info);
+            }
+         }
+      }
+   }
+}
+
+void
+brw_nir_lower_vue_inputs(nir_shader *nir, bool is_scalar,
+                         const struct brw_vue_map *vue_map)
+{
+   foreach_list_typed(nir_variable, var, node, &nir->inputs) {
+      var->data.driver_location = var->data.location;
+   }
+
+   /* Inputs are stored in vec4 slots, so use type_size_vec4(). */
+   nir_lower_io(nir, nir_var_shader_in, type_size_vec4, 0);
+
+   if (is_scalar || nir->stage != MESA_SHADER_GEOMETRY) {
+      /* This pass needs actual constants */
+      nir_opt_constant_folding(nir);
+
+      add_const_offset_to_base(nir, nir_var_shader_in);
+
+      nir_foreach_function(function, nir) {
+         if (function->impl) {
+            nir_foreach_block(block, function->impl) {
+               remap_inputs_with_vue_map(block, vue_map);
+            }
+         }
+      }
+   }
+}
+
+void
+brw_nir_lower_tes_inputs(nir_shader *nir, const struct brw_vue_map *vue_map)
+{
+   foreach_list_typed(nir_variable, var, node, &nir->inputs) {
+      var->data.driver_location = var->data.location;
+   }
+
+   nir_lower_io(nir, nir_var_shader_in, type_size_vec4, 0);
+
+   /* This pass needs actual constants */
+   nir_opt_constant_folding(nir);
+
+   add_const_offset_to_base(nir, nir_var_shader_in);
+
+   nir_foreach_function(function, nir) {
+      if (function->impl) {
+         nir_builder b;
+         nir_builder_init(&b, function->impl);
+         nir_foreach_block(block, function->impl) {
+            remap_patch_urb_offsets(block, &b, vue_map,
+                                    nir->info->tess.primitive_mode);
+         }
+      }
+   }
+}
+
+void
+brw_nir_lower_fs_inputs(nir_shader *nir,
+                        const struct gen_device_info *devinfo,
+                        const struct brw_wm_prog_key *key)
+{
+   foreach_list_typed(nir_variable, var, node, &nir->inputs) {
+      var->data.driver_location = var->data.location;
+
+      /* Apply default interpolation mode.
+       *
+       * Everything defaults to smooth except for the legacy GL color
+       * built-in variables, which might be flat depending on API state.
+       */
+      if (var->data.interpolation == INTERP_MODE_NONE) {
+         const bool flat = key->flat_shade &&
+            (var->data.location == VARYING_SLOT_COL0 ||
+             var->data.location == VARYING_SLOT_COL1);
+
+         var->data.interpolation = flat ? INTERP_MODE_FLAT
+                                        : INTERP_MODE_SMOOTH;
+      }
+
+      /* On Ironlake and below, there is only one interpolation mode.
+       * Centroid interpolation doesn't mean anything on this hardware --
+       * there is no multisampling.
+       */
+      if (devinfo->gen < 6) {
+         var->data.centroid = false;
+         var->data.sample = false;
+      }
+   }
+
+   nir_lower_io_options lower_io_options = 0;
+   if (key->persample_interp)
+      lower_io_options |= nir_lower_io_force_sample_interpolation;
+
+   nir_lower_io(nir, nir_var_shader_in, type_size_vec4, lower_io_options);
+
+   /* This pass needs actual constants */
+   nir_opt_constant_folding(nir);
+
+   add_const_offset_to_base(nir, nir_var_shader_in);
+}
+
+void
+brw_nir_lower_vue_outputs(nir_shader *nir,
+                          bool is_scalar)
+{
+   nir_foreach_variable(var, &nir->outputs) {
+      var->data.driver_location = var->data.location;
+   }
+
+   nir_lower_io(nir, nir_var_shader_out, type_size_vec4, 0);
+}
+
+void
+brw_nir_lower_tcs_outputs(nir_shader *nir, const struct brw_vue_map *vue_map,
+                          GLenum tes_primitive_mode)
+{
+   nir_foreach_variable(var, &nir->outputs) {
+      var->data.driver_location = var->data.location;
+   }
+
+   nir_lower_io(nir, nir_var_shader_out, type_size_vec4, 0);
+
+   /* This pass needs actual constants */
+   nir_opt_constant_folding(nir);
+
+   add_const_offset_to_base(nir, nir_var_shader_out);
+
+   nir_foreach_function(function, nir) {
+      if (function->impl) {
+         nir_builder b;
+         nir_builder_init(&b, function->impl);
+         nir_foreach_block(block, function->impl) {
+            remap_patch_urb_offsets(block, &b, vue_map, tes_primitive_mode);
+         }
+      }
+   }
+}
+
+void
+brw_nir_lower_fs_outputs(nir_shader *nir)
+{
+   nir_foreach_variable(var, &nir->outputs) {
+      var->data.driver_location =
+         SET_FIELD(var->data.index, BRW_NIR_FRAG_OUTPUT_INDEX) |
+         SET_FIELD(var->data.location, BRW_NIR_FRAG_OUTPUT_LOCATION);
+   }
+
+   nir_lower_io(nir, nir_var_shader_out, type_size_dvec4, 0);
+}
+
+void
+brw_nir_lower_cs_shared(nir_shader *nir)
+{
+   nir_assign_var_locations(&nir->shared, &nir->num_shared,
+                            type_size_scalar_bytes);
+   nir_lower_io(nir, nir_var_shared, type_size_scalar_bytes, 0);
+}
+
+#define OPT(pass, ...) ({                                  \
+   bool this_progress = false;                             \
+   NIR_PASS(this_progress, nir, pass, ##__VA_ARGS__);      \
+   if (this_progress)                                      \
+      progress = true;                                     \
+   this_progress;                                          \
+})
+
+#define OPT_V(pass, ...) NIR_PASS_V(nir, pass, ##__VA_ARGS__)
+
+static nir_shader *
+nir_optimize(nir_shader *nir, const struct brw_compiler *compiler,
+             bool is_scalar)
+{
+   nir_variable_mode indirect_mask = 0;
+   if (compiler->glsl_compiler_options[nir->stage].EmitNoIndirectInput)
+      indirect_mask |= nir_var_shader_in;
+   if (compiler->glsl_compiler_options[nir->stage].EmitNoIndirectOutput)
+      indirect_mask |= nir_var_shader_out;
+   if (compiler->glsl_compiler_options[nir->stage].EmitNoIndirectTemp)
+      indirect_mask |= nir_var_local;
+
+   bool progress;
+   do {
+      progress = false;
+      OPT_V(nir_lower_vars_to_ssa);
+      OPT(nir_opt_copy_prop_vars);
+
+      if (is_scalar) {
+         OPT(nir_lower_alu_to_scalar);
+      }
+
+      OPT(nir_copy_prop);
+
+      if (is_scalar) {
+         OPT(nir_lower_phis_to_scalar);
+      }
+
+      OPT(nir_copy_prop);
+      OPT(nir_opt_dce);
+      OPT(nir_opt_cse);
+      OPT(nir_opt_peephole_select, 0);
+      OPT(nir_opt_algebraic);
+      OPT(nir_opt_constant_folding);
+      OPT(nir_opt_dead_cf);
+      if (OPT(nir_opt_trivial_continues)) {
+         /* If nir_opt_trivial_continues makes progress, then we need to clean
+          * things up if we want any hope of nir_opt_if or nir_opt_loop_unroll
+          * to make progress.
+          */
+         OPT(nir_copy_prop);
+         OPT(nir_opt_dce);
+      }
+      OPT(nir_opt_if);
+      if (nir->options->max_unroll_iterations != 0) {
+         OPT(nir_opt_loop_unroll, indirect_mask);
+      }
+      OPT(nir_opt_remove_phis);
+      OPT(nir_opt_undef);
+      OPT_V(nir_lower_doubles, nir_lower_drcp |
+                               nir_lower_dsqrt |
+                               nir_lower_drsq |
+                               nir_lower_dtrunc |
+                               nir_lower_dfloor |
+                               nir_lower_dceil |
+                               nir_lower_dfract |
+                               nir_lower_dround_even |
+                               nir_lower_dmod);
+      OPT_V(nir_lower_64bit_pack);
+   } while (progress);
+
+   return nir;
+}
+
+/* Does some simple lowering and runs the standard suite of optimizations
+ *
+ * This is intended to be called more-or-less directly after you get the
+ * shader out of GLSL or some other source.  While it is geared towards i965,
+ * it is not at all generator-specific except for the is_scalar flag.  Even
+ * there, it is safe to call with is_scalar = false for a shader that is
+ * intended for the FS backend as long as nir_optimize is called again with
+ * is_scalar = true to scalarize everything prior to code gen.
+ */
+nir_shader *
+brw_preprocess_nir(const struct brw_compiler *compiler, nir_shader *nir)
+{
+   const struct gen_device_info *devinfo = compiler->devinfo;
+   bool progress; /* Written by OPT and OPT_V */
+   (void)progress;
+
+   const bool is_scalar = compiler->scalar_stage[nir->stage];
+
+   if (nir->stage == MESA_SHADER_GEOMETRY)
+      OPT(nir_lower_gs_intrinsics);
+
+   /* See also brw_nir_trig_workarounds.py */
+   if (compiler->precise_trig &&
+       !(devinfo->gen >= 10 || devinfo->is_kabylake))
+      OPT(brw_nir_apply_trig_workarounds);
+
+   static const nir_lower_tex_options tex_options = {
+      .lower_txp = ~0,
+      .lower_txf_offset = true,
+      .lower_rect_offset = true,
+      .lower_txd_cube_map = true,
+   };
+
+   OPT(nir_lower_tex, &tex_options);
+   OPT(nir_normalize_cubemap_coords);
+
+   OPT(nir_lower_global_vars_to_local);
+
+   OPT(nir_split_var_copies);
+
+   nir = nir_optimize(nir, compiler, is_scalar);
+
+   if (is_scalar) {
+      OPT_V(nir_lower_load_const_to_scalar);
+   }
+
+   /* Lower a bunch of stuff */
+   OPT_V(nir_lower_var_copies);
+
+   OPT_V(nir_lower_clip_cull_distance_arrays);
+
+   nir_variable_mode indirect_mask = 0;
+   if (compiler->glsl_compiler_options[nir->stage].EmitNoIndirectInput)
+      indirect_mask |= nir_var_shader_in;
+   if (compiler->glsl_compiler_options[nir->stage].EmitNoIndirectOutput)
+      indirect_mask |= nir_var_shader_out;
+   if (compiler->glsl_compiler_options[nir->stage].EmitNoIndirectTemp)
+      indirect_mask |= nir_var_local;
+
+   nir_lower_indirect_derefs(nir, indirect_mask);
+
+   nir_lower_int64(nir, nir_lower_imul64 |
+                        nir_lower_isign64 |
+                        nir_lower_divmod64);
+
+   /* Get rid of split copies */
+   nir = nir_optimize(nir, compiler, is_scalar);
+
+   OPT(nir_remove_dead_variables, nir_var_local);
+
+   return nir;
+}
+
+/* Prepare the given shader for codegen
+ *
+ * This function is intended to be called right before going into the actual
+ * backend and is highly backend-specific.  Also, once this function has been
+ * called on a shader, it will no longer be in SSA form so most optimizations
+ * will not work.
+ */
+nir_shader *
+brw_postprocess_nir(nir_shader *nir, const struct brw_compiler *compiler,
+                    bool is_scalar)
+{
+   const struct gen_device_info *devinfo = compiler->devinfo;
+   bool debug_enabled =
+      (INTEL_DEBUG & intel_debug_flag_for_shader_stage(nir->stage));
+
+   bool progress; /* Written by OPT and OPT_V */
+   (void)progress;
+
+   nir = nir_optimize(nir, compiler, is_scalar);
+
+   if (devinfo->gen >= 6) {
+      /* Try and fuse multiply-adds */
+      OPT(brw_nir_opt_peephole_ffma);
+   }
+
+   OPT(nir_opt_algebraic_late);
+
+   OPT_V(nir_lower_to_source_mods);
+   OPT(nir_copy_prop);
+   OPT(nir_opt_dce);
+   OPT(nir_opt_move_comparisons);
+
+   OPT(nir_lower_locals_to_regs);
+
+   if (unlikely(debug_enabled)) {
+      /* Re-index SSA defs so we print more sensible numbers. */
+      nir_foreach_function(function, nir) {
+         if (function->impl)
+            nir_index_ssa_defs(function->impl);
+      }
+
+      fprintf(stderr, "NIR (SSA form) for %s shader:\n",
+              _mesa_shader_stage_to_string(nir->stage));
+      nir_print_shader(nir, stderr);
+   }
+
+   OPT_V(nir_convert_from_ssa, true);
+
+   if (!is_scalar) {
+      OPT_V(nir_move_vec_src_uses_to_dest);
+      OPT(nir_lower_vec_to_movs);
+   }
+
+   /* This is the last pass we run before we start emitting stuff.  It
+    * determines when we need to insert boolean resolves on Gen <= 5.  We
+    * run it last because it stashes data in instr->pass_flags and we don't
+    * want that to be squashed by other NIR passes.
+    */
+   if (devinfo->gen <= 5)
+      brw_nir_analyze_boolean_resolves(nir);
+
+   nir_sweep(nir);
+
+   if (unlikely(debug_enabled)) {
+      fprintf(stderr, "NIR (final form) for %s shader:\n",
+              _mesa_shader_stage_to_string(nir->stage));
+      nir_print_shader(nir, stderr);
+   }
+
+   return nir;
+}
+
+nir_shader *
+brw_nir_apply_sampler_key(nir_shader *nir,
+                          const struct brw_compiler *compiler,
+                          const struct brw_sampler_prog_key_data *key_tex,
+                          bool is_scalar)
+{
+   const struct gen_device_info *devinfo = compiler->devinfo;
+   nir_lower_tex_options tex_options = { 0 };
+
+   /* Iron Lake and prior require lowering of all rectangle textures */
+   if (devinfo->gen < 6)
+      tex_options.lower_rect = true;
+
+   /* Prior to Broadwell, our hardware can't actually do GL_CLAMP */
+   if (devinfo->gen < 8) {
+      tex_options.saturate_s = key_tex->gl_clamp_mask[0];
+      tex_options.saturate_t = key_tex->gl_clamp_mask[1];
+      tex_options.saturate_r = key_tex->gl_clamp_mask[2];
+   }
+
+   /* Prior to Haswell, we have to fake texture swizzle */
+   for (unsigned s = 0; s < MAX_SAMPLERS; s++) {
+      if (key_tex->swizzles[s] == SWIZZLE_NOOP)
+         continue;
+
+      tex_options.swizzle_result |= (1 << s);
+      for (unsigned c = 0; c < 4; c++)
+         tex_options.swizzles[s][c] = GET_SWZ(key_tex->swizzles[s], c);
+   }
+
+   /* Prior to Haswell, we have to lower gradients on shadow samplers */
+   tex_options.lower_txd_shadow = devinfo->gen < 8 && !devinfo->is_haswell;
+
+   tex_options.lower_y_uv_external = key_tex->y_uv_image_mask;
+   tex_options.lower_y_u_v_external = key_tex->y_u_v_image_mask;
+   tex_options.lower_yx_xuxv_external = key_tex->yx_xuxv_image_mask;
+
+   if (nir_lower_tex(nir, &tex_options)) {
+      nir_validate_shader(nir);
+      nir = nir_optimize(nir, compiler, is_scalar);
+   }
+
+   return nir;
+}
+
+enum brw_reg_type
+brw_type_for_nir_type(const struct gen_device_info *devinfo, nir_alu_type type)
+{
+   switch (type) {
+   case nir_type_uint:
+   case nir_type_uint32:
+      return BRW_REGISTER_TYPE_UD;
+   case nir_type_bool:
+   case nir_type_int:
+   case nir_type_bool32:
+   case nir_type_int32:
+      return BRW_REGISTER_TYPE_D;
+   case nir_type_float:
+   case nir_type_float32:
+      return BRW_REGISTER_TYPE_F;
+   case nir_type_float64:
+      return BRW_REGISTER_TYPE_DF;
+   case nir_type_int64:
+      return devinfo->gen < 8 ? BRW_REGISTER_TYPE_DF : BRW_REGISTER_TYPE_Q;
+   case nir_type_uint64:
+      return devinfo->gen < 8 ? BRW_REGISTER_TYPE_DF : BRW_REGISTER_TYPE_UQ;
+   default:
+      unreachable("unknown type");
+   }
+
+   return BRW_REGISTER_TYPE_F;
+}
+
+/* Returns the glsl_base_type corresponding to a nir_alu_type.
+ * This is used by both brw_vec4_nir and brw_fs_nir.
+ */
+enum glsl_base_type
+brw_glsl_base_type_for_nir_type(nir_alu_type type)
+{
+   switch (type) {
+   case nir_type_float:
+   case nir_type_float32:
+      return GLSL_TYPE_FLOAT;
+
+   case nir_type_float64:
+      return GLSL_TYPE_DOUBLE;
+
+   case nir_type_int:
+   case nir_type_int32:
+      return GLSL_TYPE_INT;
+
+   case nir_type_uint:
+   case nir_type_uint32:
+      return GLSL_TYPE_UINT;
+
+   default:
+      unreachable("bad type");
+   }
+}
diff --git a/src/intel/compiler/brw_nir.h b/src/intel/compiler/brw_nir.h
new file mode 100644
index 00000000000..76d7ec89f9b
--- /dev/null
+++ b/src/intel/compiler/brw_nir.h
@@ -0,0 +1,154 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include "brw_reg.h"
+#include "compiler/nir/nir.h"
+#include "brw_compiler.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+int type_size_scalar(const struct glsl_type *type);
+int type_size_vec4(const struct glsl_type *type);
+int type_size_dvec4(const struct glsl_type *type);
+
+static inline int
+type_size_scalar_bytes(const struct glsl_type *type)
+{
+   return type_size_scalar(type) * 4;
+}
+
+static inline int
+type_size_vec4_bytes(const struct glsl_type *type)
+{
+   return type_size_vec4(type) * 16;
+}
+
+/* Flags set in the instr->pass_flags field by i965 analysis passes */
+enum {
+   BRW_NIR_NON_BOOLEAN           = 0x0,
+
+   /* Indicates that the given instruction's destination is a boolean
+    * value but that it needs to be resolved before it can be used.
+    * On Gen <= 5, CMP instructions return a 32-bit value where the bottom
+    * bit represents the actual true/false value of the compare and the top
+    * 31 bits are undefined.  In order to use this value, we have to do a
+    * "resolve" operation by replacing the value of the CMP with -(x & 1)
+    * to sign-extend the bottom bit to 0/~0.
+    */
+   BRW_NIR_BOOLEAN_NEEDS_RESOLVE = 0x1,
+
+   /* Indicates that the given instruction's destination is a boolean
+    * value that has intentionally been left unresolved.  Not all boolean
+    * values need to be resolved immediately.  For instance, if we have
+    *
+    *    CMP r1 r2 r3
+    *    CMP r4 r5 r6
+    *    AND r7 r1 r4
+    *
+    * We don't have to resolve the result of the two CMP instructions
+    * immediately because the AND still does an AND of the bottom bits.
+    * Instead, we can save ourselves instructions by delaying the resolve
+    * until after the AND.  The result of the two CMP instructions is left
+    * as BRW_NIR_BOOLEAN_UNRESOLVED.
+    */
+   BRW_NIR_BOOLEAN_UNRESOLVED    = 0x2,
+
+   /* Indicates a that the given instruction's destination is a boolean
+    * value that does not need a resolve.  For instance, if you AND two
+    * values that are BRW_NIR_BOOLEAN_NEEDS_RESOLVE then we know that both
+    * values will be 0/~0 before we get them and the result of the AND is
+    * also guaranteed to be 0/~0 and does not need a resolve.
+    */
+   BRW_NIR_BOOLEAN_NO_RESOLVE    = 0x3,
+
+   /* A mask to mask the boolean status values off of instr->pass_flags */
+   BRW_NIR_BOOLEAN_MASK          = 0x3,
+};
+
+void brw_nir_analyze_boolean_resolves(nir_shader *nir);
+
+nir_shader *brw_preprocess_nir(const struct brw_compiler *compiler,
+                               nir_shader *nir);
+
+bool brw_nir_lower_intrinsics(nir_shader *nir,
+                              struct brw_stage_prog_data *prog_data);
+void brw_nir_lower_vs_inputs(nir_shader *nir,
+                             bool is_scalar,
+                             bool use_legacy_snorm_formula,
+                             const uint8_t *vs_attrib_wa_flags);
+void brw_nir_lower_vue_inputs(nir_shader *nir, bool is_scalar,
+                              const struct brw_vue_map *vue_map);
+void brw_nir_lower_tes_inputs(nir_shader *nir, const struct brw_vue_map *vue);
+void brw_nir_lower_fs_inputs(nir_shader *nir,
+                             const struct gen_device_info *devinfo,
+                             const struct brw_wm_prog_key *key);
+void brw_nir_lower_vue_outputs(nir_shader *nir, bool is_scalar);
+void brw_nir_lower_tcs_outputs(nir_shader *nir, const struct brw_vue_map *vue,
+                               GLenum tes_primitive_mode);
+void brw_nir_lower_fs_outputs(nir_shader *nir);
+void brw_nir_lower_cs_shared(nir_shader *nir);
+
+nir_shader *brw_postprocess_nir(nir_shader *nir,
+                                const struct brw_compiler *compiler,
+                                bool is_scalar);
+
+bool brw_nir_apply_attribute_workarounds(nir_shader *nir,
+                                         bool use_legacy_snorm_formula,
+                                         const uint8_t *attrib_wa_flags);
+
+bool brw_nir_apply_trig_workarounds(nir_shader *nir);
+
+void brw_nir_apply_tcs_quads_workaround(nir_shader *nir);
+
+nir_shader *brw_nir_apply_sampler_key(nir_shader *nir,
+                                      const struct brw_compiler *compiler,
+                                      const struct brw_sampler_prog_key_data *key,
+                                      bool is_scalar);
+
+enum brw_reg_type brw_type_for_nir_type(const struct gen_device_info *devinfo,
+                                        nir_alu_type type);
+
+enum glsl_base_type brw_glsl_base_type_for_nir_type(nir_alu_type type);
+
+void brw_nir_setup_glsl_uniforms(nir_shader *shader,
+                                 const struct gl_program *prog,
+                                 struct brw_stage_prog_data *stage_prog_data,
+                                 bool is_scalar);
+
+void brw_nir_setup_arb_uniforms(nir_shader *shader, struct gl_program *prog,
+                                struct brw_stage_prog_data *stage_prog_data);
+
+bool brw_nir_opt_peephole_ffma(nir_shader *shader);
+
+#define BRW_NIR_FRAG_OUTPUT_INDEX_SHIFT 0
+#define BRW_NIR_FRAG_OUTPUT_INDEX_MASK INTEL_MASK(0, 0)
+#define BRW_NIR_FRAG_OUTPUT_LOCATION_SHIFT 1
+#define BRW_NIR_FRAG_OUTPUT_LOCATION_MASK INTEL_MASK(31, 1)
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/intel/compiler/brw_nir_analyze_boolean_resolves.c b/src/intel/compiler/brw_nir_analyze_boolean_resolves.c
new file mode 100644
index 00000000000..4ad26e21103
--- /dev/null
+++ b/src/intel/compiler/brw_nir_analyze_boolean_resolves.c
@@ -0,0 +1,269 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Jason Ekstrand <jason@jlekstrand.net>
+ */
+
+#include "brw_nir.h"
+
+/*
+ * This file implements an analysis pass that determines when we have to do
+ * a boolean resolve on Gen <= 5.  Instructions that need a boolean resolve
+ * will have the booleans portion of the instr->pass_flags field set to
+ * BRW_NIR_BOOLEAN_NEEDS_RESOLVE.
+ */
+
+
+/** Returns the resolve status for the given source
+ *
+ * If the source has a parent instruction then the resolve status is the
+ * status of the parent instruction.  If the source does not have a parent
+ * instruction then we don't know so we return NON_BOOLEAN.
+ */
+static uint8_t
+get_resolve_status_for_src(nir_src *src)
+{
+   if (src->is_ssa) {
+      nir_instr *src_instr = src->ssa->parent_instr;
+      uint8_t resolve_status = src_instr->pass_flags & BRW_NIR_BOOLEAN_MASK;
+
+      /* If the source instruction needs resolve, then from the perspective
+       * of the user, it's a true boolean.
+       */
+      if (resolve_status == BRW_NIR_BOOLEAN_NEEDS_RESOLVE)
+         resolve_status = BRW_NIR_BOOLEAN_NO_RESOLVE;
+      return resolve_status;
+   } else {
+      return BRW_NIR_NON_BOOLEAN;
+   }
+}
+
+/** Marks the given source as needing a resolve
+ *
+ * If the given source corresponds to an unresolved boolean it marks it as
+ * needing a resolve.  Otherwise, we leave it alone.
+ */
+static bool
+src_mark_needs_resolve(nir_src *src, void *void_state)
+{
+   if (src->is_ssa) {
+      nir_instr *src_instr = src->ssa->parent_instr;
+      uint8_t resolve_status = src_instr->pass_flags & BRW_NIR_BOOLEAN_MASK;
+
+      /* If the source instruction is unresolved, then mark it as needing
+       * to be resolved.
+       */
+      if (resolve_status == BRW_NIR_BOOLEAN_UNRESOLVED) {
+         src_instr->pass_flags &= ~BRW_NIR_BOOLEAN_MASK;
+         src_instr->pass_flags |= BRW_NIR_BOOLEAN_NEEDS_RESOLVE;
+      }
+
+   }
+
+   return true;
+}
+
+static bool
+analyze_boolean_resolves_block(nir_block *block)
+{
+   nir_foreach_instr(instr, block) {
+      switch (instr->type) {
+      case nir_instr_type_alu: {
+         /* For ALU instructions, the resolve status is handled in a
+          * three-step process.
+          *
+          * 1) Look at the instruction type and sources and determine if it
+          *    can be left unresolved.
+          *
+          * 2) Look at the destination and see if we have to resolve
+          *    anyway.  (This is the case if this instruction is not the
+          *    only instruction writing to a given register.)
+          *
+          * 3) If the instruction has a resolve status other than
+          *    BOOL_UNRESOLVED or BOOL_NEEDS_RESOLVE then we walk through
+          *    the sources and ensure that they are also resolved.  This
+          *    ensures that we don't end up with any stray unresolved
+          *    booleans going into ADDs or something like that.
+          */
+
+         uint8_t resolve_status;
+         nir_alu_instr *alu = nir_instr_as_alu(instr);
+         switch (alu->op) {
+         case nir_op_ball_fequal2:
+         case nir_op_ball_iequal2:
+         case nir_op_ball_fequal3:
+         case nir_op_ball_iequal3:
+         case nir_op_ball_fequal4:
+         case nir_op_ball_iequal4:
+         case nir_op_bany_fnequal2:
+         case nir_op_bany_inequal2:
+         case nir_op_bany_fnequal3:
+         case nir_op_bany_inequal3:
+         case nir_op_bany_fnequal4:
+         case nir_op_bany_inequal4:
+            /* These are only implemented by the vec4 backend and its
+             * implementation emits resolved booleans.  At some point in the
+             * future, this may change and we'll have to remove some of the
+             * above cases.
+             */
+            resolve_status = BRW_NIR_BOOLEAN_NO_RESOLVE;
+            break;
+
+         case nir_op_imov:
+         case nir_op_inot:
+            /* This is a single-source instruction.  Just copy the resolve
+             * status from the source.
+             */
+            resolve_status = get_resolve_status_for_src(&alu->src[0].src);
+            break;
+
+         case nir_op_iand:
+         case nir_op_ior:
+         case nir_op_ixor: {
+            uint8_t src0_status = get_resolve_status_for_src(&alu->src[0].src);
+            uint8_t src1_status = get_resolve_status_for_src(&alu->src[1].src);
+
+            if (src0_status == src1_status) {
+               resolve_status = src0_status;
+            } else if (src0_status == BRW_NIR_NON_BOOLEAN ||
+                       src1_status == BRW_NIR_NON_BOOLEAN) {
+               /* If one of the sources is a non-boolean then the whole
+                * thing is a non-boolean.
+                */
+               resolve_status = BRW_NIR_NON_BOOLEAN;
+            } else {
+               /* At this point one of them is a true boolean and one is a
+                * boolean that needs a resolve.  We could either resolve the
+                * unresolved source or we could resolve here.  If we resolve
+                * the unresolved source then we get two resolves for the price
+                * of one.  Just set this one to BOOLEAN_NO_RESOLVE and we'll
+                * let the code below force a resolve on the unresolved source.
+                */
+               resolve_status = BRW_NIR_BOOLEAN_NO_RESOLVE;
+            }
+            break;
+         }
+
+         default:
+            if (nir_alu_type_get_base_type(nir_op_infos[alu->op].output_type) == nir_type_bool) {
+               /* This instructions will turn into a CMP when we actually emit
+                * them so the result will have to be resolved before it can be
+                * used.
+                */
+               resolve_status = BRW_NIR_BOOLEAN_UNRESOLVED;
+
+               /* Even though the destination is allowed to be left
+                * unresolved, the sources are treated as regular integers or
+                * floats so they need to be resolved.
+                */
+               nir_foreach_src(instr, src_mark_needs_resolve, NULL);
+            } else {
+               resolve_status = BRW_NIR_NON_BOOLEAN;
+            }
+         }
+
+         /* If the destination is SSA, go ahead allow unresolved booleans.
+          * If the destination register doesn't have a well-defined parent_instr
+          * we need to resolve immediately.
+          */
+         if (!alu->dest.dest.is_ssa &&
+             resolve_status == BRW_NIR_BOOLEAN_UNRESOLVED) {
+            resolve_status = BRW_NIR_BOOLEAN_NEEDS_RESOLVE;
+         }
+
+         instr->pass_flags = (instr->pass_flags & ~BRW_NIR_BOOLEAN_MASK) |
+                             resolve_status;
+
+         /* Finally, resolve sources if it's needed */
+         switch (resolve_status) {
+         case BRW_NIR_BOOLEAN_NEEDS_RESOLVE:
+         case BRW_NIR_BOOLEAN_UNRESOLVED:
+            /* This instruction is either unresolved or we're doing the
+             * resolve here; leave the sources alone.
+             */
+            break;
+
+         case BRW_NIR_BOOLEAN_NO_RESOLVE:
+         case BRW_NIR_NON_BOOLEAN:
+            nir_foreach_src(instr, src_mark_needs_resolve, NULL);
+            break;
+
+         default:
+            unreachable("Invalid boolean flag");
+         }
+
+         break;
+      }
+
+      case nir_instr_type_load_const: {
+         nir_load_const_instr *load = nir_instr_as_load_const(instr);
+
+         /* For load_const instructions, it's a boolean exactly when it holds
+          * one of the values NIR_TRUE or NIR_FALSE.
+          *
+          * Since load_const instructions don't have any sources, we don't
+          * have to worry about resolving them.
+          */
+         instr->pass_flags &= ~BRW_NIR_BOOLEAN_MASK;
+         if (load->value.u32[0] == NIR_TRUE || load->value.u32[0] == NIR_FALSE) {
+            instr->pass_flags |= BRW_NIR_BOOLEAN_NO_RESOLVE;
+         } else {
+            instr->pass_flags |= BRW_NIR_NON_BOOLEAN;
+         }
+         continue;
+      }
+
+      default:
+         /* Everything else is an unknown non-boolean value and needs to
+          * have all sources resolved.
+          */
+         instr->pass_flags = (instr->pass_flags & ~BRW_NIR_BOOLEAN_MASK) |
+                             BRW_NIR_NON_BOOLEAN;
+         nir_foreach_src(instr, src_mark_needs_resolve, NULL);
+         continue;
+      }
+   }
+
+   nir_if *following_if = nir_block_get_following_if(block);
+   if (following_if)
+      src_mark_needs_resolve(&following_if->condition, NULL);
+
+   return true;
+}
+
+static void
+analyze_boolean_resolves_impl(nir_function_impl *impl)
+{
+   nir_foreach_block(block, impl) {
+      analyze_boolean_resolves_block(block);
+   }
+}
+
+void
+brw_nir_analyze_boolean_resolves(nir_shader *shader)
+{
+   nir_foreach_function(function, shader) {
+      if (function->impl)
+         analyze_boolean_resolves_impl(function->impl);
+   }
+}
diff --git a/src/intel/compiler/brw_nir_attribute_workarounds.c b/src/intel/compiler/brw_nir_attribute_workarounds.c
new file mode 100644
index 00000000000..d695771f04a
--- /dev/null
+++ b/src/intel/compiler/brw_nir_attribute_workarounds.c
@@ -0,0 +1,176 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/nir/nir_builder.h"
+#include "brw_nir.h"
+
+/**
+ * Prior to Haswell, the hardware can't natively support GL_FIXED or
+ * 2_10_10_10_REV vertex formats.  This pass inserts extra shader code
+ * to produce the correct values.
+ */
+
+struct attr_wa_state {
+   nir_builder builder;
+   bool impl_progress;
+   bool use_legacy_snorm_formula;
+   const uint8_t *wa_flags;
+};
+
+static bool
+apply_attr_wa_block(nir_block *block, struct attr_wa_state *state)
+{
+   nir_builder *b = &state->builder;
+
+   nir_foreach_instr_safe(instr, block) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      if (intrin->intrinsic != nir_intrinsic_load_input)
+         continue;
+
+      uint8_t wa_flags = state->wa_flags[intrin->const_index[0]];
+      if (wa_flags == 0)
+         continue;
+
+      b->cursor = nir_after_instr(instr);
+
+      nir_ssa_def *val = &intrin->dest.ssa;
+
+      /* Do GL_FIXED rescaling for GLES2.0.  Our GL_FIXED attributes
+       * come in as floating point conversions of the integer values.
+       */
+      if (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK) {
+         nir_ssa_def *scaled =
+            nir_fmul(b, val, nir_imm_float(b, 1.0f / 65536.0f));
+         nir_ssa_def *comps[4];
+         for (int i = 0; i < val->num_components; i++) {
+            bool rescale = i < (wa_flags & BRW_ATTRIB_WA_COMPONENT_MASK);
+            comps[i] = nir_channel(b, rescale ? scaled : val, i);
+         }
+         val = nir_vec(b, comps, val->num_components);
+      }
+
+      /* Do sign recovery for 2101010 formats if required. */
+      if (wa_flags & BRW_ATTRIB_WA_SIGN) {
+         /* sign recovery shift: <22, 22, 22, 30> */
+         nir_ssa_def *shift = nir_imm_ivec4(b, 22, 22, 22, 30);
+         val = nir_ishr(b, nir_ishl(b, val, shift), shift);
+      }
+
+      /* Apply BGRA swizzle if required. */
+      if (wa_flags & BRW_ATTRIB_WA_BGRA) {
+         val = nir_swizzle(b, val, (unsigned[4]){2,1,0,3}, 4, true);
+      }
+
+      if (wa_flags & BRW_ATTRIB_WA_NORMALIZE) {
+         /* ES 3.0 has different rules for converting signed normalized
+          * fixed-point numbers than desktop GL.
+          */
+         if ((wa_flags & BRW_ATTRIB_WA_SIGN) &&
+             !state->use_legacy_snorm_formula) {
+            /* According to equation 2.2 of the ES 3.0 specification,
+             * signed normalization conversion is done by:
+             *
+             * f = c / (2^(b-1)-1)
+             */
+            nir_ssa_def *es3_normalize_factor =
+               nir_imm_vec4(b, 1.0f / ((1 << 9) - 1), 1.0f / ((1 << 9) - 1),
+                               1.0f / ((1 << 9) - 1), 1.0f / ((1 << 1) - 1));
+            val = nir_fmax(b,
+                           nir_fmul(b, nir_i2f(b, val), es3_normalize_factor),
+                           nir_imm_float(b, -1.0f));
+         } else {
+            /* The following equations are from the OpenGL 3.2 specification:
+             *
+             * 2.1 unsigned normalization
+             * f = c/(2^n-1)
+             *
+             * 2.2 signed normalization
+             * f = (2c+1)/(2^n-1)
+             *
+             * Both of these share a common divisor, which we handle by
+             * multiplying by 1 / (2^b - 1) for b = <10, 10, 10, 2>.
+             */
+            nir_ssa_def *normalize_factor =
+               nir_imm_vec4(b, 1.0f / ((1 << 10) - 1), 1.0f / ((1 << 10) - 1),
+                               1.0f / ((1 << 10) - 1), 1.0f / ((1 << 2)  - 1));
+
+            if (wa_flags & BRW_ATTRIB_WA_SIGN) {
+               /* For signed normalization, the numerator is 2c+1. */
+               nir_ssa_def *two = nir_imm_float(b, 2.0f);
+               nir_ssa_def *one = nir_imm_float(b, 1.0f);
+               val = nir_fadd(b, nir_fmul(b, nir_i2f(b, val), two), one);
+            } else {
+               /* For unsigned normalization, the numerator is just c. */
+               val = nir_u2f(b, val);
+            }
+            val = nir_fmul(b, val, normalize_factor);
+         }
+      }
+
+      if (wa_flags & BRW_ATTRIB_WA_SCALE) {
+         val = (wa_flags & BRW_ATTRIB_WA_SIGN) ? nir_i2f(b, val)
+                                               : nir_u2f(b, val);
+      }
+
+      nir_ssa_def_rewrite_uses_after(&intrin->dest.ssa, nir_src_for_ssa(val),
+                                     val->parent_instr);
+      state->impl_progress = true;
+   }
+
+   return true;
+}
+
+bool
+brw_nir_apply_attribute_workarounds(nir_shader *shader,
+                                    bool use_legacy_snorm_formula,
+                                    const uint8_t *attrib_wa_flags)
+{
+   bool progress = false;
+   struct attr_wa_state state = {
+      .use_legacy_snorm_formula = use_legacy_snorm_formula,
+      .wa_flags = attrib_wa_flags,
+   };
+
+   nir_foreach_function(func, shader) {
+      if (!func->impl)
+         continue;
+
+      nir_builder_init(&state.builder, func->impl);
+      state.impl_progress = false;
+
+      nir_foreach_block(block, func->impl) {
+         apply_attr_wa_block(block, &state);
+      }
+
+      if (state.impl_progress) {
+         nir_metadata_preserve(func->impl, nir_metadata_block_index |
+                                           nir_metadata_dominance);
+         progress = true;
+      }
+   }
+
+   return progress;
+}
diff --git a/src/intel/compiler/brw_nir_intrinsics.c b/src/intel/compiler/brw_nir_intrinsics.c
new file mode 100644
index 00000000000..901a1fb0ab9
--- /dev/null
+++ b/src/intel/compiler/brw_nir_intrinsics.c
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+struct lower_intrinsics_state {
+   nir_shader *nir;
+   union {
+      struct brw_stage_prog_data *prog_data;
+      struct brw_cs_prog_data *cs_prog_data;
+   };
+   nir_function_impl *impl;
+   bool progress;
+   nir_builder builder;
+   bool cs_thread_id_used;
+};
+
+static nir_ssa_def *
+read_thread_local_id(struct lower_intrinsics_state *state)
+{
+   nir_builder *b = &state->builder;
+   nir_shader *nir = state->nir;
+   const unsigned *sizes = nir->info->cs.local_size;
+   const unsigned group_size = sizes[0] * sizes[1] * sizes[2];
+
+   /* Some programs have local_size dimensions so small that the thread local
+    * ID will always be 0.
+    */
+   if (group_size <= 8)
+      return nir_imm_int(b, 0);
+
+   assert(state->cs_prog_data->thread_local_id_index >= 0);
+   state->cs_thread_id_used = true;
+   const int id_index = state->cs_prog_data->thread_local_id_index;
+
+   nir_intrinsic_instr *load =
+      nir_intrinsic_instr_create(nir, nir_intrinsic_load_uniform);
+   load->num_components = 1;
+   load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
+   nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
+   nir_intrinsic_set_base(load, id_index * sizeof(uint32_t));
+   nir_intrinsic_set_range(load, sizeof(uint32_t));
+   nir_builder_instr_insert(b, &load->instr);
+   return &load->dest.ssa;
+}
+
+static bool
+lower_cs_intrinsics_convert_block(struct lower_intrinsics_state *state,
+                                  nir_block *block)
+{
+   bool progress = false;
+   nir_builder *b = &state->builder;
+   nir_shader *nir = state->nir;
+
+   nir_foreach_instr_safe(instr, block) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrinsic = nir_instr_as_intrinsic(instr);
+
+      b->cursor = nir_after_instr(&intrinsic->instr);
+
+      nir_ssa_def *sysval;
+      switch (intrinsic->intrinsic) {
+      case nir_intrinsic_load_local_invocation_index: {
+         assert(nir->stage == MESA_SHADER_COMPUTE);
+         /* We construct the local invocation index from:
+          *
+          *    gl_LocalInvocationIndex =
+          *       cs_thread_local_id + channel_num;
+          */
+         nir_ssa_def *thread_local_id = read_thread_local_id(state);
+         nir_ssa_def *channel = nir_load_channel_num(b);
+         sysval = nir_iadd(b, channel, thread_local_id);
+         break;
+      }
+
+      case nir_intrinsic_load_local_invocation_id: {
+         assert(nir->stage == MESA_SHADER_COMPUTE);
+         /* We lower gl_LocalInvocationID from gl_LocalInvocationIndex based
+          * on this formula:
+          *
+          *    gl_LocalInvocationID.x =
+          *       gl_LocalInvocationIndex % gl_WorkGroupSize.x;
+          *    gl_LocalInvocationID.y =
+          *       (gl_LocalInvocationIndex / gl_WorkGroupSize.x) %
+          *       gl_WorkGroupSize.y;
+          *    gl_LocalInvocationID.z =
+          *       (gl_LocalInvocationIndex /
+          *        (gl_WorkGroupSize.x * gl_WorkGroupSize.y)) %
+          *       gl_WorkGroupSize.z;
+          */
+         unsigned *size = nir->info->cs.local_size;
+
+         nir_ssa_def *local_index = nir_load_local_invocation_index(b);
+
+         nir_const_value uvec3;
+         uvec3.u32[0] = 1;
+         uvec3.u32[1] = size[0];
+         uvec3.u32[2] = size[0] * size[1];
+         nir_ssa_def *div_val = nir_build_imm(b, 3, 32, uvec3);
+         uvec3.u32[0] = size[0];
+         uvec3.u32[1] = size[1];
+         uvec3.u32[2] = size[2];
+         nir_ssa_def *mod_val = nir_build_imm(b, 3, 32, uvec3);
+
+         sysval = nir_umod(b, nir_udiv(b, local_index, div_val), mod_val);
+         break;
+      }
+
+      default:
+         continue;
+      }
+
+      nir_ssa_def_rewrite_uses(&intrinsic->dest.ssa, nir_src_for_ssa(sysval));
+      nir_instr_remove(&intrinsic->instr);
+
+      state->progress = true;
+   }
+
+   return progress;
+}
+
+static void
+lower_cs_intrinsics_convert_impl(struct lower_intrinsics_state *state)
+{
+   nir_builder_init(&state->builder, state->impl);
+
+   nir_foreach_block(block, state->impl) {
+      lower_cs_intrinsics_convert_block(state, block);
+   }
+
+   nir_metadata_preserve(state->impl,
+                         nir_metadata_block_index | nir_metadata_dominance);
+}
+
+bool
+brw_nir_lower_intrinsics(nir_shader *nir, struct brw_stage_prog_data *prog_data)
+{
+   /* Currently we only lower intrinsics for compute shaders */
+   if (nir->stage != MESA_SHADER_COMPUTE)
+      return false;
+
+   bool progress = false;
+   struct lower_intrinsics_state state;
+   memset(&state, 0, sizeof(state));
+   state.nir = nir;
+   state.prog_data = prog_data;
+
+   do {
+      state.progress = false;
+      nir_foreach_function(function, nir) {
+         if (function->impl) {
+            state.impl = function->impl;
+            lower_cs_intrinsics_convert_impl(&state);
+         }
+      }
+      progress |= state.progress;
+   } while (state.progress);
+
+   if (nir->stage == MESA_SHADER_COMPUTE && !state.cs_thread_id_used)
+      state.cs_prog_data->thread_local_id_index = -1;
+
+   return progress;
+}
diff --git a/src/intel/compiler/brw_nir_opt_peephole_ffma.c b/src/intel/compiler/brw_nir_opt_peephole_ffma.c
new file mode 100644
index 00000000000..cc225e1847b
--- /dev/null
+++ b/src/intel/compiler/brw_nir_opt_peephole_ffma.c
@@ -0,0 +1,297 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Jason Ekstrand (jason@jlekstrand.net)
+ *
+ */
+
+#include "brw_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+/*
+ * Implements a small peephole optimization that looks for a multiply that
+ * is only ever used in an add and replaces both with an fma.
+ */
+
+static inline bool
+are_all_uses_fadd(nir_ssa_def *def)
+{
+   if (!list_empty(&def->if_uses))
+      return false;
+
+   nir_foreach_use(use_src, def) {
+      nir_instr *use_instr = use_src->parent_instr;
+
+      if (use_instr->type != nir_instr_type_alu)
+         return false;
+
+      nir_alu_instr *use_alu = nir_instr_as_alu(use_instr);
+      switch (use_alu->op) {
+      case nir_op_fadd:
+         break; /* This one's ok */
+
+      case nir_op_imov:
+      case nir_op_fmov:
+      case nir_op_fneg:
+      case nir_op_fabs:
+         assert(use_alu->dest.dest.is_ssa);
+         if (!are_all_uses_fadd(&use_alu->dest.dest.ssa))
+            return false;
+         break;
+
+      default:
+         return false;
+      }
+   }
+
+   return true;
+}
+
+static nir_alu_instr *
+get_mul_for_src(nir_alu_src *src, int num_components,
+                uint8_t swizzle[4], bool *negate, bool *abs)
+{
+   uint8_t swizzle_tmp[4];
+   assert(src->src.is_ssa && !src->abs && !src->negate);
+
+   nir_instr *instr = src->src.ssa->parent_instr;
+   if (instr->type != nir_instr_type_alu)
+      return NULL;
+
+   nir_alu_instr *alu = nir_instr_as_alu(instr);
+
+   /* We want to bail if any of the other ALU operations involved is labled
+    * exact.  One reason for this is that, while the value that is changing is
+    * actually the result of the add and not the multiply, the intention of
+    * the user when they specify an exact multiply is that they want *that*
+    * value and what they don't care about is the add.  Another reason is that
+    * SPIR-V explicitly requires this behaviour.
+    */
+   if (alu->exact)
+      return NULL;
+
+   switch (alu->op) {
+   case nir_op_imov:
+   case nir_op_fmov:
+      alu = get_mul_for_src(&alu->src[0], num_components, swizzle, negate, abs);
+      break;
+
+   case nir_op_fneg:
+      alu = get_mul_for_src(&alu->src[0], num_components, swizzle, negate, abs);
+      *negate = !*negate;
+      break;
+
+   case nir_op_fabs:
+      alu = get_mul_for_src(&alu->src[0], num_components, swizzle, negate, abs);
+      *negate = false;
+      *abs = true;
+      break;
+
+   case nir_op_fmul:
+      /* Only absorb a fmul into a ffma if the fmul is only used in fadd
+       * operations.  This prevents us from being too aggressive with our
+       * fusing which can actually lead to more instructions.
+       */
+      if (!are_all_uses_fadd(&alu->dest.dest.ssa))
+         return NULL;
+      break;
+
+   default:
+      return NULL;
+   }
+
+   if (!alu)
+      return NULL;
+
+   /* Copy swizzle data before overwriting it to avoid setting a wrong swizzle.
+    *
+    * Example:
+    *   Former swizzle[] = xyzw
+    *   src->swizzle[] = zyxx
+    *
+    *   Expected output swizzle = zyxx
+    *   If we reuse swizzle in the loop, then output swizzle would be zyzz.
+    */
+   memcpy(swizzle_tmp, swizzle, 4*sizeof(uint8_t));
+   for (int i = 0; i < num_components; i++)
+      swizzle[i] = swizzle_tmp[src->swizzle[i]];
+
+   return alu;
+}
+
+/**
+ * Given a list of (at least two) nir_alu_src's, tells if any of them is a
+ * constant value and is used only once.
+ */
+static bool
+any_alu_src_is_a_constant(nir_alu_src srcs[])
+{
+   for (unsigned i = 0; i < 2; i++) {
+      if (srcs[i].src.ssa->parent_instr->type == nir_instr_type_load_const) {
+         nir_load_const_instr *load_const =
+            nir_instr_as_load_const (srcs[i].src.ssa->parent_instr);
+
+         if (list_is_singular(&load_const->def.uses) &&
+             list_empty(&load_const->def.if_uses)) {
+            return true;
+         }
+      }
+   }
+
+   return false;
+}
+
+static bool
+brw_nir_opt_peephole_ffma_block(nir_builder *b, nir_block *block)
+{
+   bool progress = false;
+
+   nir_foreach_instr_safe(instr, block) {
+      if (instr->type != nir_instr_type_alu)
+         continue;
+
+      nir_alu_instr *add = nir_instr_as_alu(instr);
+      if (add->op != nir_op_fadd)
+         continue;
+
+      assert(add->dest.dest.is_ssa);
+      if (add->exact)
+         continue;
+
+      assert(add->src[0].src.is_ssa && add->src[1].src.is_ssa);
+
+      /* This, is the case a + a.  We would rather handle this with an
+       * algebraic reduction than fuse it.  Also, we want to only fuse
+       * things where the multiply is used only once and, in this case,
+       * it would be used twice by the same instruction.
+       */
+      if (add->src[0].src.ssa == add->src[1].src.ssa)
+         continue;
+
+      nir_alu_instr *mul;
+      uint8_t add_mul_src, swizzle[4];
+      bool negate, abs;
+      for (add_mul_src = 0; add_mul_src < 2; add_mul_src++) {
+         for (unsigned i = 0; i < 4; i++)
+            swizzle[i] = i;
+
+         negate = false;
+         abs = false;
+
+         mul = get_mul_for_src(&add->src[add_mul_src],
+                               add->dest.dest.ssa.num_components,
+                               swizzle, &negate, &abs);
+
+         if (mul != NULL)
+            break;
+      }
+
+      if (mul == NULL)
+         continue;
+
+      unsigned bit_size = add->dest.dest.ssa.bit_size;
+
+      nir_ssa_def *mul_src[2];
+      mul_src[0] = mul->src[0].src.ssa;
+      mul_src[1] = mul->src[1].src.ssa;
+
+      /* If any of the operands of the fmul and any of the fadd is a constant,
+       * we bypass because it will be more efficient as the constants will be
+       * propagated as operands, potentially saving two load_const instructions.
+       */
+      if (any_alu_src_is_a_constant(mul->src) &&
+          any_alu_src_is_a_constant(add->src)) {
+         continue;
+      }
+
+      b->cursor = nir_before_instr(&add->instr);
+
+      if (abs) {
+         for (unsigned i = 0; i < 2; i++)
+            mul_src[i] = nir_fabs(b, mul_src[i]);
+      }
+
+      if (negate)
+         mul_src[0] = nir_fneg(b, mul_src[0]);
+
+      nir_alu_instr *ffma = nir_alu_instr_create(b->shader, nir_op_ffma);
+      ffma->dest.saturate = add->dest.saturate;
+      ffma->dest.write_mask = add->dest.write_mask;
+
+      for (unsigned i = 0; i < 2; i++) {
+         ffma->src[i].src = nir_src_for_ssa(mul_src[i]);
+         for (unsigned j = 0; j < add->dest.dest.ssa.num_components; j++)
+            ffma->src[i].swizzle[j] = mul->src[i].swizzle[swizzle[j]];
+      }
+      nir_alu_src_copy(&ffma->src[2], &add->src[1 - add_mul_src], ffma);
+
+      assert(add->dest.dest.is_ssa);
+
+      nir_ssa_dest_init(&ffma->instr, &ffma->dest.dest,
+                        add->dest.dest.ssa.num_components,
+                        bit_size,
+                        add->dest.dest.ssa.name);
+      nir_ssa_def_rewrite_uses(&add->dest.dest.ssa,
+                               nir_src_for_ssa(&ffma->dest.dest.ssa));
+
+      nir_builder_instr_insert(b, &ffma->instr);
+      assert(list_empty(&add->dest.dest.ssa.uses));
+      nir_instr_remove(&add->instr);
+
+      progress = true;
+   }
+
+   return progress;
+}
+
+static bool
+brw_nir_opt_peephole_ffma_impl(nir_function_impl *impl)
+{
+   bool progress = false;
+
+   nir_builder builder;
+   nir_builder_init(&builder, impl);
+
+   nir_foreach_block(block, impl) {
+      progress |= brw_nir_opt_peephole_ffma_block(&builder, block);
+   }
+
+   if (progress)
+      nir_metadata_preserve(impl, nir_metadata_block_index |
+                                  nir_metadata_dominance);
+
+   return progress;
+}
+
+bool
+brw_nir_opt_peephole_ffma(nir_shader *shader)
+{
+   bool progress = false;
+
+   nir_foreach_function(function, shader) {
+      if (function->impl)
+         progress |= brw_nir_opt_peephole_ffma_impl(function->impl);
+   }
+
+   return progress;
+}
diff --git a/src/intel/compiler/brw_nir_tcs_workarounds.c b/src/intel/compiler/brw_nir_tcs_workarounds.c
new file mode 100644
index 00000000000..a85f493c704
--- /dev/null
+++ b/src/intel/compiler/brw_nir_tcs_workarounds.c
@@ -0,0 +1,152 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "compiler/nir/nir_builder.h"
+#include "brw_nir.h"
+
+/**
+ * Implements the WaPreventHSTessLevelsInterference workaround (for Gen7-8).
+ *
+ * From the Broadwell PRM, Volume 7 (3D-Media-GPGPU), Page 494 (below the
+ * definition of the patch header layouts):
+ *
+ *    "HW Bug: The Tessellation stage will incorrectly add domain points
+ *     along patch edges under the following conditions, which may result
+ *     in conformance failures and/or cracking artifacts:
+ *
+ *       * QUAD domain
+ *       * INTEGER partitioning
+ *       * All three TessFactors in a given U or V direction (e.g., V
+ *         direction: UEQ0, InsideV, UEQ1) are all exactly 1.0
+ *       * All three TessFactors in the other direction are > 1.0 and all
+ *         round up to the same integer value (e.g, U direction:
+ *         VEQ0 = 3.1, InsideU = 3.7, VEQ1 = 3.4)
+ *
+ *     The suggested workaround (to be implemented as part of the postamble
+ *     to the HS shader in the HS kernel) is:
+ *
+ *     if (
+ *        (TF[UEQ0] > 1.0) ||
+ *        (TF[VEQ0] > 1.0) ||
+ *        (TF[UEQ1] > 1.0) ||
+ *        (TF[VEQ1] > 1.0) ||
+ *        (TF[INSIDE_U] > 1.0) ||
+ *        (TF[INSIDE_V] > 1.0) )
+ *     {
+ *        TF[INSIDE_U] = (TF[INSIDE_U] == 1.0) ? 2.0 : TF[INSIDE_U];
+ *        TF[INSIDE_V] = (TF[INSIDE_V] == 1.0) ? 2.0 : TF[INSIDE_V];
+ *     }"
+ *
+ * There's a subtlety here.  Intel internal HSD-ES bug 1208668495 notes
+ * that the above workaround fails to fix certain GL/ES CTS tests which
+ * have inside tessellation factors of -1.0.  This can be explained by
+ * a quote from the ARB_tessellation_shader specification:
+ *
+ *    "If "equal_spacing" is used, the floating-point tessellation level is
+ *     first clamped to the range [1,<max>], where <max> is implementation-
+ *     dependent maximum tessellation level (MAX_TESS_GEN_LEVEL)."
+ *
+ * In other words, the actual inner tessellation factor used is
+ * clamp(TF[INSIDE_*], 1.0, 64.0).  So we want to compare the clamped
+ * value against 1.0.  To accomplish this, we change the comparison from
+ * (TF[INSIDE_*] == 1.0) to (TF[INSIDE_*] <= 1.0).
+ */
+
+static inline nir_ssa_def *
+load_output(nir_builder *b, int num_components, int offset, int component)
+{
+   nir_intrinsic_instr *load =
+      nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_output);
+   nir_ssa_dest_init(&load->instr, &load->dest, num_components, 32, NULL);
+   load->num_components = num_components;
+   load->src[0] = nir_src_for_ssa(nir_imm_int(b, 0));
+   nir_intrinsic_set_base(load, offset);
+   nir_intrinsic_set_component(load, component);
+
+   nir_builder_instr_insert(b, &load->instr);
+
+   return &load->dest.ssa;
+}
+
+static void
+emit_quads_workaround(nir_builder *b, nir_block *block)
+{
+   b->cursor = nir_after_block_before_jump(block);
+
+   nir_ssa_def *inner = load_output(b, 2, 0, 2);
+   nir_ssa_def *outer = load_output(b, 4, 1, 0);
+
+   nir_ssa_def *any_greater_than_1 =
+       nir_ior(b, nir_bany(b, nir_flt(b, nir_imm_float(b, 1.0f), outer)),
+                  nir_bany(b, nir_flt(b, nir_imm_float(b, 1.0f), inner)));
+
+   nir_if *if_stmt = nir_if_create(b->shader);
+   if_stmt->condition = nir_src_for_ssa(any_greater_than_1);
+   nir_builder_cf_insert(b, &if_stmt->cf_node);
+
+   /* Fill out the new then-block */
+   b->cursor = nir_after_cf_list(&if_stmt->then_list);
+
+   inner = nir_bcsel(b, nir_fge(b, nir_imm_float(b, 1.0f), inner),
+                        nir_imm_float(b, 2.0f), inner);
+
+   nir_intrinsic_instr *store =
+      nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_output);
+   store->num_components = 2;
+   nir_intrinsic_set_write_mask(store, WRITEMASK_XY);
+   nir_intrinsic_set_component(store, 2);
+   store->src[0] = nir_src_for_ssa(inner);
+   store->src[1] = nir_src_for_ssa(nir_imm_int(b, 0));
+   nir_builder_instr_insert(b, &store->instr);
+}
+
+void
+brw_nir_apply_tcs_quads_workaround(nir_shader *nir)
+{
+   assert(nir->stage == MESA_SHADER_TESS_CTRL);
+
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+
+   nir_builder b;
+   nir_builder_init(&b, impl);
+
+   /* emit_quads_workaround() inserts an if statement into each block,
+    * which splits it in two.  This changes the set of predecessors of
+    * the end block.  We want to process the original set, so to be safe,
+    * save it off to an array first.
+    */
+   const unsigned num_end_preds = impl->end_block->predecessors->entries;
+   nir_block *end_preds[num_end_preds];
+   unsigned i = 0;
+   struct set_entry *entry;
+
+   set_foreach(impl->end_block->predecessors, entry) {
+      end_preds[i++] = (nir_block *) entry->key;
+   }
+
+   for (i = 0; i < num_end_preds; i++) {
+      emit_quads_workaround(&b, end_preds[i]);
+   }
+
+   nir_metadata_preserve(impl, 0);
+}
diff --git a/src/intel/compiler/brw_nir_trig_workarounds.py b/src/intel/compiler/brw_nir_trig_workarounds.py
new file mode 100644
index 00000000000..6a77d64dbd4
--- /dev/null
+++ b/src/intel/compiler/brw_nir_trig_workarounds.py
@@ -0,0 +1,43 @@
+#
+# Copyright (C) 2016 Intel Corporation
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+import nir_algebraic
+
+# Prior to Kaby Lake, The SIN and COS instructions on Intel hardware can
+# produce values slightly outside of the [-1.0, 1.0] range for a small set of
+# values.  Obviously, this can break everyone's expectations about trig
+# functions.  This appears to be fixed in Kaby Lake.
+#
+# According to an internal presentation, the COS instruction can produce
+# a value up to 1.000027 for inputs in the range (0.08296, 0.09888).  One
+# suggested workaround is to multiply by 0.99997, scaling down the
+# amplitude slightly.  Apparently this also minimizes the error function,
+# reducing the maximum error from 0.00006 to about 0.00003.
+
+trig_workarounds = [
+   (('fsin', 'x'), ('fmul', ('fsin', 'x'), 0.99997)),
+   (('fcos', 'x'), ('fmul', ('fcos', 'x'), 0.99997)),
+]
+
+print '#include "brw_nir.h"'
+print nir_algebraic.AlgebraicPass("brw_nir_apply_trig_workarounds",
+                                  trig_workarounds).render()
diff --git a/src/intel/compiler/brw_packed_float.c b/src/intel/compiler/brw_packed_float.c
new file mode 100644
index 00000000000..9b7687a756f
--- /dev/null
+++ b/src/intel/compiler/brw_packed_float.c
@@ -0,0 +1,75 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+#include "brw_reg.h"
+
+union fu {
+   float f;
+   unsigned u;
+   struct {
+      unsigned mantissa:23;
+      unsigned exponent:8;
+      unsigned sign:1;
+   } s;
+};
+
+int
+brw_float_to_vf(float f)
+{
+   union fu fu = { .f = f };
+
+   /* ±0.0f is special cased. */
+   if (f == 0.0f)
+      return fu.s.sign << 7;
+
+   unsigned mantissa = fu.s.mantissa >> (23 - 4);
+   unsigned exponent = fu.s.exponent - (127 - 3);
+   unsigned vf = (fu.s.sign << 7) | (exponent << 4) | mantissa;
+
+   /* 0.125 would have had the same representation as 0.0, so reject it. */
+   if ((vf & 0x7f) == 0)
+      return -1;
+
+   /* Make sure the mantissa fits in 4-bits and the exponent in 3-bits. */
+   if (fu.u & 0x7ffff || exponent > 7)
+      return -1;
+
+   return vf;
+}
+
+float
+brw_vf_to_float(unsigned char vf)
+{
+   union fu fu;
+
+   /* ±0.0f is special cased. */
+   if (vf == 0x00 || vf == 0x80) {
+      fu.u = vf << 24;
+      return fu.f;
+   }
+
+   fu.s.sign = vf >> 7;
+   fu.s.exponent = ((vf & 0x70) >> 4) + (127 - 3);
+   fu.s.mantissa = (vf & 0xf) << (23 - 4);
+
+   return fu.f;
+}
diff --git a/src/intel/compiler/brw_predicated_break.cpp b/src/intel/compiler/brw_predicated_break.cpp
new file mode 100644
index 00000000000..607715dace4
--- /dev/null
+++ b/src/intel/compiler/brw_predicated_break.cpp
@@ -0,0 +1,148 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_cfg.h"
+
+using namespace brw;
+
+/** @file brw_predicated_break.cpp
+ *
+ * Loops are often structured as
+ *
+ * loop:
+ *    CMP.f0
+ *    (+f0) IF
+ *    BREAK
+ *    ENDIF
+ *    ...
+ *    WHILE loop
+ *
+ * This peephole pass removes the IF and ENDIF instructions and predicates the
+ * BREAK, dropping two instructions from the loop body.
+ *
+ * If the loop was a DO { ... } WHILE loop, it looks like
+ *
+ * loop:
+ *    ...
+ *    CMP.f0
+ *    (+f0) IF
+ *    BREAK
+ *    ENDIF
+ *    WHILE loop
+ *
+ * and we can remove the BREAK instruction and predicate the WHILE.
+ */
+
+bool
+opt_predicated_break(backend_shader *s)
+{
+   bool progress = false;
+
+   foreach_block (block, s->cfg) {
+      if (block->start_ip != block->end_ip)
+         continue;
+
+      /* BREAK and CONTINUE instructions, by definition, can only be found at
+       * the ends of basic blocks.
+       */
+      backend_instruction *jump_inst = block->end();
+      if (jump_inst->opcode != BRW_OPCODE_BREAK &&
+          jump_inst->opcode != BRW_OPCODE_CONTINUE)
+         continue;
+
+      backend_instruction *if_inst = block->prev()->end();
+      if (if_inst->opcode != BRW_OPCODE_IF)
+         continue;
+
+      backend_instruction *endif_inst = block->next()->start();
+      if (endif_inst->opcode != BRW_OPCODE_ENDIF)
+         continue;
+
+      bblock_t *jump_block = block;
+      bblock_t *if_block = jump_block->prev();
+      bblock_t *endif_block = jump_block->next();
+
+      jump_inst->predicate = if_inst->predicate;
+      jump_inst->predicate_inverse = if_inst->predicate_inverse;
+
+      bblock_t *earlier_block = if_block;
+      if (if_block->start_ip == if_block->end_ip) {
+         earlier_block = if_block->prev();
+      }
+
+      if_inst->remove(if_block);
+
+      bblock_t *later_block = endif_block;
+      if (endif_block->start_ip == endif_block->end_ip) {
+         later_block = endif_block->next();
+      }
+      endif_inst->remove(endif_block);
+
+      if (!earlier_block->ends_with_control_flow()) {
+         earlier_block->children.make_empty();
+         earlier_block->add_successor(s->cfg->mem_ctx, jump_block);
+      }
+
+      if (!later_block->starts_with_control_flow()) {
+         later_block->parents.make_empty();
+      }
+      jump_block->add_successor(s->cfg->mem_ctx, later_block);
+
+      if (earlier_block->can_combine_with(jump_block)) {
+         earlier_block->combine_with(jump_block);
+
+         block = earlier_block;
+      }
+
+      /* Now look at the first instruction of the block following the BREAK. If
+       * it's a WHILE, we can delete the break, predicate the WHILE, and join
+       * the two basic blocks.
+       */
+      bblock_t *while_block = earlier_block->next();
+      backend_instruction *while_inst = while_block->start();
+
+      if (jump_inst->opcode == BRW_OPCODE_BREAK &&
+          while_inst->opcode == BRW_OPCODE_WHILE &&
+          while_inst->predicate == BRW_PREDICATE_NONE) {
+         jump_inst->remove(earlier_block);
+         while_inst->predicate = jump_inst->predicate;
+         while_inst->predicate_inverse = !jump_inst->predicate_inverse;
+
+         earlier_block->children.make_empty();
+         earlier_block->add_successor(s->cfg->mem_ctx, while_block);
+
+         assert(earlier_block->can_combine_with(while_block));
+         earlier_block->combine_with(while_block);
+
+         earlier_block->next()->parents.make_empty();
+         earlier_block->add_successor(s->cfg->mem_ctx, earlier_block->next());
+      }
+
+      progress = true;
+   }
+
+   if (progress)
+      s->invalidate_live_intervals();
+
+   return progress;
+}
diff --git a/src/intel/compiler/brw_reg.h b/src/intel/compiler/brw_reg.h
new file mode 100644
index 00000000000..f8c3340e452
--- /dev/null
+++ b/src/intel/compiler/brw_reg.h
@@ -0,0 +1,1135 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+/** @file brw_reg.h
+ *
+ * This file defines struct brw_reg, which is our representation for EU
+ * registers.  They're not a hardware specific format, just an abstraction
+ * that intends to capture the full flexibility of the hardware registers.
+ *
+ * The brw_eu_emit.c layer's brw_set_dest/brw_set_src[01] functions encode
+ * the abstract brw_reg type into the actual hardware instruction encoding.
+ */
+
+#ifndef BRW_REG_H
+#define BRW_REG_H
+
+#include <stdbool.h>
+#include "main/compiler.h"
+#include "main/macros.h"
+#include "program/prog_instruction.h"
+#include "brw_eu_defines.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct gen_device_info;
+
+/** Number of general purpose registers (VS, WM, etc) */
+#define BRW_MAX_GRF 128
+
+/**
+ * First GRF used for the MRF hack.
+ *
+ * On gen7, MRFs are no longer used, and contiguous GRFs are used instead.  We
+ * haven't converted our compiler to be aware of this, so it asks for MRFs and
+ * brw_eu_emit.c quietly converts them to be accesses of the top GRFs.  The
+ * register allocators have to be careful of this to avoid corrupting the "MRF"s
+ * with actual GRF allocations.
+ */
+#define GEN7_MRF_HACK_START 112
+
+/** Number of message register file registers */
+#define BRW_MAX_MRF(gen) (gen == 6 ? 24 : 16)
+
+#define BRW_SWIZZLE4(a,b,c,d) (((a)<<0) | ((b)<<2) | ((c)<<4) | ((d)<<6))
+#define BRW_GET_SWZ(swz, idx) (((swz) >> ((idx)*2)) & 0x3)
+
+#define BRW_SWIZZLE_NOOP      BRW_SWIZZLE4(0,1,2,3)
+#define BRW_SWIZZLE_XYZW      BRW_SWIZZLE4(0,1,2,3)
+#define BRW_SWIZZLE_XXXX      BRW_SWIZZLE4(0,0,0,0)
+#define BRW_SWIZZLE_YYYY      BRW_SWIZZLE4(1,1,1,1)
+#define BRW_SWIZZLE_ZZZZ      BRW_SWIZZLE4(2,2,2,2)
+#define BRW_SWIZZLE_WWWW      BRW_SWIZZLE4(3,3,3,3)
+#define BRW_SWIZZLE_XYXY      BRW_SWIZZLE4(0,1,0,1)
+#define BRW_SWIZZLE_YXYX      BRW_SWIZZLE4(1,0,1,0)
+#define BRW_SWIZZLE_XZXZ      BRW_SWIZZLE4(0,2,0,2)
+#define BRW_SWIZZLE_YZXW      BRW_SWIZZLE4(1,2,0,3)
+#define BRW_SWIZZLE_YWYW      BRW_SWIZZLE4(1,3,1,3)
+#define BRW_SWIZZLE_ZXYW      BRW_SWIZZLE4(2,0,1,3)
+#define BRW_SWIZZLE_ZWZW      BRW_SWIZZLE4(2,3,2,3)
+#define BRW_SWIZZLE_WZWZ      BRW_SWIZZLE4(3,2,3,2)
+#define BRW_SWIZZLE_WZYX      BRW_SWIZZLE4(3,2,1,0)
+#define BRW_SWIZZLE_XXZZ      BRW_SWIZZLE4(0,0,2,2)
+#define BRW_SWIZZLE_YYWW      BRW_SWIZZLE4(1,1,3,3)
+#define BRW_SWIZZLE_YXWZ      BRW_SWIZZLE4(1,0,3,2)
+
+#define BRW_SWZ_COMP_INPUT(comp) (BRW_SWIZZLE_XYZW >> ((comp)*2))
+#define BRW_SWZ_COMP_OUTPUT(comp) (BRW_SWIZZLE_XYZW << ((comp)*2))
+
+static inline bool
+brw_is_single_value_swizzle(unsigned swiz)
+{
+   return (swiz == BRW_SWIZZLE_XXXX ||
+           swiz == BRW_SWIZZLE_YYYY ||
+           swiz == BRW_SWIZZLE_ZZZZ ||
+           swiz == BRW_SWIZZLE_WWWW);
+}
+
+/**
+ * Compute the swizzle obtained from the application of \p swz0 on the result
+ * of \p swz1.  The argument ordering is expected to match function
+ * composition.
+ */
+static inline unsigned
+brw_compose_swizzle(unsigned swz0, unsigned swz1)
+{
+   return BRW_SWIZZLE4(
+      BRW_GET_SWZ(swz1, BRW_GET_SWZ(swz0, 0)),
+      BRW_GET_SWZ(swz1, BRW_GET_SWZ(swz0, 1)),
+      BRW_GET_SWZ(swz1, BRW_GET_SWZ(swz0, 2)),
+      BRW_GET_SWZ(swz1, BRW_GET_SWZ(swz0, 3)));
+}
+
+/**
+ * Return the result of applying swizzle \p swz to shuffle the bits of \p mask
+ * (AKA image).
+ */
+static inline unsigned
+brw_apply_swizzle_to_mask(unsigned swz, unsigned mask)
+{
+   unsigned result = 0;
+
+   for (unsigned i = 0; i < 4; i++) {
+      if (mask & (1 << BRW_GET_SWZ(swz, i)))
+         result |= 1 << i;
+   }
+
+   return result;
+}
+
+/**
+ * Return the result of applying the inverse of swizzle \p swz to shuffle the
+ * bits of \p mask (AKA preimage).  Useful to find out which components are
+ * read from a swizzled source given the instruction writemask.
+ */
+static inline unsigned
+brw_apply_inv_swizzle_to_mask(unsigned swz, unsigned mask)
+{
+   unsigned result = 0;
+
+   for (unsigned i = 0; i < 4; i++) {
+      if (mask & (1 << i))
+         result |= 1 << BRW_GET_SWZ(swz, i);
+   }
+
+   return result;
+}
+
+/**
+ * Construct an identity swizzle for the set of enabled channels given by \p
+ * mask.  The result will only reference channels enabled in the provided \p
+ * mask, assuming that \p mask is non-zero.  The constructed swizzle will
+ * satisfy the property that for any instruction OP and any mask:
+ *
+ *    brw_OP(p, brw_writemask(dst, mask),
+ *           brw_swizzle(src, brw_swizzle_for_mask(mask)));
+ *
+ * will be equivalent to the same instruction without swizzle:
+ *
+ *    brw_OP(p, brw_writemask(dst, mask), src);
+ */
+static inline unsigned
+brw_swizzle_for_mask(unsigned mask)
+{
+   unsigned last = (mask ? ffs(mask) - 1 : 0);
+   unsigned swz[4];
+
+   for (unsigned i = 0; i < 4; i++)
+      last = swz[i] = (mask & (1 << i) ? i : last);
+
+   return BRW_SWIZZLE4(swz[0], swz[1], swz[2], swz[3]);
+}
+
+/**
+ * Construct an identity swizzle for the first \p n components of a vector.
+ * When only a subset of channels of a vec4 are used we don't want to
+ * reference the other channels, as that will tell optimization passes that
+ * those other channels are used.
+ */
+static inline unsigned
+brw_swizzle_for_size(unsigned n)
+{
+   return brw_swizzle_for_mask((1 << n) - 1);
+}
+
+/**
+ * Converse of brw_swizzle_for_mask().  Returns the mask of components
+ * accessed by the specified swizzle \p swz.
+ */
+static inline unsigned
+brw_mask_for_swizzle(unsigned swz)
+{
+   return brw_apply_inv_swizzle_to_mask(swz, ~0);
+}
+
+enum PACKED brw_reg_type {
+   BRW_REGISTER_TYPE_UD = 0,
+   BRW_REGISTER_TYPE_D,
+   BRW_REGISTER_TYPE_UW,
+   BRW_REGISTER_TYPE_W,
+   BRW_REGISTER_TYPE_F,
+
+   /** Non-immediates only: @{ */
+   BRW_REGISTER_TYPE_UB,
+   BRW_REGISTER_TYPE_B,
+   /** @} */
+
+   /** Immediates only: @{ */
+   BRW_REGISTER_TYPE_UV, /* Gen6+ */
+   BRW_REGISTER_TYPE_V,
+   BRW_REGISTER_TYPE_VF,
+   /** @} */
+
+   BRW_REGISTER_TYPE_DF, /* Gen7+ (no immediates until Gen8+) */
+
+   /* Gen8+ */
+   BRW_REGISTER_TYPE_HF,
+   BRW_REGISTER_TYPE_UQ,
+   BRW_REGISTER_TYPE_Q,
+};
+
+unsigned brw_reg_type_to_hw_type(const struct gen_device_info *devinfo,
+                                 enum brw_reg_type type, enum brw_reg_file file);
+
+#define brw_element_size(devinfo, inst, operand)                             \
+   brw_hw_reg_type_to_size(devinfo,                                          \
+                           brw_inst_ ## operand ## _reg_type(devinfo, inst), \
+                           brw_inst_ ## operand ## _reg_file(devinfo, inst))
+unsigned brw_hw_reg_type_to_size(const struct gen_device_info *devinfo,
+                                 unsigned type, enum brw_reg_file file);
+
+const char *brw_reg_type_letters(unsigned brw_reg_type);
+uint32_t brw_swizzle_immediate(enum brw_reg_type type, uint32_t x, unsigned swz);
+
+#define REG_SIZE (8*4)
+
+/* These aren't hardware structs, just something useful for us to pass around:
+ *
+ * Align1 operation has a lot of control over input ranges.  Used in
+ * WM programs to implement shaders decomposed into "channel serial"
+ * or "structure of array" form:
+ */
+struct brw_reg {
+   union {
+      struct {
+         enum brw_reg_type type:4;
+         enum brw_reg_file file:3;      /* :2 hardware format */
+         unsigned negate:1;             /* source only */
+         unsigned abs:1;                /* source only */
+         unsigned address_mode:1;       /* relative addressing, hopefully! */
+         unsigned pad0:1;
+         unsigned subnr:5;              /* :1 in align16 */
+         unsigned nr:16;
+      };
+      uint32_t bits;
+   };
+
+   union {
+      struct {
+         unsigned swizzle:8;      /* src only, align16 only */
+         unsigned writemask:4;    /* dest only, align16 only */
+         int  indirect_offset:10; /* relative addressing offset */
+         unsigned vstride:4;      /* source only */
+         unsigned width:3;        /* src only, align1 only */
+         unsigned hstride:2;      /* align1 only */
+         unsigned pad1:1;
+      };
+
+      double df;
+      uint64_t u64;
+      int64_t d64;
+      float f;
+      int   d;
+      unsigned ud;
+   };
+};
+
+static inline bool
+brw_regs_equal(const struct brw_reg *a, const struct brw_reg *b)
+{
+   const bool df = a->type == BRW_REGISTER_TYPE_DF && a->file == IMM;
+   return a->bits == b->bits && (df ? a->u64 == b->u64 : a->ud == b->ud);
+}
+
+struct brw_indirect {
+   unsigned addr_subnr:4;
+   int addr_offset:10;
+   unsigned pad:18;
+};
+
+
+static inline unsigned
+type_sz(unsigned type)
+{
+   switch(type) {
+   case BRW_REGISTER_TYPE_UQ:
+   case BRW_REGISTER_TYPE_Q:
+   case BRW_REGISTER_TYPE_DF:
+      return 8;
+   case BRW_REGISTER_TYPE_UD:
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_F:
+   case BRW_REGISTER_TYPE_VF:
+      return 4;
+   case BRW_REGISTER_TYPE_UW:
+   case BRW_REGISTER_TYPE_W:
+   case BRW_REGISTER_TYPE_UV:
+   case BRW_REGISTER_TYPE_V:
+   case BRW_REGISTER_TYPE_HF:
+      return 2;
+   case BRW_REGISTER_TYPE_UB:
+   case BRW_REGISTER_TYPE_B:
+      return 1;
+   default:
+      unreachable("not reached");
+   }
+}
+
+/**
+ * Return an integer type of the requested size and signedness.
+ */
+static inline enum brw_reg_type
+brw_int_type(unsigned sz, bool is_signed)
+{
+   switch (sz) {
+   case 1:
+      return (is_signed ? BRW_REGISTER_TYPE_B : BRW_REGISTER_TYPE_UB);
+   case 2:
+      return (is_signed ? BRW_REGISTER_TYPE_W : BRW_REGISTER_TYPE_UW);
+   case 4:
+      return (is_signed ? BRW_REGISTER_TYPE_D : BRW_REGISTER_TYPE_UD);
+   case 8:
+      return (is_signed ? BRW_REGISTER_TYPE_Q : BRW_REGISTER_TYPE_UQ);
+   default:
+      unreachable("Not reached.");
+   }
+}
+
+/**
+ * Construct a brw_reg.
+ * \param file      one of the BRW_x_REGISTER_FILE values
+ * \param nr        register number/index
+ * \param subnr     register sub number
+ * \param negate    register negate modifier
+ * \param abs       register abs modifier
+ * \param type      one of BRW_REGISTER_TYPE_x
+ * \param vstride   one of BRW_VERTICAL_STRIDE_x
+ * \param width     one of BRW_WIDTH_x
+ * \param hstride   one of BRW_HORIZONTAL_STRIDE_x
+ * \param swizzle   one of BRW_SWIZZLE_x
+ * \param writemask WRITEMASK_X/Y/Z/W bitfield
+ */
+static inline struct brw_reg
+brw_reg(enum brw_reg_file file,
+        unsigned nr,
+        unsigned subnr,
+        unsigned negate,
+        unsigned abs,
+        enum brw_reg_type type,
+        unsigned vstride,
+        unsigned width,
+        unsigned hstride,
+        unsigned swizzle,
+        unsigned writemask)
+{
+   struct brw_reg reg;
+   if (file == BRW_GENERAL_REGISTER_FILE)
+      assert(nr < BRW_MAX_GRF);
+   else if (file == BRW_ARCHITECTURE_REGISTER_FILE)
+      assert(nr <= BRW_ARF_TIMESTAMP);
+   /* Asserting on the MRF register number requires to know the hardware gen
+    * (gen6 has 24 MRF registers), which we don't know here, so we assert
+    * for that in the generators and in brw_eu_emit.c
+    */
+
+   reg.type = type;
+   reg.file = file;
+   reg.negate = negate;
+   reg.abs = abs;
+   reg.address_mode = BRW_ADDRESS_DIRECT;
+   reg.pad0 = 0;
+   reg.subnr = subnr * type_sz(type);
+   reg.nr = nr;
+
+   /* Could do better: If the reg is r5.3<0;1,0>, we probably want to
+    * set swizzle and writemask to W, as the lower bits of subnr will
+    * be lost when converted to align16.  This is probably too much to
+    * keep track of as you'd want it adjusted by suboffset(), etc.
+    * Perhaps fix up when converting to align16?
+    */
+   reg.swizzle = swizzle;
+   reg.writemask = writemask;
+   reg.indirect_offset = 0;
+   reg.vstride = vstride;
+   reg.width = width;
+   reg.hstride = hstride;
+   reg.pad1 = 0;
+   return reg;
+}
+
+/** Construct float[16] register */
+static inline struct brw_reg
+brw_vec16_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
+{
+   return brw_reg(file,
+                  nr,
+                  subnr,
+                  0,
+                  0,
+                  BRW_REGISTER_TYPE_F,
+                  BRW_VERTICAL_STRIDE_16,
+                  BRW_WIDTH_16,
+                  BRW_HORIZONTAL_STRIDE_1,
+                  BRW_SWIZZLE_XYZW,
+                  WRITEMASK_XYZW);
+}
+
+/** Construct float[8] register */
+static inline struct brw_reg
+brw_vec8_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
+{
+   return brw_reg(file,
+                  nr,
+                  subnr,
+                  0,
+                  0,
+                  BRW_REGISTER_TYPE_F,
+                  BRW_VERTICAL_STRIDE_8,
+                  BRW_WIDTH_8,
+                  BRW_HORIZONTAL_STRIDE_1,
+                  BRW_SWIZZLE_XYZW,
+                  WRITEMASK_XYZW);
+}
+
+/** Construct float[4] register */
+static inline struct brw_reg
+brw_vec4_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
+{
+   return brw_reg(file,
+                  nr,
+                  subnr,
+                  0,
+                  0,
+                  BRW_REGISTER_TYPE_F,
+                  BRW_VERTICAL_STRIDE_4,
+                  BRW_WIDTH_4,
+                  BRW_HORIZONTAL_STRIDE_1,
+                  BRW_SWIZZLE_XYZW,
+                  WRITEMASK_XYZW);
+}
+
+/** Construct float[2] register */
+static inline struct brw_reg
+brw_vec2_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
+{
+   return brw_reg(file,
+                  nr,
+                  subnr,
+                  0,
+                  0,
+                  BRW_REGISTER_TYPE_F,
+                  BRW_VERTICAL_STRIDE_2,
+                  BRW_WIDTH_2,
+                  BRW_HORIZONTAL_STRIDE_1,
+                  BRW_SWIZZLE_XYXY,
+                  WRITEMASK_XY);
+}
+
+/** Construct float[1] register */
+static inline struct brw_reg
+brw_vec1_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
+{
+   return brw_reg(file,
+                  nr,
+                  subnr,
+                  0,
+                  0,
+                  BRW_REGISTER_TYPE_F,
+                  BRW_VERTICAL_STRIDE_0,
+                  BRW_WIDTH_1,
+                  BRW_HORIZONTAL_STRIDE_0,
+                  BRW_SWIZZLE_XXXX,
+                  WRITEMASK_X);
+}
+
+static inline struct brw_reg
+brw_vecn_reg(unsigned width, enum brw_reg_file file,
+             unsigned nr, unsigned subnr)
+{
+   switch (width) {
+   case 1:
+      return brw_vec1_reg(file, nr, subnr);
+   case 2:
+      return brw_vec2_reg(file, nr, subnr);
+   case 4:
+      return brw_vec4_reg(file, nr, subnr);
+   case 8:
+      return brw_vec8_reg(file, nr, subnr);
+   case 16:
+      return brw_vec16_reg(file, nr, subnr);
+   default:
+      unreachable("Invalid register width");
+   }
+}
+
+static inline struct brw_reg
+retype(struct brw_reg reg, enum brw_reg_type type)
+{
+   reg.type = type;
+   return reg;
+}
+
+static inline struct brw_reg
+firsthalf(struct brw_reg reg)
+{
+   return reg;
+}
+
+static inline struct brw_reg
+sechalf(struct brw_reg reg)
+{
+   if (reg.vstride)
+      reg.nr++;
+   return reg;
+}
+
+static inline struct brw_reg
+offset(struct brw_reg reg, unsigned delta)
+{
+   reg.nr += delta;
+   return reg;
+}
+
+
+static inline struct brw_reg
+byte_offset(struct brw_reg reg, unsigned bytes)
+{
+   unsigned newoffset = reg.nr * REG_SIZE + reg.subnr + bytes;
+   reg.nr = newoffset / REG_SIZE;
+   reg.subnr = newoffset % REG_SIZE;
+   return reg;
+}
+
+static inline struct brw_reg
+suboffset(struct brw_reg reg, unsigned delta)
+{
+   return byte_offset(reg, delta * type_sz(reg.type));
+}
+
+/** Construct unsigned word[16] register */
+static inline struct brw_reg
+brw_uw16_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
+{
+   return suboffset(retype(brw_vec16_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+/** Construct unsigned word[8] register */
+static inline struct brw_reg
+brw_uw8_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
+{
+   return suboffset(retype(brw_vec8_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+/** Construct unsigned word[1] register */
+static inline struct brw_reg
+brw_uw1_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
+{
+   return suboffset(retype(brw_vec1_reg(file, nr, 0), BRW_REGISTER_TYPE_UW), subnr);
+}
+
+static inline struct brw_reg
+brw_ud1_reg(enum brw_reg_file file, unsigned nr, unsigned subnr)
+{
+   return retype(brw_vec1_reg(file, nr, subnr), BRW_REGISTER_TYPE_UD);
+}
+
+static inline struct brw_reg
+brw_imm_reg(enum brw_reg_type type)
+{
+   return brw_reg(BRW_IMMEDIATE_VALUE,
+                  0,
+                  0,
+                  0,
+                  0,
+                  type,
+                  BRW_VERTICAL_STRIDE_0,
+                  BRW_WIDTH_1,
+                  BRW_HORIZONTAL_STRIDE_0,
+                  0,
+                  0);
+}
+
+/** Construct float immediate register */
+static inline struct brw_reg
+brw_imm_df(double df)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_DF);
+   imm.df = df;
+   return imm;
+}
+
+static inline struct brw_reg
+brw_imm_f(float f)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_F);
+   imm.f = f;
+   return imm;
+}
+
+/** Construct integer immediate register */
+static inline struct brw_reg
+brw_imm_d(int d)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_D);
+   imm.d = d;
+   return imm;
+}
+
+/** Construct uint immediate register */
+static inline struct brw_reg
+brw_imm_ud(unsigned ud)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UD);
+   imm.ud = ud;
+   return imm;
+}
+
+/** Construct ushort immediate register */
+static inline struct brw_reg
+brw_imm_uw(uint16_t uw)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UW);
+   imm.ud = uw | (uw << 16);
+   return imm;
+}
+
+/** Construct short immediate register */
+static inline struct brw_reg
+brw_imm_w(int16_t w)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_W);
+   imm.d = w | (w << 16);
+   return imm;
+}
+
+/* brw_imm_b and brw_imm_ub aren't supported by hardware - the type
+ * numbers alias with _V and _VF below:
+ */
+
+/** Construct vector of eight signed half-byte values */
+static inline struct brw_reg
+brw_imm_v(unsigned v)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_V);
+   imm.ud = v;
+   return imm;
+}
+
+/** Construct vector of eight unsigned half-byte values */
+static inline struct brw_reg
+brw_imm_uv(unsigned uv)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_UV);
+   imm.ud = uv;
+   return imm;
+}
+
+/** Construct vector of four 8-bit float values */
+static inline struct brw_reg
+brw_imm_vf(unsigned v)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF);
+   imm.ud = v;
+   return imm;
+}
+
+static inline struct brw_reg
+brw_imm_vf4(unsigned v0, unsigned v1, unsigned v2, unsigned v3)
+{
+   struct brw_reg imm = brw_imm_reg(BRW_REGISTER_TYPE_VF);
+   imm.vstride = BRW_VERTICAL_STRIDE_0;
+   imm.width = BRW_WIDTH_4;
+   imm.hstride = BRW_HORIZONTAL_STRIDE_1;
+   imm.ud = ((v0 << 0) | (v1 << 8) | (v2 << 16) | (v3 << 24));
+   return imm;
+}
+
+
+static inline struct brw_reg
+brw_address(struct brw_reg reg)
+{
+   return brw_imm_uw(reg.nr * REG_SIZE + reg.subnr);
+}
+
+/** Construct float[1] general-purpose register */
+static inline struct brw_reg
+brw_vec1_grf(unsigned nr, unsigned subnr)
+{
+   return brw_vec1_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+/** Construct float[2] general-purpose register */
+static inline struct brw_reg
+brw_vec2_grf(unsigned nr, unsigned subnr)
+{
+   return brw_vec2_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+/** Construct float[4] general-purpose register */
+static inline struct brw_reg
+brw_vec4_grf(unsigned nr, unsigned subnr)
+{
+   return brw_vec4_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+/** Construct float[8] general-purpose register */
+static inline struct brw_reg
+brw_vec8_grf(unsigned nr, unsigned subnr)
+{
+   return brw_vec8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+/** Construct float[16] general-purpose register */
+static inline struct brw_reg
+brw_vec16_grf(unsigned nr, unsigned subnr)
+{
+   return brw_vec16_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static inline struct brw_reg
+brw_vecn_grf(unsigned width, unsigned nr, unsigned subnr)
+{
+   return brw_vecn_reg(width, BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+
+static inline struct brw_reg
+brw_uw8_grf(unsigned nr, unsigned subnr)
+{
+   return brw_uw8_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+static inline struct brw_reg
+brw_uw16_grf(unsigned nr, unsigned subnr)
+{
+   return brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, nr, subnr);
+}
+
+
+/** Construct null register (usually used for setting condition codes) */
+static inline struct brw_reg
+brw_null_reg(void)
+{
+   return brw_vec8_reg(BRW_ARCHITECTURE_REGISTER_FILE, BRW_ARF_NULL, 0);
+}
+
+static inline struct brw_reg
+brw_null_vec(unsigned width)
+{
+   return brw_vecn_reg(width, BRW_ARCHITECTURE_REGISTER_FILE, BRW_ARF_NULL, 0);
+}
+
+static inline struct brw_reg
+brw_address_reg(unsigned subnr)
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE, BRW_ARF_ADDRESS, subnr);
+}
+
+/* If/else instructions break in align16 mode if writemask & swizzle
+ * aren't xyzw.  This goes against the convention for other scalar
+ * regs:
+ */
+static inline struct brw_reg
+brw_ip_reg(void)
+{
+   return brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+                  BRW_ARF_IP,
+                  0,
+                  0,
+                  0,
+                  BRW_REGISTER_TYPE_UD,
+                  BRW_VERTICAL_STRIDE_4, /* ? */
+                  BRW_WIDTH_1,
+                  BRW_HORIZONTAL_STRIDE_0,
+                  BRW_SWIZZLE_XYZW, /* NOTE! */
+                  WRITEMASK_XYZW); /* NOTE! */
+}
+
+static inline struct brw_reg
+brw_notification_reg(void)
+{
+   return brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+                  BRW_ARF_NOTIFICATION_COUNT,
+                  0,
+                  0,
+                  0,
+                  BRW_REGISTER_TYPE_UD,
+                  BRW_VERTICAL_STRIDE_0,
+                  BRW_WIDTH_1,
+                  BRW_HORIZONTAL_STRIDE_0,
+                  BRW_SWIZZLE_XXXX,
+                  WRITEMASK_X);
+}
+
+static inline struct brw_reg
+brw_sr0_reg(unsigned subnr)
+{
+   return brw_ud1_reg(BRW_ARCHITECTURE_REGISTER_FILE, BRW_ARF_STATE, subnr);
+}
+
+static inline struct brw_reg
+brw_acc_reg(unsigned width)
+{
+   return brw_vecn_reg(width, BRW_ARCHITECTURE_REGISTER_FILE,
+                       BRW_ARF_ACCUMULATOR, 0);
+}
+
+static inline struct brw_reg
+brw_flag_reg(int reg, int subreg)
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+                      BRW_ARF_FLAG + reg, subreg);
+}
+
+/**
+ * Return the mask register present in Gen4-5, or the related register present
+ * in Gen7.5 and later hardware referred to as "channel enable" register in
+ * the documentation.
+ */
+static inline struct brw_reg
+brw_mask_reg(unsigned subnr)
+{
+   return brw_uw1_reg(BRW_ARCHITECTURE_REGISTER_FILE, BRW_ARF_MASK, subnr);
+}
+
+static inline struct brw_reg
+brw_vmask_reg()
+{
+   return brw_sr0_reg(3);
+}
+
+static inline struct brw_reg
+brw_dmask_reg()
+{
+   return brw_sr0_reg(2);
+}
+
+static inline struct brw_reg
+brw_message_reg(unsigned nr)
+{
+   return brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, nr, 0);
+}
+
+static inline struct brw_reg
+brw_uvec_mrf(unsigned width, unsigned nr, unsigned subnr)
+{
+   return retype(brw_vecn_reg(width, BRW_MESSAGE_REGISTER_FILE, nr, subnr),
+                 BRW_REGISTER_TYPE_UD);
+}
+
+/* This is almost always called with a numeric constant argument, so
+ * make things easy to evaluate at compile time:
+ */
+static inline unsigned cvt(unsigned val)
+{
+   switch (val) {
+   case 0: return 0;
+   case 1: return 1;
+   case 2: return 2;
+   case 4: return 3;
+   case 8: return 4;
+   case 16: return 5;
+   case 32: return 6;
+   }
+   return 0;
+}
+
+static inline struct brw_reg
+stride(struct brw_reg reg, unsigned vstride, unsigned width, unsigned hstride)
+{
+   reg.vstride = cvt(vstride);
+   reg.width = cvt(width) - 1;
+   reg.hstride = cvt(hstride);
+   return reg;
+}
+
+/**
+ * Multiply the vertical and horizontal stride of a register by the given
+ * factor \a s.
+ */
+static inline struct brw_reg
+spread(struct brw_reg reg, unsigned s)
+{
+   if (s) {
+      assert(_mesa_is_pow_two(s));
+
+      if (reg.hstride)
+         reg.hstride += cvt(s) - 1;
+
+      if (reg.vstride)
+         reg.vstride += cvt(s) - 1;
+
+      return reg;
+   } else {
+      return stride(reg, 0, 1, 0);
+   }
+}
+
+static inline struct brw_reg
+vec16(struct brw_reg reg)
+{
+   return stride(reg, 16,16,1);
+}
+
+static inline struct brw_reg
+vec8(struct brw_reg reg)
+{
+   return stride(reg, 8,8,1);
+}
+
+static inline struct brw_reg
+vec4(struct brw_reg reg)
+{
+   return stride(reg, 4,4,1);
+}
+
+static inline struct brw_reg
+vec2(struct brw_reg reg)
+{
+   return stride(reg, 2,2,1);
+}
+
+static inline struct brw_reg
+vec1(struct brw_reg reg)
+{
+   return stride(reg, 0,1,0);
+}
+
+
+static inline struct brw_reg
+get_element(struct brw_reg reg, unsigned elt)
+{
+   return vec1(suboffset(reg, elt));
+}
+
+static inline struct brw_reg
+get_element_ud(struct brw_reg reg, unsigned elt)
+{
+   return vec1(suboffset(retype(reg, BRW_REGISTER_TYPE_UD), elt));
+}
+
+static inline struct brw_reg
+get_element_d(struct brw_reg reg, unsigned elt)
+{
+   return vec1(suboffset(retype(reg, BRW_REGISTER_TYPE_D), elt));
+}
+
+static inline struct brw_reg
+brw_swizzle(struct brw_reg reg, unsigned swz)
+{
+   if (reg.file == BRW_IMMEDIATE_VALUE)
+      reg.ud = brw_swizzle_immediate(reg.type, reg.ud, swz);
+   else
+      reg.swizzle = brw_compose_swizzle(swz, reg.swizzle);
+
+   return reg;
+}
+
+static inline struct brw_reg
+brw_writemask(struct brw_reg reg, unsigned mask)
+{
+   assert(reg.file != BRW_IMMEDIATE_VALUE);
+   reg.writemask &= mask;
+   return reg;
+}
+
+static inline struct brw_reg
+brw_set_writemask(struct brw_reg reg, unsigned mask)
+{
+   assert(reg.file != BRW_IMMEDIATE_VALUE);
+   reg.writemask = mask;
+   return reg;
+}
+
+static inline unsigned
+brw_writemask_for_size(unsigned n)
+{
+   return (1 << n) - 1;
+}
+
+static inline unsigned
+brw_writemask_for_component_packing(unsigned n, unsigned first_component)
+{
+   assert(first_component + n <= 4);
+   return (((1 << n) - 1) << first_component);
+}
+
+static inline struct brw_reg
+negate(struct brw_reg reg)
+{
+   reg.negate ^= 1;
+   return reg;
+}
+
+static inline struct brw_reg
+brw_abs(struct brw_reg reg)
+{
+   reg.abs = 1;
+   reg.negate = 0;
+   return reg;
+}
+
+/************************************************************************/
+
+static inline struct brw_reg
+brw_vec4_indirect(unsigned subnr, int offset)
+{
+   struct brw_reg reg =  brw_vec4_grf(0, 0);
+   reg.subnr = subnr;
+   reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
+   reg.indirect_offset = offset;
+   return reg;
+}
+
+static inline struct brw_reg
+brw_vec1_indirect(unsigned subnr, int offset)
+{
+   struct brw_reg reg =  brw_vec1_grf(0, 0);
+   reg.subnr = subnr;
+   reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
+   reg.indirect_offset = offset;
+   return reg;
+}
+
+static inline struct brw_reg
+brw_VxH_indirect(unsigned subnr, int offset)
+{
+   struct brw_reg reg = brw_vec1_grf(0, 0);
+   reg.vstride = BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL;
+   reg.subnr = subnr;
+   reg.address_mode = BRW_ADDRESS_REGISTER_INDIRECT_REGISTER;
+   reg.indirect_offset = offset;
+   return reg;
+}
+
+static inline struct brw_reg
+deref_4f(struct brw_indirect ptr, int offset)
+{
+   return brw_vec4_indirect(ptr.addr_subnr, ptr.addr_offset + offset);
+}
+
+static inline struct brw_reg
+deref_1f(struct brw_indirect ptr, int offset)
+{
+   return brw_vec1_indirect(ptr.addr_subnr, ptr.addr_offset + offset);
+}
+
+static inline struct brw_reg
+deref_4b(struct brw_indirect ptr, int offset)
+{
+   return retype(deref_4f(ptr, offset), BRW_REGISTER_TYPE_B);
+}
+
+static inline struct brw_reg
+deref_1uw(struct brw_indirect ptr, int offset)
+{
+   return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_UW);
+}
+
+static inline struct brw_reg
+deref_1d(struct brw_indirect ptr, int offset)
+{
+   return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_D);
+}
+
+static inline struct brw_reg
+deref_1ud(struct brw_indirect ptr, int offset)
+{
+   return retype(deref_1f(ptr, offset), BRW_REGISTER_TYPE_UD);
+}
+
+static inline struct brw_reg
+get_addr_reg(struct brw_indirect ptr)
+{
+   return brw_address_reg(ptr.addr_subnr);
+}
+
+static inline struct brw_indirect
+brw_indirect_offset(struct brw_indirect ptr, int offset)
+{
+   ptr.addr_offset += offset;
+   return ptr;
+}
+
+static inline struct brw_indirect
+brw_indirect(unsigned addr_subnr, int offset)
+{
+   struct brw_indirect ptr;
+   ptr.addr_subnr = addr_subnr;
+   ptr.addr_offset = offset;
+   ptr.pad = 0;
+   return ptr;
+}
+
+static inline bool
+region_matches(struct brw_reg reg, enum brw_vertical_stride v,
+               enum brw_width w, enum brw_horizontal_stride h)
+{
+   return reg.vstride == v &&
+          reg.width == w &&
+          reg.hstride == h;
+}
+
+#define has_scalar_region(reg) \
+   region_matches(reg, BRW_VERTICAL_STRIDE_0, BRW_WIDTH_1, \
+                  BRW_HORIZONTAL_STRIDE_0)
+
+/* brw_packed_float.c */
+int brw_float_to_vf(float f);
+float brw_vf_to_float(unsigned char vf);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/intel/compiler/brw_schedule_instructions.cpp b/src/intel/compiler/brw_schedule_instructions.cpp
new file mode 100644
index 00000000000..b3f7e877c80
--- /dev/null
+++ b/src/intel/compiler/brw_schedule_instructions.cpp
@@ -0,0 +1,1753 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "brw_fs.h"
+#include "brw_fs_live_variables.h"
+#include "brw_vec4.h"
+#include "brw_cfg.h"
+#include "brw_shader.h"
+
+using namespace brw;
+
+/** @file brw_fs_schedule_instructions.cpp
+ *
+ * List scheduling of FS instructions.
+ *
+ * The basic model of the list scheduler is to take a basic block,
+ * compute a DAG of the dependencies (RAW ordering with latency, WAW
+ * ordering with latency, WAR ordering), and make a list of the DAG heads.
+ * Heuristically pick a DAG head, then put all the children that are
+ * now DAG heads into the list of things to schedule.
+ *
+ * The heuristic is the important part.  We're trying to be cheap,
+ * since actually computing the optimal scheduling is NP complete.
+ * What we do is track a "current clock".  When we schedule a node, we
+ * update the earliest-unblocked clock time of its children, and
+ * increment the clock.  Then, when trying to schedule, we just pick
+ * the earliest-unblocked instruction to schedule.
+ *
+ * Note that often there will be many things which could execute
+ * immediately, and there are a range of heuristic options to choose
+ * from in picking among those.
+ */
+
+static bool debug = false;
+
+class instruction_scheduler;
+
+class schedule_node : public exec_node
+{
+public:
+   schedule_node(backend_instruction *inst, instruction_scheduler *sched);
+   void set_latency_gen4();
+   void set_latency_gen7(bool is_haswell);
+
+   backend_instruction *inst;
+   schedule_node **children;
+   int *child_latency;
+   int child_count;
+   int parent_count;
+   int child_array_size;
+   int unblocked_time;
+   int latency;
+
+   /**
+    * Which iteration of pushing groups of children onto the candidates list
+    * this node was a part of.
+    */
+   unsigned cand_generation;
+
+   /**
+    * This is the sum of the instruction's latency plus the maximum delay of
+    * its children, or just the issue_time if it's a leaf node.
+    */
+   int delay;
+
+   /**
+    * Preferred exit node among the (direct or indirect) successors of this
+    * node.  Among the scheduler nodes blocked by this node, this will be the
+    * one that may cause earliest program termination, or NULL if none of the
+    * successors is an exit node.
+    */
+   schedule_node *exit;
+
+   bool is_barrier;
+};
+
+/**
+ * Lower bound of the scheduling time after which one of the instructions
+ * blocked by this node may lead to program termination.
+ *
+ * exit_unblocked_time() determines a strict partial ordering relation '«' on
+ * the set of scheduler nodes as follows:
+ *
+ *   n « m <-> exit_unblocked_time(n) < exit_unblocked_time(m)
+ *
+ * which can be used to heuristically order nodes according to how early they
+ * can unblock an exit node and lead to program termination.
+ */
+static inline int
+exit_unblocked_time(const schedule_node *n)
+{
+   return n->exit ? n->exit->unblocked_time : INT_MAX;
+}
+
+void
+schedule_node::set_latency_gen4()
+{
+   int chans = 8;
+   int math_latency = 22;
+
+   switch (inst->opcode) {
+   case SHADER_OPCODE_RCP:
+      this->latency = 1 * chans * math_latency;
+      break;
+   case SHADER_OPCODE_RSQ:
+      this->latency = 2 * chans * math_latency;
+      break;
+   case SHADER_OPCODE_INT_QUOTIENT:
+   case SHADER_OPCODE_SQRT:
+   case SHADER_OPCODE_LOG2:
+      /* full precision log.  partial is 2. */
+      this->latency = 3 * chans * math_latency;
+      break;
+   case SHADER_OPCODE_INT_REMAINDER:
+   case SHADER_OPCODE_EXP2:
+      /* full precision.  partial is 3, same throughput. */
+      this->latency = 4 * chans * math_latency;
+      break;
+   case SHADER_OPCODE_POW:
+      this->latency = 8 * chans * math_latency;
+      break;
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_COS:
+      /* minimum latency, max is 12 rounds. */
+      this->latency = 5 * chans * math_latency;
+      break;
+   default:
+      this->latency = 2;
+      break;
+   }
+}
+
+void
+schedule_node::set_latency_gen7(bool is_haswell)
+{
+   switch (inst->opcode) {
+   case BRW_OPCODE_MAD:
+      /* 2 cycles
+       *  (since the last two src operands are in different register banks):
+       * mad(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
+       *
+       * 3 cycles on IVB, 4 on HSW
+       *  (since the last two src operands are in the same register bank):
+       * mad(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
+       *
+       * 18 cycles on IVB, 16 on HSW
+       *  (since the last two src operands are in different register banks):
+       * mad(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
+       * mov(8) null   g4<4,5,1>F                     { align16 WE_normal 1Q };
+       *
+       * 20 cycles on IVB, 18 on HSW
+       *  (since the last two src operands are in the same register bank):
+       * mad(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
+       * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
+       */
+
+      /* Our register allocator doesn't know about register banks, so use the
+       * higher latency.
+       */
+      latency = is_haswell ? 16 : 18;
+      break;
+
+   case BRW_OPCODE_LRP:
+      /* 2 cycles
+       *  (since the last two src operands are in different register banks):
+       * lrp(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
+       *
+       * 3 cycles on IVB, 4 on HSW
+       *  (since the last two src operands are in the same register bank):
+       * lrp(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
+       *
+       * 16 cycles on IVB, 14 on HSW
+       *  (since the last two src operands are in different register banks):
+       * lrp(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
+       * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
+       *
+       * 16 cycles
+       *  (since the last two src operands are in the same register bank):
+       * lrp(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
+       * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
+       */
+
+      /* Our register allocator doesn't know about register banks, so use the
+       * higher latency.
+       */
+      latency = 14;
+      break;
+
+   case SHADER_OPCODE_RCP:
+   case SHADER_OPCODE_RSQ:
+   case SHADER_OPCODE_SQRT:
+   case SHADER_OPCODE_LOG2:
+   case SHADER_OPCODE_EXP2:
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_COS:
+      /* 2 cycles:
+       * math inv(8) g4<1>F g2<0,1,0>F      null       { align1 WE_normal 1Q };
+       *
+       * 18 cycles:
+       * math inv(8) g4<1>F g2<0,1,0>F      null       { align1 WE_normal 1Q };
+       * mov(8)      null   g4<8,8,1>F                 { align1 WE_normal 1Q };
+       *
+       * Same for exp2, log2, rsq, sqrt, sin, cos.
+       */
+      latency = is_haswell ? 14 : 16;
+      break;
+
+   case SHADER_OPCODE_POW:
+      /* 2 cycles:
+       * math pow(8) g4<1>F g2<0,1,0>F   g2.1<0,1,0>F  { align1 WE_normal 1Q };
+       *
+       * 26 cycles:
+       * math pow(8) g4<1>F g2<0,1,0>F   g2.1<0,1,0>F  { align1 WE_normal 1Q };
+       * mov(8)      null   g4<8,8,1>F                 { align1 WE_normal 1Q };
+       */
+      latency = is_haswell ? 22 : 24;
+      break;
+
+   case SHADER_OPCODE_TEX:
+   case SHADER_OPCODE_TXD:
+   case SHADER_OPCODE_TXF:
+   case SHADER_OPCODE_TXF_LZ:
+   case SHADER_OPCODE_TXL:
+   case SHADER_OPCODE_TXL_LZ:
+      /* 18 cycles:
+       * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
+       * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
+       * send(8) g4<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
+       *
+       * 697 +/-49 cycles (min 610, n=26):
+       * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
+       * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
+       * send(8) g4<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
+       * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
+       *
+       * So the latency on our first texture load of the batchbuffer takes
+       * ~700 cycles, since the caches are cold at that point.
+       *
+       * 840 +/- 92 cycles (min 720, n=25):
+       * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
+       * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
+       * send(8) g4<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
+       * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
+       * send(8) g4<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
+       * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
+       *
+       * On the second load, it takes just an extra ~140 cycles, and after
+       * accounting for the 14 cycles of the MOV's latency, that makes ~130.
+       *
+       * 683 +/- 49 cycles (min = 602, n=47):
+       * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
+       * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
+       * send(8) g4<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
+       * send(8) g50<1>UW   g114<8,8,1>F
+       *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
+       * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
+       *
+       * The unit appears to be pipelined, since this matches up with the
+       * cache-cold case, despite there being two loads here.  If you replace
+       * the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39).
+       *
+       * So, take some number between the cache-hot 140 cycles and the
+       * cache-cold 700 cycles.  No particular tuning was done on this.
+       *
+       * I haven't done significant testing of the non-TEX opcodes.  TXL at
+       * least looked about the same as TEX.
+       */
+      latency = 200;
+      break;
+
+   case SHADER_OPCODE_TXS:
+      /* Testing textureSize(sampler2D, 0), one load was 420 +/- 41
+       * cycles (n=15):
+       * mov(8)   g114<1>UD  0D                        { align1 WE_normal 1Q };
+       * send(8)  g6<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 10, 1) mlen 1 rlen 4        { align1 WE_normal 1Q };
+       * mov(16)  g6<1>F     g6<8,8,1>D                { align1 WE_normal 1Q };
+       *
+       *
+       * Two loads was 535 +/- 30 cycles (n=19):
+       * mov(16)   g114<1>UD  0D                       { align1 WE_normal 1H };
+       * send(16)  g6<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 10, 2) mlen 2 rlen 8        { align1 WE_normal 1H };
+       * mov(16)   g114<1>UD  0D                       { align1 WE_normal 1H };
+       * mov(16)   g6<1>F     g6<8,8,1>D               { align1 WE_normal 1H };
+       * send(16)  g8<1>UW    g114<8,8,1>F
+       *   sampler (10, 0, 10, 2) mlen 2 rlen 8        { align1 WE_normal 1H };
+       * mov(16)   g8<1>F     g8<8,8,1>D               { align1 WE_normal 1H };
+       * add(16)   g6<1>F     g6<8,8,1>F   g8<8,8,1>F  { align1 WE_normal 1H };
+       *
+       * Since the only caches that should matter are just the
+       * instruction/state cache containing the surface state, assume that we
+       * always have hot caches.
+       */
+      latency = 100;
+      break;
+
+   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
+   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
+   case VS_OPCODE_PULL_CONSTANT_LOAD:
+      /* testing using varying-index pull constants:
+       *
+       * 16 cycles:
+       * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
+       * send(8) g4<1>F  g4<8,8,1>D
+       *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
+       *
+       * ~480 cycles:
+       * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
+       * send(8) g4<1>F  g4<8,8,1>D
+       *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
+       * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
+       *
+       * ~620 cycles:
+       * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
+       * send(8) g4<1>F  g4<8,8,1>D
+       *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
+       * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
+       * send(8) g4<1>F  g4<8,8,1>D
+       *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
+       * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
+       *
+       * So, if it's cache-hot, it's about 140.  If it's cache cold, it's
+       * about 460.  We expect to mostly be cache hot, so pick something more
+       * in that direction.
+       */
+      latency = 200;
+      break;
+
+   case SHADER_OPCODE_GEN7_SCRATCH_READ:
+      /* Testing a load from offset 0, that had been previously written:
+       *
+       * send(8) g114<1>UW g0<8,8,1>F data (0, 0, 0) mlen 1 rlen 1 { align1 WE_normal 1Q };
+       * mov(8)  null      g114<8,8,1>F { align1 WE_normal 1Q };
+       *
+       * The cycles spent seemed to be grouped around 40-50 (as low as 38),
+       * then around 140.  Presumably this is cache hit vs miss.
+       */
+      latency = 50;
+      break;
+
+   case SHADER_OPCODE_UNTYPED_ATOMIC:
+   case SHADER_OPCODE_TYPED_ATOMIC:
+      /* Test code:
+       *   mov(8)    g112<1>ud       0x00000000ud       { align1 WE_all 1Q };
+       *   mov(1)    g112.7<1>ud     g1.7<0,1,0>ud      { align1 WE_all };
+       *   mov(8)    g113<1>ud       0x00000000ud       { align1 WE_normal 1Q };
+       *   send(8)   g4<1>ud         g112<8,8,1>ud
+       *             data (38, 5, 6) mlen 2 rlen 1      { align1 WE_normal 1Q };
+       *
+       * Running it 100 times as fragment shader on a 128x128 quad
+       * gives an average latency of 13867 cycles per atomic op,
+       * standard deviation 3%.  Note that this is a rather
+       * pessimistic estimate, the actual latency in cases with few
+       * collisions between threads and favorable pipelining has been
+       * seen to be reduced by a factor of 100.
+       */
+      latency = 14000;
+      break;
+
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+   case SHADER_OPCODE_TYPED_SURFACE_READ:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+      /* Test code:
+       *   mov(8)    g112<1>UD       0x00000000UD       { align1 WE_all 1Q };
+       *   mov(1)    g112.7<1>UD     g1.7<0,1,0>UD      { align1 WE_all };
+       *   mov(8)    g113<1>UD       0x00000000UD       { align1 WE_normal 1Q };
+       *   send(8)   g4<1>UD         g112<8,8,1>UD
+       *             data (38, 6, 5) mlen 2 rlen 1      { align1 WE_normal 1Q };
+       *   .
+       *   . [repeats 8 times]
+       *   .
+       *   mov(8)    g112<1>UD       0x00000000UD       { align1 WE_all 1Q };
+       *   mov(1)    g112.7<1>UD     g1.7<0,1,0>UD      { align1 WE_all };
+       *   mov(8)    g113<1>UD       0x00000000UD       { align1 WE_normal 1Q };
+       *   send(8)   g4<1>UD         g112<8,8,1>UD
+       *             data (38, 6, 5) mlen 2 rlen 1      { align1 WE_normal 1Q };
+       *
+       * Running it 100 times as fragment shader on a 128x128 quad
+       * gives an average latency of 583 cycles per surface read,
+       * standard deviation 0.9%.
+       */
+      latency = is_haswell ? 300 : 600;
+      break;
+
+   default:
+      /* 2 cycles:
+       * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
+       *
+       * 16 cycles:
+       * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
+       * mov(8) null   g4<8,8,1>F                      { align1 WE_normal 1Q };
+       */
+      latency = 14;
+      break;
+   }
+}
+
+class instruction_scheduler {
+public:
+   instruction_scheduler(backend_shader *s, int grf_count,
+                         int hw_reg_count, int block_count,
+                         instruction_scheduler_mode mode)
+   {
+      this->bs = s;
+      this->mem_ctx = ralloc_context(NULL);
+      this->grf_count = grf_count;
+      this->hw_reg_count = hw_reg_count;
+      this->instructions.make_empty();
+      this->instructions_to_schedule = 0;
+      this->post_reg_alloc = (mode == SCHEDULE_POST);
+      this->mode = mode;
+      if (!post_reg_alloc) {
+         this->reg_pressure_in = rzalloc_array(mem_ctx, int, block_count);
+
+         this->livein = ralloc_array(mem_ctx, BITSET_WORD *, block_count);
+         for (int i = 0; i < block_count; i++)
+            this->livein[i] = rzalloc_array(mem_ctx, BITSET_WORD,
+                                            BITSET_WORDS(grf_count));
+
+         this->liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count);
+         for (int i = 0; i < block_count; i++)
+            this->liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD,
+                                             BITSET_WORDS(grf_count));
+
+         this->hw_liveout = ralloc_array(mem_ctx, BITSET_WORD *, block_count);
+         for (int i = 0; i < block_count; i++)
+            this->hw_liveout[i] = rzalloc_array(mem_ctx, BITSET_WORD,
+                                                BITSET_WORDS(hw_reg_count));
+
+         this->written = rzalloc_array(mem_ctx, bool, grf_count);
+
+         this->reads_remaining = rzalloc_array(mem_ctx, int, grf_count);
+
+         this->hw_reads_remaining = rzalloc_array(mem_ctx, int, hw_reg_count);
+      } else {
+         this->reg_pressure_in = NULL;
+         this->livein = NULL;
+         this->liveout = NULL;
+         this->hw_liveout = NULL;
+         this->written = NULL;
+         this->reads_remaining = NULL;
+         this->hw_reads_remaining = NULL;
+      }
+   }
+
+   ~instruction_scheduler()
+   {
+      ralloc_free(this->mem_ctx);
+   }
+   void add_barrier_deps(schedule_node *n);
+   void add_dep(schedule_node *before, schedule_node *after, int latency);
+   void add_dep(schedule_node *before, schedule_node *after);
+
+   void run(cfg_t *cfg);
+   void add_insts_from_block(bblock_t *block);
+   void compute_delays();
+   void compute_exits();
+   virtual void calculate_deps() = 0;
+   virtual schedule_node *choose_instruction_to_schedule() = 0;
+
+   /**
+    * Returns how many cycles it takes the instruction to issue.
+    *
+    * Instructions in gen hardware are handled one simd4 vector at a time,
+    * with 1 cycle per vector dispatched.  Thus SIMD8 pixel shaders take 2
+    * cycles to dispatch and SIMD16 (compressed) instructions take 4.
+    */
+   virtual int issue_time(backend_instruction *inst) = 0;
+
+   virtual void count_reads_remaining(backend_instruction *inst) = 0;
+   virtual void setup_liveness(cfg_t *cfg) = 0;
+   virtual void update_register_pressure(backend_instruction *inst) = 0;
+   virtual int get_register_pressure_benefit(backend_instruction *inst) = 0;
+
+   void schedule_instructions(bblock_t *block);
+
+   void *mem_ctx;
+
+   bool post_reg_alloc;
+   int instructions_to_schedule;
+   int grf_count;
+   int hw_reg_count;
+   int reg_pressure;
+   int block_idx;
+   exec_list instructions;
+   backend_shader *bs;
+
+   instruction_scheduler_mode mode;
+
+   /*
+    * The register pressure at the beginning of each basic block.
+    */
+
+   int *reg_pressure_in;
+
+   /*
+    * The virtual GRF's whose range overlaps the beginning of each basic block.
+    */
+
+   BITSET_WORD **livein;
+
+   /*
+    * The virtual GRF's whose range overlaps the end of each basic block.
+    */
+
+   BITSET_WORD **liveout;
+
+   /*
+    * The hardware GRF's whose range overlaps the end of each basic block.
+    */
+
+   BITSET_WORD **hw_liveout;
+
+   /*
+    * Whether we've scheduled a write for this virtual GRF yet.
+    */
+
+   bool *written;
+
+   /*
+    * How many reads we haven't scheduled for this virtual GRF yet.
+    */
+
+   int *reads_remaining;
+
+   /*
+    * How many reads we haven't scheduled for this hardware GRF yet.
+    */
+
+   int *hw_reads_remaining;
+};
+
+class fs_instruction_scheduler : public instruction_scheduler
+{
+public:
+   fs_instruction_scheduler(fs_visitor *v, int grf_count, int hw_reg_count,
+                            int block_count,
+                            instruction_scheduler_mode mode);
+   void calculate_deps();
+   bool is_compressed(fs_inst *inst);
+   schedule_node *choose_instruction_to_schedule();
+   int issue_time(backend_instruction *inst);
+   fs_visitor *v;
+
+   void count_reads_remaining(backend_instruction *inst);
+   void setup_liveness(cfg_t *cfg);
+   void update_register_pressure(backend_instruction *inst);
+   int get_register_pressure_benefit(backend_instruction *inst);
+};
+
+fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v,
+                                                   int grf_count, int hw_reg_count,
+                                                   int block_count,
+                                                   instruction_scheduler_mode mode)
+   : instruction_scheduler(v, grf_count, hw_reg_count, block_count, mode),
+     v(v)
+{
+}
+
+static bool
+is_src_duplicate(fs_inst *inst, int src)
+{
+   for (int i = 0; i < src; i++)
+     if (inst->src[i].equals(inst->src[src]))
+       return true;
+
+  return false;
+}
+
+void
+fs_instruction_scheduler::count_reads_remaining(backend_instruction *be)
+{
+   fs_inst *inst = (fs_inst *)be;
+
+   if (!reads_remaining)
+      return;
+
+   for (int i = 0; i < inst->sources; i++) {
+      if (is_src_duplicate(inst, i))
+         continue;
+
+      if (inst->src[i].file == VGRF) {
+         reads_remaining[inst->src[i].nr]++;
+      } else if (inst->src[i].file == FIXED_GRF) {
+         if (inst->src[i].nr >= hw_reg_count)
+            continue;
+
+         for (unsigned j = 0; j < regs_read(inst, i); j++)
+            hw_reads_remaining[inst->src[i].nr + j]++;
+      }
+   }
+}
+
+void
+fs_instruction_scheduler::setup_liveness(cfg_t *cfg)
+{
+   /* First, compute liveness on a per-GRF level using the in/out sets from
+    * liveness calculation.
+    */
+   for (int block = 0; block < cfg->num_blocks; block++) {
+      for (int i = 0; i < v->live_intervals->num_vars; i++) {
+         if (BITSET_TEST(v->live_intervals->block_data[block].livein, i)) {
+            int vgrf = v->live_intervals->vgrf_from_var[i];
+            if (!BITSET_TEST(livein[block], vgrf)) {
+               reg_pressure_in[block] += v->alloc.sizes[vgrf];
+               BITSET_SET(livein[block], vgrf);
+            }
+         }
+
+         if (BITSET_TEST(v->live_intervals->block_data[block].liveout, i))
+            BITSET_SET(liveout[block], v->live_intervals->vgrf_from_var[i]);
+      }
+   }
+
+   /* Now, extend the live in/live out sets for when a range crosses a block
+    * boundary, which matches what our register allocator/interference code
+    * does to account for force_writemask_all and incompatible exec_mask's.
+    */
+   for (int block = 0; block < cfg->num_blocks - 1; block++) {
+      for (int i = 0; i < grf_count; i++) {
+         if (v->virtual_grf_start[i] <= cfg->blocks[block]->end_ip &&
+             v->virtual_grf_end[i] >= cfg->blocks[block + 1]->start_ip) {
+            if (!BITSET_TEST(livein[block + 1], i)) {
+                reg_pressure_in[block + 1] += v->alloc.sizes[i];
+                BITSET_SET(livein[block + 1], i);
+            }
+
+            BITSET_SET(liveout[block], i);
+         }
+      }
+   }
+
+   int payload_last_use_ip[hw_reg_count];
+   v->calculate_payload_ranges(hw_reg_count, payload_last_use_ip);
+
+   for (int i = 0; i < hw_reg_count; i++) {
+      if (payload_last_use_ip[i] == -1)
+         continue;
+
+      for (int block = 0; block < cfg->num_blocks; block++) {
+         if (cfg->blocks[block]->start_ip <= payload_last_use_ip[i])
+            reg_pressure_in[block]++;
+
+         if (cfg->blocks[block]->end_ip <= payload_last_use_ip[i])
+            BITSET_SET(hw_liveout[block], i);
+      }
+   }
+}
+
+void
+fs_instruction_scheduler::update_register_pressure(backend_instruction *be)
+{
+   fs_inst *inst = (fs_inst *)be;
+
+   if (!reads_remaining)
+      return;
+
+   if (inst->dst.file == VGRF) {
+      written[inst->dst.nr] = true;
+   }
+
+   for (int i = 0; i < inst->sources; i++) {
+      if (is_src_duplicate(inst, i))
+          continue;
+
+      if (inst->src[i].file == VGRF) {
+         reads_remaining[inst->src[i].nr]--;
+      } else if (inst->src[i].file == FIXED_GRF &&
+                 inst->src[i].nr < hw_reg_count) {
+         for (unsigned off = 0; off < regs_read(inst, i); off++)
+            hw_reads_remaining[inst->src[i].nr + off]--;
+      }
+   }
+}
+
+int
+fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
+{
+   fs_inst *inst = (fs_inst *)be;
+   int benefit = 0;
+
+   if (inst->dst.file == VGRF) {
+      if (!BITSET_TEST(livein[block_idx], inst->dst.nr) &&
+          !written[inst->dst.nr])
+         benefit -= v->alloc.sizes[inst->dst.nr];
+   }
+
+   for (int i = 0; i < inst->sources; i++) {
+      if (is_src_duplicate(inst, i))
+         continue;
+
+      if (inst->src[i].file == VGRF &&
+          !BITSET_TEST(liveout[block_idx], inst->src[i].nr) &&
+          reads_remaining[inst->src[i].nr] == 1)
+         benefit += v->alloc.sizes[inst->src[i].nr];
+
+      if (inst->src[i].file == FIXED_GRF &&
+          inst->src[i].nr < hw_reg_count) {
+         for (unsigned off = 0; off < regs_read(inst, i); off++) {
+            int reg = inst->src[i].nr + off;
+            if (!BITSET_TEST(hw_liveout[block_idx], reg) &&
+                hw_reads_remaining[reg] == 1) {
+               benefit++;
+            }
+         }
+      }
+   }
+
+   return benefit;
+}
+
+class vec4_instruction_scheduler : public instruction_scheduler
+{
+public:
+   vec4_instruction_scheduler(vec4_visitor *v, int grf_count);
+   void calculate_deps();
+   schedule_node *choose_instruction_to_schedule();
+   int issue_time(backend_instruction *inst);
+   vec4_visitor *v;
+
+   void count_reads_remaining(backend_instruction *inst);
+   void setup_liveness(cfg_t *cfg);
+   void update_register_pressure(backend_instruction *inst);
+   int get_register_pressure_benefit(backend_instruction *inst);
+};
+
+vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v,
+                                                       int grf_count)
+   : instruction_scheduler(v, grf_count, 0, 0, SCHEDULE_POST),
+     v(v)
+{
+}
+
+void
+vec4_instruction_scheduler::count_reads_remaining(backend_instruction *be)
+{
+}
+
+void
+vec4_instruction_scheduler::setup_liveness(cfg_t *cfg)
+{
+}
+
+void
+vec4_instruction_scheduler::update_register_pressure(backend_instruction *be)
+{
+}
+
+int
+vec4_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
+{
+   return 0;
+}
+
+schedule_node::schedule_node(backend_instruction *inst,
+                             instruction_scheduler *sched)
+{
+   const struct gen_device_info *devinfo = sched->bs->devinfo;
+
+   this->inst = inst;
+   this->child_array_size = 0;
+   this->children = NULL;
+   this->child_latency = NULL;
+   this->child_count = 0;
+   this->parent_count = 0;
+   this->unblocked_time = 0;
+   this->cand_generation = 0;
+   this->delay = 0;
+   this->exit = NULL;
+   this->is_barrier = false;
+
+   /* We can't measure Gen6 timings directly but expect them to be much
+    * closer to Gen7 than Gen4.
+    */
+   if (!sched->post_reg_alloc)
+      this->latency = 1;
+   else if (devinfo->gen >= 6)
+      set_latency_gen7(devinfo->is_haswell);
+   else
+      set_latency_gen4();
+}
+
+void
+instruction_scheduler::add_insts_from_block(bblock_t *block)
+{
+   foreach_inst_in_block(backend_instruction, inst, block) {
+      schedule_node *n = new(mem_ctx) schedule_node(inst, this);
+
+      instructions.push_tail(n);
+   }
+
+   this->instructions_to_schedule = block->end_ip - block->start_ip + 1;
+}
+
+/** Computation of the delay member of each node. */
+void
+instruction_scheduler::compute_delays()
+{
+   foreach_in_list_reverse(schedule_node, n, &instructions) {
+      if (!n->child_count) {
+         n->delay = issue_time(n->inst);
+      } else {
+         for (int i = 0; i < n->child_count; i++) {
+            assert(n->children[i]->delay);
+            n->delay = MAX2(n->delay, n->latency + n->children[i]->delay);
+         }
+      }
+   }
+}
+
+void
+instruction_scheduler::compute_exits()
+{
+   /* Calculate a lower bound of the scheduling time of each node in the
+    * graph.  This is analogous to the node's critical path but calculated
+    * from the top instead of from the bottom of the block.
+    */
+   foreach_in_list(schedule_node, n, &instructions) {
+      for (int i = 0; i < n->child_count; i++) {
+         n->children[i]->unblocked_time =
+            MAX2(n->children[i]->unblocked_time,
+                 n->unblocked_time + issue_time(n->inst) + n->child_latency[i]);
+      }
+   }
+
+   /* Calculate the exit of each node by induction based on the exit nodes of
+    * its children.  The preferred exit of a node is the one among the exit
+    * nodes of its children which can be unblocked first according to the
+    * optimistic unblocked time estimate calculated above.
+    */
+   foreach_in_list_reverse(schedule_node, n, &instructions) {
+      n->exit = (n->inst->opcode == FS_OPCODE_DISCARD_JUMP ? n : NULL);
+
+      for (int i = 0; i < n->child_count; i++) {
+         if (exit_unblocked_time(n->children[i]) < exit_unblocked_time(n))
+            n->exit = n->children[i]->exit;
+      }
+   }
+}
+
+/**
+ * Add a dependency between two instruction nodes.
+ *
+ * The @after node will be scheduled after @before.  We will try to
+ * schedule it @latency cycles after @before, but no guarantees there.
+ */
+void
+instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
+                               int latency)
+{
+   if (!before || !after)
+      return;
+
+   assert(before != after);
+
+   for (int i = 0; i < before->child_count; i++) {
+      if (before->children[i] == after) {
+         before->child_latency[i] = MAX2(before->child_latency[i], latency);
+         return;
+      }
+   }
+
+   if (before->child_array_size <= before->child_count) {
+      if (before->child_array_size < 16)
+         before->child_array_size = 16;
+      else
+         before->child_array_size *= 2;
+
+      before->children = reralloc(mem_ctx, before->children,
+                                  schedule_node *,
+                                  before->child_array_size);
+      before->child_latency = reralloc(mem_ctx, before->child_latency,
+                                       int, before->child_array_size);
+   }
+
+   before->children[before->child_count] = after;
+   before->child_latency[before->child_count] = latency;
+   before->child_count++;
+   after->parent_count++;
+}
+
+void
+instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
+{
+   if (!before)
+      return;
+
+   add_dep(before, after, before->latency);
+}
+
+/**
+ * Sometimes we really want this node to execute after everything that
+ * was before it and before everything that followed it.  This adds
+ * the deps to do so.
+ */
+void
+instruction_scheduler::add_barrier_deps(schedule_node *n)
+{
+   schedule_node *prev = (schedule_node *)n->prev;
+   schedule_node *next = (schedule_node *)n->next;
+
+   n->is_barrier = true;
+
+   if (prev) {
+      while (!prev->is_head_sentinel()) {
+         add_dep(prev, n, 0);
+         if (prev->is_barrier)
+            break;
+         prev = (schedule_node *)prev->prev;
+      }
+   }
+
+   if (next) {
+      while (!next->is_tail_sentinel()) {
+         add_dep(n, next, 0);
+         if (next->is_barrier)
+            break;
+         next = (schedule_node *)next->next;
+      }
+   }
+}
+
+/* instruction scheduling needs to be aware of when an MRF write
+ * actually writes 2 MRFs.
+ */
+bool
+fs_instruction_scheduler::is_compressed(fs_inst *inst)
+{
+   return inst->exec_size == 16;
+}
+
+static bool
+is_scheduling_barrier(const fs_inst *inst)
+{
+   return inst->opcode == FS_OPCODE_PLACEHOLDER_HALT ||
+          inst->is_control_flow() ||
+          inst->has_side_effects();
+}
+
+void
+fs_instruction_scheduler::calculate_deps()
+{
+   /* Pre-register-allocation, this tracks the last write per VGRF offset.
+    * After register allocation, reg_offsets are gone and we track individual
+    * GRF registers.
+    */
+   schedule_node *last_grf_write[grf_count * 16];
+   schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->gen)];
+   schedule_node *last_conditional_mod[4] = {};
+   schedule_node *last_accumulator_write = NULL;
+   /* Fixed HW registers are assumed to be separate from the virtual
+    * GRFs, so they can be tracked separately.  We don't really write
+    * to fixed GRFs much, so don't bother tracking them on a more
+    * granular level.
+    */
+   schedule_node *last_fixed_grf_write = NULL;
+
+   memset(last_grf_write, 0, sizeof(last_grf_write));
+   memset(last_mrf_write, 0, sizeof(last_mrf_write));
+
+   /* top-to-bottom dependencies: RAW and WAW. */
+   foreach_in_list(schedule_node, n, &instructions) {
+      fs_inst *inst = (fs_inst *)n->inst;
+
+      if (is_scheduling_barrier(inst))
+         add_barrier_deps(n);
+
+      /* read-after-write deps. */
+      for (int i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == VGRF) {
+            if (post_reg_alloc) {
+               for (unsigned r = 0; r < regs_read(inst, i); r++)
+                  add_dep(last_grf_write[inst->src[i].nr + r], n);
+            } else {
+               for (unsigned r = 0; r < regs_read(inst, i); r++) {
+                  add_dep(last_grf_write[inst->src[i].nr * 16 +
+                                         inst->src[i].offset / REG_SIZE + r], n);
+               }
+            }
+         } else if (inst->src[i].file == FIXED_GRF) {
+            if (post_reg_alloc) {
+               for (unsigned r = 0; r < regs_read(inst, i); r++)
+                  add_dep(last_grf_write[inst->src[i].nr + r], n);
+            } else {
+               add_dep(last_fixed_grf_write, n);
+            }
+         } else if (inst->src[i].is_accumulator()) {
+            add_dep(last_accumulator_write, n);
+         } else if (inst->src[i].file == ARF) {
+            add_barrier_deps(n);
+         }
+      }
+
+      if (inst->base_mrf != -1) {
+         for (int i = 0; i < inst->mlen; i++) {
+            /* It looks like the MRF regs are released in the send
+             * instruction once it's sent, not when the result comes
+             * back.
+             */
+            add_dep(last_mrf_write[inst->base_mrf + i], n);
+         }
+      }
+
+      if (const unsigned mask = inst->flags_read(v->devinfo)) {
+         assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
+
+         for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) {
+            if (mask & (1 << i))
+               add_dep(last_conditional_mod[i], n);
+         }
+      }
+
+      if (inst->reads_accumulator_implicitly()) {
+         add_dep(last_accumulator_write, n);
+      }
+
+      /* write-after-write deps. */
+      if (inst->dst.file == VGRF) {
+         if (post_reg_alloc) {
+            for (unsigned r = 0; r < regs_written(inst); r++) {
+               add_dep(last_grf_write[inst->dst.nr + r], n);
+               last_grf_write[inst->dst.nr + r] = n;
+            }
+         } else {
+            for (unsigned r = 0; r < regs_written(inst); r++) {
+               add_dep(last_grf_write[inst->dst.nr * 16 +
+                                      inst->dst.offset / REG_SIZE + r], n);
+               last_grf_write[inst->dst.nr * 16 +
+                              inst->dst.offset / REG_SIZE + r] = n;
+            }
+         }
+      } else if (inst->dst.file == MRF) {
+         int reg = inst->dst.nr & ~BRW_MRF_COMPR4;
+
+         add_dep(last_mrf_write[reg], n);
+         last_mrf_write[reg] = n;
+         if (is_compressed(inst)) {
+            if (inst->dst.nr & BRW_MRF_COMPR4)
+               reg += 4;
+            else
+               reg++;
+            add_dep(last_mrf_write[reg], n);
+            last_mrf_write[reg] = n;
+         }
+      } else if (inst->dst.file == FIXED_GRF) {
+         if (post_reg_alloc) {
+            for (unsigned r = 0; r < regs_written(inst); r++)
+               last_grf_write[inst->dst.nr + r] = n;
+         } else {
+            last_fixed_grf_write = n;
+         }
+      } else if (inst->dst.is_accumulator()) {
+         add_dep(last_accumulator_write, n);
+         last_accumulator_write = n;
+      } else if (inst->dst.file == ARF && !inst->dst.is_null()) {
+         add_barrier_deps(n);
+      }
+
+      if (inst->mlen > 0 && inst->base_mrf != -1) {
+         for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
+            add_dep(last_mrf_write[inst->base_mrf + i], n);
+            last_mrf_write[inst->base_mrf + i] = n;
+         }
+      }
+
+      if (const unsigned mask = inst->flags_written()) {
+         assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
+
+         for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) {
+            if (mask & (1 << i)) {
+               add_dep(last_conditional_mod[i], n, 0);
+               last_conditional_mod[i] = n;
+            }
+         }
+      }
+
+      if (inst->writes_accumulator_implicitly(v->devinfo) &&
+          !inst->dst.is_accumulator()) {
+         add_dep(last_accumulator_write, n);
+         last_accumulator_write = n;
+      }
+   }
+
+   /* bottom-to-top dependencies: WAR */
+   memset(last_grf_write, 0, sizeof(last_grf_write));
+   memset(last_mrf_write, 0, sizeof(last_mrf_write));
+   memset(last_conditional_mod, 0, sizeof(last_conditional_mod));
+   last_accumulator_write = NULL;
+   last_fixed_grf_write = NULL;
+
+   foreach_in_list_reverse_safe(schedule_node, n, &instructions) {
+      fs_inst *inst = (fs_inst *)n->inst;
+
+      /* write-after-read deps. */
+      for (int i = 0; i < inst->sources; i++) {
+         if (inst->src[i].file == VGRF) {
+            if (post_reg_alloc) {
+               for (unsigned r = 0; r < regs_read(inst, i); r++)
+                  add_dep(n, last_grf_write[inst->src[i].nr + r], 0);
+            } else {
+               for (unsigned r = 0; r < regs_read(inst, i); r++) {
+                  add_dep(n, last_grf_write[inst->src[i].nr * 16 +
+                                            inst->src[i].offset / REG_SIZE + r], 0);
+               }
+            }
+         } else if (inst->src[i].file == FIXED_GRF) {
+            if (post_reg_alloc) {
+               for (unsigned r = 0; r < regs_read(inst, i); r++)
+                  add_dep(n, last_grf_write[inst->src[i].nr + r], 0);
+            } else {
+               add_dep(n, last_fixed_grf_write, 0);
+            }
+         } else if (inst->src[i].is_accumulator()) {
+            add_dep(n, last_accumulator_write, 0);
+         } else if (inst->src[i].file == ARF) {
+            add_barrier_deps(n);
+         }
+      }
+
+      if (inst->base_mrf != -1) {
+         for (int i = 0; i < inst->mlen; i++) {
+            /* It looks like the MRF regs are released in the send
+             * instruction once it's sent, not when the result comes
+             * back.
+             */
+            add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
+         }
+      }
+
+      if (const unsigned mask = inst->flags_read(v->devinfo)) {
+         assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
+
+         for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) {
+            if (mask & (1 << i))
+               add_dep(n, last_conditional_mod[i]);
+         }
+      }
+
+      if (inst->reads_accumulator_implicitly()) {
+         add_dep(n, last_accumulator_write);
+      }
+
+      /* Update the things this instruction wrote, so earlier reads
+       * can mark this as WAR dependency.
+       */
+      if (inst->dst.file == VGRF) {
+         if (post_reg_alloc) {
+            for (unsigned r = 0; r < regs_written(inst); r++)
+               last_grf_write[inst->dst.nr + r] = n;
+         } else {
+            for (unsigned r = 0; r < regs_written(inst); r++) {
+               last_grf_write[inst->dst.nr * 16 +
+                              inst->dst.offset / REG_SIZE + r] = n;
+            }
+         }
+      } else if (inst->dst.file == MRF) {
+         int reg = inst->dst.nr & ~BRW_MRF_COMPR4;
+
+         last_mrf_write[reg] = n;
+
+         if (is_compressed(inst)) {
+            if (inst->dst.nr & BRW_MRF_COMPR4)
+               reg += 4;
+            else
+               reg++;
+
+            last_mrf_write[reg] = n;
+         }
+      } else if (inst->dst.file == FIXED_GRF) {
+         if (post_reg_alloc) {
+            for (unsigned r = 0; r < regs_written(inst); r++)
+               last_grf_write[inst->dst.nr + r] = n;
+         } else {
+            last_fixed_grf_write = n;
+         }
+      } else if (inst->dst.is_accumulator()) {
+         last_accumulator_write = n;
+      } else if (inst->dst.file == ARF && !inst->dst.is_null()) {
+         add_barrier_deps(n);
+      }
+
+      if (inst->mlen > 0 && inst->base_mrf != -1) {
+         for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
+            last_mrf_write[inst->base_mrf + i] = n;
+         }
+      }
+
+      if (const unsigned mask = inst->flags_written()) {
+         assert(mask < (1 << ARRAY_SIZE(last_conditional_mod)));
+
+         for (unsigned i = 0; i < ARRAY_SIZE(last_conditional_mod); i++) {
+            if (mask & (1 << i))
+               last_conditional_mod[i] = n;
+         }
+      }
+
+      if (inst->writes_accumulator_implicitly(v->devinfo)) {
+         last_accumulator_write = n;
+      }
+   }
+}
+
+static bool
+is_scheduling_barrier(const vec4_instruction *inst)
+{
+   return inst->is_control_flow() ||
+          inst->has_side_effects();
+}
+
+void
+vec4_instruction_scheduler::calculate_deps()
+{
+   schedule_node *last_grf_write[grf_count];
+   schedule_node *last_mrf_write[BRW_MAX_MRF(v->devinfo->gen)];
+   schedule_node *last_conditional_mod = NULL;
+   schedule_node *last_accumulator_write = NULL;
+   /* Fixed HW registers are assumed to be separate from the virtual
+    * GRFs, so they can be tracked separately.  We don't really write
+    * to fixed GRFs much, so don't bother tracking them on a more
+    * granular level.
+    */
+   schedule_node *last_fixed_grf_write = NULL;
+
+   memset(last_grf_write, 0, sizeof(last_grf_write));
+   memset(last_mrf_write, 0, sizeof(last_mrf_write));
+
+   /* top-to-bottom dependencies: RAW and WAW. */
+   foreach_in_list(schedule_node, n, &instructions) {
+      vec4_instruction *inst = (vec4_instruction *)n->inst;
+
+      if (is_scheduling_barrier(inst))
+         add_barrier_deps(n);
+
+      /* read-after-write deps. */
+      for (int i = 0; i < 3; i++) {
+         if (inst->src[i].file == VGRF) {
+            for (unsigned j = 0; j < regs_read(inst, i); ++j)
+               add_dep(last_grf_write[inst->src[i].nr + j], n);
+         } else if (inst->src[i].file == FIXED_GRF) {
+            add_dep(last_fixed_grf_write, n);
+         } else if (inst->src[i].is_accumulator()) {
+            assert(last_accumulator_write);
+            add_dep(last_accumulator_write, n);
+         } else if (inst->src[i].file == ARF) {
+            add_barrier_deps(n);
+         }
+      }
+
+      if (!inst->is_send_from_grf()) {
+         for (int i = 0; i < inst->mlen; i++) {
+            /* It looks like the MRF regs are released in the send
+             * instruction once it's sent, not when the result comes
+             * back.
+             */
+            add_dep(last_mrf_write[inst->base_mrf + i], n);
+         }
+      }
+
+      if (inst->reads_flag()) {
+         assert(last_conditional_mod);
+         add_dep(last_conditional_mod, n);
+      }
+
+      if (inst->reads_accumulator_implicitly()) {
+         assert(last_accumulator_write);
+         add_dep(last_accumulator_write, n);
+      }
+
+      /* write-after-write deps. */
+      if (inst->dst.file == VGRF) {
+         for (unsigned j = 0; j < regs_written(inst); ++j) {
+            add_dep(last_grf_write[inst->dst.nr + j], n);
+            last_grf_write[inst->dst.nr + j] = n;
+         }
+      } else if (inst->dst.file == MRF) {
+         add_dep(last_mrf_write[inst->dst.nr], n);
+         last_mrf_write[inst->dst.nr] = n;
+     } else if (inst->dst.file == FIXED_GRF) {
+         last_fixed_grf_write = n;
+      } else if (inst->dst.is_accumulator()) {
+         add_dep(last_accumulator_write, n);
+         last_accumulator_write = n;
+      } else if (inst->dst.file == ARF && !inst->dst.is_null()) {
+         add_barrier_deps(n);
+      }
+
+      if (inst->mlen > 0 && !inst->is_send_from_grf()) {
+         for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
+            add_dep(last_mrf_write[inst->base_mrf + i], n);
+            last_mrf_write[inst->base_mrf + i] = n;
+         }
+      }
+
+      if (inst->writes_flag()) {
+         add_dep(last_conditional_mod, n, 0);
+         last_conditional_mod = n;
+      }
+
+      if (inst->writes_accumulator_implicitly(v->devinfo) &&
+          !inst->dst.is_accumulator()) {
+         add_dep(last_accumulator_write, n);
+         last_accumulator_write = n;
+      }
+   }
+
+   /* bottom-to-top dependencies: WAR */
+   memset(last_grf_write, 0, sizeof(last_grf_write));
+   memset(last_mrf_write, 0, sizeof(last_mrf_write));
+   last_conditional_mod = NULL;
+   last_accumulator_write = NULL;
+   last_fixed_grf_write = NULL;
+
+   foreach_in_list_reverse_safe(schedule_node, n, &instructions) {
+      vec4_instruction *inst = (vec4_instruction *)n->inst;
+
+      /* write-after-read deps. */
+      for (int i = 0; i < 3; i++) {
+         if (inst->src[i].file == VGRF) {
+            for (unsigned j = 0; j < regs_read(inst, i); ++j)
+               add_dep(n, last_grf_write[inst->src[i].nr + j]);
+         } else if (inst->src[i].file == FIXED_GRF) {
+            add_dep(n, last_fixed_grf_write);
+         } else if (inst->src[i].is_accumulator()) {
+            add_dep(n, last_accumulator_write);
+         } else if (inst->src[i].file == ARF) {
+            add_barrier_deps(n);
+         }
+      }
+
+      if (!inst->is_send_from_grf()) {
+         for (int i = 0; i < inst->mlen; i++) {
+            /* It looks like the MRF regs are released in the send
+             * instruction once it's sent, not when the result comes
+             * back.
+             */
+            add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
+         }
+      }
+
+      if (inst->reads_flag()) {
+         add_dep(n, last_conditional_mod);
+      }
+
+      if (inst->reads_accumulator_implicitly()) {
+         add_dep(n, last_accumulator_write);
+      }
+
+      /* Update the things this instruction wrote, so earlier reads
+       * can mark this as WAR dependency.
+       */
+      if (inst->dst.file == VGRF) {
+         for (unsigned j = 0; j < regs_written(inst); ++j)
+            last_grf_write[inst->dst.nr + j] = n;
+      } else if (inst->dst.file == MRF) {
+         last_mrf_write[inst->dst.nr] = n;
+      } else if (inst->dst.file == FIXED_GRF) {
+         last_fixed_grf_write = n;
+      } else if (inst->dst.is_accumulator()) {
+         last_accumulator_write = n;
+      } else if (inst->dst.file == ARF && !inst->dst.is_null()) {
+         add_barrier_deps(n);
+      }
+
+      if (inst->mlen > 0 && !inst->is_send_from_grf()) {
+         for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
+            last_mrf_write[inst->base_mrf + i] = n;
+         }
+      }
+
+      if (inst->writes_flag()) {
+         last_conditional_mod = n;
+      }
+
+      if (inst->writes_accumulator_implicitly(v->devinfo)) {
+         last_accumulator_write = n;
+      }
+   }
+}
+
+schedule_node *
+fs_instruction_scheduler::choose_instruction_to_schedule()
+{
+   schedule_node *chosen = NULL;
+
+   if (mode == SCHEDULE_PRE || mode == SCHEDULE_POST) {
+      int chosen_time = 0;
+
+      /* Of the instructions ready to execute or the closest to being ready,
+       * choose the one most likely to unblock an early program exit, or
+       * otherwise the oldest one.
+       */
+      foreach_in_list(schedule_node, n, &instructions) {
+         if (!chosen ||
+             exit_unblocked_time(n) < exit_unblocked_time(chosen) ||
+             (exit_unblocked_time(n) == exit_unblocked_time(chosen) &&
+              n->unblocked_time < chosen_time)) {
+            chosen = n;
+            chosen_time = n->unblocked_time;
+         }
+      }
+   } else {
+      /* Before register allocation, we don't care about the latencies of
+       * instructions.  All we care about is reducing live intervals of
+       * variables so that we can avoid register spilling, or get SIMD16
+       * shaders which naturally do a better job of hiding instruction
+       * latency.
+       */
+      foreach_in_list(schedule_node, n, &instructions) {
+         fs_inst *inst = (fs_inst *)n->inst;
+
+         if (!chosen) {
+            chosen = n;
+            continue;
+         }
+
+         /* Most important: If we can definitely reduce register pressure, do
+          * so immediately.
+          */
+         int register_pressure_benefit = get_register_pressure_benefit(n->inst);
+         int chosen_register_pressure_benefit =
+            get_register_pressure_benefit(chosen->inst);
+
+         if (register_pressure_benefit > 0 &&
+             register_pressure_benefit > chosen_register_pressure_benefit) {
+            chosen = n;
+            continue;
+         } else if (chosen_register_pressure_benefit > 0 &&
+                    (register_pressure_benefit <
+                     chosen_register_pressure_benefit)) {
+            continue;
+         }
+
+         if (mode == SCHEDULE_PRE_LIFO) {
+            /* Prefer instructions that recently became available for
+             * scheduling.  These are the things that are most likely to
+             * (eventually) make a variable dead and reduce register pressure.
+             * Typical register pressure estimates don't work for us because
+             * most of our pressure comes from texturing, where no single
+             * instruction to schedule will make a vec4 value dead.
+             */
+            if (n->cand_generation > chosen->cand_generation) {
+               chosen = n;
+               continue;
+            } else if (n->cand_generation < chosen->cand_generation) {
+               continue;
+            }
+
+            /* On MRF-using chips, prefer non-SEND instructions.  If we don't
+             * do this, then because we prefer instructions that just became
+             * candidates, we'll end up in a pattern of scheduling a SEND,
+             * then the MRFs for the next SEND, then the next SEND, then the
+             * MRFs, etc., without ever consuming the results of a send.
+             */
+            if (v->devinfo->gen < 7) {
+               fs_inst *chosen_inst = (fs_inst *)chosen->inst;
+
+               /* We use size_written > 4 * exec_size as our test for the kind
+                * of send instruction to avoid -- only sends generate many
+                * regs, and a single-result send is probably actually reducing
+                * register pressure.
+                */
+               if (inst->size_written <= 4 * inst->exec_size &&
+                   chosen_inst->size_written > 4 * chosen_inst->exec_size) {
+                  chosen = n;
+                  continue;
+               } else if (inst->size_written > chosen_inst->size_written) {
+                  continue;
+               }
+            }
+         }
+
+         /* For instructions pushed on the cands list at the same time, prefer
+          * the one with the highest delay to the end of the program.  This is
+          * most likely to have its values able to be consumed first (such as
+          * for a large tree of lowered ubo loads, which appear reversed in
+          * the instruction stream with respect to when they can be consumed).
+          */
+         if (n->delay > chosen->delay) {
+            chosen = n;
+            continue;
+         } else if (n->delay < chosen->delay) {
+            continue;
+         }
+
+         /* Prefer the node most likely to unblock an early program exit.
+          */
+         if (exit_unblocked_time(n) < exit_unblocked_time(chosen)) {
+            chosen = n;
+            continue;
+         } else if (exit_unblocked_time(n) > exit_unblocked_time(chosen)) {
+            continue;
+         }
+
+         /* If all other metrics are equal, we prefer the first instruction in
+          * the list (program execution).
+          */
+      }
+   }
+
+   return chosen;
+}
+
+schedule_node *
+vec4_instruction_scheduler::choose_instruction_to_schedule()
+{
+   schedule_node *chosen = NULL;
+   int chosen_time = 0;
+
+   /* Of the instructions ready to execute or the closest to being ready,
+    * choose the oldest one.
+    */
+   foreach_in_list(schedule_node, n, &instructions) {
+      if (!chosen || n->unblocked_time < chosen_time) {
+         chosen = n;
+         chosen_time = n->unblocked_time;
+      }
+   }
+
+   return chosen;
+}
+
+int
+fs_instruction_scheduler::issue_time(backend_instruction *inst)
+{
+   if (is_compressed((fs_inst *)inst))
+      return 4;
+   else
+      return 2;
+}
+
+int
+vec4_instruction_scheduler::issue_time(backend_instruction *inst)
+{
+   /* We always execute as two vec4s in parallel. */
+   return 2;
+}
+
+void
+instruction_scheduler::schedule_instructions(bblock_t *block)
+{
+   const struct gen_device_info *devinfo = bs->devinfo;
+   int time = 0;
+   if (!post_reg_alloc)
+      reg_pressure = reg_pressure_in[block->num];
+   block_idx = block->num;
+
+   /* Remove non-DAG heads from the list. */
+   foreach_in_list_safe(schedule_node, n, &instructions) {
+      if (n->parent_count != 0)
+         n->remove();
+   }
+
+   unsigned cand_generation = 1;
+   while (!instructions.is_empty()) {
+      schedule_node *chosen = choose_instruction_to_schedule();
+
+      /* Schedule this instruction. */
+      assert(chosen);
+      chosen->remove();
+      chosen->inst->exec_node::remove();
+      block->instructions.push_tail(chosen->inst);
+      instructions_to_schedule--;
+
+      if (!post_reg_alloc) {
+         reg_pressure -= get_register_pressure_benefit(chosen->inst);
+         update_register_pressure(chosen->inst);
+      }
+
+      /* If we expected a delay for scheduling, then bump the clock to reflect
+       * that.  In reality, the hardware will switch to another hyperthread
+       * and may not return to dispatching our thread for a while even after
+       * we're unblocked.  After this, we have the time when the chosen
+       * instruction will start executing.
+       */
+      time = MAX2(time, chosen->unblocked_time);
+
+      /* Update the clock for how soon an instruction could start after the
+       * chosen one.
+       */
+      time += issue_time(chosen->inst);
+
+      if (debug) {
+         fprintf(stderr, "clock %4d, scheduled: ", time);
+         bs->dump_instruction(chosen->inst);
+         if (!post_reg_alloc)
+            fprintf(stderr, "(register pressure %d)\n", reg_pressure);
+      }
+
+      /* Now that we've scheduled a new instruction, some of its
+       * children can be promoted to the list of instructions ready to
+       * be scheduled.  Update the children's unblocked time for this
+       * DAG edge as we do so.
+       */
+      for (int i = chosen->child_count - 1; i >= 0; i--) {
+         schedule_node *child = chosen->children[i];
+
+         child->unblocked_time = MAX2(child->unblocked_time,
+                                      time + chosen->child_latency[i]);
+
+         if (debug) {
+            fprintf(stderr, "\tchild %d, %d parents: ", i, child->parent_count);
+            bs->dump_instruction(child->inst);
+         }
+
+         child->cand_generation = cand_generation;
+         child->parent_count--;
+         if (child->parent_count == 0) {
+            if (debug) {
+               fprintf(stderr, "\t\tnow available\n");
+            }
+            instructions.push_head(child);
+         }
+      }
+      cand_generation++;
+
+      /* Shared resource: the mathbox.  There's one mathbox per EU on Gen6+
+       * but it's more limited pre-gen6, so if we send something off to it then
+       * the next math instruction isn't going to make progress until the first
+       * is done.
+       */
+      if (devinfo->gen < 6 && chosen->inst->is_math()) {
+         foreach_in_list(schedule_node, n, &instructions) {
+            if (n->inst->is_math())
+               n->unblocked_time = MAX2(n->unblocked_time,
+                                        time + chosen->latency);
+         }
+      }
+   }
+
+   assert(instructions_to_schedule == 0);
+
+   block->cycle_count = time;
+}
+
+static unsigned get_cycle_count(cfg_t *cfg)
+{
+   unsigned count = 0, multiplier = 1;
+   foreach_block(block, cfg) {
+      if (block->start()->opcode == BRW_OPCODE_DO)
+         multiplier *= 10; /* assume that loops execute ~10 times */
+
+      count += block->cycle_count * multiplier;
+
+      if (block->end()->opcode == BRW_OPCODE_WHILE)
+         multiplier /= 10;
+   }
+
+   return count;
+}
+
+void
+instruction_scheduler::run(cfg_t *cfg)
+{
+   if (debug && !post_reg_alloc) {
+      fprintf(stderr, "\nInstructions before scheduling (reg_alloc %d)\n",
+              post_reg_alloc);
+         bs->dump_instructions();
+   }
+
+   if (!post_reg_alloc)
+      setup_liveness(cfg);
+
+   foreach_block(block, cfg) {
+      if (reads_remaining) {
+         memset(reads_remaining, 0,
+                grf_count * sizeof(*reads_remaining));
+         memset(hw_reads_remaining, 0,
+                hw_reg_count * sizeof(*hw_reads_remaining));
+         memset(written, 0, grf_count * sizeof(*written));
+
+         foreach_inst_in_block(fs_inst, inst, block)
+            count_reads_remaining(inst);
+      }
+
+      add_insts_from_block(block);
+
+      calculate_deps();
+
+      compute_delays();
+      compute_exits();
+
+      schedule_instructions(block);
+   }
+
+   if (debug && !post_reg_alloc) {
+      fprintf(stderr, "\nInstructions after scheduling (reg_alloc %d)\n",
+              post_reg_alloc);
+      bs->dump_instructions();
+   }
+
+   cfg->cycle_count = get_cycle_count(cfg);
+}
+
+void
+fs_visitor::schedule_instructions(instruction_scheduler_mode mode)
+{
+   if (mode != SCHEDULE_POST)
+      calculate_live_intervals();
+
+   int grf_count;
+   if (mode == SCHEDULE_POST)
+      grf_count = grf_used;
+   else
+      grf_count = alloc.count;
+
+   fs_instruction_scheduler sched(this, grf_count, first_non_payload_grf,
+                                  cfg->num_blocks, mode);
+   sched.run(cfg);
+
+   invalidate_live_intervals();
+}
+
+void
+vec4_visitor::opt_schedule_instructions()
+{
+   vec4_instruction_scheduler sched(this, prog_data->total_grf);
+   sched.run(cfg);
+
+   invalidate_live_intervals();
+}
diff --git a/src/intel/compiler/brw_shader.cpp b/src/intel/compiler/brw_shader.cpp
new file mode 100644
index 00000000000..bfaa5e7bfe2
--- /dev/null
+++ b/src/intel/compiler/brw_shader.cpp
@@ -0,0 +1,1273 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_cfg.h"
+#include "brw_eu.h"
+#include "brw_fs.h"
+#include "brw_nir.h"
+#include "brw_vec4_tes.h"
+#include "common/gen_debug.h"
+#include "main/uniforms.h"
+#include "util/macros.h"
+
+enum brw_reg_type
+brw_type_for_base_type(const struct glsl_type *type)
+{
+   switch (type->base_type) {
+   case GLSL_TYPE_FLOAT:
+      return BRW_REGISTER_TYPE_F;
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_BOOL:
+   case GLSL_TYPE_SUBROUTINE:
+      return BRW_REGISTER_TYPE_D;
+   case GLSL_TYPE_UINT:
+      return BRW_REGISTER_TYPE_UD;
+   case GLSL_TYPE_ARRAY:
+      return brw_type_for_base_type(type->fields.array);
+   case GLSL_TYPE_STRUCT:
+   case GLSL_TYPE_SAMPLER:
+   case GLSL_TYPE_ATOMIC_UINT:
+      /* These should be overridden with the type of the member when
+       * dereferenced into.  BRW_REGISTER_TYPE_UD seems like a likely
+       * way to trip up if we don't.
+       */
+      return BRW_REGISTER_TYPE_UD;
+   case GLSL_TYPE_IMAGE:
+      return BRW_REGISTER_TYPE_UD;
+   case GLSL_TYPE_DOUBLE:
+      return BRW_REGISTER_TYPE_DF;
+   case GLSL_TYPE_UINT64:
+      return BRW_REGISTER_TYPE_UQ;
+   case GLSL_TYPE_INT64:
+      return BRW_REGISTER_TYPE_Q;
+   case GLSL_TYPE_VOID:
+   case GLSL_TYPE_ERROR:
+   case GLSL_TYPE_INTERFACE:
+   case GLSL_TYPE_FUNCTION:
+      unreachable("not reached");
+   }
+
+   return BRW_REGISTER_TYPE_F;
+}
+
+enum brw_conditional_mod
+brw_conditional_for_comparison(unsigned int op)
+{
+   switch (op) {
+   case ir_binop_less:
+      return BRW_CONDITIONAL_L;
+   case ir_binop_greater:
+      return BRW_CONDITIONAL_G;
+   case ir_binop_lequal:
+      return BRW_CONDITIONAL_LE;
+   case ir_binop_gequal:
+      return BRW_CONDITIONAL_GE;
+   case ir_binop_equal:
+   case ir_binop_all_equal: /* same as equal for scalars */
+      return BRW_CONDITIONAL_Z;
+   case ir_binop_nequal:
+   case ir_binop_any_nequal: /* same as nequal for scalars */
+      return BRW_CONDITIONAL_NZ;
+   default:
+      unreachable("not reached: bad operation for comparison");
+   }
+}
+
+uint32_t
+brw_math_function(enum opcode op)
+{
+   switch (op) {
+   case SHADER_OPCODE_RCP:
+      return BRW_MATH_FUNCTION_INV;
+   case SHADER_OPCODE_RSQ:
+      return BRW_MATH_FUNCTION_RSQ;
+   case SHADER_OPCODE_SQRT:
+      return BRW_MATH_FUNCTION_SQRT;
+   case SHADER_OPCODE_EXP2:
+      return BRW_MATH_FUNCTION_EXP;
+   case SHADER_OPCODE_LOG2:
+      return BRW_MATH_FUNCTION_LOG;
+   case SHADER_OPCODE_POW:
+      return BRW_MATH_FUNCTION_POW;
+   case SHADER_OPCODE_SIN:
+      return BRW_MATH_FUNCTION_SIN;
+   case SHADER_OPCODE_COS:
+      return BRW_MATH_FUNCTION_COS;
+   case SHADER_OPCODE_INT_QUOTIENT:
+      return BRW_MATH_FUNCTION_INT_DIV_QUOTIENT;
+   case SHADER_OPCODE_INT_REMAINDER:
+      return BRW_MATH_FUNCTION_INT_DIV_REMAINDER;
+   default:
+      unreachable("not reached: unknown math function");
+   }
+}
+
+bool
+brw_texture_offset(int *offsets, unsigned num_components, uint32_t *offset_bits)
+{
+   if (!offsets) return false;  /* nonconstant offset; caller will handle it. */
+
+   /* offset out of bounds; caller will handle it. */
+   for (unsigned i = 0; i < num_components; i++)
+      if (offsets[i] > 7 || offsets[i] < -8)
+         return false;
+
+   /* Combine all three offsets into a single unsigned dword:
+    *
+    *    bits 11:8 - U Offset (X component)
+    *    bits  7:4 - V Offset (Y component)
+    *    bits  3:0 - R Offset (Z component)
+    */
+   *offset_bits = 0;
+   for (unsigned i = 0; i < num_components; i++) {
+      const unsigned shift = 4 * (2 - i);
+      *offset_bits |= (offsets[i] << shift) & (0xF << shift);
+   }
+   return true;
+}
+
+const char *
+brw_instruction_name(const struct gen_device_info *devinfo, enum opcode op)
+{
+   switch (op) {
+   case BRW_OPCODE_ILLEGAL ... BRW_OPCODE_NOP:
+      /* The DO instruction doesn't exist on Gen6+, but we use it to mark the
+       * start of a loop in the IR.
+       */
+      if (devinfo->gen >= 6 && op == BRW_OPCODE_DO)
+         return "do";
+
+      assert(brw_opcode_desc(devinfo, op)->name);
+      return brw_opcode_desc(devinfo, op)->name;
+   case FS_OPCODE_FB_WRITE:
+      return "fb_write";
+   case FS_OPCODE_FB_WRITE_LOGICAL:
+      return "fb_write_logical";
+   case FS_OPCODE_REP_FB_WRITE:
+      return "rep_fb_write";
+   case FS_OPCODE_FB_READ:
+      return "fb_read";
+   case FS_OPCODE_FB_READ_LOGICAL:
+      return "fb_read_logical";
+
+   case SHADER_OPCODE_RCP:
+      return "rcp";
+   case SHADER_OPCODE_RSQ:
+      return "rsq";
+   case SHADER_OPCODE_SQRT:
+      return "sqrt";
+   case SHADER_OPCODE_EXP2:
+      return "exp2";
+   case SHADER_OPCODE_LOG2:
+      return "log2";
+   case SHADER_OPCODE_POW:
+      return "pow";
+   case SHADER_OPCODE_INT_QUOTIENT:
+      return "int_quot";
+   case SHADER_OPCODE_INT_REMAINDER:
+      return "int_rem";
+   case SHADER_OPCODE_SIN:
+      return "sin";
+   case SHADER_OPCODE_COS:
+      return "cos";
+
+   case SHADER_OPCODE_TEX:
+      return "tex";
+   case SHADER_OPCODE_TEX_LOGICAL:
+      return "tex_logical";
+   case SHADER_OPCODE_TXD:
+      return "txd";
+   case SHADER_OPCODE_TXD_LOGICAL:
+      return "txd_logical";
+   case SHADER_OPCODE_TXF:
+      return "txf";
+   case SHADER_OPCODE_TXF_LOGICAL:
+      return "txf_logical";
+   case SHADER_OPCODE_TXF_LZ:
+      return "txf_lz";
+   case SHADER_OPCODE_TXL:
+      return "txl";
+   case SHADER_OPCODE_TXL_LOGICAL:
+      return "txl_logical";
+   case SHADER_OPCODE_TXL_LZ:
+      return "txl_lz";
+   case SHADER_OPCODE_TXS:
+      return "txs";
+   case SHADER_OPCODE_TXS_LOGICAL:
+      return "txs_logical";
+   case FS_OPCODE_TXB:
+      return "txb";
+   case FS_OPCODE_TXB_LOGICAL:
+      return "txb_logical";
+   case SHADER_OPCODE_TXF_CMS:
+      return "txf_cms";
+   case SHADER_OPCODE_TXF_CMS_LOGICAL:
+      return "txf_cms_logical";
+   case SHADER_OPCODE_TXF_CMS_W:
+      return "txf_cms_w";
+   case SHADER_OPCODE_TXF_CMS_W_LOGICAL:
+      return "txf_cms_w_logical";
+   case SHADER_OPCODE_TXF_UMS:
+      return "txf_ums";
+   case SHADER_OPCODE_TXF_UMS_LOGICAL:
+      return "txf_ums_logical";
+   case SHADER_OPCODE_TXF_MCS:
+      return "txf_mcs";
+   case SHADER_OPCODE_TXF_MCS_LOGICAL:
+      return "txf_mcs_logical";
+   case SHADER_OPCODE_LOD:
+      return "lod";
+   case SHADER_OPCODE_LOD_LOGICAL:
+      return "lod_logical";
+   case SHADER_OPCODE_TG4:
+      return "tg4";
+   case SHADER_OPCODE_TG4_LOGICAL:
+      return "tg4_logical";
+   case SHADER_OPCODE_TG4_OFFSET:
+      return "tg4_offset";
+   case SHADER_OPCODE_TG4_OFFSET_LOGICAL:
+      return "tg4_offset_logical";
+   case SHADER_OPCODE_SAMPLEINFO:
+      return "sampleinfo";
+   case SHADER_OPCODE_SAMPLEINFO_LOGICAL:
+      return "sampleinfo_logical";
+
+   case SHADER_OPCODE_SHADER_TIME_ADD:
+      return "shader_time_add";
+
+   case SHADER_OPCODE_UNTYPED_ATOMIC:
+      return "untyped_atomic";
+   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+      return "untyped_atomic_logical";
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+      return "untyped_surface_read";
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+      return "untyped_surface_read_logical";
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+      return "untyped_surface_write";
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+      return "untyped_surface_write_logical";
+   case SHADER_OPCODE_TYPED_ATOMIC:
+      return "typed_atomic";
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+      return "typed_atomic_logical";
+   case SHADER_OPCODE_TYPED_SURFACE_READ:
+      return "typed_surface_read";
+   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+      return "typed_surface_read_logical";
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+      return "typed_surface_write";
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+      return "typed_surface_write_logical";
+   case SHADER_OPCODE_MEMORY_FENCE:
+      return "memory_fence";
+
+   case SHADER_OPCODE_LOAD_PAYLOAD:
+      return "load_payload";
+   case FS_OPCODE_PACK:
+      return "pack";
+
+   case SHADER_OPCODE_GEN4_SCRATCH_READ:
+      return "gen4_scratch_read";
+   case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
+      return "gen4_scratch_write";
+   case SHADER_OPCODE_GEN7_SCRATCH_READ:
+      return "gen7_scratch_read";
+   case SHADER_OPCODE_URB_WRITE_SIMD8:
+      return "gen8_urb_write_simd8";
+   case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
+      return "gen8_urb_write_simd8_per_slot";
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
+      return "gen8_urb_write_simd8_masked";
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
+      return "gen8_urb_write_simd8_masked_per_slot";
+   case SHADER_OPCODE_URB_READ_SIMD8:
+      return "urb_read_simd8";
+   case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
+      return "urb_read_simd8_per_slot";
+
+   case SHADER_OPCODE_FIND_LIVE_CHANNEL:
+      return "find_live_channel";
+   case SHADER_OPCODE_BROADCAST:
+      return "broadcast";
+
+   case VEC4_OPCODE_MOV_BYTES:
+      return "mov_bytes";
+   case VEC4_OPCODE_PACK_BYTES:
+      return "pack_bytes";
+   case VEC4_OPCODE_UNPACK_UNIFORM:
+      return "unpack_uniform";
+   case VEC4_OPCODE_FROM_DOUBLE:
+      return "double_to_single";
+   case VEC4_OPCODE_TO_DOUBLE:
+      return "single_to_double";
+   case VEC4_OPCODE_PICK_LOW_32BIT:
+      return "pick_low_32bit";
+   case VEC4_OPCODE_PICK_HIGH_32BIT:
+      return "pick_high_32bit";
+   case VEC4_OPCODE_SET_LOW_32BIT:
+      return "set_low_32bit";
+   case VEC4_OPCODE_SET_HIGH_32BIT:
+      return "set_high_32bit";
+
+   case FS_OPCODE_DDX_COARSE:
+      return "ddx_coarse";
+   case FS_OPCODE_DDX_FINE:
+      return "ddx_fine";
+   case FS_OPCODE_DDY_COARSE:
+      return "ddy_coarse";
+   case FS_OPCODE_DDY_FINE:
+      return "ddy_fine";
+
+   case FS_OPCODE_CINTERP:
+      return "cinterp";
+   case FS_OPCODE_LINTERP:
+      return "linterp";
+
+   case FS_OPCODE_PIXEL_X:
+      return "pixel_x";
+   case FS_OPCODE_PIXEL_Y:
+      return "pixel_y";
+
+   case FS_OPCODE_GET_BUFFER_SIZE:
+      return "fs_get_buffer_size";
+
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
+      return "uniform_pull_const";
+   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
+      return "uniform_pull_const_gen7";
+   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN4:
+      return "varying_pull_const_gen4";
+   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
+      return "varying_pull_const_gen7";
+   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL:
+      return "varying_pull_const_logical";
+
+   case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
+      return "mov_dispatch_to_flags";
+   case FS_OPCODE_DISCARD_JUMP:
+      return "discard_jump";
+
+   case FS_OPCODE_SET_SAMPLE_ID:
+      return "set_sample_id";
+
+   case FS_OPCODE_PACK_HALF_2x16_SPLIT:
+      return "pack_half_2x16_split";
+   case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
+      return "unpack_half_2x16_split_x";
+   case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
+      return "unpack_half_2x16_split_y";
+
+   case FS_OPCODE_PLACEHOLDER_HALT:
+      return "placeholder_halt";
+
+   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
+      return "interp_sample";
+   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
+      return "interp_shared_offset";
+   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
+      return "interp_per_slot_offset";
+
+   case VS_OPCODE_URB_WRITE:
+      return "vs_urb_write";
+   case VS_OPCODE_PULL_CONSTANT_LOAD:
+      return "pull_constant_load";
+   case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
+      return "pull_constant_load_gen7";
+
+   case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
+      return "set_simd4x2_header_gen9";
+
+   case VS_OPCODE_GET_BUFFER_SIZE:
+      return "vs_get_buffer_size";
+
+   case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
+      return "unpack_flags_simd4x2";
+
+   case GS_OPCODE_URB_WRITE:
+      return "gs_urb_write";
+   case GS_OPCODE_URB_WRITE_ALLOCATE:
+      return "gs_urb_write_allocate";
+   case GS_OPCODE_THREAD_END:
+      return "gs_thread_end";
+   case GS_OPCODE_SET_WRITE_OFFSET:
+      return "set_write_offset";
+   case GS_OPCODE_SET_VERTEX_COUNT:
+      return "set_vertex_count";
+   case GS_OPCODE_SET_DWORD_2:
+      return "set_dword_2";
+   case GS_OPCODE_PREPARE_CHANNEL_MASKS:
+      return "prepare_channel_masks";
+   case GS_OPCODE_SET_CHANNEL_MASKS:
+      return "set_channel_masks";
+   case GS_OPCODE_GET_INSTANCE_ID:
+      return "get_instance_id";
+   case GS_OPCODE_FF_SYNC:
+      return "ff_sync";
+   case GS_OPCODE_SET_PRIMITIVE_ID:
+      return "set_primitive_id";
+   case GS_OPCODE_SVB_WRITE:
+      return "gs_svb_write";
+   case GS_OPCODE_SVB_SET_DST_INDEX:
+      return "gs_svb_set_dst_index";
+   case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
+      return "gs_ff_sync_set_primitives";
+   case CS_OPCODE_CS_TERMINATE:
+      return "cs_terminate";
+   case SHADER_OPCODE_BARRIER:
+      return "barrier";
+   case SHADER_OPCODE_MULH:
+      return "mulh";
+   case SHADER_OPCODE_MOV_INDIRECT:
+      return "mov_indirect";
+
+   case VEC4_OPCODE_URB_READ:
+      return "urb_read";
+   case TCS_OPCODE_GET_INSTANCE_ID:
+      return "tcs_get_instance_id";
+   case TCS_OPCODE_URB_WRITE:
+      return "tcs_urb_write";
+   case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+      return "tcs_set_input_urb_offsets";
+   case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
+      return "tcs_set_output_urb_offsets";
+   case TCS_OPCODE_GET_PRIMITIVE_ID:
+      return "tcs_get_primitive_id";
+   case TCS_OPCODE_CREATE_BARRIER_HEADER:
+      return "tcs_create_barrier_header";
+   case TCS_OPCODE_SRC0_010_IS_ZERO:
+      return "tcs_src0<0,1,0>_is_zero";
+   case TCS_OPCODE_RELEASE_INPUT:
+      return "tcs_release_input";
+   case TCS_OPCODE_THREAD_END:
+      return "tcs_thread_end";
+   case TES_OPCODE_CREATE_INPUT_READ_HEADER:
+      return "tes_create_input_read_header";
+   case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
+      return "tes_add_indirect_urb_offset";
+   case TES_OPCODE_GET_PRIMITIVE_ID:
+      return "tes_get_primitive_id";
+   }
+
+   unreachable("not reached");
+}
+
+bool
+brw_saturate_immediate(enum brw_reg_type type, struct brw_reg *reg)
+{
+   union {
+      unsigned ud;
+      int d;
+      float f;
+      double df;
+   } imm, sat_imm = { 0 };
+
+   const unsigned size = type_sz(type);
+
+   /* We want to either do a 32-bit or 64-bit data copy, the type is otherwise
+    * irrelevant, so just check the size of the type and copy from/to an
+    * appropriately sized field.
+    */
+   if (size < 8)
+      imm.ud = reg->ud;
+   else
+      imm.df = reg->df;
+
+   switch (type) {
+   case BRW_REGISTER_TYPE_UD:
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_UW:
+   case BRW_REGISTER_TYPE_W:
+   case BRW_REGISTER_TYPE_UQ:
+   case BRW_REGISTER_TYPE_Q:
+      /* Nothing to do. */
+      return false;
+   case BRW_REGISTER_TYPE_F:
+      sat_imm.f = CLAMP(imm.f, 0.0f, 1.0f);
+      break;
+   case BRW_REGISTER_TYPE_DF:
+      sat_imm.df = CLAMP(imm.df, 0.0, 1.0);
+      break;
+   case BRW_REGISTER_TYPE_UB:
+   case BRW_REGISTER_TYPE_B:
+      unreachable("no UB/B immediates");
+   case BRW_REGISTER_TYPE_V:
+   case BRW_REGISTER_TYPE_UV:
+   case BRW_REGISTER_TYPE_VF:
+      unreachable("unimplemented: saturate vector immediate");
+   case BRW_REGISTER_TYPE_HF:
+      unreachable("unimplemented: saturate HF immediate");
+   }
+
+   if (size < 8) {
+      if (imm.ud != sat_imm.ud) {
+         reg->ud = sat_imm.ud;
+         return true;
+      }
+   } else {
+      if (imm.df != sat_imm.df) {
+         reg->df = sat_imm.df;
+         return true;
+      }
+   }
+   return false;
+}
+
+bool
+brw_negate_immediate(enum brw_reg_type type, struct brw_reg *reg)
+{
+   switch (type) {
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_UD:
+      reg->d = -reg->d;
+      return true;
+   case BRW_REGISTER_TYPE_W:
+   case BRW_REGISTER_TYPE_UW:
+      reg->d = -(int16_t)reg->ud;
+      return true;
+   case BRW_REGISTER_TYPE_F:
+      reg->f = -reg->f;
+      return true;
+   case BRW_REGISTER_TYPE_VF:
+      reg->ud ^= 0x80808080;
+      return true;
+   case BRW_REGISTER_TYPE_DF:
+      reg->df = -reg->df;
+      return true;
+   case BRW_REGISTER_TYPE_UQ:
+   case BRW_REGISTER_TYPE_Q:
+      reg->d64 = -reg->d64;
+      return true;
+   case BRW_REGISTER_TYPE_UB:
+   case BRW_REGISTER_TYPE_B:
+      unreachable("no UB/B immediates");
+   case BRW_REGISTER_TYPE_UV:
+   case BRW_REGISTER_TYPE_V:
+      assert(!"unimplemented: negate UV/V immediate");
+   case BRW_REGISTER_TYPE_HF:
+      assert(!"unimplemented: negate HF immediate");
+   }
+
+   return false;
+}
+
+bool
+brw_abs_immediate(enum brw_reg_type type, struct brw_reg *reg)
+{
+   switch (type) {
+   case BRW_REGISTER_TYPE_D:
+      reg->d = abs(reg->d);
+      return true;
+   case BRW_REGISTER_TYPE_W:
+      reg->d = abs((int16_t)reg->ud);
+      return true;
+   case BRW_REGISTER_TYPE_F:
+      reg->f = fabsf(reg->f);
+      return true;
+   case BRW_REGISTER_TYPE_DF:
+      reg->df = fabs(reg->df);
+      return true;
+   case BRW_REGISTER_TYPE_VF:
+      reg->ud &= ~0x80808080;
+      return true;
+   case BRW_REGISTER_TYPE_Q:
+      reg->d64 = imaxabs(reg->d64);
+      return true;
+   case BRW_REGISTER_TYPE_UB:
+   case BRW_REGISTER_TYPE_B:
+      unreachable("no UB/B immediates");
+   case BRW_REGISTER_TYPE_UQ:
+   case BRW_REGISTER_TYPE_UD:
+   case BRW_REGISTER_TYPE_UW:
+   case BRW_REGISTER_TYPE_UV:
+      /* Presumably the absolute value modifier on an unsigned source is a
+       * nop, but it would be nice to confirm.
+       */
+      assert(!"unimplemented: abs unsigned immediate");
+   case BRW_REGISTER_TYPE_V:
+      assert(!"unimplemented: abs V immediate");
+   case BRW_REGISTER_TYPE_HF:
+      assert(!"unimplemented: abs HF immediate");
+   }
+
+   return false;
+}
+
+/**
+ * Get the appropriate atomic op for an image atomic intrinsic.
+ */
+unsigned
+get_atomic_counter_op(nir_intrinsic_op op)
+{
+   switch (op) {
+   case nir_intrinsic_atomic_counter_inc:
+      return BRW_AOP_INC;
+   case nir_intrinsic_atomic_counter_dec:
+      return BRW_AOP_PREDEC;
+   case nir_intrinsic_atomic_counter_add:
+      return BRW_AOP_ADD;
+   case nir_intrinsic_atomic_counter_min:
+      return BRW_AOP_UMIN;
+   case nir_intrinsic_atomic_counter_max:
+      return BRW_AOP_UMAX;
+   case nir_intrinsic_atomic_counter_and:
+      return BRW_AOP_AND;
+   case nir_intrinsic_atomic_counter_or:
+      return BRW_AOP_OR;
+   case nir_intrinsic_atomic_counter_xor:
+      return BRW_AOP_XOR;
+   case nir_intrinsic_atomic_counter_exchange:
+      return BRW_AOP_MOV;
+   case nir_intrinsic_atomic_counter_comp_swap:
+      return BRW_AOP_CMPWR;
+   default:
+      unreachable("Not reachable.");
+   }
+}
+
+backend_shader::backend_shader(const struct brw_compiler *compiler,
+                               void *log_data,
+                               void *mem_ctx,
+                               const nir_shader *shader,
+                               struct brw_stage_prog_data *stage_prog_data)
+   : compiler(compiler),
+     log_data(log_data),
+     devinfo(compiler->devinfo),
+     nir(shader),
+     stage_prog_data(stage_prog_data),
+     mem_ctx(mem_ctx),
+     cfg(NULL),
+     stage(shader->stage)
+{
+   debug_enabled = INTEL_DEBUG & intel_debug_flag_for_shader_stage(stage);
+   stage_name = _mesa_shader_stage_to_string(stage);
+   stage_abbrev = _mesa_shader_stage_to_abbrev(stage);
+}
+
+bool
+backend_reg::equals(const backend_reg &r) const
+{
+   return brw_regs_equal(this, &r) && offset == r.offset;
+}
+
+bool
+backend_reg::is_zero() const
+{
+   if (file != IMM)
+      return false;
+
+   switch (type) {
+   case BRW_REGISTER_TYPE_F:
+      return f == 0;
+   case BRW_REGISTER_TYPE_DF:
+      return df == 0;
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_UD:
+      return d == 0;
+   case BRW_REGISTER_TYPE_UQ:
+   case BRW_REGISTER_TYPE_Q:
+      return u64 == 0;
+   default:
+      return false;
+   }
+}
+
+bool
+backend_reg::is_one() const
+{
+   if (file != IMM)
+      return false;
+
+   switch (type) {
+   case BRW_REGISTER_TYPE_F:
+      return f == 1.0f;
+   case BRW_REGISTER_TYPE_DF:
+      return df == 1.0;
+   case BRW_REGISTER_TYPE_D:
+   case BRW_REGISTER_TYPE_UD:
+      return d == 1;
+   case BRW_REGISTER_TYPE_UQ:
+   case BRW_REGISTER_TYPE_Q:
+      return u64 == 1;
+   default:
+      return false;
+   }
+}
+
+bool
+backend_reg::is_negative_one() const
+{
+   if (file != IMM)
+      return false;
+
+   switch (type) {
+   case BRW_REGISTER_TYPE_F:
+      return f == -1.0;
+   case BRW_REGISTER_TYPE_DF:
+      return df == -1.0;
+   case BRW_REGISTER_TYPE_D:
+      return d == -1;
+   case BRW_REGISTER_TYPE_Q:
+      return d64 == -1;
+   default:
+      return false;
+   }
+}
+
+bool
+backend_reg::is_null() const
+{
+   return file == ARF && nr == BRW_ARF_NULL;
+}
+
+
+bool
+backend_reg::is_accumulator() const
+{
+   return file == ARF && nr == BRW_ARF_ACCUMULATOR;
+}
+
+bool
+backend_instruction::is_commutative() const
+{
+   switch (opcode) {
+   case BRW_OPCODE_AND:
+   case BRW_OPCODE_OR:
+   case BRW_OPCODE_XOR:
+   case BRW_OPCODE_ADD:
+   case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
+      return true;
+   case BRW_OPCODE_SEL:
+      /* MIN and MAX are commutative. */
+      if (conditional_mod == BRW_CONDITIONAL_GE ||
+          conditional_mod == BRW_CONDITIONAL_L) {
+         return true;
+      }
+      /* fallthrough */
+   default:
+      return false;
+   }
+}
+
+bool
+backend_instruction::is_3src(const struct gen_device_info *devinfo) const
+{
+   return ::is_3src(devinfo, opcode);
+}
+
+bool
+backend_instruction::is_tex() const
+{
+   return (opcode == SHADER_OPCODE_TEX ||
+           opcode == FS_OPCODE_TXB ||
+           opcode == SHADER_OPCODE_TXD ||
+           opcode == SHADER_OPCODE_TXF ||
+           opcode == SHADER_OPCODE_TXF_LZ ||
+           opcode == SHADER_OPCODE_TXF_CMS ||
+           opcode == SHADER_OPCODE_TXF_CMS_W ||
+           opcode == SHADER_OPCODE_TXF_UMS ||
+           opcode == SHADER_OPCODE_TXF_MCS ||
+           opcode == SHADER_OPCODE_TXL ||
+           opcode == SHADER_OPCODE_TXL_LZ ||
+           opcode == SHADER_OPCODE_TXS ||
+           opcode == SHADER_OPCODE_LOD ||
+           opcode == SHADER_OPCODE_TG4 ||
+           opcode == SHADER_OPCODE_TG4_OFFSET ||
+           opcode == SHADER_OPCODE_SAMPLEINFO);
+}
+
+bool
+backend_instruction::is_math() const
+{
+   return (opcode == SHADER_OPCODE_RCP ||
+           opcode == SHADER_OPCODE_RSQ ||
+           opcode == SHADER_OPCODE_SQRT ||
+           opcode == SHADER_OPCODE_EXP2 ||
+           opcode == SHADER_OPCODE_LOG2 ||
+           opcode == SHADER_OPCODE_SIN ||
+           opcode == SHADER_OPCODE_COS ||
+           opcode == SHADER_OPCODE_INT_QUOTIENT ||
+           opcode == SHADER_OPCODE_INT_REMAINDER ||
+           opcode == SHADER_OPCODE_POW);
+}
+
+bool
+backend_instruction::is_control_flow() const
+{
+   switch (opcode) {
+   case BRW_OPCODE_DO:
+   case BRW_OPCODE_WHILE:
+   case BRW_OPCODE_IF:
+   case BRW_OPCODE_ELSE:
+   case BRW_OPCODE_ENDIF:
+   case BRW_OPCODE_BREAK:
+   case BRW_OPCODE_CONTINUE:
+      return true;
+   default:
+      return false;
+   }
+}
+
+bool
+backend_instruction::can_do_source_mods() const
+{
+   switch (opcode) {
+   case BRW_OPCODE_ADDC:
+   case BRW_OPCODE_BFE:
+   case BRW_OPCODE_BFI1:
+   case BRW_OPCODE_BFI2:
+   case BRW_OPCODE_BFREV:
+   case BRW_OPCODE_CBIT:
+   case BRW_OPCODE_FBH:
+   case BRW_OPCODE_FBL:
+   case BRW_OPCODE_SUBB:
+      return false;
+   default:
+      return true;
+   }
+}
+
+bool
+backend_instruction::can_do_saturate() const
+{
+   switch (opcode) {
+   case BRW_OPCODE_ADD:
+   case BRW_OPCODE_ASR:
+   case BRW_OPCODE_AVG:
+   case BRW_OPCODE_DP2:
+   case BRW_OPCODE_DP3:
+   case BRW_OPCODE_DP4:
+   case BRW_OPCODE_DPH:
+   case BRW_OPCODE_F16TO32:
+   case BRW_OPCODE_F32TO16:
+   case BRW_OPCODE_LINE:
+   case BRW_OPCODE_LRP:
+   case BRW_OPCODE_MAC:
+   case BRW_OPCODE_MAD:
+   case BRW_OPCODE_MATH:
+   case BRW_OPCODE_MOV:
+   case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
+   case BRW_OPCODE_PLN:
+   case BRW_OPCODE_RNDD:
+   case BRW_OPCODE_RNDE:
+   case BRW_OPCODE_RNDU:
+   case BRW_OPCODE_RNDZ:
+   case BRW_OPCODE_SEL:
+   case BRW_OPCODE_SHL:
+   case BRW_OPCODE_SHR:
+   case FS_OPCODE_LINTERP:
+   case SHADER_OPCODE_COS:
+   case SHADER_OPCODE_EXP2:
+   case SHADER_OPCODE_LOG2:
+   case SHADER_OPCODE_POW:
+   case SHADER_OPCODE_RCP:
+   case SHADER_OPCODE_RSQ:
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_SQRT:
+      return true;
+   default:
+      return false;
+   }
+}
+
+bool
+backend_instruction::can_do_cmod() const
+{
+   switch (opcode) {
+   case BRW_OPCODE_ADD:
+   case BRW_OPCODE_ADDC:
+   case BRW_OPCODE_AND:
+   case BRW_OPCODE_ASR:
+   case BRW_OPCODE_AVG:
+   case BRW_OPCODE_CMP:
+   case BRW_OPCODE_CMPN:
+   case BRW_OPCODE_DP2:
+   case BRW_OPCODE_DP3:
+   case BRW_OPCODE_DP4:
+   case BRW_OPCODE_DPH:
+   case BRW_OPCODE_F16TO32:
+   case BRW_OPCODE_F32TO16:
+   case BRW_OPCODE_FRC:
+   case BRW_OPCODE_LINE:
+   case BRW_OPCODE_LRP:
+   case BRW_OPCODE_LZD:
+   case BRW_OPCODE_MAC:
+   case BRW_OPCODE_MACH:
+   case BRW_OPCODE_MAD:
+   case BRW_OPCODE_MOV:
+   case BRW_OPCODE_MUL:
+   case BRW_OPCODE_NOT:
+   case BRW_OPCODE_OR:
+   case BRW_OPCODE_PLN:
+   case BRW_OPCODE_RNDD:
+   case BRW_OPCODE_RNDE:
+   case BRW_OPCODE_RNDU:
+   case BRW_OPCODE_RNDZ:
+   case BRW_OPCODE_SAD2:
+   case BRW_OPCODE_SADA2:
+   case BRW_OPCODE_SHL:
+   case BRW_OPCODE_SHR:
+   case BRW_OPCODE_SUBB:
+   case BRW_OPCODE_XOR:
+   case FS_OPCODE_CINTERP:
+   case FS_OPCODE_LINTERP:
+      return true;
+   default:
+      return false;
+   }
+}
+
+bool
+backend_instruction::reads_accumulator_implicitly() const
+{
+   switch (opcode) {
+   case BRW_OPCODE_MAC:
+   case BRW_OPCODE_MACH:
+   case BRW_OPCODE_SADA2:
+      return true;
+   default:
+      return false;
+   }
+}
+
+bool
+backend_instruction::writes_accumulator_implicitly(const struct gen_device_info *devinfo) const
+{
+   return writes_accumulator ||
+          (devinfo->gen < 6 &&
+           ((opcode >= BRW_OPCODE_ADD && opcode < BRW_OPCODE_NOP) ||
+            (opcode >= FS_OPCODE_DDX_COARSE && opcode <= FS_OPCODE_LINTERP &&
+             opcode != FS_OPCODE_CINTERP)));
+}
+
+bool
+backend_instruction::has_side_effects() const
+{
+   switch (opcode) {
+   case SHADER_OPCODE_UNTYPED_ATOMIC:
+   case SHADER_OPCODE_UNTYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE_LOGICAL:
+   case SHADER_OPCODE_TYPED_ATOMIC:
+   case SHADER_OPCODE_TYPED_ATOMIC_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE_LOGICAL:
+   case SHADER_OPCODE_MEMORY_FENCE:
+   case SHADER_OPCODE_URB_WRITE_SIMD8:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_PER_SLOT:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED:
+   case SHADER_OPCODE_URB_WRITE_SIMD8_MASKED_PER_SLOT:
+   case FS_OPCODE_FB_WRITE:
+   case FS_OPCODE_FB_WRITE_LOGICAL:
+   case SHADER_OPCODE_BARRIER:
+   case TCS_OPCODE_URB_WRITE:
+   case TCS_OPCODE_RELEASE_INPUT:
+      return true;
+   default:
+      return false;
+   }
+}
+
+bool
+backend_instruction::is_volatile() const
+{
+   switch (opcode) {
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_TYPED_SURFACE_READ:
+   case SHADER_OPCODE_TYPED_SURFACE_READ_LOGICAL:
+   case SHADER_OPCODE_URB_READ_SIMD8:
+   case SHADER_OPCODE_URB_READ_SIMD8_PER_SLOT:
+   case VEC4_OPCODE_URB_READ:
+      return true;
+   default:
+      return false;
+   }
+}
+
+#ifndef NDEBUG
+static bool
+inst_is_in_block(const bblock_t *block, const backend_instruction *inst)
+{
+   bool found = false;
+   foreach_inst_in_block (backend_instruction, i, block) {
+      if (inst == i) {
+         found = true;
+      }
+   }
+   return found;
+}
+#endif
+
+static void
+adjust_later_block_ips(bblock_t *start_block, int ip_adjustment)
+{
+   for (bblock_t *block_iter = start_block->next();
+        block_iter;
+        block_iter = block_iter->next()) {
+      block_iter->start_ip += ip_adjustment;
+      block_iter->end_ip += ip_adjustment;
+   }
+}
+
+void
+backend_instruction::insert_after(bblock_t *block, backend_instruction *inst)
+{
+   assert(this != inst);
+
+   if (!this->is_head_sentinel())
+      assert(inst_is_in_block(block, this) || !"Instruction not in block");
+
+   block->end_ip++;
+
+   adjust_later_block_ips(block, 1);
+
+   exec_node::insert_after(inst);
+}
+
+void
+backend_instruction::insert_before(bblock_t *block, backend_instruction *inst)
+{
+   assert(this != inst);
+
+   if (!this->is_tail_sentinel())
+      assert(inst_is_in_block(block, this) || !"Instruction not in block");
+
+   block->end_ip++;
+
+   adjust_later_block_ips(block, 1);
+
+   exec_node::insert_before(inst);
+}
+
+void
+backend_instruction::insert_before(bblock_t *block, exec_list *list)
+{
+   assert(inst_is_in_block(block, this) || !"Instruction not in block");
+
+   unsigned num_inst = list->length();
+
+   block->end_ip += num_inst;
+
+   adjust_later_block_ips(block, num_inst);
+
+   exec_node::insert_before(list);
+}
+
+void
+backend_instruction::remove(bblock_t *block)
+{
+   assert(inst_is_in_block(block, this) || !"Instruction not in block");
+
+   adjust_later_block_ips(block, -1);
+
+   if (block->start_ip == block->end_ip) {
+      block->cfg->remove_block(block);
+   } else {
+      block->end_ip--;
+   }
+
+   exec_node::remove();
+}
+
+void
+backend_shader::dump_instructions()
+{
+   dump_instructions(NULL);
+}
+
+void
+backend_shader::dump_instructions(const char *name)
+{
+   FILE *file = stderr;
+   if (name && geteuid() != 0) {
+      file = fopen(name, "w");
+      if (!file)
+         file = stderr;
+   }
+
+   if (cfg) {
+      int ip = 0;
+      foreach_block_and_inst(block, backend_instruction, inst, cfg) {
+         if (!unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER))
+            fprintf(file, "%4d: ", ip++);
+         dump_instruction(inst, file);
+      }
+   } else {
+      int ip = 0;
+      foreach_in_list(backend_instruction, inst, &instructions) {
+         if (!unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER))
+            fprintf(file, "%4d: ", ip++);
+         dump_instruction(inst, file);
+      }
+   }
+
+   if (file != stderr) {
+      fclose(file);
+   }
+}
+
+void
+backend_shader::calculate_cfg()
+{
+   if (this->cfg)
+      return;
+   cfg = new(mem_ctx) cfg_t(&this->instructions);
+}
+
+extern "C" const unsigned *
+brw_compile_tes(const struct brw_compiler *compiler,
+                void *log_data,
+                void *mem_ctx,
+                const struct brw_tes_prog_key *key,
+                const struct brw_vue_map *input_vue_map,
+                struct brw_tes_prog_data *prog_data,
+                const nir_shader *src_shader,
+                struct gl_program *prog,
+                int shader_time_index,
+                unsigned *final_assembly_size,
+                char **error_str)
+{
+   const struct gen_device_info *devinfo = compiler->devinfo;
+   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_EVAL];
+
+   nir_shader *nir = nir_shader_clone(mem_ctx, src_shader);
+   nir->info->inputs_read = key->inputs_read;
+   nir->info->patch_inputs_read = key->patch_inputs_read;
+
+   nir = brw_nir_apply_sampler_key(nir, compiler, &key->tex, is_scalar);
+   brw_nir_lower_tes_inputs(nir, input_vue_map);
+   brw_nir_lower_vue_outputs(nir, is_scalar);
+   nir = brw_postprocess_nir(nir, compiler, is_scalar);
+
+   brw_compute_vue_map(devinfo, &prog_data->base.vue_map,
+                       nir->info->outputs_written,
+                       nir->info->separate_shader);
+
+   unsigned output_size_bytes = prog_data->base.vue_map.num_slots * 4 * 4;
+
+   assert(output_size_bytes >= 1);
+   if (output_size_bytes > GEN7_MAX_DS_URB_ENTRY_SIZE_BYTES) {
+      if (error_str)
+         *error_str = ralloc_strdup(mem_ctx, "DS outputs exceed maximum size");
+      return NULL;
+   }
+
+   prog_data->base.clip_distance_mask =
+      ((1 << nir->info->clip_distance_array_size) - 1);
+   prog_data->base.cull_distance_mask =
+      ((1 << nir->info->cull_distance_array_size) - 1) <<
+      nir->info->clip_distance_array_size;
+
+   /* URB entry sizes are stored as a multiple of 64 bytes. */
+   prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
+   prog_data->base.urb_read_length = 0;
+
+   STATIC_ASSERT(BRW_TESS_PARTITIONING_INTEGER == TESS_SPACING_EQUAL - 1);
+   STATIC_ASSERT(BRW_TESS_PARTITIONING_ODD_FRACTIONAL ==
+                 TESS_SPACING_FRACTIONAL_ODD - 1);
+   STATIC_ASSERT(BRW_TESS_PARTITIONING_EVEN_FRACTIONAL ==
+                 TESS_SPACING_FRACTIONAL_EVEN - 1);
+
+   prog_data->partitioning =
+      (enum brw_tess_partitioning) (nir->info->tess.spacing - 1);
+
+   switch (nir->info->tess.primitive_mode) {
+   case GL_QUADS:
+      prog_data->domain = BRW_TESS_DOMAIN_QUAD;
+      break;
+   case GL_TRIANGLES:
+      prog_data->domain = BRW_TESS_DOMAIN_TRI;
+      break;
+   case GL_ISOLINES:
+      prog_data->domain = BRW_TESS_DOMAIN_ISOLINE;
+      break;
+   default:
+      unreachable("invalid domain shader primitive mode");
+   }
+
+   if (nir->info->tess.point_mode) {
+      prog_data->output_topology = BRW_TESS_OUTPUT_TOPOLOGY_POINT;
+   } else if (nir->info->tess.primitive_mode == GL_ISOLINES) {
+      prog_data->output_topology = BRW_TESS_OUTPUT_TOPOLOGY_LINE;
+   } else {
+      /* Hardware winding order is backwards from OpenGL */
+      prog_data->output_topology =
+         nir->info->tess.ccw ? BRW_TESS_OUTPUT_TOPOLOGY_TRI_CW
+                             : BRW_TESS_OUTPUT_TOPOLOGY_TRI_CCW;
+   }
+
+   if (unlikely(INTEL_DEBUG & DEBUG_TES)) {
+      fprintf(stderr, "TES Input ");
+      brw_print_vue_map(stderr, input_vue_map);
+      fprintf(stderr, "TES Output ");
+      brw_print_vue_map(stderr, &prog_data->base.vue_map);
+   }
+
+   if (is_scalar) {
+      fs_visitor v(compiler, log_data, mem_ctx, (void *) key,
+                   &prog_data->base.base, NULL, nir, 8,
+                   shader_time_index, input_vue_map);
+      if (!v.run_tes()) {
+         if (error_str)
+            *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
+         return NULL;
+      }
+
+      prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs;
+      prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
+
+      fs_generator g(compiler, log_data, mem_ctx, (void *) key,
+                     &prog_data->base.base, v.promoted_constants, false,
+                     MESA_SHADER_TESS_EVAL);
+      if (unlikely(INTEL_DEBUG & DEBUG_TES)) {
+         g.enable_debug(ralloc_asprintf(mem_ctx,
+                                        "%s tessellation evaluation shader %s",
+                                        nir->info->label ? nir->info->label
+                                                        : "unnamed",
+                                        nir->info->name));
+      }
+
+      g.generate_code(v.cfg, 8);
+
+      return g.get_assembly(final_assembly_size);
+   } else {
+      brw::vec4_tes_visitor v(compiler, log_data, key, prog_data,
+			      nir, mem_ctx, shader_time_index);
+      if (!v.run()) {
+	 if (error_str)
+	    *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
+	 return NULL;
+      }
+
+      if (unlikely(INTEL_DEBUG & DEBUG_TES))
+	 v.dump_instructions();
+
+      return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir,
+					&prog_data->base, v.cfg,
+					final_assembly_size);
+   }
+}
diff --git a/src/intel/compiler/brw_shader.h b/src/intel/compiler/brw_shader.h
new file mode 100644
index 00000000000..5a253e66570
--- /dev/null
+++ b/src/intel/compiler/brw_shader.h
@@ -0,0 +1,295 @@
+/*
+ * Copyright © 2010 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#pragma once
+
+#include <stdint.h>
+#include "brw_reg.h"
+#include "brw_compiler.h"
+#include "brw_eu_defines.h"
+#include "brw_inst.h"
+#include "compiler/nir/nir.h"
+
+#ifdef __cplusplus
+#include "brw_ir_allocator.h"
+#endif
+
+#define MAX_SAMPLER_MESSAGE_SIZE 11
+#define MAX_VGRF_SIZE 16
+
+#ifdef __cplusplus
+struct backend_reg : private brw_reg
+{
+   backend_reg() {}
+   backend_reg(const struct brw_reg &reg) : brw_reg(reg) {}
+
+   const brw_reg &as_brw_reg() const
+   {
+      assert(file == ARF || file == FIXED_GRF || file == MRF || file == IMM);
+      assert(offset == 0);
+      return static_cast<const brw_reg &>(*this);
+   }
+
+   brw_reg &as_brw_reg()
+   {
+      assert(file == ARF || file == FIXED_GRF || file == MRF || file == IMM);
+      assert(offset == 0);
+      return static_cast<brw_reg &>(*this);
+   }
+
+   bool equals(const backend_reg &r) const;
+
+   bool is_zero() const;
+   bool is_one() const;
+   bool is_negative_one() const;
+   bool is_null() const;
+   bool is_accumulator() const;
+
+   /** Offset from the start of the (virtual) register in bytes. */
+   uint16_t offset;
+
+   using brw_reg::type;
+   using brw_reg::file;
+   using brw_reg::negate;
+   using brw_reg::abs;
+   using brw_reg::address_mode;
+   using brw_reg::subnr;
+   using brw_reg::nr;
+
+   using brw_reg::swizzle;
+   using brw_reg::writemask;
+   using brw_reg::indirect_offset;
+   using brw_reg::vstride;
+   using brw_reg::width;
+   using brw_reg::hstride;
+
+   using brw_reg::df;
+   using brw_reg::f;
+   using brw_reg::d;
+   using brw_reg::ud;
+};
+#endif
+
+struct cfg_t;
+struct bblock_t;
+
+#ifdef __cplusplus
+struct backend_instruction : public exec_node {
+   bool is_3src(const struct gen_device_info *devinfo) const;
+   bool is_tex() const;
+   bool is_math() const;
+   bool is_control_flow() const;
+   bool is_commutative() const;
+   bool can_do_source_mods() const;
+   bool can_do_saturate() const;
+   bool can_do_cmod() const;
+   bool reads_accumulator_implicitly() const;
+   bool writes_accumulator_implicitly(const struct gen_device_info *devinfo) const;
+
+   void remove(bblock_t *block);
+   void insert_after(bblock_t *block, backend_instruction *inst);
+   void insert_before(bblock_t *block, backend_instruction *inst);
+   void insert_before(bblock_t *block, exec_list *list);
+
+   /**
+    * True if the instruction has side effects other than writing to
+    * its destination registers.  You are expected not to reorder or
+    * optimize these out unless you know what you are doing.
+    */
+   bool has_side_effects() const;
+
+   /**
+    * True if the instruction might be affected by side effects of other
+    * instructions.
+    */
+   bool is_volatile() const;
+#else
+struct backend_instruction {
+   struct exec_node link;
+#endif
+   /** @{
+    * Annotation for the generated IR.  One of the two can be set.
+    */
+   const void *ir;
+   const char *annotation;
+   /** @} */
+
+   /**
+    * Execution size of the instruction.  This is used by the generator to
+    * generate the correct binary for the given instruction.  Current valid
+    * values are 1, 4, 8, 16, 32.
+    */
+   uint8_t exec_size;
+
+   /**
+    * Channel group from the hardware execution and predication mask that
+    * should be applied to the instruction.  The subset of channel enable
+    * signals (calculated from the EU control flow and predication state)
+    * given by [group, group + exec_size) will be used to mask GRF writes and
+    * any other side effects of the instruction.
+    */
+   uint8_t group;
+
+   uint32_t offset; /**< spill/unspill offset or texture offset bitfield */
+   uint8_t mlen; /**< SEND message length */
+   int8_t base_mrf; /**< First MRF in the SEND message, if mlen is nonzero. */
+   uint8_t target; /**< MRT target. */
+   unsigned size_written; /**< Data written to the destination register in bytes. */
+
+   enum opcode opcode; /* BRW_OPCODE_* or FS_OPCODE_* */
+   enum brw_conditional_mod conditional_mod; /**< BRW_CONDITIONAL_* */
+   enum brw_predicate predicate;
+   bool predicate_inverse:1;
+   bool writes_accumulator:1; /**< instruction implicitly writes accumulator */
+   bool force_writemask_all:1;
+   bool no_dd_clear:1;
+   bool no_dd_check:1;
+   bool saturate:1;
+   bool shadow_compare:1;
+
+   /* Chooses which flag subregister (f0.0 or f0.1) is used for conditional
+    * mod and predication.
+    */
+   unsigned flag_subreg:1;
+
+   /** The number of hardware registers used for a message header. */
+   uint8_t header_size;
+};
+
+#ifdef __cplusplus
+
+enum instruction_scheduler_mode {
+   SCHEDULE_PRE,
+   SCHEDULE_PRE_NON_LIFO,
+   SCHEDULE_PRE_LIFO,
+   SCHEDULE_POST,
+};
+
+struct backend_shader {
+protected:
+
+   backend_shader(const struct brw_compiler *compiler,
+                  void *log_data,
+                  void *mem_ctx,
+                  const nir_shader *shader,
+                  struct brw_stage_prog_data *stage_prog_data);
+
+public:
+
+   const struct brw_compiler *compiler;
+   void *log_data; /* Passed to compiler->*_log functions */
+
+   const struct gen_device_info * const devinfo;
+   const nir_shader *nir;
+   struct brw_stage_prog_data * const stage_prog_data;
+
+   /** ralloc context for temporary data used during compile */
+   void *mem_ctx;
+
+   /**
+    * List of either fs_inst or vec4_instruction (inheriting from
+    * backend_instruction)
+    */
+   exec_list instructions;
+
+   cfg_t *cfg;
+
+   gl_shader_stage stage;
+   bool debug_enabled;
+   const char *stage_name;
+   const char *stage_abbrev;
+
+   brw::simple_allocator alloc;
+
+   virtual void dump_instruction(backend_instruction *inst) = 0;
+   virtual void dump_instruction(backend_instruction *inst, FILE *file) = 0;
+   virtual void dump_instructions();
+   virtual void dump_instructions(const char *name);
+
+   void calculate_cfg();
+
+   virtual void invalidate_live_intervals() = 0;
+};
+
+bool brw_texture_offset(int *offsets,
+                        unsigned num_components,
+                        uint32_t *offset_bits);
+
+void brw_setup_image_uniform_values(gl_shader_stage stage,
+                                    struct brw_stage_prog_data *stage_prog_data,
+                                    unsigned param_start_index,
+                                    const gl_uniform_storage *storage);
+
+#else
+struct backend_shader;
+#endif /* __cplusplus */
+
+enum brw_reg_type brw_type_for_base_type(const struct glsl_type *type);
+enum brw_conditional_mod brw_conditional_for_comparison(unsigned int op);
+uint32_t brw_math_function(enum opcode op);
+const char *brw_instruction_name(const struct gen_device_info *devinfo,
+                                 enum opcode op);
+bool brw_saturate_immediate(enum brw_reg_type type, struct brw_reg *reg);
+bool brw_negate_immediate(enum brw_reg_type type, struct brw_reg *reg);
+bool brw_abs_immediate(enum brw_reg_type type, struct brw_reg *reg);
+
+bool opt_predicated_break(struct backend_shader *s);
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/* brw_fs_reg_allocate.cpp */
+void brw_fs_alloc_reg_sets(struct brw_compiler *compiler);
+
+/* brw_vec4_reg_allocate.cpp */
+void brw_vec4_alloc_reg_set(struct brw_compiler *compiler);
+
+/* brw_disasm.c */
+extern const char *const conditional_modifier[16];
+extern const char *const pred_ctrl_align16[16];
+
+/* Per-thread scratch space is a power-of-two multiple of 1KB. */
+static inline int
+brw_get_scratch_size(int size)
+{
+   return MAX2(1024, util_next_power_of_two(size));
+}
+
+/**
+ * Scratch data used when compiling a GLSL geometry shader.
+ */
+struct brw_gs_compile
+{
+   struct brw_gs_prog_key key;
+   struct brw_vue_map input_vue_map;
+
+   unsigned control_data_bits_per_vertex;
+   unsigned control_data_header_size_bits;
+};
+
+unsigned get_atomic_counter_op(nir_intrinsic_op op);
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/src/intel/compiler/brw_vec4.cpp b/src/intel/compiler/brw_vec4.cpp
new file mode 100644
index 00000000000..d7c09093032
--- /dev/null
+++ b/src/intel/compiler/brw_vec4.cpp
@@ -0,0 +1,2851 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_vec4.h"
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "brw_nir.h"
+#include "brw_vec4_builder.h"
+#include "brw_vec4_live_variables.h"
+#include "brw_vec4_vs.h"
+#include "brw_dead_control_flow.h"
+#include "common/gen_debug.h"
+#include "program/prog_parameter.h"
+
+#define MAX_INSTRUCTION (1 << 30)
+
+using namespace brw;
+
+namespace brw {
+
+void
+src_reg::init()
+{
+   memset(this, 0, sizeof(*this));
+
+   this->file = BAD_FILE;
+}
+
+src_reg::src_reg(enum brw_reg_file file, int nr, const glsl_type *type)
+{
+   init();
+
+   this->file = file;
+   this->nr = nr;
+   if (type && (type->is_scalar() || type->is_vector() || type->is_matrix()))
+      this->swizzle = brw_swizzle_for_size(type->vector_elements);
+   else
+      this->swizzle = BRW_SWIZZLE_XYZW;
+   if (type)
+      this->type = brw_type_for_base_type(type);
+}
+
+/** Generic unset register constructor. */
+src_reg::src_reg()
+{
+   init();
+}
+
+src_reg::src_reg(struct ::brw_reg reg) :
+   backend_reg(reg)
+{
+   this->offset = 0;
+   this->reladdr = NULL;
+}
+
+src_reg::src_reg(const dst_reg &reg) :
+   backend_reg(reg)
+{
+   this->reladdr = reg.reladdr;
+   this->swizzle = brw_swizzle_for_mask(reg.writemask);
+}
+
+void
+dst_reg::init()
+{
+   memset(this, 0, sizeof(*this));
+   this->file = BAD_FILE;
+   this->writemask = WRITEMASK_XYZW;
+}
+
+dst_reg::dst_reg()
+{
+   init();
+}
+
+dst_reg::dst_reg(enum brw_reg_file file, int nr)
+{
+   init();
+
+   this->file = file;
+   this->nr = nr;
+}
+
+dst_reg::dst_reg(enum brw_reg_file file, int nr, const glsl_type *type,
+                 unsigned writemask)
+{
+   init();
+
+   this->file = file;
+   this->nr = nr;
+   this->type = brw_type_for_base_type(type);
+   this->writemask = writemask;
+}
+
+dst_reg::dst_reg(enum brw_reg_file file, int nr, brw_reg_type type,
+                 unsigned writemask)
+{
+   init();
+
+   this->file = file;
+   this->nr = nr;
+   this->type = type;
+   this->writemask = writemask;
+}
+
+dst_reg::dst_reg(struct ::brw_reg reg) :
+   backend_reg(reg)
+{
+   this->offset = 0;
+   this->reladdr = NULL;
+}
+
+dst_reg::dst_reg(const src_reg &reg) :
+   backend_reg(reg)
+{
+   this->writemask = brw_mask_for_swizzle(reg.swizzle);
+   this->reladdr = reg.reladdr;
+}
+
+bool
+dst_reg::equals(const dst_reg &r) const
+{
+   return (this->backend_reg::equals(r) &&
+           (reladdr == r.reladdr ||
+            (reladdr && r.reladdr && reladdr->equals(*r.reladdr))));
+}
+
+bool
+vec4_instruction::is_send_from_grf()
+{
+   switch (opcode) {
+   case SHADER_OPCODE_SHADER_TIME_ADD:
+   case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
+   case SHADER_OPCODE_UNTYPED_ATOMIC:
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+   case SHADER_OPCODE_TYPED_ATOMIC:
+   case SHADER_OPCODE_TYPED_SURFACE_READ:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+   case VEC4_OPCODE_URB_READ:
+   case TCS_OPCODE_URB_WRITE:
+   case TCS_OPCODE_RELEASE_INPUT:
+   case SHADER_OPCODE_BARRIER:
+      return true;
+   default:
+      return false;
+   }
+}
+
+/**
+ * Returns true if this instruction's sources and destinations cannot
+ * safely be the same register.
+ *
+ * In most cases, a register can be written over safely by the same
+ * instruction that is its last use.  For a single instruction, the
+ * sources are dereferenced before writing of the destination starts
+ * (naturally).
+ *
+ * However, there are a few cases where this can be problematic:
+ *
+ * - Virtual opcodes that translate to multiple instructions in the
+ *   code generator: if src == dst and one instruction writes the
+ *   destination before a later instruction reads the source, then
+ *   src will have been clobbered.
+ *
+ * The register allocator uses this information to set up conflicts between
+ * GRF sources and the destination.
+ */
+bool
+vec4_instruction::has_source_and_destination_hazard() const
+{
+   switch (opcode) {
+   case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+   case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
+   case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
+      return true;
+   default:
+      /* 8-wide compressed DF operations are executed as two 4-wide operations,
+       * so we have a src/dst hazard if the first half of the instruction
+       * overwrites the source of the second half. Prevent this by marking
+       * compressed instructions as having src/dst hazards, so the register
+       * allocator assigns safe register regions for dst and srcs.
+       */
+      return size_written > REG_SIZE;
+   }
+}
+
+unsigned
+vec4_instruction::size_read(unsigned arg) const
+{
+   switch (opcode) {
+   case SHADER_OPCODE_SHADER_TIME_ADD:
+   case SHADER_OPCODE_UNTYPED_ATOMIC:
+   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+   case SHADER_OPCODE_TYPED_ATOMIC:
+   case SHADER_OPCODE_TYPED_SURFACE_READ:
+   case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+   case TCS_OPCODE_URB_WRITE:
+      if (arg == 0)
+         return mlen * REG_SIZE;
+      break;
+   case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
+      if (arg == 1)
+         return mlen * REG_SIZE;
+      break;
+   default:
+      break;
+   }
+
+   switch (src[arg].file) {
+   case BAD_FILE:
+      return 0;
+   case IMM:
+   case UNIFORM:
+      return 4 * type_sz(src[arg].type);
+   default:
+      /* XXX - Represent actual vertical stride. */
+      return exec_size * type_sz(src[arg].type);
+   }
+}
+
+bool
+vec4_instruction::can_do_source_mods(const struct gen_device_info *devinfo)
+{
+   if (devinfo->gen == 6 && is_math())
+      return false;
+
+   if (is_send_from_grf())
+      return false;
+
+   if (!backend_instruction::can_do_source_mods())
+      return false;
+
+   return true;
+}
+
+bool
+vec4_instruction::can_do_writemask(const struct gen_device_info *devinfo)
+{
+   switch (opcode) {
+   case SHADER_OPCODE_GEN4_SCRATCH_READ:
+   case VEC4_OPCODE_FROM_DOUBLE:
+   case VEC4_OPCODE_TO_DOUBLE:
+   case VEC4_OPCODE_PICK_LOW_32BIT:
+   case VEC4_OPCODE_PICK_HIGH_32BIT:
+   case VEC4_OPCODE_SET_LOW_32BIT:
+   case VEC4_OPCODE_SET_HIGH_32BIT:
+   case VS_OPCODE_PULL_CONSTANT_LOAD:
+   case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
+   case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
+   case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+   case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
+   case TES_OPCODE_CREATE_INPUT_READ_HEADER:
+   case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
+   case VEC4_OPCODE_URB_READ:
+   case SHADER_OPCODE_MOV_INDIRECT:
+      return false;
+   default:
+      /* The MATH instruction on Gen6 only executes in align1 mode, which does
+       * not support writemasking.
+       */
+      if (devinfo->gen == 6 && is_math())
+         return false;
+
+      if (is_tex())
+         return false;
+
+      return true;
+   }
+}
+
+bool
+vec4_instruction::can_change_types() const
+{
+   return dst.type == src[0].type &&
+          !src[0].abs && !src[0].negate && !saturate &&
+          (opcode == BRW_OPCODE_MOV ||
+           (opcode == BRW_OPCODE_SEL &&
+            dst.type == src[1].type &&
+            predicate != BRW_PREDICATE_NONE &&
+            !src[1].abs && !src[1].negate));
+}
+
+/**
+ * Returns how many MRFs an opcode will write over.
+ *
+ * Note that this is not the 0 or 1 implied writes in an actual gen
+ * instruction -- the generate_* functions generate additional MOVs
+ * for setup.
+ */
+int
+vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
+{
+   if (inst->mlen == 0 || inst->is_send_from_grf())
+      return 0;
+
+   switch (inst->opcode) {
+   case SHADER_OPCODE_RCP:
+   case SHADER_OPCODE_RSQ:
+   case SHADER_OPCODE_SQRT:
+   case SHADER_OPCODE_EXP2:
+   case SHADER_OPCODE_LOG2:
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_COS:
+      return 1;
+   case SHADER_OPCODE_INT_QUOTIENT:
+   case SHADER_OPCODE_INT_REMAINDER:
+   case SHADER_OPCODE_POW:
+   case TCS_OPCODE_THREAD_END:
+      return 2;
+   case VS_OPCODE_URB_WRITE:
+      return 1;
+   case VS_OPCODE_PULL_CONSTANT_LOAD:
+      return 2;
+   case SHADER_OPCODE_GEN4_SCRATCH_READ:
+      return 2;
+   case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
+      return 3;
+   case GS_OPCODE_URB_WRITE:
+   case GS_OPCODE_URB_WRITE_ALLOCATE:
+   case GS_OPCODE_THREAD_END:
+      return 0;
+   case GS_OPCODE_FF_SYNC:
+      return 1;
+   case TCS_OPCODE_URB_WRITE:
+      return 0;
+   case SHADER_OPCODE_SHADER_TIME_ADD:
+      return 0;
+   case SHADER_OPCODE_TEX:
+   case SHADER_OPCODE_TXL:
+   case SHADER_OPCODE_TXD:
+   case SHADER_OPCODE_TXF:
+   case SHADER_OPCODE_TXF_CMS:
+   case SHADER_OPCODE_TXF_CMS_W:
+   case SHADER_OPCODE_TXF_MCS:
+   case SHADER_OPCODE_TXS:
+   case SHADER_OPCODE_TG4:
+   case SHADER_OPCODE_TG4_OFFSET:
+   case SHADER_OPCODE_SAMPLEINFO:
+   case VS_OPCODE_GET_BUFFER_SIZE:
+      return inst->header_size;
+   default:
+      unreachable("not reached");
+   }
+}
+
+bool
+src_reg::equals(const src_reg &r) const
+{
+   return (this->backend_reg::equals(r) &&
+	   !reladdr && !r.reladdr);
+}
+
+bool
+vec4_visitor::opt_vector_float()
+{
+   bool progress = false;
+
+   foreach_block(block, cfg) {
+      int last_reg = -1, last_offset = -1;
+      enum brw_reg_file last_reg_file = BAD_FILE;
+
+      uint8_t imm[4] = { 0 };
+      int inst_count = 0;
+      vec4_instruction *imm_inst[4];
+      unsigned writemask = 0;
+      enum brw_reg_type dest_type = BRW_REGISTER_TYPE_F;
+
+      foreach_inst_in_block_safe(vec4_instruction, inst, block) {
+         int vf = -1;
+         enum brw_reg_type need_type;
+
+         /* Look for unconditional MOVs from an immediate with a partial
+          * writemask.  Skip type-conversion MOVs other than integer 0,
+          * where the type doesn't matter.  See if the immediate can be
+          * represented as a VF.
+          */
+         if (inst->opcode == BRW_OPCODE_MOV &&
+             inst->src[0].file == IMM &&
+             inst->predicate == BRW_PREDICATE_NONE &&
+             inst->dst.writemask != WRITEMASK_XYZW &&
+             type_sz(inst->src[0].type) < 8 &&
+             (inst->src[0].type == inst->dst.type || inst->src[0].d == 0)) {
+
+            vf = brw_float_to_vf(inst->src[0].d);
+            need_type = BRW_REGISTER_TYPE_D;
+
+            if (vf == -1) {
+               vf = brw_float_to_vf(inst->src[0].f);
+               need_type = BRW_REGISTER_TYPE_F;
+            }
+         } else {
+            last_reg = -1;
+         }
+
+         /* If this wasn't a MOV, or the destination register doesn't match,
+          * or we have to switch destination types, then this breaks our
+          * sequence.  Combine anything we've accumulated so far.
+          */
+         if (last_reg != inst->dst.nr ||
+             last_offset != inst->dst.offset ||
+             last_reg_file != inst->dst.file ||
+             (vf > 0 && dest_type != need_type)) {
+
+            if (inst_count > 1) {
+               unsigned vf;
+               memcpy(&vf, imm, sizeof(vf));
+               vec4_instruction *mov = MOV(imm_inst[0]->dst, brw_imm_vf(vf));
+               mov->dst.type = dest_type;
+               mov->dst.writemask = writemask;
+               inst->insert_before(block, mov);
+
+               for (int i = 0; i < inst_count; i++) {
+                  imm_inst[i]->remove(block);
+               }
+
+               progress = true;
+            }
+
+            inst_count = 0;
+            last_reg = -1;
+            writemask = 0;
+            dest_type = BRW_REGISTER_TYPE_F;
+
+            for (int i = 0; i < 4; i++) {
+               imm[i] = 0;
+            }
+         }
+
+         /* Record this instruction's value (if it was representable). */
+         if (vf != -1) {
+            if ((inst->dst.writemask & WRITEMASK_X) != 0)
+               imm[0] = vf;
+            if ((inst->dst.writemask & WRITEMASK_Y) != 0)
+               imm[1] = vf;
+            if ((inst->dst.writemask & WRITEMASK_Z) != 0)
+               imm[2] = vf;
+            if ((inst->dst.writemask & WRITEMASK_W) != 0)
+               imm[3] = vf;
+
+            writemask |= inst->dst.writemask;
+            imm_inst[inst_count++] = inst;
+
+            last_reg = inst->dst.nr;
+            last_offset = inst->dst.offset;
+            last_reg_file = inst->dst.file;
+            if (vf > 0)
+               dest_type = need_type;
+         }
+      }
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
+/* Replaces unused channels of a swizzle with channels that are used.
+ *
+ * For instance, this pass transforms
+ *
+ *    mov vgrf4.yz, vgrf5.wxzy
+ *
+ * into
+ *
+ *    mov vgrf4.yz, vgrf5.xxzx
+ *
+ * This eliminates false uses of some channels, letting dead code elimination
+ * remove the instructions that wrote them.
+ */
+bool
+vec4_visitor::opt_reduce_swizzle()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+      if (inst->dst.file == BAD_FILE ||
+          inst->dst.file == ARF ||
+          inst->dst.file == FIXED_GRF ||
+          inst->is_send_from_grf())
+         continue;
+
+      unsigned swizzle;
+
+      /* Determine which channels of the sources are read. */
+      switch (inst->opcode) {
+      case VEC4_OPCODE_PACK_BYTES:
+      case BRW_OPCODE_DP4:
+      case BRW_OPCODE_DPH: /* FINISHME: DPH reads only three channels of src0,
+                            *           but all four of src1.
+                            */
+         swizzle = brw_swizzle_for_size(4);
+         break;
+      case BRW_OPCODE_DP3:
+         swizzle = brw_swizzle_for_size(3);
+         break;
+      case BRW_OPCODE_DP2:
+         swizzle = brw_swizzle_for_size(2);
+         break;
+
+      case VEC4_OPCODE_TO_DOUBLE:
+      case VEC4_OPCODE_FROM_DOUBLE:
+      case VEC4_OPCODE_PICK_LOW_32BIT:
+      case VEC4_OPCODE_PICK_HIGH_32BIT:
+      case VEC4_OPCODE_SET_LOW_32BIT:
+      case VEC4_OPCODE_SET_HIGH_32BIT:
+         swizzle = brw_swizzle_for_size(4);
+         break;
+
+      default:
+         swizzle = brw_swizzle_for_mask(inst->dst.writemask);
+         break;
+      }
+
+      /* Update sources' swizzles. */
+      for (int i = 0; i < 3; i++) {
+         if (inst->src[i].file != VGRF &&
+             inst->src[i].file != ATTR &&
+             inst->src[i].file != UNIFORM)
+            continue;
+
+         const unsigned new_swizzle =
+            brw_compose_swizzle(swizzle, inst->src[i].swizzle);
+         if (inst->src[i].swizzle != new_swizzle) {
+            inst->src[i].swizzle = new_swizzle;
+            progress = true;
+         }
+      }
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
+void
+vec4_visitor::split_uniform_registers()
+{
+   /* Prior to this, uniforms have been in an array sized according to
+    * the number of vector uniforms present, sparsely filled (so an
+    * aggregate results in reg indices being skipped over).  Now we're
+    * going to cut those aggregates up so each .nr index is one
+    * vector.  The goal is to make elimination of unused uniform
+    * components easier later.
+    */
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      for (int i = 0 ; i < 3; i++) {
+	 if (inst->src[i].file != UNIFORM)
+	    continue;
+
+	 assert(!inst->src[i].reladdr);
+
+         inst->src[i].nr += inst->src[i].offset / 16;
+	 inst->src[i].offset %= 16;
+      }
+   }
+}
+
+void
+vec4_visitor::pack_uniform_registers()
+{
+   uint8_t chans_used[this->uniforms];
+   int new_loc[this->uniforms];
+   int new_chan[this->uniforms];
+
+   memset(chans_used, 0, sizeof(chans_used));
+   memset(new_loc, 0, sizeof(new_loc));
+   memset(new_chan, 0, sizeof(new_chan));
+
+   /* Find which uniform vectors are actually used by the program.  We
+    * expect unused vector elements when we've moved array access out
+    * to pull constants, and from some GLSL code generators like wine.
+    */
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      unsigned readmask;
+      switch (inst->opcode) {
+      case VEC4_OPCODE_PACK_BYTES:
+      case BRW_OPCODE_DP4:
+      case BRW_OPCODE_DPH:
+         readmask = 0xf;
+         break;
+      case BRW_OPCODE_DP3:
+         readmask = 0x7;
+         break;
+      case BRW_OPCODE_DP2:
+         readmask = 0x3;
+         break;
+      default:
+         readmask = inst->dst.writemask;
+         break;
+      }
+
+      for (int i = 0 ; i < 3; i++) {
+         if (inst->src[i].file != UNIFORM)
+            continue;
+
+         assert(type_sz(inst->src[i].type) % 4 == 0);
+         unsigned channel_size = type_sz(inst->src[i].type) / 4;
+
+         int reg = inst->src[i].nr;
+         for (int c = 0; c < 4; c++) {
+            if (!(readmask & (1 << c)))
+               continue;
+
+            unsigned channel = BRW_GET_SWZ(inst->src[i].swizzle, c) + 1;
+            unsigned used = MAX2(chans_used[reg], channel * channel_size);
+            if (used <= 4)
+               chans_used[reg] = used;
+            else
+               chans_used[reg + 1] = used - 4;
+         }
+      }
+
+      if (inst->opcode == SHADER_OPCODE_MOV_INDIRECT &&
+          inst->src[0].file == UNIFORM) {
+         assert(inst->src[2].file == BRW_IMMEDIATE_VALUE);
+         assert(inst->src[0].subnr == 0);
+
+         unsigned bytes_read = inst->src[2].ud;
+         assert(bytes_read % 4 == 0);
+         unsigned vec4s_read = DIV_ROUND_UP(bytes_read, 16);
+
+         /* We just mark every register touched by a MOV_INDIRECT as being
+          * fully used.  This ensures that it doesn't broken up piecewise by
+          * the next part of our packing algorithm.
+          */
+         int reg = inst->src[0].nr;
+         for (unsigned i = 0; i < vec4s_read; i++)
+            chans_used[reg + i] = 4;
+      }
+   }
+
+   int new_uniform_count = 0;
+
+   /* Now, figure out a packing of the live uniform vectors into our
+    * push constants.
+    */
+   for (int src = 0; src < uniforms; src++) {
+      int size = chans_used[src];
+
+      if (size == 0)
+         continue;
+
+      int dst;
+      /* Find the lowest place we can slot this uniform in. */
+      for (dst = 0; dst < src; dst++) {
+         if (chans_used[dst] + size <= 4)
+            break;
+      }
+
+      if (src == dst) {
+         new_loc[src] = dst;
+         new_chan[src] = 0;
+      } else {
+         new_loc[src] = dst;
+         new_chan[src] = chans_used[dst];
+
+         /* Move the references to the data */
+         for (int j = 0; j < size; j++) {
+            stage_prog_data->param[dst * 4 + new_chan[src] + j] =
+               stage_prog_data->param[src * 4 + j];
+         }
+
+         chans_used[dst] += size;
+         chans_used[src] = 0;
+      }
+
+      new_uniform_count = MAX2(new_uniform_count, dst + 1);
+   }
+
+   this->uniforms = new_uniform_count;
+
+   /* Now, update the instructions for our repacked uniforms. */
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      for (int i = 0 ; i < 3; i++) {
+         int src = inst->src[i].nr;
+
+         if (inst->src[i].file != UNIFORM)
+            continue;
+
+         inst->src[i].nr = new_loc[src];
+         inst->src[i].swizzle += BRW_SWIZZLE4(new_chan[src], new_chan[src],
+                                              new_chan[src], new_chan[src]);
+      }
+   }
+}
+
+/**
+ * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a).
+ *
+ * While GLSL IR also performs this optimization, we end up with it in
+ * our instruction stream for a couple of reasons.  One is that we
+ * sometimes generate silly instructions, for example in array access
+ * where we'll generate "ADD offset, index, base" even if base is 0.
+ * The other is that GLSL IR's constant propagation doesn't track the
+ * components of aggregates, so some VS patterns (initialize matrix to
+ * 0, accumulate in vertex blending factors) end up breaking down to
+ * instructions involving 0.
+ */
+bool
+vec4_visitor::opt_algebraic()
+{
+   bool progress = false;
+
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      switch (inst->opcode) {
+      case BRW_OPCODE_MOV:
+         if (inst->src[0].file != IMM)
+            break;
+
+         if (inst->saturate) {
+            if (inst->dst.type != inst->src[0].type)
+               assert(!"unimplemented: saturate mixed types");
+
+            if (brw_saturate_immediate(inst->dst.type,
+                                       &inst->src[0].as_brw_reg())) {
+               inst->saturate = false;
+               progress = true;
+            }
+         }
+         break;
+
+      case VEC4_OPCODE_UNPACK_UNIFORM:
+         if (inst->src[0].file != UNIFORM) {
+            inst->opcode = BRW_OPCODE_MOV;
+            progress = true;
+         }
+         break;
+
+      case BRW_OPCODE_ADD:
+	 if (inst->src[1].is_zero()) {
+	    inst->opcode = BRW_OPCODE_MOV;
+	    inst->src[1] = src_reg();
+	    progress = true;
+	 }
+	 break;
+
+      case BRW_OPCODE_MUL:
+	 if (inst->src[1].is_zero()) {
+	    inst->opcode = BRW_OPCODE_MOV;
+	    switch (inst->src[0].type) {
+	    case BRW_REGISTER_TYPE_F:
+	       inst->src[0] = brw_imm_f(0.0f);
+	       break;
+	    case BRW_REGISTER_TYPE_D:
+	       inst->src[0] = brw_imm_d(0);
+	       break;
+	    case BRW_REGISTER_TYPE_UD:
+	       inst->src[0] = brw_imm_ud(0u);
+	       break;
+	    default:
+	       unreachable("not reached");
+	    }
+	    inst->src[1] = src_reg();
+	    progress = true;
+	 } else if (inst->src[1].is_one()) {
+	    inst->opcode = BRW_OPCODE_MOV;
+	    inst->src[1] = src_reg();
+	    progress = true;
+         } else if (inst->src[1].is_negative_one()) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0].negate = !inst->src[0].negate;
+            inst->src[1] = src_reg();
+            progress = true;
+	 }
+	 break;
+      case BRW_OPCODE_CMP:
+         if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
+             inst->src[0].abs &&
+             inst->src[0].negate &&
+             inst->src[1].is_zero()) {
+            inst->src[0].abs = false;
+            inst->src[0].negate = false;
+            inst->conditional_mod = BRW_CONDITIONAL_Z;
+            progress = true;
+            break;
+         }
+         break;
+      case SHADER_OPCODE_BROADCAST:
+         if (is_uniform(inst->src[0]) ||
+             inst->src[1].is_zero()) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[1] = src_reg();
+            inst->force_writemask_all = true;
+            progress = true;
+         }
+         break;
+
+      default:
+	 break;
+      }
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
+/**
+ * Only a limited number of hardware registers may be used for push
+ * constants, so this turns access to the overflowed constants into
+ * pull constants.
+ */
+void
+vec4_visitor::move_push_constants_to_pull_constants()
+{
+   int pull_constant_loc[this->uniforms];
+
+   /* Only allow 32 registers (256 uniform components) as push constants,
+    * which is the limit on gen6.
+    *
+    * If changing this value, note the limitation about total_regs in
+    * brw_curbe.c.
+    */
+   int max_uniform_components = 32 * 8;
+   if (this->uniforms * 4 <= max_uniform_components)
+      return;
+
+   /* Make some sort of choice as to which uniforms get sent to pull
+    * constants.  We could potentially do something clever here like
+    * look for the most infrequently used uniform vec4s, but leave
+    * that for later.
+    */
+   for (int i = 0; i < this->uniforms * 4; i += 4) {
+      pull_constant_loc[i / 4] = -1;
+
+      if (i >= max_uniform_components) {
+         const gl_constant_value **values = &stage_prog_data->param[i];
+
+         /* Try to find an existing copy of this uniform in the pull
+          * constants if it was part of an array access already.
+          */
+         for (unsigned int j = 0; j < stage_prog_data->nr_pull_params; j += 4) {
+            int matches;
+
+            for (matches = 0; matches < 4; matches++) {
+               if (stage_prog_data->pull_param[j + matches] != values[matches])
+                  break;
+            }
+
+            if (matches == 4) {
+               pull_constant_loc[i / 4] = j / 4;
+               break;
+            }
+         }
+
+         if (pull_constant_loc[i / 4] == -1) {
+            assert(stage_prog_data->nr_pull_params % 4 == 0);
+            pull_constant_loc[i / 4] = stage_prog_data->nr_pull_params / 4;
+
+            for (int j = 0; j < 4; j++) {
+               stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
+                  values[j];
+            }
+         }
+      }
+   }
+
+   /* Now actually rewrite usage of the things we've moved to pull
+    * constants.
+    */
+   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+      for (int i = 0 ; i < 3; i++) {
+         if (inst->src[i].file != UNIFORM ||
+             pull_constant_loc[inst->src[i].nr] == -1)
+            continue;
+
+         int uniform = inst->src[i].nr;
+
+         const glsl_type *temp_type = type_sz(inst->src[i].type) == 8 ?
+            glsl_type::dvec4_type : glsl_type::vec4_type;
+         dst_reg temp = dst_reg(this, temp_type);
+
+         emit_pull_constant_load(block, inst, temp, inst->src[i],
+                                 pull_constant_loc[uniform], src_reg());
+
+         inst->src[i].file = temp.file;
+         inst->src[i].nr = temp.nr;
+         inst->src[i].offset %= 16;
+         inst->src[i].reladdr = NULL;
+      }
+   }
+
+   /* Repack push constants to remove the now-unused ones. */
+   pack_uniform_registers();
+}
+
+/* Conditions for which we want to avoid setting the dependency control bits */
+bool
+vec4_visitor::is_dep_ctrl_unsafe(const vec4_instruction *inst)
+{
+#define IS_DWORD(reg) \
+   (reg.type == BRW_REGISTER_TYPE_UD || \
+    reg.type == BRW_REGISTER_TYPE_D)
+
+#define IS_64BIT(reg) (reg.file != BAD_FILE && type_sz(reg.type) == 8)
+
+   /* From the Cherryview and Broadwell PRMs:
+    *
+    * "When source or destination datatype is 64b or operation is integer DWord
+    * multiply, DepCtrl must not be used."
+    *
+    * SKL PRMs don't include this restriction, however, gen7 seems to be
+    * affected, at least by the 64b restriction, since DepCtrl with double
+    * precision instructions seems to produce GPU hangs in some cases.
+    */
+   if (devinfo->gen == 8 || devinfo->is_broxton) {
+      if (inst->opcode == BRW_OPCODE_MUL &&
+         IS_DWORD(inst->src[0]) &&
+         IS_DWORD(inst->src[1]))
+         return true;
+   }
+
+   if (devinfo->gen >= 7 && devinfo->gen <= 8) {
+      if (IS_64BIT(inst->dst) || IS_64BIT(inst->src[0]) ||
+          IS_64BIT(inst->src[1]) || IS_64BIT(inst->src[2]))
+      return true;
+   }
+
+#undef IS_64BIT
+#undef IS_DWORD
+
+   if (devinfo->gen >= 8) {
+      if (inst->opcode == BRW_OPCODE_F32TO16)
+         return true;
+   }
+
+   /*
+    * mlen:
+    * In the presence of send messages, totally interrupt dependency
+    * control. They're long enough that the chance of dependency
+    * control around them just doesn't matter.
+    *
+    * predicate:
+    * From the Ivy Bridge PRM, volume 4 part 3.7, page 80:
+    * When a sequence of NoDDChk and NoDDClr are used, the last instruction that
+    * completes the scoreboard clear must have a non-zero execution mask. This
+    * means, if any kind of predication can change the execution mask or channel
+    * enable of the last instruction, the optimization must be avoided. This is
+    * to avoid instructions being shot down the pipeline when no writes are
+    * required.
+    *
+    * math:
+    * Dependency control does not work well over math instructions.
+    * NB: Discovered empirically
+    */
+   return (inst->mlen || inst->predicate || inst->is_math());
+}
+
+/**
+ * Sets the dependency control fields on instructions after register
+ * allocation and before the generator is run.
+ *
+ * When you have a sequence of instructions like:
+ *
+ * DP4 temp.x vertex uniform[0]
+ * DP4 temp.y vertex uniform[0]
+ * DP4 temp.z vertex uniform[0]
+ * DP4 temp.w vertex uniform[0]
+ *
+ * The hardware doesn't know that it can actually run the later instructions
+ * while the previous ones are in flight, producing stalls.  However, we have
+ * manual fields we can set in the instructions that let it do so.
+ */
+void
+vec4_visitor::opt_set_dependency_control()
+{
+   vec4_instruction *last_grf_write[BRW_MAX_GRF];
+   uint8_t grf_channels_written[BRW_MAX_GRF];
+   vec4_instruction *last_mrf_write[BRW_MAX_GRF];
+   uint8_t mrf_channels_written[BRW_MAX_GRF];
+
+   assert(prog_data->total_grf ||
+          !"Must be called after register allocation");
+
+   foreach_block (block, cfg) {
+      memset(last_grf_write, 0, sizeof(last_grf_write));
+      memset(last_mrf_write, 0, sizeof(last_mrf_write));
+
+      foreach_inst_in_block (vec4_instruction, inst, block) {
+         /* If we read from a register that we were doing dependency control
+          * on, don't do dependency control across the read.
+          */
+         for (int i = 0; i < 3; i++) {
+            int reg = inst->src[i].nr + inst->src[i].offset / REG_SIZE;
+            if (inst->src[i].file == VGRF) {
+               last_grf_write[reg] = NULL;
+            } else if (inst->src[i].file == FIXED_GRF) {
+               memset(last_grf_write, 0, sizeof(last_grf_write));
+               break;
+            }
+            assert(inst->src[i].file != MRF);
+         }
+
+         if (is_dep_ctrl_unsafe(inst)) {
+            memset(last_grf_write, 0, sizeof(last_grf_write));
+            memset(last_mrf_write, 0, sizeof(last_mrf_write));
+            continue;
+         }
+
+         /* Now, see if we can do dependency control for this instruction
+          * against a previous one writing to its destination.
+          */
+         int reg = inst->dst.nr + inst->dst.offset / REG_SIZE;
+         if (inst->dst.file == VGRF || inst->dst.file == FIXED_GRF) {
+            if (last_grf_write[reg] &&
+                last_grf_write[reg]->dst.offset == inst->dst.offset &&
+                !(inst->dst.writemask & grf_channels_written[reg])) {
+               last_grf_write[reg]->no_dd_clear = true;
+               inst->no_dd_check = true;
+            } else {
+               grf_channels_written[reg] = 0;
+            }
+
+            last_grf_write[reg] = inst;
+            grf_channels_written[reg] |= inst->dst.writemask;
+         } else if (inst->dst.file == MRF) {
+            if (last_mrf_write[reg] &&
+                last_mrf_write[reg]->dst.offset == inst->dst.offset &&
+                !(inst->dst.writemask & mrf_channels_written[reg])) {
+               last_mrf_write[reg]->no_dd_clear = true;
+               inst->no_dd_check = true;
+            } else {
+               mrf_channels_written[reg] = 0;
+            }
+
+            last_mrf_write[reg] = inst;
+            mrf_channels_written[reg] |= inst->dst.writemask;
+         }
+      }
+   }
+}
+
+bool
+vec4_instruction::can_reswizzle(const struct gen_device_info *devinfo,
+                                int dst_writemask,
+                                int swizzle,
+                                int swizzle_mask)
+{
+   /* Gen6 MATH instructions can not execute in align16 mode, so swizzles
+    * are not allowed.
+    */
+   if (devinfo->gen == 6 && is_math() && swizzle != BRW_SWIZZLE_XYZW)
+      return false;
+
+   if (!can_do_writemask(devinfo) && dst_writemask != WRITEMASK_XYZW)
+      return false;
+
+   /* If this instruction sets anything not referenced by swizzle, then we'd
+    * totally break it when we reswizzle.
+    */
+   if (dst.writemask & ~swizzle_mask)
+      return false;
+
+   if (mlen > 0)
+      return false;
+
+   for (int i = 0; i < 3; i++) {
+      if (src[i].is_accumulator())
+         return false;
+   }
+
+   return true;
+}
+
+/**
+ * For any channels in the swizzle's source that were populated by this
+ * instruction, rewrite the instruction to put the appropriate result directly
+ * in those channels.
+ *
+ * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x
+ */
+void
+vec4_instruction::reswizzle(int dst_writemask, int swizzle)
+{
+   /* Destination write mask doesn't correspond to source swizzle for the dot
+    * product and pack_bytes instructions.
+    */
+   if (opcode != BRW_OPCODE_DP4 && opcode != BRW_OPCODE_DPH &&
+       opcode != BRW_OPCODE_DP3 && opcode != BRW_OPCODE_DP2 &&
+       opcode != VEC4_OPCODE_PACK_BYTES) {
+      for (int i = 0; i < 3; i++) {
+         if (src[i].file == BAD_FILE || src[i].file == IMM)
+            continue;
+
+         src[i].swizzle = brw_compose_swizzle(swizzle, src[i].swizzle);
+      }
+   }
+
+   /* Apply the specified swizzle and writemask to the original mask of
+    * written components.
+    */
+   dst.writemask = dst_writemask &
+                   brw_apply_swizzle_to_mask(swizzle, dst.writemask);
+}
+
+/*
+ * Tries to reduce extra MOV instructions by taking temporary GRFs that get
+ * just written and then MOVed into another reg and making the original write
+ * of the GRF write directly to the final destination instead.
+ */
+bool
+vec4_visitor::opt_register_coalesce()
+{
+   bool progress = false;
+   int next_ip = 0;
+
+   calculate_live_intervals();
+
+   foreach_block_and_inst_safe (block, vec4_instruction, inst, cfg) {
+      int ip = next_ip;
+      next_ip++;
+
+      if (inst->opcode != BRW_OPCODE_MOV ||
+          (inst->dst.file != VGRF && inst->dst.file != MRF) ||
+	  inst->predicate ||
+	  inst->src[0].file != VGRF ||
+	  inst->dst.type != inst->src[0].type ||
+	  inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr)
+	 continue;
+
+      /* Remove no-op MOVs */
+      if (inst->dst.file == inst->src[0].file &&
+          inst->dst.nr == inst->src[0].nr &&
+          inst->dst.offset == inst->src[0].offset) {
+         bool is_nop_mov = true;
+
+         for (unsigned c = 0; c < 4; c++) {
+            if ((inst->dst.writemask & (1 << c)) == 0)
+               continue;
+
+            if (BRW_GET_SWZ(inst->src[0].swizzle, c) != c) {
+               is_nop_mov = false;
+               break;
+            }
+         }
+
+         if (is_nop_mov) {
+            inst->remove(block);
+            progress = true;
+            continue;
+         }
+      }
+
+      bool to_mrf = (inst->dst.file == MRF);
+
+      /* Can't coalesce this GRF if someone else was going to
+       * read it later.
+       */
+      if (var_range_end(var_from_reg(alloc, dst_reg(inst->src[0])), 8) > ip)
+	 continue;
+
+      /* We need to check interference with the final destination between this
+       * instruction and the earliest instruction involved in writing the GRF
+       * we're eliminating.  To do that, keep track of which of our source
+       * channels we've seen initialized.
+       */
+      const unsigned chans_needed =
+         brw_apply_inv_swizzle_to_mask(inst->src[0].swizzle,
+                                       inst->dst.writemask);
+      unsigned chans_remaining = chans_needed;
+
+      /* Now walk up the instruction stream trying to see if we can rewrite
+       * everything writing to the temporary to write into the destination
+       * instead.
+       */
+      vec4_instruction *_scan_inst = (vec4_instruction *)inst->prev;
+      foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst,
+                                                  inst) {
+         _scan_inst = scan_inst;
+
+         if (regions_overlap(inst->src[0], inst->size_read(0),
+                             scan_inst->dst, scan_inst->size_written)) {
+            /* Found something writing to the reg we want to coalesce away. */
+            if (to_mrf) {
+               /* SEND instructions can't have MRF as a destination. */
+               if (scan_inst->mlen)
+                  break;
+
+               if (devinfo->gen == 6) {
+                  /* gen6 math instructions must have the destination be
+                   * VGRF, so no compute-to-MRF for them.
+                   */
+                  if (scan_inst->is_math()) {
+                     break;
+                  }
+               }
+            }
+
+            /* This doesn't handle saturation on the instruction we
+             * want to coalesce away if the register types do not match.
+             * But if scan_inst is a non type-converting 'mov', we can fix
+             * the types later.
+             */
+            if (inst->saturate &&
+                inst->dst.type != scan_inst->dst.type &&
+                !(scan_inst->opcode == BRW_OPCODE_MOV &&
+                  scan_inst->dst.type == scan_inst->src[0].type))
+               break;
+
+            /* Only allow coalescing between registers of the same type size.
+             * Otherwise we would need to make the pass aware of the fact that
+             * channel sizes are different for single and double precision.
+             */
+            if (type_sz(inst->src[0].type) != type_sz(scan_inst->src[0].type))
+               break;
+
+            /* Check that scan_inst writes the same amount of data as the
+             * instruction, otherwise coalescing would lead to writing a
+             * different (larger or smaller) region of the destination
+             */
+            if (scan_inst->size_written != inst->size_written)
+               break;
+
+            /* If we can't handle the swizzle, bail. */
+            if (!scan_inst->can_reswizzle(devinfo, inst->dst.writemask,
+                                          inst->src[0].swizzle,
+                                          chans_needed)) {
+               break;
+            }
+
+            /* This only handles coalescing writes of 8 channels (1 register
+             * for single-precision and 2 registers for double-precision)
+             * starting at the source offset of the copy instruction.
+             */
+            if (DIV_ROUND_UP(scan_inst->size_written,
+                             type_sz(scan_inst->dst.type)) > 8 ||
+                scan_inst->dst.offset != inst->src[0].offset)
+               break;
+
+	    /* Mark which channels we found unconditional writes for. */
+	    if (!scan_inst->predicate)
+               chans_remaining &= ~scan_inst->dst.writemask;
+
+	    if (chans_remaining == 0)
+	       break;
+	 }
+
+         /* You can't read from an MRF, so if someone else reads our MRF's
+          * source GRF that we wanted to rewrite, that stops us.  If it's a
+          * GRF we're trying to coalesce to, we don't actually handle
+          * rewriting sources so bail in that case as well.
+          */
+	 bool interfered = false;
+	 for (int i = 0; i < 3; i++) {
+            if (regions_overlap(inst->src[0], inst->size_read(0),
+                                scan_inst->src[i], scan_inst->size_read(i)))
+	       interfered = true;
+	 }
+	 if (interfered)
+	    break;
+
+         /* If somebody else writes the same channels of our destination here,
+          * we can't coalesce before that.
+          */
+         if (regions_overlap(inst->dst, inst->size_written,
+                             scan_inst->dst, scan_inst->size_written) &&
+             (inst->dst.writemask & scan_inst->dst.writemask) != 0) {
+            break;
+         }
+
+         /* Check for reads of the register we're trying to coalesce into.  We
+          * can't go rewriting instructions above that to put some other value
+          * in the register instead.
+          */
+         if (to_mrf && scan_inst->mlen > 0) {
+            if (inst->dst.nr >= scan_inst->base_mrf &&
+                inst->dst.nr < scan_inst->base_mrf + scan_inst->mlen) {
+               break;
+            }
+         } else {
+            for (int i = 0; i < 3; i++) {
+               if (regions_overlap(inst->dst, inst->size_written,
+                                   scan_inst->src[i], scan_inst->size_read(i)))
+                  interfered = true;
+            }
+            if (interfered)
+               break;
+         }
+      }
+
+      if (chans_remaining == 0) {
+	 /* If we've made it here, we have an MOV we want to coalesce out, and
+	  * a scan_inst pointing to the earliest instruction involved in
+	  * computing the value.  Now go rewrite the instruction stream
+	  * between the two.
+	  */
+         vec4_instruction *scan_inst = _scan_inst;
+	 while (scan_inst != inst) {
+	    if (scan_inst->dst.file == VGRF &&
+                scan_inst->dst.nr == inst->src[0].nr &&
+		scan_inst->dst.offset == inst->src[0].offset) {
+               scan_inst->reswizzle(inst->dst.writemask,
+                                    inst->src[0].swizzle);
+	       scan_inst->dst.file = inst->dst.file;
+               scan_inst->dst.nr = inst->dst.nr;
+	       scan_inst->dst.offset = inst->dst.offset;
+               if (inst->saturate &&
+                   inst->dst.type != scan_inst->dst.type) {
+                  /* If we have reached this point, scan_inst is a non
+                   * type-converting 'mov' and we can modify its register types
+                   * to match the ones in inst. Otherwise, we could have an
+                   * incorrect saturation result.
+                   */
+                  scan_inst->dst.type = inst->dst.type;
+                  scan_inst->src[0].type = inst->src[0].type;
+               }
+	       scan_inst->saturate |= inst->saturate;
+	    }
+	    scan_inst = (vec4_instruction *)scan_inst->next;
+	 }
+	 inst->remove(block);
+	 progress = true;
+      }
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
+/**
+ * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
+ * flow.  We could probably do better here with some form of divergence
+ * analysis.
+ */
+bool
+vec4_visitor::eliminate_find_live_channel()
+{
+   bool progress = false;
+   unsigned depth = 0;
+
+   if (!brw_stage_has_packed_dispatch(devinfo, stage, stage_prog_data)) {
+      /* The optimization below assumes that channel zero is live on thread
+       * dispatch, which may not be the case if the fixed function dispatches
+       * threads sparsely.
+       */
+      return false;
+   }
+
+   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+      switch (inst->opcode) {
+      case BRW_OPCODE_IF:
+      case BRW_OPCODE_DO:
+         depth++;
+         break;
+
+      case BRW_OPCODE_ENDIF:
+      case BRW_OPCODE_WHILE:
+         depth--;
+         break;
+
+      case SHADER_OPCODE_FIND_LIVE_CHANNEL:
+         if (depth == 0) {
+            inst->opcode = BRW_OPCODE_MOV;
+            inst->src[0] = brw_imm_d(0);
+            inst->force_writemask_all = true;
+            progress = true;
+         }
+         break;
+
+      default:
+         break;
+      }
+   }
+
+   return progress;
+}
+
+/**
+ * Splits virtual GRFs requesting more than one contiguous physical register.
+ *
+ * We initially create large virtual GRFs for temporary structures, arrays,
+ * and matrices, so that the visitor functions can add offsets to work their
+ * way down to the actual member being accessed.  But when it comes to
+ * optimization, we'd like to treat each register as individual storage if
+ * possible.
+ *
+ * So far, the only thing that might prevent splitting is a send message from
+ * a GRF on IVB.
+ */
+void
+vec4_visitor::split_virtual_grfs()
+{
+   int num_vars = this->alloc.count;
+   int new_virtual_grf[num_vars];
+   bool split_grf[num_vars];
+
+   memset(new_virtual_grf, 0, sizeof(new_virtual_grf));
+
+   /* Try to split anything > 0 sized. */
+   for (int i = 0; i < num_vars; i++) {
+      split_grf[i] = this->alloc.sizes[i] != 1;
+   }
+
+   /* Check that the instructions are compatible with the registers we're trying
+    * to split.
+    */
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      if (inst->dst.file == VGRF && regs_written(inst) > 1)
+         split_grf[inst->dst.nr] = false;
+
+      for (int i = 0; i < 3; i++) {
+         if (inst->src[i].file == VGRF && regs_read(inst, i) > 1)
+            split_grf[inst->src[i].nr] = false;
+      }
+   }
+
+   /* Allocate new space for split regs.  Note that the virtual
+    * numbers will be contiguous.
+    */
+   for (int i = 0; i < num_vars; i++) {
+      if (!split_grf[i])
+         continue;
+
+      new_virtual_grf[i] = alloc.allocate(1);
+      for (unsigned j = 2; j < this->alloc.sizes[i]; j++) {
+         unsigned reg = alloc.allocate(1);
+         assert(reg == new_virtual_grf[i] + j - 1);
+         (void) reg;
+      }
+      this->alloc.sizes[i] = 1;
+   }
+
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      if (inst->dst.file == VGRF && split_grf[inst->dst.nr] &&
+          inst->dst.offset / REG_SIZE != 0) {
+         inst->dst.nr = (new_virtual_grf[inst->dst.nr] +
+                         inst->dst.offset / REG_SIZE - 1);
+         inst->dst.offset %= REG_SIZE;
+      }
+      for (int i = 0; i < 3; i++) {
+         if (inst->src[i].file == VGRF && split_grf[inst->src[i].nr] &&
+             inst->src[i].offset / REG_SIZE != 0) {
+            inst->src[i].nr = (new_virtual_grf[inst->src[i].nr] +
+                                inst->src[i].offset / REG_SIZE - 1);
+            inst->src[i].offset %= REG_SIZE;
+         }
+      }
+   }
+   invalidate_live_intervals();
+}
+
+void
+vec4_visitor::dump_instruction(backend_instruction *be_inst)
+{
+   dump_instruction(be_inst, stderr);
+}
+
+void
+vec4_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
+{
+   vec4_instruction *inst = (vec4_instruction *)be_inst;
+
+   if (inst->predicate) {
+      fprintf(file, "(%cf0.%d%s) ",
+              inst->predicate_inverse ? '-' : '+',
+              inst->flag_subreg,
+              pred_ctrl_align16[inst->predicate]);
+   }
+
+   fprintf(file, "%s(%d)", brw_instruction_name(devinfo, inst->opcode),
+           inst->exec_size);
+   if (inst->saturate)
+      fprintf(file, ".sat");
+   if (inst->conditional_mod) {
+      fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
+      if (!inst->predicate &&
+          (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
+                                inst->opcode != BRW_OPCODE_IF &&
+                                inst->opcode != BRW_OPCODE_WHILE))) {
+         fprintf(file, ".f0.%d", inst->flag_subreg);
+      }
+   }
+   fprintf(file, " ");
+
+   switch (inst->dst.file) {
+   case VGRF:
+      fprintf(file, "vgrf%d", inst->dst.nr);
+      break;
+   case FIXED_GRF:
+      fprintf(file, "g%d", inst->dst.nr);
+      break;
+   case MRF:
+      fprintf(file, "m%d", inst->dst.nr);
+      break;
+   case ARF:
+      switch (inst->dst.nr) {
+      case BRW_ARF_NULL:
+         fprintf(file, "null");
+         break;
+      case BRW_ARF_ADDRESS:
+         fprintf(file, "a0.%d", inst->dst.subnr);
+         break;
+      case BRW_ARF_ACCUMULATOR:
+         fprintf(file, "acc%d", inst->dst.subnr);
+         break;
+      case BRW_ARF_FLAG:
+         fprintf(file, "f%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
+         break;
+      default:
+         fprintf(file, "arf%d.%d", inst->dst.nr & 0xf, inst->dst.subnr);
+         break;
+      }
+      break;
+   case BAD_FILE:
+      fprintf(file, "(null)");
+      break;
+   case IMM:
+   case ATTR:
+   case UNIFORM:
+      unreachable("not reached");
+   }
+   if (inst->dst.offset ||
+       (inst->dst.file == VGRF &&
+        alloc.sizes[inst->dst.nr] * REG_SIZE != inst->size_written)) {
+      const unsigned reg_size = (inst->dst.file == UNIFORM ? 16 : REG_SIZE);
+      fprintf(file, "+%d.%d", inst->dst.offset / reg_size,
+              inst->dst.offset % reg_size);
+   }
+   if (inst->dst.writemask != WRITEMASK_XYZW) {
+      fprintf(file, ".");
+      if (inst->dst.writemask & 1)
+         fprintf(file, "x");
+      if (inst->dst.writemask & 2)
+         fprintf(file, "y");
+      if (inst->dst.writemask & 4)
+         fprintf(file, "z");
+      if (inst->dst.writemask & 8)
+         fprintf(file, "w");
+   }
+   fprintf(file, ":%s", brw_reg_type_letters(inst->dst.type));
+
+   if (inst->src[0].file != BAD_FILE)
+      fprintf(file, ", ");
+
+   for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
+      if (inst->src[i].negate)
+         fprintf(file, "-");
+      if (inst->src[i].abs)
+         fprintf(file, "|");
+      switch (inst->src[i].file) {
+      case VGRF:
+         fprintf(file, "vgrf%d", inst->src[i].nr);
+         break;
+      case FIXED_GRF:
+         fprintf(file, "g%d.%d", inst->src[i].nr, inst->src[i].subnr);
+         break;
+      case ATTR:
+         fprintf(file, "attr%d", inst->src[i].nr);
+         break;
+      case UNIFORM:
+         fprintf(file, "u%d", inst->src[i].nr);
+         break;
+      case IMM:
+         switch (inst->src[i].type) {
+         case BRW_REGISTER_TYPE_F:
+            fprintf(file, "%fF", inst->src[i].f);
+            break;
+         case BRW_REGISTER_TYPE_DF:
+            fprintf(file, "%fDF", inst->src[i].df);
+            break;
+         case BRW_REGISTER_TYPE_D:
+            fprintf(file, "%dD", inst->src[i].d);
+            break;
+         case BRW_REGISTER_TYPE_UD:
+            fprintf(file, "%uU", inst->src[i].ud);
+            break;
+         case BRW_REGISTER_TYPE_VF:
+            fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
+                    brw_vf_to_float((inst->src[i].ud >>  0) & 0xff),
+                    brw_vf_to_float((inst->src[i].ud >>  8) & 0xff),
+                    brw_vf_to_float((inst->src[i].ud >> 16) & 0xff),
+                    brw_vf_to_float((inst->src[i].ud >> 24) & 0xff));
+            break;
+         default:
+            fprintf(file, "???");
+            break;
+         }
+         break;
+      case ARF:
+         switch (inst->src[i].nr) {
+         case BRW_ARF_NULL:
+            fprintf(file, "null");
+            break;
+         case BRW_ARF_ADDRESS:
+            fprintf(file, "a0.%d", inst->src[i].subnr);
+            break;
+         case BRW_ARF_ACCUMULATOR:
+            fprintf(file, "acc%d", inst->src[i].subnr);
+            break;
+         case BRW_ARF_FLAG:
+            fprintf(file, "f%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
+            break;
+         default:
+            fprintf(file, "arf%d.%d", inst->src[i].nr & 0xf, inst->src[i].subnr);
+            break;
+         }
+         break;
+      case BAD_FILE:
+         fprintf(file, "(null)");
+         break;
+      case MRF:
+         unreachable("not reached");
+      }
+
+      if (inst->src[i].offset ||
+          (inst->src[i].file == VGRF &&
+           alloc.sizes[inst->src[i].nr] * REG_SIZE != inst->size_read(i))) {
+         const unsigned reg_size = (inst->src[i].file == UNIFORM ? 16 : REG_SIZE);
+         fprintf(file, "+%d.%d", inst->src[i].offset / reg_size,
+                 inst->src[i].offset % reg_size);
+      }
+
+      if (inst->src[i].file != IMM) {
+         static const char *chans[4] = {"x", "y", "z", "w"};
+         fprintf(file, ".");
+         for (int c = 0; c < 4; c++) {
+            fprintf(file, "%s", chans[BRW_GET_SWZ(inst->src[i].swizzle, c)]);
+         }
+      }
+
+      if (inst->src[i].abs)
+         fprintf(file, "|");
+
+      if (inst->src[i].file != IMM) {
+         fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
+      }
+
+      if (i < 2 && inst->src[i + 1].file != BAD_FILE)
+         fprintf(file, ", ");
+   }
+
+   if (inst->force_writemask_all)
+      fprintf(file, " NoMask");
+
+   if (inst->exec_size != 8)
+      fprintf(file, " group%d", inst->group);
+
+   fprintf(file, "\n");
+}
+
+
+static inline struct brw_reg
+attribute_to_hw_reg(int attr, brw_reg_type type, bool interleaved)
+{
+   struct brw_reg reg;
+
+   unsigned width = REG_SIZE / 2 / MAX2(4, type_sz(type));
+   if (interleaved) {
+      reg = stride(brw_vecn_grf(width, attr / 2, (attr % 2) * 4), 0, width, 1);
+   } else {
+      reg = brw_vecn_grf(width, attr, 0);
+   }
+
+   reg.type = type;
+   return reg;
+}
+
+
+/**
+ * Replace each register of type ATTR in this->instructions with a reference
+ * to a fixed HW register.
+ *
+ * If interleaved is true, then each attribute takes up half a register, with
+ * register N containing attribute 2*N in its first half and attribute 2*N+1
+ * in its second half (this corresponds to the payload setup used by geometry
+ * shaders in "single" or "dual instanced" dispatch mode).  If interleaved is
+ * false, then each attribute takes up a whole register, with register N
+ * containing attribute N (this corresponds to the payload setup used by
+ * vertex shaders, and by geometry shaders in "dual object" dispatch mode).
+ */
+void
+vec4_visitor::lower_attributes_to_hw_regs(const int *attribute_map,
+                                          bool interleaved)
+{
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      for (int i = 0; i < 3; i++) {
+         if (inst->src[i].file != ATTR)
+            continue;
+
+         int grf = attribute_map[inst->src[i].nr +
+                                 inst->src[i].offset / REG_SIZE];
+         assert(inst->src[i].offset % REG_SIZE == 0);
+
+         /* All attributes used in the shader need to have been assigned a
+          * hardware register by the caller
+          */
+         assert(grf != 0);
+
+         struct brw_reg reg =
+            attribute_to_hw_reg(grf, inst->src[i].type, interleaved);
+         reg.swizzle = inst->src[i].swizzle;
+         if (inst->src[i].abs)
+            reg = brw_abs(reg);
+         if (inst->src[i].negate)
+            reg = negate(reg);
+
+         inst->src[i] = reg;
+      }
+   }
+}
+
+int
+vec4_vs_visitor::setup_attributes(int payload_reg)
+{
+   int nr_attributes;
+   int attribute_map[VERT_ATTRIB_MAX + 2];
+   memset(attribute_map, 0, sizeof(attribute_map));
+
+   nr_attributes = 0;
+   GLbitfield64 vs_inputs = vs_prog_data->inputs_read;
+   while (vs_inputs) {
+      GLuint first = ffsll(vs_inputs) - 1;
+      int needed_slots =
+         (vs_prog_data->double_inputs_read & BITFIELD64_BIT(first)) ? 2 : 1;
+      for (int c = 0; c < needed_slots; c++) {
+         attribute_map[first + c] = payload_reg + nr_attributes;
+         nr_attributes++;
+         vs_inputs &= ~BITFIELD64_BIT(first + c);
+      }
+   }
+
+   /* VertexID is stored by the VF as the last vertex element, but we
+    * don't represent it with a flag in inputs_read, so we call it
+    * VERT_ATTRIB_MAX.
+    */
+   if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid ||
+       vs_prog_data->uses_basevertex || vs_prog_data->uses_baseinstance) {
+      attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes;
+      nr_attributes++;
+   }
+
+   if (vs_prog_data->uses_drawid) {
+      attribute_map[VERT_ATTRIB_MAX + 1] = payload_reg + nr_attributes;
+      nr_attributes++;
+   }
+
+   lower_attributes_to_hw_regs(attribute_map, false /* interleaved */);
+
+   return payload_reg + vs_prog_data->nr_attribute_slots;
+}
+
+int
+vec4_visitor::setup_uniforms(int reg)
+{
+   prog_data->base.dispatch_grf_start_reg = reg;
+
+   /* The pre-gen6 VS requires that some push constants get loaded no
+    * matter what, or the GPU would hang.
+    */
+   if (devinfo->gen < 6 && this->uniforms == 0) {
+      stage_prog_data->param =
+         reralloc(NULL, stage_prog_data->param, const gl_constant_value *, 4);
+      for (unsigned int i = 0; i < 4; i++) {
+	 unsigned int slot = this->uniforms * 4 + i;
+	 static gl_constant_value zero = { 0.0 };
+	 stage_prog_data->param[slot] = &zero;
+      }
+
+      this->uniforms++;
+      reg++;
+   } else {
+      reg += ALIGN(uniforms, 2) / 2;
+   }
+
+   stage_prog_data->nr_params = this->uniforms * 4;
+
+   prog_data->base.curb_read_length =
+      reg - prog_data->base.dispatch_grf_start_reg;
+
+   return reg;
+}
+
+void
+vec4_vs_visitor::setup_payload(void)
+{
+   int reg = 0;
+
+   /* The payload always contains important data in g0, which contains
+    * the URB handles that are passed on to the URB write at the end
+    * of the thread.  So, we always start push constants at g1.
+    */
+   reg++;
+
+   reg = setup_uniforms(reg);
+
+   reg = setup_attributes(reg);
+
+   this->first_non_payload_grf = reg;
+}
+
+bool
+vec4_visitor::lower_minmax()
+{
+   assert(devinfo->gen < 6);
+
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+      const vec4_builder ibld(this, block, inst);
+
+      if (inst->opcode == BRW_OPCODE_SEL &&
+          inst->predicate == BRW_PREDICATE_NONE) {
+         /* FIXME: Using CMP doesn't preserve the NaN propagation semantics of
+          *        the original SEL.L/GE instruction
+          */
+         ibld.CMP(ibld.null_reg_d(), inst->src[0], inst->src[1],
+                  inst->conditional_mod);
+         inst->predicate = BRW_PREDICATE_NORMAL;
+         inst->conditional_mod = BRW_CONDITIONAL_NONE;
+
+         progress = true;
+      }
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
+src_reg
+vec4_visitor::get_timestamp()
+{
+   assert(devinfo->gen >= 7);
+
+   src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+                                BRW_ARF_TIMESTAMP,
+                                0,
+                                0,
+                                0,
+                                BRW_REGISTER_TYPE_UD,
+                                BRW_VERTICAL_STRIDE_0,
+                                BRW_WIDTH_4,
+                                BRW_HORIZONTAL_STRIDE_4,
+                                BRW_SWIZZLE_XYZW,
+                                WRITEMASK_XYZW));
+
+   dst_reg dst = dst_reg(this, glsl_type::uvec4_type);
+
+   vec4_instruction *mov = emit(MOV(dst, ts));
+   /* We want to read the 3 fields we care about (mostly field 0, but also 2)
+    * even if it's not enabled in the dispatch.
+    */
+   mov->force_writemask_all = true;
+
+   return src_reg(dst);
+}
+
+void
+vec4_visitor::emit_shader_time_begin()
+{
+   current_annotation = "shader time start";
+   shader_start_time = get_timestamp();
+}
+
+void
+vec4_visitor::emit_shader_time_end()
+{
+   current_annotation = "shader time end";
+   src_reg shader_end_time = get_timestamp();
+
+
+   /* Check that there weren't any timestamp reset events (assuming these
+    * were the only two timestamp reads that happened).
+    */
+   src_reg reset_end = shader_end_time;
+   reset_end.swizzle = BRW_SWIZZLE_ZZZZ;
+   vec4_instruction *test = emit(AND(dst_null_ud(), reset_end, brw_imm_ud(1u)));
+   test->conditional_mod = BRW_CONDITIONAL_Z;
+
+   emit(IF(BRW_PREDICATE_NORMAL));
+
+   /* Take the current timestamp and get the delta. */
+   shader_start_time.negate = true;
+   dst_reg diff = dst_reg(this, glsl_type::uint_type);
+   emit(ADD(diff, shader_start_time, shader_end_time));
+
+   /* If there were no instructions between the two timestamp gets, the diff
+    * is 2 cycles.  Remove that overhead, so I can forget about that when
+    * trying to determine the time taken for single instructions.
+    */
+   emit(ADD(diff, src_reg(diff), brw_imm_ud(-2u)));
+
+   emit_shader_time_write(0, src_reg(diff));
+   emit_shader_time_write(1, brw_imm_ud(1u));
+   emit(BRW_OPCODE_ELSE);
+   emit_shader_time_write(2, brw_imm_ud(1u));
+   emit(BRW_OPCODE_ENDIF);
+}
+
+void
+vec4_visitor::emit_shader_time_write(int shader_time_subindex, src_reg value)
+{
+   dst_reg dst =
+      dst_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 2));
+
+   dst_reg offset = dst;
+   dst_reg time = dst;
+   time.offset += REG_SIZE;
+
+   offset.type = BRW_REGISTER_TYPE_UD;
+   int index = shader_time_index * 3 + shader_time_subindex;
+   emit(MOV(offset, brw_imm_d(index * BRW_SHADER_TIME_STRIDE)));
+
+   time.type = BRW_REGISTER_TYPE_UD;
+   emit(MOV(time, value));
+
+   vec4_instruction *inst =
+      emit(SHADER_OPCODE_SHADER_TIME_ADD, dst_reg(), src_reg(dst));
+   inst->mlen = 2;
+}
+
+void
+vec4_visitor::convert_to_hw_regs()
+{
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      for (int i = 0; i < 3; i++) {
+         struct src_reg &src = inst->src[i];
+         struct brw_reg reg;
+         switch (src.file) {
+         case VGRF: {
+            const unsigned type_size = type_sz(src.type);
+            const unsigned width = REG_SIZE / 2 / MAX2(4, type_size);
+            reg = byte_offset(brw_vecn_grf(width, src.nr, 0), src.offset);
+            reg.type = src.type;
+            reg.abs = src.abs;
+            reg.negate = src.negate;
+            break;
+         }
+
+         case UNIFORM: {
+            const unsigned width = REG_SIZE / 2 / MAX2(4, type_sz(src.type));
+            reg = stride(byte_offset(brw_vec4_grf(
+                                        prog_data->base.dispatch_grf_start_reg +
+                                        src.nr / 2, src.nr % 2 * 4),
+                                     src.offset),
+                         0, width, 1);
+            reg.type = src.type;
+            reg.abs = src.abs;
+            reg.negate = src.negate;
+
+            /* This should have been moved to pull constants. */
+            assert(!src.reladdr);
+            break;
+         }
+
+         case FIXED_GRF:
+            if (type_sz(src.type) == 8) {
+               reg = src.as_brw_reg();
+               break;
+            }
+            /* fallthrough */
+         case ARF:
+         case IMM:
+            continue;
+
+         case BAD_FILE:
+            /* Probably unused. */
+            reg = brw_null_reg();
+            break;
+
+         case MRF:
+         case ATTR:
+            unreachable("not reached");
+         }
+
+         apply_logical_swizzle(&reg, inst, i);
+         src = reg;
+      }
+
+      if (inst->is_3src(devinfo)) {
+         /* 3-src instructions with scalar sources support arbitrary subnr,
+          * but don't actually use swizzles.  Convert swizzle into subnr.
+          * Skip this for double-precision instructions: RepCtrl=1 is not
+          * allowed for them and needs special handling.
+          */
+         for (int i = 0; i < 3; i++) {
+            if (inst->src[i].vstride == BRW_VERTICAL_STRIDE_0 &&
+                type_sz(inst->src[i].type) < 8) {
+               assert(brw_is_single_value_swizzle(inst->src[i].swizzle));
+               inst->src[i].subnr += 4 * BRW_GET_SWZ(inst->src[i].swizzle, 0);
+            }
+         }
+      }
+
+      dst_reg &dst = inst->dst;
+      struct brw_reg reg;
+
+      switch (inst->dst.file) {
+      case VGRF:
+         reg = byte_offset(brw_vec8_grf(dst.nr, 0), dst.offset);
+         reg.type = dst.type;
+         reg.writemask = dst.writemask;
+         break;
+
+      case MRF:
+         reg = byte_offset(brw_message_reg(dst.nr), dst.offset);
+         assert((reg.nr & ~BRW_MRF_COMPR4) < BRW_MAX_MRF(devinfo->gen));
+         reg.type = dst.type;
+         reg.writemask = dst.writemask;
+         break;
+
+      case ARF:
+      case FIXED_GRF:
+         reg = dst.as_brw_reg();
+         break;
+
+      case BAD_FILE:
+         reg = brw_null_reg();
+         break;
+
+      case IMM:
+      case ATTR:
+      case UNIFORM:
+         unreachable("not reached");
+      }
+
+      dst = reg;
+   }
+}
+
+static bool
+stage_uses_interleaved_attributes(unsigned stage,
+                                  enum shader_dispatch_mode dispatch_mode)
+{
+   switch (stage) {
+   case MESA_SHADER_TESS_EVAL:
+      return true;
+   case MESA_SHADER_GEOMETRY:
+      return dispatch_mode != DISPATCH_MODE_4X2_DUAL_OBJECT;
+   default:
+      return false;
+   }
+}
+
+/**
+ * Get the closest native SIMD width supported by the hardware for instruction
+ * \p inst.  The instruction will be left untouched by
+ * vec4_visitor::lower_simd_width() if the returned value matches the
+ * instruction's original execution size.
+ */
+static unsigned
+get_lowered_simd_width(const struct gen_device_info *devinfo,
+                       enum shader_dispatch_mode dispatch_mode,
+                       unsigned stage, const vec4_instruction *inst)
+{
+   /* Do not split some instructions that require special handling */
+   switch (inst->opcode) {
+   case SHADER_OPCODE_GEN4_SCRATCH_READ:
+   case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
+      return inst->exec_size;
+   default:
+      break;
+   }
+
+   unsigned lowered_width = MIN2(16, inst->exec_size);
+
+   /* We need to split some cases of double-precision instructions that write
+    * 2 registers. We only need to care about this in gen7 because that is the
+    * only hardware that implements fp64 in Align16.
+    */
+   if (devinfo->gen == 7 && inst->size_written > REG_SIZE) {
+      /* Align16 8-wide double-precision SEL does not work well. Verified
+       * empirically.
+       */
+      if (inst->opcode == BRW_OPCODE_SEL && type_sz(inst->dst.type) == 8)
+         lowered_width = MIN2(lowered_width, 4);
+
+      /* HSW PRM, 3D Media GPGPU Engine, Region Alignment Rules for Direct
+       * Register Addressing:
+       *
+       *    "When destination spans two registers, the source MUST span two
+       *     registers."
+       */
+      for (unsigned i = 0; i < 3; i++) {
+         if (inst->src[i].file == BAD_FILE)
+            continue;
+         if (inst->size_read(i) <= REG_SIZE)
+            lowered_width = MIN2(lowered_width, 4);
+
+         /* Interleaved attribute setups use a vertical stride of 0, which
+          * makes them hit the associated instruction decompression bug in gen7.
+          * Split them to prevent this.
+          */
+         if (inst->src[i].file == ATTR &&
+             stage_uses_interleaved_attributes(stage, dispatch_mode))
+            lowered_width = MIN2(lowered_width, 4);
+      }
+   }
+
+   return lowered_width;
+}
+
+static bool
+dst_src_regions_overlap(vec4_instruction *inst)
+{
+   if (inst->size_written == 0)
+      return false;
+
+   unsigned dst_start = inst->dst.offset;
+   unsigned dst_end = dst_start + inst->size_written - 1;
+   for (int i = 0; i < 3; i++) {
+      if (inst->src[i].file == BAD_FILE)
+         continue;
+
+      if (inst->dst.file != inst->src[i].file ||
+          inst->dst.nr != inst->src[i].nr)
+         continue;
+
+      unsigned src_start = inst->src[i].offset;
+      unsigned src_end = src_start + inst->size_read(i) - 1;
+
+      if ((dst_start >= src_start && dst_start <= src_end) ||
+          (dst_end >= src_start && dst_end <= src_end) ||
+          (dst_start <= src_start && dst_end >= src_end)) {
+         return true;
+      }
+   }
+
+   return false;
+}
+
+bool
+vec4_visitor::lower_simd_width()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+      const unsigned lowered_width =
+         get_lowered_simd_width(devinfo, prog_data->dispatch_mode, stage, inst);
+      assert(lowered_width <= inst->exec_size);
+      if (lowered_width == inst->exec_size)
+         continue;
+
+      /* We need to deal with source / destination overlaps when splitting.
+       * The hardware supports reading from and writing to the same register
+       * in the same instruction, but we need to be careful that each split
+       * instruction we produce does not corrupt the source of the next.
+       *
+       * The easiest way to handle this is to make the split instructions write
+       * to temporaries if there is an src/dst overlap and then move from the
+       * temporaries to the original destination. We also need to consider
+       * instructions that do partial writes via align1 opcodes, in which case
+       * we need to make sure that the we initialize the temporary with the
+       * value of the instruction's dst.
+       */
+      bool needs_temp = dst_src_regions_overlap(inst);
+      for (unsigned n = 0; n < inst->exec_size / lowered_width; n++)  {
+         unsigned channel_offset = lowered_width * n;
+
+         unsigned size_written = lowered_width * type_sz(inst->dst.type);
+
+         /* Create the split instruction from the original so that we copy all
+          * relevant instruction fields, then set the width and calculate the
+          * new dst/src regions.
+          */
+         vec4_instruction *linst = new(mem_ctx) vec4_instruction(*inst);
+         linst->exec_size = lowered_width;
+         linst->group = channel_offset;
+         linst->size_written = size_written;
+
+         /* Compute split dst region */
+         dst_reg dst;
+         if (needs_temp) {
+            unsigned num_regs = DIV_ROUND_UP(size_written, REG_SIZE);
+            dst = retype(dst_reg(VGRF, alloc.allocate(num_regs)),
+                         inst->dst.type);
+            if (inst->is_align1_partial_write()) {
+               vec4_instruction *copy = MOV(dst, src_reg(inst->dst));
+               copy->exec_size = lowered_width;
+               copy->group = channel_offset;
+               copy->size_written = size_written;
+               inst->insert_before(block, copy);
+            }
+         } else {
+            dst = horiz_offset(inst->dst, channel_offset);
+         }
+         linst->dst = dst;
+
+         /* Compute split source regions */
+         for (int i = 0; i < 3; i++) {
+            if (linst->src[i].file == BAD_FILE)
+               continue;
+
+            if (!is_uniform(linst->src[i]))
+               linst->src[i] = horiz_offset(linst->src[i], channel_offset);
+         }
+
+         inst->insert_before(block, linst);
+
+         /* If we used a temporary to store the result of the split
+          * instruction, copy the result to the original destination
+          */
+         if (needs_temp) {
+            vec4_instruction *mov =
+               MOV(offset(inst->dst, lowered_width, n), src_reg(dst));
+            mov->exec_size = lowered_width;
+            mov->group = channel_offset;
+            mov->size_written = size_written;
+            mov->predicate = inst->predicate;
+            inst->insert_before(block, mov);
+         }
+      }
+
+      inst->remove(block);
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
+static bool
+is_align1_df(vec4_instruction *inst)
+{
+   switch (inst->opcode) {
+   case VEC4_OPCODE_FROM_DOUBLE:
+   case VEC4_OPCODE_TO_DOUBLE:
+   case VEC4_OPCODE_PICK_LOW_32BIT:
+   case VEC4_OPCODE_PICK_HIGH_32BIT:
+   case VEC4_OPCODE_SET_LOW_32BIT:
+   case VEC4_OPCODE_SET_HIGH_32BIT:
+      return true;
+   default:
+      return false;
+   }
+}
+
+static brw_predicate
+scalarize_predicate(brw_predicate predicate, unsigned writemask)
+{
+   if (predicate != BRW_PREDICATE_NORMAL)
+      return predicate;
+
+   switch (writemask) {
+   case WRITEMASK_X:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_X;
+   case WRITEMASK_Y:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_Y;
+   case WRITEMASK_Z:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_Z;
+   case WRITEMASK_W:
+      return BRW_PREDICATE_ALIGN16_REPLICATE_W;
+   default:
+      unreachable("invalid writemask");
+   }
+}
+
+/* Gen7 has a hardware decompression bug that we can exploit to represent
+ * handful of additional swizzles natively.
+ */
+static bool
+is_gen7_supported_64bit_swizzle(vec4_instruction *inst, unsigned arg)
+{
+   switch (inst->src[arg].swizzle) {
+   case BRW_SWIZZLE_XXXX:
+   case BRW_SWIZZLE_YYYY:
+   case BRW_SWIZZLE_ZZZZ:
+   case BRW_SWIZZLE_WWWW:
+   case BRW_SWIZZLE_XYXY:
+   case BRW_SWIZZLE_YXYX:
+   case BRW_SWIZZLE_ZWZW:
+   case BRW_SWIZZLE_WZWZ:
+      return true;
+   default:
+      return false;
+   }
+}
+
+/* 64-bit sources use regions with a width of 2. These 2 elements in each row
+ * can be addressed using 32-bit swizzles (which is what the hardware supports)
+ * but it also means that the swizzle we apply on the first two components of a
+ * dvec4 is coupled with the swizzle we use for the last 2. In other words,
+ * only some specific swizzle combinations can be natively supported.
+ *
+ * FIXME: we can go an step further and implement even more swizzle
+ *        variations using only partial scalarization.
+ *
+ * For more details see:
+ * https://bugs.freedesktop.org/show_bug.cgi?id=92760#c82
+ */
+bool
+vec4_visitor::is_supported_64bit_region(vec4_instruction *inst, unsigned arg)
+{
+   const src_reg &src = inst->src[arg];
+   assert(type_sz(src.type) == 8);
+
+   /* Uniform regions have a vstride=0. Because we use 2-wide rows with
+    * 64-bit regions it means that we cannot access components Z/W, so
+    * return false for any such case. Interleaved attributes will also be
+    * mapped to GRF registers with a vstride of 0, so apply the same
+    * treatment.
+    */
+   if ((is_uniform(src) ||
+        (stage_uses_interleaved_attributes(stage, prog_data->dispatch_mode) &&
+         src.file == ATTR)) &&
+       (brw_mask_for_swizzle(src.swizzle) & 12))
+      return false;
+
+   switch (src.swizzle) {
+   case BRW_SWIZZLE_XYZW:
+   case BRW_SWIZZLE_XXZZ:
+   case BRW_SWIZZLE_YYWW:
+   case BRW_SWIZZLE_YXWZ:
+      return true;
+   default:
+      return devinfo->gen == 7 && is_gen7_supported_64bit_swizzle(inst, arg);
+   }
+}
+
+bool
+vec4_visitor::scalarize_df()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+      /* Skip DF instructions that operate in Align1 mode */
+      if (is_align1_df(inst))
+         continue;
+
+      /* Check if this is a double-precision instruction */
+      bool is_double = type_sz(inst->dst.type) == 8;
+      for (int arg = 0; !is_double && arg < 3; arg++) {
+         is_double = inst->src[arg].file != BAD_FILE &&
+                     type_sz(inst->src[arg].type) == 8;
+      }
+
+      if (!is_double)
+         continue;
+
+      /* Skip the lowering for specific regioning scenarios that we can
+       * support natively.
+       */
+      bool skip_lowering = true;
+
+      /* XY and ZW writemasks operate in 32-bit, which means that they don't
+       * have a native 64-bit representation and they should always be split.
+       */
+      if (inst->dst.writemask == WRITEMASK_XY ||
+          inst->dst.writemask == WRITEMASK_ZW) {
+         skip_lowering = false;
+      } else {
+         for (unsigned i = 0; i < 3; i++) {
+            if (inst->src[i].file == BAD_FILE || type_sz(inst->src[i].type) < 8)
+               continue;
+            skip_lowering = skip_lowering && is_supported_64bit_region(inst, i);
+         }
+      }
+
+      if (skip_lowering)
+         continue;
+
+      /* Generate scalar instructions for each enabled channel */
+      for (unsigned chan = 0; chan < 4; chan++) {
+         unsigned chan_mask = 1 << chan;
+         if (!(inst->dst.writemask & chan_mask))
+            continue;
+
+         vec4_instruction *scalar_inst = new(mem_ctx) vec4_instruction(*inst);
+
+         for (unsigned i = 0; i < 3; i++) {
+            unsigned swz = BRW_GET_SWZ(inst->src[i].swizzle, chan);
+            scalar_inst->src[i].swizzle = BRW_SWIZZLE4(swz, swz, swz, swz);
+         }
+
+         scalar_inst->dst.writemask = chan_mask;
+
+         if (inst->predicate != BRW_PREDICATE_NONE) {
+            scalar_inst->predicate =
+               scalarize_predicate(inst->predicate, chan_mask);
+         }
+
+         inst->insert_before(block, scalar_inst);
+      }
+
+      inst->remove(block);
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
+bool
+vec4_visitor::lower_64bit_mad_to_mul_add()
+{
+   bool progress = false;
+
+   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+      if (inst->opcode != BRW_OPCODE_MAD)
+         continue;
+
+      if (type_sz(inst->dst.type) != 8)
+         continue;
+
+      dst_reg mul_dst = dst_reg(this, glsl_type::dvec4_type);
+
+      /* Use the copy constructor so we copy all relevant instruction fields
+       * from the original mad into the add and mul instructions
+       */
+      vec4_instruction *mul = new(mem_ctx) vec4_instruction(*inst);
+      mul->opcode = BRW_OPCODE_MUL;
+      mul->dst = mul_dst;
+      mul->src[0] = inst->src[1];
+      mul->src[1] = inst->src[2];
+      mul->src[2].file = BAD_FILE;
+
+      vec4_instruction *add = new(mem_ctx) vec4_instruction(*inst);
+      add->opcode = BRW_OPCODE_ADD;
+      add->src[0] = src_reg(mul_dst);
+      add->src[1] = inst->src[0];
+      add->src[2].file = BAD_FILE;
+
+      inst->insert_before(block, mul);
+      inst->insert_before(block, add);
+      inst->remove(block);
+
+      progress = true;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
+/* The align16 hardware can only do 32-bit swizzle channels, so we need to
+ * translate the logical 64-bit swizzle channels that we use in the Vec4 IR
+ * to 32-bit swizzle channels in hardware registers.
+ *
+ * @inst and @arg identify the original vec4 IR source operand we need to
+ * translate the swizzle for and @hw_reg is the hardware register where we
+ * will write the hardware swizzle to use.
+ *
+ * This pass assumes that Align16/DF instructions have been fully scalarized
+ * previously so there is just one 64-bit swizzle channel to deal with for any
+ * given Vec4 IR source.
+ */
+void
+vec4_visitor::apply_logical_swizzle(struct brw_reg *hw_reg,
+                                    vec4_instruction *inst, int arg)
+{
+   src_reg reg = inst->src[arg];
+
+   if (reg.file == BAD_FILE || reg.file == BRW_IMMEDIATE_VALUE)
+      return;
+
+   /* If this is not a 64-bit operand or this is a scalar instruction we don't
+    * need to do anything about the swizzles.
+    */
+   if(type_sz(reg.type) < 8 || is_align1_df(inst)) {
+      hw_reg->swizzle = reg.swizzle;
+      return;
+   }
+
+   /* Take the 64-bit logical swizzle channel and translate it to 32-bit */
+   assert(brw_is_single_value_swizzle(reg.swizzle) ||
+          is_supported_64bit_region(inst, arg));
+
+   if (is_supported_64bit_region(inst, arg) &&
+       !is_gen7_supported_64bit_swizzle(inst, arg)) {
+      /* Supported 64-bit swizzles are those such that their first two
+       * components, when expanded to 32-bit swizzles, match the semantics
+       * of the original 64-bit swizzle with 2-wide row regioning.
+       */
+      unsigned swizzle0 = BRW_GET_SWZ(reg.swizzle, 0);
+      unsigned swizzle1 = BRW_GET_SWZ(reg.swizzle, 1);
+      hw_reg->swizzle = BRW_SWIZZLE4(swizzle0 * 2, swizzle0 * 2 + 1,
+                                     swizzle1 * 2, swizzle1 * 2 + 1);
+   } else {
+      /* If we got here then we have one of the following:
+       *
+       * 1. An unsupported swizzle, which should be single-value thanks to the
+       *    scalarization pass.
+       *
+       * 2. A gen7 supported swizzle. These can be single-value or double-value
+       *    swizzles. If the latter, they are never cross-dvec2 channels. For
+       *    these we always need to activate the gen7 vstride=0 exploit.
+       */
+      unsigned swizzle0 = BRW_GET_SWZ(reg.swizzle, 0);
+      unsigned swizzle1 = BRW_GET_SWZ(reg.swizzle, 1);
+      assert((swizzle0 < 2) == (swizzle1 < 2));
+
+      /* To gain access to Z/W components we need to select the second half
+       * of the register and then use a X/Y swizzle to select Z/W respectively.
+       */
+      if (swizzle0 >= 2) {
+         *hw_reg = suboffset(*hw_reg, 2);
+         swizzle0 -= 2;
+         swizzle1 -= 2;
+      }
+
+      /* All gen7-specific supported swizzles require the vstride=0 exploit */
+      if (devinfo->gen == 7 && is_gen7_supported_64bit_swizzle(inst, arg))
+         hw_reg->vstride = BRW_VERTICAL_STRIDE_0;
+
+      /* Any 64-bit source with an offset at 16B is intended to address the
+       * second half of a register and needs a vertical stride of 0 so we:
+       *
+       * 1. Don't violate register region restrictions.
+       * 2. Activate the gen7 instruction decompresion bug exploit when
+       *    execsize > 4
+       */
+      if (hw_reg->subnr % REG_SIZE == 16) {
+         assert(devinfo->gen == 7);
+         hw_reg->vstride = BRW_VERTICAL_STRIDE_0;
+      }
+
+      hw_reg->swizzle = BRW_SWIZZLE4(swizzle0 * 2, swizzle0 * 2 + 1,
+                                     swizzle1 * 2, swizzle1 * 2 + 1);
+   }
+}
+
+bool
+vec4_visitor::run()
+{
+   if (shader_time_index >= 0)
+      emit_shader_time_begin();
+
+   emit_prolog();
+
+   emit_nir_code();
+   if (failed)
+      return false;
+   base_ir = NULL;
+
+   emit_thread_end();
+
+   calculate_cfg();
+
+   /* Before any optimization, push array accesses out to scratch
+    * space where we need them to be.  This pass may allocate new
+    * virtual GRFs, so we want to do it early.  It also makes sure
+    * that we have reladdr computations available for CSE, since we'll
+    * often do repeated subexpressions for those.
+    */
+   move_grf_array_access_to_scratch();
+   move_uniform_array_access_to_pull_constants();
+
+   pack_uniform_registers();
+   move_push_constants_to_pull_constants();
+   split_virtual_grfs();
+
+#define OPT(pass, args...) ({                                          \
+      pass_num++;                                                      \
+      bool this_progress = pass(args);                                 \
+                                                                       \
+      if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {  \
+         char filename[64];                                            \
+         snprintf(filename, 64, "%s-%s-%02d-%02d-" #pass,              \
+                  stage_abbrev, nir->info->name, iteration, pass_num); \
+                                                                       \
+         backend_shader::dump_instructions(filename);                  \
+      }                                                                \
+                                                                       \
+      progress = progress || this_progress;                            \
+      this_progress;                                                   \
+   })
+
+
+   if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
+      char filename[64];
+      snprintf(filename, 64, "%s-%s-00-00-start",
+               stage_abbrev, nir->info->name);
+
+      backend_shader::dump_instructions(filename);
+   }
+
+   bool progress;
+   int iteration = 0;
+   int pass_num = 0;
+   do {
+      progress = false;
+      pass_num = 0;
+      iteration++;
+
+      OPT(opt_predicated_break, this);
+      OPT(opt_reduce_swizzle);
+      OPT(dead_code_eliminate);
+      OPT(dead_control_flow_eliminate, this);
+      OPT(opt_copy_propagation);
+      OPT(opt_cmod_propagation);
+      OPT(opt_cse);
+      OPT(opt_algebraic);
+      OPT(opt_register_coalesce);
+      OPT(eliminate_find_live_channel);
+   } while (progress);
+
+   pass_num = 0;
+
+   if (OPT(opt_vector_float)) {
+      OPT(opt_cse);
+      OPT(opt_copy_propagation, false);
+      OPT(opt_copy_propagation, true);
+      OPT(dead_code_eliminate);
+   }
+
+   if (devinfo->gen <= 5 && OPT(lower_minmax)) {
+      OPT(opt_cmod_propagation);
+      OPT(opt_cse);
+      OPT(opt_copy_propagation);
+      OPT(dead_code_eliminate);
+   }
+
+   if (OPT(lower_simd_width)) {
+      OPT(opt_copy_propagation);
+      OPT(dead_code_eliminate);
+   }
+
+   if (failed)
+      return false;
+
+   OPT(lower_64bit_mad_to_mul_add);
+
+   /* Run this before payload setup because tesselation shaders
+    * rely on it to prevent cross dvec2 regioning on DF attributes
+    * that are setup so that XY are on the second half of register and
+    * ZW are in the first half of the next.
+    */
+   OPT(scalarize_df);
+
+   setup_payload();
+
+   if (unlikely(INTEL_DEBUG & DEBUG_SPILL_VEC4)) {
+      /* Debug of register spilling: Go spill everything. */
+      const int grf_count = alloc.count;
+      float spill_costs[alloc.count];
+      bool no_spill[alloc.count];
+      evaluate_spill_costs(spill_costs, no_spill);
+      for (int i = 0; i < grf_count; i++) {
+         if (no_spill[i])
+            continue;
+         spill_reg(i);
+      }
+
+      /* We want to run this after spilling because 64-bit (un)spills need to
+       * emit code to shuffle 64-bit data for the 32-bit scratch read/write
+       * messages that can produce unsupported 64-bit swizzle regions.
+       */
+      OPT(scalarize_df);
+   }
+
+   bool allocated_without_spills = reg_allocate();
+
+   if (!allocated_without_spills) {
+      compiler->shader_perf_log(log_data,
+                                "%s shader triggered register spilling.  "
+                                "Try reducing the number of live vec4 values "
+                                "to improve performance.\n",
+                                stage_name);
+
+      while (!reg_allocate()) {
+         if (failed)
+            return false;
+      }
+
+      /* We want to run this after spilling because 64-bit (un)spills need to
+       * emit code to shuffle 64-bit data for the 32-bit scratch read/write
+       * messages that can produce unsupported 64-bit swizzle regions.
+       */
+      OPT(scalarize_df);
+   }
+
+   opt_schedule_instructions();
+
+   opt_set_dependency_control();
+
+   convert_to_hw_regs();
+
+   if (last_scratch > 0) {
+      prog_data->base.total_scratch =
+         brw_get_scratch_size(last_scratch * REG_SIZE);
+   }
+
+   return !failed;
+}
+
+} /* namespace brw */
+
+extern "C" {
+
+/**
+ * Compile a vertex shader.
+ *
+ * Returns the final assembly and the program's size.
+ */
+const unsigned *
+brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
+               void *mem_ctx,
+               const struct brw_vs_prog_key *key,
+               struct brw_vs_prog_data *prog_data,
+               const nir_shader *src_shader,
+               gl_clip_plane *clip_planes,
+               bool use_legacy_snorm_formula,
+               int shader_time_index,
+               unsigned *final_assembly_size,
+               char **error_str)
+{
+   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_VERTEX];
+   nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
+   shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, is_scalar);
+   brw_nir_lower_vs_inputs(shader, is_scalar,
+                           use_legacy_snorm_formula, key->gl_attrib_wa_flags);
+   brw_nir_lower_vue_outputs(shader, is_scalar);
+   shader = brw_postprocess_nir(shader, compiler, is_scalar);
+
+   const unsigned *assembly = NULL;
+
+   prog_data->base.clip_distance_mask =
+      ((1 << shader->info->clip_distance_array_size) - 1);
+   prog_data->base.cull_distance_mask =
+      ((1 << shader->info->cull_distance_array_size) - 1) <<
+      shader->info->clip_distance_array_size;
+
+   unsigned nr_attribute_slots = _mesa_bitcount_64(prog_data->inputs_read);
+
+   /* gl_VertexID and gl_InstanceID are system values, but arrive via an
+    * incoming vertex attribute.  So, add an extra slot.
+    */
+   if (shader->info->system_values_read &
+       (BITFIELD64_BIT(SYSTEM_VALUE_BASE_VERTEX) |
+        BITFIELD64_BIT(SYSTEM_VALUE_BASE_INSTANCE) |
+        BITFIELD64_BIT(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE) |
+        BITFIELD64_BIT(SYSTEM_VALUE_INSTANCE_ID))) {
+      nr_attribute_slots++;
+   }
+
+   /* gl_DrawID has its very own vec4 */
+   if (shader->info->system_values_read &
+       BITFIELD64_BIT(SYSTEM_VALUE_DRAW_ID)) {
+      nr_attribute_slots++;
+   }
+
+   unsigned nr_attributes = nr_attribute_slots -
+      DIV_ROUND_UP(_mesa_bitcount_64(shader->info->double_inputs_read), 2);
+
+   /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry
+    * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode.  Empirically, in
+    * vec4 mode, the hardware appears to wedge unless we read something.
+    */
+   if (is_scalar)
+      prog_data->base.urb_read_length =
+         DIV_ROUND_UP(nr_attribute_slots, 2);
+   else
+      prog_data->base.urb_read_length =
+         DIV_ROUND_UP(MAX2(nr_attribute_slots, 1), 2);
+
+   prog_data->nr_attributes = nr_attributes;
+   prog_data->nr_attribute_slots = nr_attribute_slots;
+
+   /* Since vertex shaders reuse the same VUE entry for inputs and outputs
+    * (overwriting the original contents), we need to make sure the size is
+    * the larger of the two.
+    */
+   const unsigned vue_entries =
+      MAX2(nr_attribute_slots, (unsigned)prog_data->base.vue_map.num_slots);
+
+   if (compiler->devinfo->gen == 6)
+      prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 8);
+   else
+      prog_data->base.urb_entry_size = DIV_ROUND_UP(vue_entries, 4);
+
+   if (INTEL_DEBUG & DEBUG_VS) {
+      fprintf(stderr, "VS Output ");
+      brw_print_vue_map(stderr, &prog_data->base.vue_map);
+   }
+
+   if (is_scalar) {
+      prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
+
+      fs_visitor v(compiler, log_data, mem_ctx, key, &prog_data->base.base,
+                   NULL, /* prog; Only used for TEXTURE_RECTANGLE on gen < 8 */
+                   shader, 8, shader_time_index);
+      if (!v.run_vs(clip_planes)) {
+         if (error_str)
+            *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
+
+         return NULL;
+      }
+
+      prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs;
+
+      fs_generator g(compiler, log_data, mem_ctx, (void *) key,
+                     &prog_data->base.base, v.promoted_constants,
+                     v.runtime_check_aads_emit, MESA_SHADER_VERTEX);
+      if (INTEL_DEBUG & DEBUG_VS) {
+         const char *debug_name =
+            ralloc_asprintf(mem_ctx, "%s vertex shader %s",
+                            shader->info->label ? shader->info->label :
+                               "unnamed",
+                            shader->info->name);
+
+         g.enable_debug(debug_name);
+      }
+      g.generate_code(v.cfg, 8);
+      assembly = g.get_assembly(final_assembly_size);
+   }
+
+   if (!assembly) {
+      prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
+
+      vec4_vs_visitor v(compiler, log_data, key, prog_data,
+                        shader, clip_planes, mem_ctx,
+                        shader_time_index, use_legacy_snorm_formula);
+      if (!v.run()) {
+         if (error_str)
+            *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
+
+         return NULL;
+      }
+
+      assembly = brw_vec4_generate_assembly(compiler, log_data, mem_ctx,
+                                            shader, &prog_data->base, v.cfg,
+                                            final_assembly_size);
+   }
+
+   return assembly;
+}
+
+} /* extern "C" */
diff --git a/src/intel/compiler/brw_vec4.h b/src/intel/compiler/brw_vec4.h
new file mode 100644
index 00000000000..a84048d8c6a
--- /dev/null
+++ b/src/intel/compiler/brw_vec4.h
@@ -0,0 +1,399 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_VEC4_H
+#define BRW_VEC4_H
+
+#include "brw_shader.h"
+
+#ifdef __cplusplus
+#include "brw_ir_vec4.h"
+#endif
+
+#include "compiler/glsl/ir.h"
+#include "compiler/nir/nir.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+const unsigned *
+brw_vec4_generate_assembly(const struct brw_compiler *compiler,
+                           void *log_data,
+                           void *mem_ctx,
+                           const nir_shader *nir,
+                           struct brw_vue_prog_data *prog_data,
+                           const struct cfg_t *cfg,
+                           unsigned *out_assembly_size);
+
+#ifdef __cplusplus
+} /* extern "C" */
+
+namespace brw {
+
+class vec4_live_variables;
+
+/**
+ * The vertex shader front-end.
+ *
+ * Translates either GLSL IR or Mesa IR (for ARB_vertex_program and
+ * fixed-function) into VS IR.
+ */
+class vec4_visitor : public backend_shader
+{
+public:
+   vec4_visitor(const struct brw_compiler *compiler,
+                void *log_data,
+                const struct brw_sampler_prog_key_data *key,
+                struct brw_vue_prog_data *prog_data,
+                const nir_shader *shader,
+		void *mem_ctx,
+                bool no_spills,
+                int shader_time_index);
+   virtual ~vec4_visitor();
+
+   dst_reg dst_null_f()
+   {
+      return dst_reg(brw_null_reg());
+   }
+
+   dst_reg dst_null_df()
+   {
+      return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_DF));
+   }
+
+   dst_reg dst_null_d()
+   {
+      return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
+   }
+
+   dst_reg dst_null_ud()
+   {
+      return dst_reg(retype(brw_null_reg(), BRW_REGISTER_TYPE_UD));
+   }
+
+   const struct brw_sampler_prog_key_data * const key_tex;
+   struct brw_vue_prog_data * const prog_data;
+   char *fail_msg;
+   bool failed;
+
+   /**
+    * GLSL IR currently being processed, which is associated with our
+    * driver IR instructions for debugging purposes.
+    */
+   const void *base_ir;
+   const char *current_annotation;
+
+   int first_non_payload_grf;
+   unsigned int max_grf;
+   int *virtual_grf_start;
+   int *virtual_grf_end;
+   brw::vec4_live_variables *live_intervals;
+   dst_reg userplane[MAX_CLIP_PLANES];
+
+   bool need_all_constants_in_pull_buffer;
+
+   /* Regs for vertex results.  Generated at ir_variable visiting time
+    * for the ir->location's used.
+    */
+   dst_reg output_reg[VARYING_SLOT_TESS_MAX][4];
+   unsigned output_num_components[VARYING_SLOT_TESS_MAX][4];
+   const char *output_reg_annotation[VARYING_SLOT_TESS_MAX];
+   int uniforms;
+
+   src_reg shader_start_time;
+
+   bool run();
+   void fail(const char *msg, ...);
+
+   int setup_uniforms(int payload_reg);
+
+   bool reg_allocate_trivial();
+   bool reg_allocate();
+   void evaluate_spill_costs(float *spill_costs, bool *no_spill);
+   int choose_spill_reg(struct ra_graph *g);
+   void spill_reg(int spill_reg);
+   void move_grf_array_access_to_scratch();
+   void move_uniform_array_access_to_pull_constants();
+   void move_push_constants_to_pull_constants();
+   void split_uniform_registers();
+   void pack_uniform_registers();
+   void calculate_live_intervals();
+   void invalidate_live_intervals();
+   void split_virtual_grfs();
+   bool opt_vector_float();
+   bool opt_reduce_swizzle();
+   bool dead_code_eliminate();
+   int var_range_start(unsigned v, unsigned n) const;
+   int var_range_end(unsigned v, unsigned n) const;
+   bool virtual_grf_interferes(int a, int b);
+   bool opt_cmod_propagation();
+   bool opt_copy_propagation(bool do_constant_prop = true);
+   bool opt_cse_local(bblock_t *block);
+   bool opt_cse();
+   bool opt_algebraic();
+   bool opt_register_coalesce();
+   bool eliminate_find_live_channel();
+   bool is_dep_ctrl_unsafe(const vec4_instruction *inst);
+   void opt_set_dependency_control();
+   void opt_schedule_instructions();
+   void convert_to_hw_regs();
+
+   bool is_supported_64bit_region(vec4_instruction *inst, unsigned arg);
+   bool lower_simd_width();
+   bool scalarize_df();
+   bool lower_64bit_mad_to_mul_add();
+   void apply_logical_swizzle(struct brw_reg *hw_reg,
+                              vec4_instruction *inst, int arg);
+
+   vec4_instruction *emit(vec4_instruction *inst);
+
+   vec4_instruction *emit(enum opcode opcode);
+   vec4_instruction *emit(enum opcode opcode, const dst_reg &dst);
+   vec4_instruction *emit(enum opcode opcode, const dst_reg &dst,
+                          const src_reg &src0);
+   vec4_instruction *emit(enum opcode opcode, const dst_reg &dst,
+                          const src_reg &src0, const src_reg &src1);
+   vec4_instruction *emit(enum opcode opcode, const dst_reg &dst,
+                          const src_reg &src0, const src_reg &src1,
+                          const src_reg &src2);
+
+   vec4_instruction *emit_before(bblock_t *block,
+                                 vec4_instruction *inst,
+				 vec4_instruction *new_inst);
+
+#define EMIT1(op) vec4_instruction *op(const dst_reg &, const src_reg &);
+#define EMIT2(op) vec4_instruction *op(const dst_reg &, const src_reg &, const src_reg &);
+#define EMIT3(op) vec4_instruction *op(const dst_reg &, const src_reg &, const src_reg &, const src_reg &);
+   EMIT1(MOV)
+   EMIT1(NOT)
+   EMIT1(RNDD)
+   EMIT1(RNDE)
+   EMIT1(RNDZ)
+   EMIT1(FRC)
+   EMIT1(F32TO16)
+   EMIT1(F16TO32)
+   EMIT2(ADD)
+   EMIT2(MUL)
+   EMIT2(MACH)
+   EMIT2(MAC)
+   EMIT2(AND)
+   EMIT2(OR)
+   EMIT2(XOR)
+   EMIT2(DP3)
+   EMIT2(DP4)
+   EMIT2(DPH)
+   EMIT2(SHL)
+   EMIT2(SHR)
+   EMIT2(ASR)
+   vec4_instruction *CMP(dst_reg dst, src_reg src0, src_reg src1,
+			 enum brw_conditional_mod condition);
+   vec4_instruction *IF(src_reg src0, src_reg src1,
+                        enum brw_conditional_mod condition);
+   vec4_instruction *IF(enum brw_predicate predicate);
+   EMIT1(SCRATCH_READ)
+   EMIT2(SCRATCH_WRITE)
+   EMIT3(LRP)
+   EMIT1(BFREV)
+   EMIT3(BFE)
+   EMIT2(BFI1)
+   EMIT3(BFI2)
+   EMIT1(FBH)
+   EMIT1(FBL)
+   EMIT1(CBIT)
+   EMIT3(MAD)
+   EMIT2(ADDC)
+   EMIT2(SUBB)
+   EMIT1(DIM)
+
+#undef EMIT1
+#undef EMIT2
+#undef EMIT3
+
+   int implied_mrf_writes(vec4_instruction *inst);
+
+   vec4_instruction *emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
+                                 src_reg src0, src_reg src1);
+
+   vec4_instruction *emit_lrp(const dst_reg &dst, const src_reg &x,
+                              const src_reg &y, const src_reg &a);
+
+   /**
+    * Copy any live channel from \p src to the first channel of the
+    * result.
+    */
+   src_reg emit_uniformize(const src_reg &src);
+
+   src_reg fix_3src_operand(const src_reg &src);
+   src_reg resolve_source_modifiers(const src_reg &src);
+
+   vec4_instruction *emit_math(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+                               const src_reg &src1 = src_reg());
+
+   src_reg fix_math_operand(const src_reg &src);
+
+   void emit_pack_half_2x16(dst_reg dst, src_reg src0);
+   void emit_unpack_half_2x16(dst_reg dst, src_reg src0);
+   void emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0);
+   void emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0);
+   void emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0);
+   void emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0);
+
+   void emit_texture(ir_texture_opcode op,
+                     dst_reg dest,
+                     const glsl_type *dest_type,
+                     src_reg coordinate,
+                     int coord_components,
+                     src_reg shadow_comparator,
+                     src_reg lod, src_reg lod2,
+                     src_reg sample_index,
+                     uint32_t constant_offset,
+                     src_reg offset_value,
+                     src_reg mcs,
+                     uint32_t surface, src_reg surface_reg,
+                     src_reg sampler_reg);
+
+   src_reg emit_mcs_fetch(const glsl_type *coordinate_type, src_reg coordinate,
+                          src_reg surface);
+   void emit_gen6_gather_wa(uint8_t wa, dst_reg dst);
+
+   void emit_ndc_computation();
+   void emit_psiz_and_flags(dst_reg reg);
+   vec4_instruction *emit_generic_urb_slot(dst_reg reg, int varying, int comp);
+   virtual void emit_urb_slot(dst_reg reg, int varying);
+
+   void emit_shader_time_begin();
+   void emit_shader_time_end();
+   void emit_shader_time_write(int shader_time_subindex, src_reg value);
+
+   src_reg get_scratch_offset(bblock_t *block, vec4_instruction *inst,
+			      src_reg *reladdr, int reg_offset);
+   void emit_scratch_read(bblock_t *block, vec4_instruction *inst,
+			  dst_reg dst,
+			  src_reg orig_src,
+			  int base_offset);
+   void emit_scratch_write(bblock_t *block, vec4_instruction *inst,
+			   int base_offset);
+   void emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
+				dst_reg dst,
+				src_reg orig_src,
+                                int base_offset,
+                                src_reg indirect);
+   void emit_pull_constant_load_reg(dst_reg dst,
+                                    src_reg surf_index,
+                                    src_reg offset,
+                                    bblock_t *before_block,
+                                    vec4_instruction *before_inst);
+   src_reg emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
+                                vec4_instruction *inst, src_reg src);
+
+   void resolve_ud_negate(src_reg *reg);
+
+   bool lower_minmax();
+
+   src_reg get_timestamp();
+
+   void dump_instruction(backend_instruction *inst);
+   void dump_instruction(backend_instruction *inst, FILE *file);
+
+   bool is_high_sampler(src_reg sampler);
+
+   bool optimize_predicate(nir_alu_instr *instr, enum brw_predicate *predicate);
+
+   void emit_conversion_from_double(dst_reg dst, src_reg src, bool saturate,
+                                    brw_reg_type single_type);
+   void emit_conversion_to_double(dst_reg dst, src_reg src, bool saturate,
+                                  brw_reg_type single_type);
+
+   src_reg setup_imm_df(double v);
+
+   vec4_instruction *shuffle_64bit_data(dst_reg dst, src_reg src,
+                                        bool for_write,
+                                        bblock_t *block = NULL,
+                                        vec4_instruction *ref = NULL);
+
+   virtual void emit_nir_code();
+   virtual void nir_setup_uniforms();
+   virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr);
+   virtual void nir_setup_system_values();
+   virtual void nir_emit_impl(nir_function_impl *impl);
+   virtual void nir_emit_cf_list(exec_list *list);
+   virtual void nir_emit_if(nir_if *if_stmt);
+   virtual void nir_emit_loop(nir_loop *loop);
+   virtual void nir_emit_block(nir_block *block);
+   virtual void nir_emit_instr(nir_instr *instr);
+   virtual void nir_emit_load_const(nir_load_const_instr *instr);
+   virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
+   virtual void nir_emit_alu(nir_alu_instr *instr);
+   virtual void nir_emit_jump(nir_jump_instr *instr);
+   virtual void nir_emit_texture(nir_tex_instr *instr);
+   virtual void nir_emit_undef(nir_ssa_undef_instr *instr);
+   virtual void nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr);
+
+   dst_reg get_nir_dest(const nir_dest &dest, enum brw_reg_type type);
+   dst_reg get_nir_dest(const nir_dest &dest, nir_alu_type type);
+   dst_reg get_nir_dest(const nir_dest &dest);
+   src_reg get_nir_src(const nir_src &src, enum brw_reg_type type,
+                       unsigned num_components = 4);
+   src_reg get_nir_src(const nir_src &src, nir_alu_type type,
+                       unsigned num_components = 4);
+   src_reg get_nir_src(const nir_src &src,
+                       unsigned num_components = 4);
+   src_reg get_indirect_offset(nir_intrinsic_instr *instr);
+
+   virtual dst_reg *make_reg_for_system_value(int location) = 0;
+
+   dst_reg *nir_locals;
+   dst_reg *nir_ssa_values;
+   dst_reg *nir_system_values;
+
+protected:
+   void emit_vertex();
+   void lower_attributes_to_hw_regs(const int *attribute_map,
+                                    bool interleaved);
+   void setup_payload_interference(struct ra_graph *g, int first_payload_node,
+                                   int reg_node_count);
+   virtual void setup_payload() = 0;
+   virtual void emit_prolog() = 0;
+   virtual void emit_thread_end() = 0;
+   virtual void emit_urb_write_header(int mrf) = 0;
+   virtual vec4_instruction *emit_urb_write_opcode(bool complete) = 0;
+   virtual void gs_emit_vertex(int stream_id);
+   virtual void gs_end_primitive();
+
+private:
+   /**
+    * If true, then register allocation should fail instead of spilling.
+    */
+   const bool no_spills;
+
+   int shader_time_index;
+
+   unsigned last_scratch; /**< measured in 32-byte (register size) units */
+};
+
+} /* namespace brw */
+#endif /* __cplusplus */
+
+#endif /* BRW_VEC4_H */
diff --git a/src/intel/compiler/brw_vec4_builder.h b/src/intel/compiler/brw_vec4_builder.h
new file mode 100644
index 00000000000..4c3efe8457b
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_builder.h
@@ -0,0 +1,634 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2010-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_VEC4_BUILDER_H
+#define BRW_VEC4_BUILDER_H
+
+#include "brw_ir_vec4.h"
+#include "brw_ir_allocator.h"
+
+namespace brw {
+   /**
+    * Toolbox to assemble a VEC4 IR program out of individual instructions.
+    *
+    * This object is meant to have an interface consistent with
+    * brw::fs_builder.  They cannot be fully interchangeable because
+    * brw::fs_builder generates scalar code while brw::vec4_builder generates
+    * vector code.
+    */
+   class vec4_builder {
+   public:
+      /** Type used in this IR to represent a source of an instruction. */
+      typedef brw::src_reg src_reg;
+
+      /** Type used in this IR to represent the destination of an instruction. */
+      typedef brw::dst_reg dst_reg;
+
+      /** Type used in this IR to represent an instruction. */
+      typedef vec4_instruction instruction;
+
+      /**
+       * Construct a vec4_builder that inserts instructions into \p shader.
+       */
+      vec4_builder(backend_shader *shader, unsigned dispatch_width = 8) :
+         shader(shader), block(NULL), cursor(NULL),
+         _dispatch_width(dispatch_width), _group(0),
+         force_writemask_all(false),
+         annotation()
+      {
+      }
+
+      /**
+       * Construct a vec4_builder that inserts instructions into \p shader
+       * before instruction \p inst in basic block \p block.  The default
+       * execution controls and debug annotation are initialized from the
+       * instruction passed as argument.
+       */
+      vec4_builder(backend_shader *shader, bblock_t *block, instruction *inst) :
+         shader(shader), block(block), cursor(inst),
+         _dispatch_width(inst->exec_size), _group(inst->group),
+         force_writemask_all(inst->force_writemask_all)
+      {
+         annotation.str = inst->annotation;
+         annotation.ir = inst->ir;
+      }
+
+      /**
+       * Construct a vec4_builder that inserts instructions before \p cursor
+       * in basic block \p block, inheriting other code generation parameters
+       * from this.
+       */
+      vec4_builder
+      at(bblock_t *block, exec_node *cursor) const
+      {
+         vec4_builder bld = *this;
+         bld.block = block;
+         bld.cursor = cursor;
+         return bld;
+      }
+
+      /**
+       * Construct a vec4_builder appending instructions at the end of the
+       * instruction list of the shader, inheriting other code generation
+       * parameters from this.
+       */
+      vec4_builder
+      at_end() const
+      {
+         return at(NULL, (exec_node *)&shader->instructions.tail_sentinel);
+      }
+
+      /**
+       * Construct a builder specifying the default SIMD width and group of
+       * channel enable signals, inheriting other code generation parameters
+       * from this.
+       *
+       * \p n gives the default SIMD width, \p i gives the slot group used for
+       * predication and control flow masking in multiples of \p n channels.
+       */
+      vec4_builder
+      group(unsigned n, unsigned i) const
+      {
+         assert(force_writemask_all ||
+                (n <= dispatch_width() && i < dispatch_width() / n));
+         vec4_builder bld = *this;
+         bld._dispatch_width = n;
+         bld._group += i * n;
+         return bld;
+      }
+
+      /**
+       * Construct a builder with per-channel control flow execution masking
+       * disabled if \p b is true.  If control flow execution masking is
+       * already disabled this has no effect.
+       */
+      vec4_builder
+      exec_all(bool b = true) const
+      {
+         vec4_builder bld = *this;
+         if (b)
+            bld.force_writemask_all = true;
+         return bld;
+      }
+
+      /**
+       * Construct a builder with the given debug annotation info.
+       */
+      vec4_builder
+      annotate(const char *str, const void *ir = NULL) const
+      {
+         vec4_builder bld = *this;
+         bld.annotation.str = str;
+         bld.annotation.ir = ir;
+         return bld;
+      }
+
+      /**
+       * Get the SIMD width in use.
+       */
+      unsigned
+      dispatch_width() const
+      {
+         return _dispatch_width;
+      }
+
+      /**
+       * Get the channel group in use.
+       */
+      unsigned
+      group() const
+      {
+         return _group;
+      }
+
+      /**
+       * Allocate a virtual register of natural vector size (four for this IR)
+       * and SIMD width.  \p n gives the amount of space to allocate in
+       * dispatch_width units (which is just enough space for four logical
+       * components in this IR).
+       */
+      dst_reg
+      vgrf(enum brw_reg_type type, unsigned n = 1) const
+      {
+         assert(dispatch_width() <= 32);
+
+         if (n > 0)
+            return retype(dst_reg(VGRF, shader->alloc.allocate(
+                                     n * DIV_ROUND_UP(type_sz(type), 4))),
+                           type);
+         else
+            return retype(null_reg_ud(), type);
+      }
+
+      /**
+       * Create a null register of floating type.
+       */
+      dst_reg
+      null_reg_f() const
+      {
+         return dst_reg(retype(brw_null_vec(dispatch_width()),
+                               BRW_REGISTER_TYPE_F));
+      }
+
+      /**
+       * Create a null register of signed integer type.
+       */
+      dst_reg
+      null_reg_d() const
+      {
+         return dst_reg(retype(brw_null_vec(dispatch_width()),
+                               BRW_REGISTER_TYPE_D));
+      }
+
+      /**
+       * Create a null register of unsigned integer type.
+       */
+      dst_reg
+      null_reg_ud() const
+      {
+         return dst_reg(retype(brw_null_vec(dispatch_width()),
+                               BRW_REGISTER_TYPE_UD));
+      }
+
+      /**
+       * Insert an instruction into the program.
+       */
+      instruction *
+      emit(const instruction &inst) const
+      {
+         return emit(new(shader->mem_ctx) instruction(inst));
+      }
+
+      /**
+       * Create and insert a nullary control instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode) const
+      {
+         return emit(instruction(opcode));
+      }
+
+      /**
+       * Create and insert a nullary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst) const
+      {
+         return emit(instruction(opcode, dst));
+      }
+
+      /**
+       * Create and insert a unary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0) const
+      {
+         switch (opcode) {
+         case SHADER_OPCODE_RCP:
+         case SHADER_OPCODE_RSQ:
+         case SHADER_OPCODE_SQRT:
+         case SHADER_OPCODE_EXP2:
+         case SHADER_OPCODE_LOG2:
+         case SHADER_OPCODE_SIN:
+         case SHADER_OPCODE_COS:
+            return fix_math_instruction(
+               emit(instruction(opcode, dst,
+                                fix_math_operand(src0))));
+
+         default:
+            return emit(instruction(opcode, dst, src0));
+         }
+      }
+
+      /**
+       * Create and insert a binary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+           const src_reg &src1) const
+      {
+         switch (opcode) {
+         case SHADER_OPCODE_POW:
+         case SHADER_OPCODE_INT_QUOTIENT:
+         case SHADER_OPCODE_INT_REMAINDER:
+            return fix_math_instruction(
+               emit(instruction(opcode, dst,
+                                fix_math_operand(src0),
+                                fix_math_operand(src1))));
+
+         default:
+            return emit(instruction(opcode, dst, src0, src1));
+         }
+      }
+
+      /**
+       * Create and insert a ternary instruction into the program.
+       */
+      instruction *
+      emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+           const src_reg &src1, const src_reg &src2) const
+      {
+         switch (opcode) {
+         case BRW_OPCODE_BFE:
+         case BRW_OPCODE_BFI2:
+         case BRW_OPCODE_MAD:
+         case BRW_OPCODE_LRP:
+            return emit(instruction(opcode, dst,
+                                    fix_3src_operand(src0),
+                                    fix_3src_operand(src1),
+                                    fix_3src_operand(src2)));
+
+         default:
+            return emit(instruction(opcode, dst, src0, src1, src2));
+         }
+      }
+
+      /**
+       * Insert a preallocated instruction into the program.
+       */
+      instruction *
+      emit(instruction *inst) const
+      {
+         inst->exec_size = dispatch_width();
+         inst->group = group();
+         inst->force_writemask_all = force_writemask_all;
+         inst->size_written = inst->exec_size * type_sz(inst->dst.type);
+         inst->annotation = annotation.str;
+         inst->ir = annotation.ir;
+
+         if (block)
+            static_cast<instruction *>(cursor)->insert_before(block, inst);
+         else
+            cursor->insert_before(inst);
+
+         return inst;
+      }
+
+      /**
+       * Select \p src0 if the comparison of both sources with the given
+       * conditional mod evaluates to true, otherwise select \p src1.
+       *
+       * Generally useful to get the minimum or maximum of two values.
+       */
+      instruction *
+      emit_minmax(const dst_reg &dst, const src_reg &src0,
+                  const src_reg &src1, brw_conditional_mod mod) const
+      {
+         assert(mod == BRW_CONDITIONAL_GE || mod == BRW_CONDITIONAL_L);
+
+         return set_condmod(mod, SEL(dst, fix_unsigned_negate(src0),
+                                     fix_unsigned_negate(src1)));
+      }
+
+      /**
+       * Copy any live channel from \p src to the first channel of the result.
+       */
+      src_reg
+      emit_uniformize(const src_reg &src) const
+      {
+         const vec4_builder ubld = exec_all();
+         const dst_reg chan_index =
+            writemask(vgrf(BRW_REGISTER_TYPE_UD), WRITEMASK_X);
+         const dst_reg dst = vgrf(src.type);
+
+         ubld.emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, chan_index);
+         ubld.emit(SHADER_OPCODE_BROADCAST, dst, src, src_reg(chan_index));
+
+         return src_reg(dst);
+      }
+
+      /**
+       * Assorted arithmetic ops.
+       * @{
+       */
+#define ALU1(op)                                        \
+      instruction *                                     \
+      op(const dst_reg &dst, const src_reg &src0) const \
+      {                                                 \
+         return emit(BRW_OPCODE_##op, dst, src0);       \
+      }
+
+#define ALU2(op)                                                        \
+      instruction *                                                     \
+      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
+      {                                                                 \
+         return emit(BRW_OPCODE_##op, dst, src0, src1);                 \
+      }
+
+#define ALU2_ACC(op)                                                    \
+      instruction *                                                     \
+      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1) const \
+      {                                                                 \
+         instruction *inst = emit(BRW_OPCODE_##op, dst, src0, src1);    \
+         inst->writes_accumulator = true;                               \
+         return inst;                                                   \
+      }
+
+#define ALU3(op)                                                        \
+      instruction *                                                     \
+      op(const dst_reg &dst, const src_reg &src0, const src_reg &src1,  \
+         const src_reg &src2) const                                     \
+      {                                                                 \
+         return emit(BRW_OPCODE_##op, dst, src0, src1, src2);           \
+      }
+
+      ALU2(ADD)
+      ALU2_ACC(ADDC)
+      ALU2(AND)
+      ALU2(ASR)
+      ALU2(AVG)
+      ALU3(BFE)
+      ALU2(BFI1)
+      ALU3(BFI2)
+      ALU1(BFREV)
+      ALU1(CBIT)
+      ALU2(CMPN)
+      ALU3(CSEL)
+      ALU1(DIM)
+      ALU2(DP2)
+      ALU2(DP3)
+      ALU2(DP4)
+      ALU2(DPH)
+      ALU1(F16TO32)
+      ALU1(F32TO16)
+      ALU1(FBH)
+      ALU1(FBL)
+      ALU1(FRC)
+      ALU2(LINE)
+      ALU1(LZD)
+      ALU2(MAC)
+      ALU2_ACC(MACH)
+      ALU3(MAD)
+      ALU1(MOV)
+      ALU2(MUL)
+      ALU1(NOT)
+      ALU2(OR)
+      ALU2(PLN)
+      ALU1(RNDD)
+      ALU1(RNDE)
+      ALU1(RNDU)
+      ALU1(RNDZ)
+      ALU2(SAD2)
+      ALU2_ACC(SADA2)
+      ALU2(SEL)
+      ALU2(SHL)
+      ALU2(SHR)
+      ALU2_ACC(SUBB)
+      ALU2(XOR)
+
+#undef ALU3
+#undef ALU2_ACC
+#undef ALU2
+#undef ALU1
+      /** @} */
+
+      /**
+       * CMP: Sets the low bit of the destination channels with the result
+       * of the comparison, while the upper bits are undefined, and updates
+       * the flag register with the packed 16 bits of the result.
+       */
+      instruction *
+      CMP(const dst_reg &dst, const src_reg &src0, const src_reg &src1,
+          brw_conditional_mod condition) const
+      {
+         /* Take the instruction:
+          *
+          * CMP null<d> src0<f> src1<f>
+          *
+          * Original gen4 does type conversion to the destination type
+          * before comparison, producing garbage results for floating
+          * point comparisons.
+          *
+          * The destination type doesn't matter on newer generations,
+          * so we set the type to match src0 so we can compact the
+          * instruction.
+          */
+         return set_condmod(condition,
+                            emit(BRW_OPCODE_CMP, retype(dst, src0.type),
+                                 fix_unsigned_negate(src0),
+                                 fix_unsigned_negate(src1)));
+      }
+
+      /**
+       * Gen4 predicated IF.
+       */
+      instruction *
+      IF(brw_predicate predicate) const
+      {
+         return set_predicate(predicate, emit(BRW_OPCODE_IF));
+      }
+
+      /**
+       * Gen6 IF with embedded comparison.
+       */
+      instruction *
+      IF(const src_reg &src0, const src_reg &src1,
+         brw_conditional_mod condition) const
+      {
+         assert(shader->devinfo->gen == 6);
+         return set_condmod(condition,
+                            emit(BRW_OPCODE_IF,
+                                 null_reg_d(),
+                                 fix_unsigned_negate(src0),
+                                 fix_unsigned_negate(src1)));
+      }
+
+      /**
+       * Emit a linear interpolation instruction.
+       */
+      instruction *
+      LRP(const dst_reg &dst, const src_reg &x, const src_reg &y,
+          const src_reg &a) const
+      {
+         if (shader->devinfo->gen >= 6) {
+            /* The LRP instruction actually does op1 * op0 + op2 * (1 - op0), so
+             * we need to reorder the operands.
+             */
+            return emit(BRW_OPCODE_LRP, dst, a, y, x);
+
+         } else {
+            /* We can't use the LRP instruction.  Emit x*(1-a) + y*a. */
+            const dst_reg y_times_a = vgrf(dst.type);
+            const dst_reg one_minus_a = vgrf(dst.type);
+            const dst_reg x_times_one_minus_a = vgrf(dst.type);
+
+            MUL(y_times_a, y, a);
+            ADD(one_minus_a, negate(a), brw_imm_f(1.0f));
+            MUL(x_times_one_minus_a, x, src_reg(one_minus_a));
+            return ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a));
+         }
+      }
+
+      backend_shader *shader;
+
+   protected:
+      /**
+       * Workaround for negation of UD registers.  See comment in
+       * fs_generator::generate_code() for the details.
+       */
+      src_reg
+      fix_unsigned_negate(const src_reg &src) const
+      {
+         if (src.type == BRW_REGISTER_TYPE_UD && src.negate) {
+            dst_reg temp = vgrf(BRW_REGISTER_TYPE_UD);
+            MOV(temp, src);
+            return src_reg(temp);
+         } else {
+            return src;
+         }
+      }
+
+      /**
+       * Workaround for register access modes not supported by the ternary
+       * instruction encoding.
+       */
+      src_reg
+      fix_3src_operand(const src_reg &src) const
+      {
+         /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
+          * able to use vertical stride of zero to replicate the vec4 uniform, like
+          *
+          *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
+          *
+          * But you can't, since vertical stride is always four in three-source
+          * instructions. Instead, insert a MOV instruction to do the replication so
+          * that the three-source instruction can consume it.
+          */
+
+         /* The MOV is only needed if the source is a uniform or immediate. */
+         if (src.file != UNIFORM && src.file != IMM)
+            return src;
+
+         if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
+            return src;
+
+         const dst_reg expanded = vgrf(src.type);
+         emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
+         return src_reg(expanded);
+      }
+
+      /**
+       * Workaround for register access modes not supported by the math
+       * instruction.
+       */
+      src_reg
+      fix_math_operand(const src_reg &src) const
+      {
+         /* The gen6 math instruction ignores the source modifiers --
+          * swizzle, abs, negate, and at least some parts of the register
+          * region description.
+          *
+          * Rather than trying to enumerate all these cases, *always* expand the
+          * operand to a temp GRF for gen6.
+          *
+          * For gen7, keep the operand as-is, except if immediate, which gen7 still
+          * can't use.
+          */
+         if (shader->devinfo->gen == 6 ||
+             (shader->devinfo->gen == 7 && src.file == IMM)) {
+            const dst_reg tmp = vgrf(src.type);
+            MOV(tmp, src);
+            return src_reg(tmp);
+         } else {
+            return src;
+         }
+      }
+
+      /**
+       * Workaround other weirdness of the math instruction.
+       */
+      instruction *
+      fix_math_instruction(instruction *inst) const
+      {
+         if (shader->devinfo->gen == 6 &&
+             inst->dst.writemask != WRITEMASK_XYZW) {
+            const dst_reg tmp = vgrf(inst->dst.type);
+            MOV(inst->dst, src_reg(tmp));
+            inst->dst = tmp;
+
+         } else if (shader->devinfo->gen < 6) {
+            const unsigned sources = (inst->src[1].file == BAD_FILE ? 1 : 2);
+            inst->base_mrf = 1;
+            inst->mlen = sources;
+         }
+
+         return inst;
+      }
+
+      bblock_t *block;
+      exec_node *cursor;
+
+      unsigned _dispatch_width;
+      unsigned _group;
+      bool force_writemask_all;
+
+      /** Debug annotation info. */
+      struct {
+         const char *str;
+         const void *ir;
+      } annotation;
+   };
+}
+
+#endif
diff --git a/src/intel/compiler/brw_vec4_cmod_propagation.cpp b/src/intel/compiler/brw_vec4_cmod_propagation.cpp
new file mode 100644
index 00000000000..4454cdbfc94
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_cmod_propagation.cpp
@@ -0,0 +1,172 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+/** @file brw_vec4_cmod_propagation.cpp
+ *
+ * Really similar to brw_fs_cmod_propagation but adapted to vec4 needs. Check
+ * brw_fs_cmod_propagation for further details on the rationale behind this
+ * optimization.
+ */
+
+#include "brw_vec4.h"
+#include "brw_cfg.h"
+#include "brw_eu.h"
+
+namespace brw {
+
+static bool
+opt_cmod_propagation_local(bblock_t *block)
+{
+   bool progress = false;
+   int ip = block->end_ip + 1;
+
+   foreach_inst_in_block_reverse_safe(vec4_instruction, inst, block) {
+      ip--;
+
+      if ((inst->opcode != BRW_OPCODE_AND &&
+           inst->opcode != BRW_OPCODE_CMP &&
+           inst->opcode != BRW_OPCODE_MOV) ||
+          inst->predicate != BRW_PREDICATE_NONE ||
+          !inst->dst.is_null() ||
+          inst->src[0].file != VGRF ||
+          inst->src[0].abs)
+         continue;
+
+      if (inst->opcode == BRW_OPCODE_AND &&
+          !(inst->src[1].is_one() &&
+            inst->conditional_mod == BRW_CONDITIONAL_NZ &&
+            !inst->src[0].negate))
+         continue;
+
+      if (inst->opcode == BRW_OPCODE_CMP && !inst->src[1].is_zero())
+         continue;
+
+      if (inst->opcode == BRW_OPCODE_MOV &&
+          inst->conditional_mod != BRW_CONDITIONAL_NZ)
+         continue;
+
+      bool read_flag = false;
+      foreach_inst_in_block_reverse_starting_from(vec4_instruction, scan_inst, inst) {
+         if (regions_overlap(inst->src[0], inst->size_read(0),
+                             scan_inst->dst, scan_inst->size_written)) {
+            if ((scan_inst->predicate && scan_inst->opcode != BRW_OPCODE_SEL) ||
+                scan_inst->dst.offset != inst->src[0].offset ||
+                (scan_inst->dst.writemask != WRITEMASK_X &&
+                 scan_inst->dst.writemask != WRITEMASK_XYZW) ||
+                (scan_inst->dst.writemask == WRITEMASK_XYZW &&
+                 inst->src[0].swizzle != BRW_SWIZZLE_XYZW) ||
+                (inst->dst.writemask & ~scan_inst->dst.writemask) != 0 ||
+                scan_inst->exec_size != inst->exec_size ||
+                scan_inst->group != inst->group) {
+               break;
+            }
+
+            /* CMP's result is the same regardless of dest type. */
+            if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
+                scan_inst->opcode == BRW_OPCODE_CMP &&
+                (inst->dst.type == BRW_REGISTER_TYPE_D ||
+                 inst->dst.type == BRW_REGISTER_TYPE_UD)) {
+               inst->remove(block);
+               progress = true;
+               break;
+            }
+
+            /* If the AND wasn't handled by the previous case, it isn't safe
+             * to remove it.
+             */
+            if (inst->opcode == BRW_OPCODE_AND)
+               break;
+
+            /* Comparisons operate differently for ints and floats */
+            if (scan_inst->dst.type != inst->dst.type &&
+                (scan_inst->dst.type == BRW_REGISTER_TYPE_F ||
+                 inst->dst.type == BRW_REGISTER_TYPE_F))
+               break;
+
+            /* If the instruction generating inst's source also wrote the
+             * flag, and inst is doing a simple .nz comparison, then inst
+             * is redundant - the appropriate value is already in the flag
+             * register.  Delete inst.
+             */
+            if (inst->conditional_mod == BRW_CONDITIONAL_NZ &&
+                !inst->src[0].negate &&
+                scan_inst->writes_flag()) {
+               inst->remove(block);
+               progress = true;
+               break;
+            }
+
+            /* The conditional mod of the CMP/CMPN instructions behaves
+             * specially because the flag output is not calculated from the
+             * result of the instruction, but the other way around, which
+             * means that even if the condmod to propagate and the condmod
+             * from the CMP instruction are the same they will in general give
+             * different results because they are evaluated based on different
+             * inputs.
+             */
+            if (scan_inst->opcode == BRW_OPCODE_CMP ||
+                scan_inst->opcode == BRW_OPCODE_CMPN)
+               break;
+
+            /* Otherwise, try propagating the conditional. */
+            enum brw_conditional_mod cond =
+               inst->src[0].negate ? brw_swap_cmod(inst->conditional_mod)
+                                   : inst->conditional_mod;
+
+            if (scan_inst->can_do_cmod() &&
+                ((!read_flag && scan_inst->conditional_mod == BRW_CONDITIONAL_NONE) ||
+                 scan_inst->conditional_mod == cond)) {
+               scan_inst->conditional_mod = cond;
+               inst->remove(block);
+               progress = true;
+            }
+            break;
+         }
+
+         if (scan_inst->writes_flag())
+            break;
+
+         read_flag = read_flag || scan_inst->reads_flag();
+      }
+   }
+
+   return progress;
+}
+
+bool
+vec4_visitor::opt_cmod_propagation()
+{
+   bool progress = false;
+
+   foreach_block_reverse(block, cfg) {
+      progress = opt_cmod_propagation_local(block) || progress;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
+} /* namespace brw */
diff --git a/src/intel/compiler/brw_vec4_copy_propagation.cpp b/src/intel/compiler/brw_vec4_copy_propagation.cpp
new file mode 100644
index 00000000000..e7f6f93f8bd
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_copy_propagation.cpp
@@ -0,0 +1,558 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file brw_vec4_copy_propagation.cpp
+ *
+ * Implements tracking of values copied between registers, and
+ * optimizations based on that: copy propagation and constant
+ * propagation.
+ */
+
+#include "brw_vec4.h"
+#include "brw_cfg.h"
+#include "brw_eu.h"
+
+namespace brw {
+
+struct copy_entry {
+   src_reg *value[4];
+   int saturatemask;
+};
+
+static bool
+is_direct_copy(vec4_instruction *inst)
+{
+   return (inst->opcode == BRW_OPCODE_MOV &&
+	   !inst->predicate &&
+	   inst->dst.file == VGRF &&
+	   inst->dst.offset % REG_SIZE == 0 &&
+	   !inst->dst.reladdr &&
+	   !inst->src[0].reladdr &&
+	   (inst->dst.type == inst->src[0].type ||
+            (inst->dst.type == BRW_REGISTER_TYPE_F &&
+             inst->src[0].type == BRW_REGISTER_TYPE_VF)));
+}
+
+static bool
+is_dominated_by_previous_instruction(vec4_instruction *inst)
+{
+   return (inst->opcode != BRW_OPCODE_DO &&
+	   inst->opcode != BRW_OPCODE_WHILE &&
+	   inst->opcode != BRW_OPCODE_ELSE &&
+	   inst->opcode != BRW_OPCODE_ENDIF);
+}
+
+static bool
+is_channel_updated(vec4_instruction *inst, src_reg *values[4], int ch)
+{
+   const src_reg *src = values[ch];
+
+   /* consider GRF only */
+   assert(inst->dst.file == VGRF);
+   if (!src || src->file != VGRF)
+      return false;
+
+   return regions_overlap(*src, REG_SIZE, inst->dst, inst->size_written) &&
+          (inst->dst.offset != src->offset ||
+           inst->dst.writemask & (1 << BRW_GET_SWZ(src->swizzle, ch)));
+}
+
+static bool
+is_logic_op(enum opcode opcode)
+{
+   return (opcode == BRW_OPCODE_AND ||
+           opcode == BRW_OPCODE_OR  ||
+           opcode == BRW_OPCODE_XOR ||
+           opcode == BRW_OPCODE_NOT);
+}
+
+/**
+ * Get the origin of a copy as a single register if all components present in
+ * the given readmask originate from the same register and have compatible
+ * regions, otherwise return a BAD_FILE register.
+ */
+static src_reg
+get_copy_value(const copy_entry &entry, unsigned readmask)
+{
+   unsigned swz[4] = {};
+   src_reg value;
+
+   for (unsigned i = 0; i < 4; i++) {
+      if (readmask & (1 << i)) {
+         if (entry.value[i]) {
+            src_reg src = *entry.value[i];
+
+            if (src.file == IMM) {
+               swz[i] = i;
+            } else {
+               swz[i] = BRW_GET_SWZ(src.swizzle, i);
+               /* Overwrite the original swizzle so the src_reg::equals call
+                * below doesn't care about it, the correct swizzle will be
+                * calculated once the swizzles of all components are known.
+                */
+               src.swizzle = BRW_SWIZZLE_XYZW;
+            }
+
+            if (value.file == BAD_FILE) {
+               value = src;
+            } else if (!value.equals(src)) {
+               return src_reg();
+            }
+         } else {
+            return src_reg();
+         }
+      }
+   }
+
+   return swizzle(value,
+                  brw_compose_swizzle(brw_swizzle_for_mask(readmask),
+                                      BRW_SWIZZLE4(swz[0], swz[1],
+                                                   swz[2], swz[3])));
+}
+
+static bool
+try_constant_propagate(const struct gen_device_info *devinfo,
+                       vec4_instruction *inst,
+                       int arg, const copy_entry *entry)
+{
+   /* For constant propagation, we only handle the same constant
+    * across all 4 channels.  Some day, we should handle the 8-bit
+    * float vector format, which would let us constant propagate
+    * vectors better.
+    * We could be more aggressive here -- some channels might not get used
+    * based on the destination writemask.
+    */
+   src_reg value =
+      get_copy_value(*entry,
+                     brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle,
+                                                   WRITEMASK_XYZW));
+
+   if (value.file != IMM)
+      return false;
+
+   /* 64-bit types can't be used except for one-source instructions, which
+    * higher levels should have constant folded away, so there's no point in
+    * propagating immediates here.
+    */
+   if (type_sz(value.type) == 8 || type_sz(inst->src[arg].type) == 8)
+      return false;
+
+   if (value.type == BRW_REGISTER_TYPE_VF) {
+      /* The result of bit-casting the component values of a vector float
+       * cannot in general be represented as an immediate.
+       */
+      if (inst->src[arg].type != BRW_REGISTER_TYPE_F)
+         return false;
+   } else {
+      value.type = inst->src[arg].type;
+   }
+
+   if (inst->src[arg].abs) {
+      if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) ||
+          !brw_abs_immediate(value.type, &value.as_brw_reg())) {
+         return false;
+      }
+   }
+
+   if (inst->src[arg].negate) {
+      if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) ||
+          !brw_negate_immediate(value.type, &value.as_brw_reg())) {
+         return false;
+      }
+   }
+
+   value = swizzle(value, inst->src[arg].swizzle);
+
+   switch (inst->opcode) {
+   case BRW_OPCODE_MOV:
+   case SHADER_OPCODE_BROADCAST:
+      inst->src[arg] = value;
+      return true;
+
+   case SHADER_OPCODE_POW:
+   case SHADER_OPCODE_INT_QUOTIENT:
+   case SHADER_OPCODE_INT_REMAINDER:
+      if (devinfo->gen < 8)
+         break;
+      /* fallthrough */
+   case BRW_OPCODE_DP2:
+   case BRW_OPCODE_DP3:
+   case BRW_OPCODE_DP4:
+   case BRW_OPCODE_DPH:
+   case BRW_OPCODE_BFI1:
+   case BRW_OPCODE_ASR:
+   case BRW_OPCODE_SHL:
+   case BRW_OPCODE_SHR:
+   case BRW_OPCODE_SUBB:
+      if (arg == 1) {
+         inst->src[arg] = value;
+         return true;
+      }
+      break;
+
+   case BRW_OPCODE_MACH:
+   case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
+   case BRW_OPCODE_ADD:
+   case BRW_OPCODE_OR:
+   case BRW_OPCODE_AND:
+   case BRW_OPCODE_XOR:
+   case BRW_OPCODE_ADDC:
+      if (arg == 1) {
+	 inst->src[arg] = value;
+	 return true;
+      } else if (arg == 0 && inst->src[1].file != IMM) {
+	 /* Fit this constant in by commuting the operands.  Exception: we
+	  * can't do this for 32-bit integer MUL/MACH because it's asymmetric.
+	  */
+	 if ((inst->opcode == BRW_OPCODE_MUL ||
+              inst->opcode == BRW_OPCODE_MACH) &&
+	     (inst->src[1].type == BRW_REGISTER_TYPE_D ||
+	      inst->src[1].type == BRW_REGISTER_TYPE_UD))
+	    break;
+	 inst->src[0] = inst->src[1];
+	 inst->src[1] = value;
+	 return true;
+      }
+      break;
+   case GS_OPCODE_SET_WRITE_OFFSET:
+      /* This is just a multiply by a constant with special strides.
+       * The generator will handle immediates in both arguments (generating
+       * a single MOV of the product).  So feel free to propagate in src0.
+       */
+      inst->src[arg] = value;
+      return true;
+
+   case BRW_OPCODE_CMP:
+      if (arg == 1) {
+	 inst->src[arg] = value;
+	 return true;
+      } else if (arg == 0 && inst->src[1].file != IMM) {
+	 enum brw_conditional_mod new_cmod;
+
+	 new_cmod = brw_swap_cmod(inst->conditional_mod);
+	 if (new_cmod != BRW_CONDITIONAL_NONE) {
+	    /* Fit this constant in by swapping the operands and
+	     * flipping the test.
+	     */
+	    inst->src[0] = inst->src[1];
+	    inst->src[1] = value;
+	    inst->conditional_mod = new_cmod;
+	    return true;
+	 }
+      }
+      break;
+
+   case BRW_OPCODE_SEL:
+      if (arg == 1) {
+	 inst->src[arg] = value;
+	 return true;
+      } else if (arg == 0 && inst->src[1].file != IMM) {
+	 inst->src[0] = inst->src[1];
+	 inst->src[1] = value;
+
+	 /* If this was predicated, flipping operands means
+	  * we also need to flip the predicate.
+	  */
+	 if (inst->conditional_mod == BRW_CONDITIONAL_NONE) {
+	    inst->predicate_inverse = !inst->predicate_inverse;
+	 }
+	 return true;
+      }
+      break;
+
+   default:
+      break;
+   }
+
+   return false;
+}
+
+static bool
+is_align1_opcode(unsigned opcode)
+{
+   switch (opcode) {
+   case VEC4_OPCODE_FROM_DOUBLE:
+   case VEC4_OPCODE_TO_DOUBLE:
+   case VEC4_OPCODE_PICK_LOW_32BIT:
+   case VEC4_OPCODE_PICK_HIGH_32BIT:
+   case VEC4_OPCODE_SET_LOW_32BIT:
+   case VEC4_OPCODE_SET_HIGH_32BIT:
+      return true;
+   default:
+      return false;
+   }
+}
+
+static bool
+try_copy_propagate(const struct gen_device_info *devinfo,
+                   vec4_instruction *inst, int arg,
+                   const copy_entry *entry, int attributes_per_reg)
+{
+   /* Build up the value we are propagating as if it were the source of a
+    * single MOV
+    */
+   src_reg value =
+      get_copy_value(*entry,
+                     brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle,
+                                                   WRITEMASK_XYZW));
+
+   /* Check that we can propagate that value */
+   if (value.file != UNIFORM &&
+       value.file != VGRF &&
+       value.file != ATTR)
+      return false;
+
+   /* In gen < 8 instructions that write 2 registers also need to read 2
+    * registers. Make sure we don't break that restriction by copy
+    * propagating from a uniform.
+    */
+   if (devinfo->gen < 8 && inst->size_written > REG_SIZE && is_uniform(value))
+      return false;
+
+   /* There is a regioning restriction such that if execsize == width
+    * and hstride != 0 then the vstride can't be 0. When we split instrutions
+    * that take a single-precision source (like F->DF conversions) we end up
+    * with a 4-wide source on an instruction with an execution size of 4.
+    * If we then copy-propagate the source from a uniform we also end up with a
+    * vstride of 0 and we violate the restriction.
+    */
+   if (inst->exec_size == 4 && value.file == UNIFORM &&
+       type_sz(value.type) == 4)
+      return false;
+
+   /* If the type of the copy value is different from the type of the
+    * instruction then the swizzles and writemasks involved don't have the same
+    * meaning and simply replacing the source would produce different semantics.
+    */
+   if (type_sz(value.type) != type_sz(inst->src[arg].type))
+      return false;
+
+   if (devinfo->gen >= 8 && (value.negate || value.abs) &&
+       is_logic_op(inst->opcode)) {
+      return false;
+   }
+
+   if (inst->src[arg].offset % REG_SIZE || value.offset % REG_SIZE)
+      return false;
+
+   bool has_source_modifiers = value.negate || value.abs;
+
+   /* gen6 math and gen7+ SENDs from GRFs ignore source modifiers on
+    * instructions.
+    */
+   if ((has_source_modifiers || value.file == UNIFORM ||
+        value.swizzle != BRW_SWIZZLE_XYZW) && !inst->can_do_source_mods(devinfo))
+      return false;
+
+   if (has_source_modifiers &&
+       value.type != inst->src[arg].type &&
+       !inst->can_change_types())
+      return false;
+
+   if (has_source_modifiers &&
+       inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_WRITE)
+      return false;
+
+   unsigned composed_swizzle = brw_compose_swizzle(inst->src[arg].swizzle,
+                                                   value.swizzle);
+
+   /* Instructions that operate on vectors in ALIGN1 mode will ignore swizzles
+    * so copy-propagation won't be safe if the composed swizzle is anything
+    * other than the identity.
+    */
+   if (is_align1_opcode(inst->opcode) && composed_swizzle != BRW_SWIZZLE_XYZW)
+      return false;
+
+   if (inst->is_3src(devinfo) &&
+       (value.file == UNIFORM ||
+        (value.file == ATTR && attributes_per_reg != 1)) &&
+       !brw_is_single_value_swizzle(composed_swizzle))
+      return false;
+
+   if (inst->is_send_from_grf())
+      return false;
+
+   /* we can't generally copy-propagate UD negations becuse we
+    * end up accessing the resulting values as signed integers
+    * instead. See also resolve_ud_negate().
+    */
+   if (value.negate &&
+       value.type == BRW_REGISTER_TYPE_UD)
+      return false;
+
+   /* Don't report progress if this is a noop. */
+   if (value.equals(inst->src[arg]))
+      return false;
+
+   const unsigned dst_saturate_mask = inst->dst.writemask &
+      brw_apply_swizzle_to_mask(inst->src[arg].swizzle, entry->saturatemask);
+
+   if (dst_saturate_mask) {
+      /* We either saturate all or nothing. */
+      if (dst_saturate_mask != inst->dst.writemask)
+         return false;
+
+      /* Limit saturate propagation only to SEL with src1 bounded within 0.0
+       * and 1.0, otherwise skip copy propagate altogether.
+       */
+      switch(inst->opcode) {
+      case BRW_OPCODE_SEL:
+         if (arg != 0 ||
+             inst->src[0].type != BRW_REGISTER_TYPE_F ||
+             inst->src[1].file != IMM ||
+             inst->src[1].type != BRW_REGISTER_TYPE_F ||
+             inst->src[1].f < 0.0 ||
+             inst->src[1].f > 1.0) {
+            return false;
+         }
+         if (!inst->saturate)
+            inst->saturate = true;
+         break;
+      default:
+         return false;
+      }
+   }
+
+   /* Build the final value */
+   if (inst->src[arg].abs) {
+      value.negate = false;
+      value.abs = true;
+   }
+   if (inst->src[arg].negate)
+      value.negate = !value.negate;
+
+   value.swizzle = composed_swizzle;
+   if (has_source_modifiers &&
+       value.type != inst->src[arg].type) {
+      assert(inst->can_change_types());
+      for (int i = 0; i < 3; i++) {
+         inst->src[i].type = value.type;
+      }
+      inst->dst.type = value.type;
+   } else {
+      value.type = inst->src[arg].type;
+   }
+
+   inst->src[arg] = value;
+   return true;
+}
+
+bool
+vec4_visitor::opt_copy_propagation(bool do_constant_prop)
+{
+   /* If we are in dual instanced or single mode, then attributes are going
+    * to be interleaved, so one register contains two attribute slots.
+    */
+   const int attributes_per_reg =
+      prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2;
+   bool progress = false;
+   struct copy_entry entries[alloc.total_size];
+
+   memset(&entries, 0, sizeof(entries));
+
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      /* This pass only works on basic blocks.  If there's flow
+       * control, throw out all our information and start from
+       * scratch.
+       *
+       * This should really be fixed by using a structure like in
+       * src/glsl/opt_copy_propagation.cpp to track available copies.
+       */
+      if (!is_dominated_by_previous_instruction(inst)) {
+	 memset(&entries, 0, sizeof(entries));
+	 continue;
+      }
+
+      /* For each source arg, see if each component comes from a copy
+       * from the same type file (IMM, VGRF, UNIFORM), and try
+       * optimizing out access to the copy result
+       */
+      for (int i = 2; i >= 0; i--) {
+	 /* Copied values end up in GRFs, and we don't track reladdr
+	  * accesses.
+	  */
+	 if (inst->src[i].file != VGRF ||
+	     inst->src[i].reladdr)
+	    continue;
+
+         /* We only handle register-aligned single GRF copies. */
+         if (inst->size_read(i) != REG_SIZE ||
+             inst->src[i].offset % REG_SIZE)
+            continue;
+
+         const unsigned reg = (alloc.offsets[inst->src[i].nr] +
+                               inst->src[i].offset / REG_SIZE);
+         const copy_entry &entry = entries[reg];
+
+         if (do_constant_prop && try_constant_propagate(devinfo, inst, i, &entry))
+            progress = true;
+         else if (try_copy_propagate(devinfo, inst, i, &entry, attributes_per_reg))
+	    progress = true;
+      }
+
+      /* Track available source registers. */
+      if (inst->dst.file == VGRF) {
+	 const int reg =
+            alloc.offsets[inst->dst.nr] + inst->dst.offset / REG_SIZE;
+
+	 /* Update our destination's current channel values.  For a direct copy,
+	  * the value is the newly propagated source.  Otherwise, we don't know
+	  * the new value, so clear it.
+	  */
+	 bool direct_copy = is_direct_copy(inst);
+         entries[reg].saturatemask &= ~inst->dst.writemask;
+	 for (int i = 0; i < 4; i++) {
+	    if (inst->dst.writemask & (1 << i)) {
+               entries[reg].value[i] = direct_copy ? &inst->src[0] : NULL;
+               entries[reg].saturatemask |=
+                  inst->saturate && direct_copy ? 1 << i : 0;
+            }
+	 }
+
+	 /* Clear the records for any registers whose current value came from
+	  * our destination's updated channels, as the two are no longer equal.
+	  */
+	 if (inst->dst.reladdr)
+	    memset(&entries, 0, sizeof(entries));
+	 else {
+	    for (unsigned i = 0; i < alloc.total_size; i++) {
+	       for (int j = 0; j < 4; j++) {
+		  if (is_channel_updated(inst, entries[i].value, j)) {
+		     entries[i].value[j] = NULL;
+		     entries[i].saturatemask &= ~(1 << j);
+                  }
+	       }
+	    }
+	 }
+      }
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
+
+} /* namespace brw */
diff --git a/src/intel/compiler/brw_vec4_cse.cpp b/src/intel/compiler/brw_vec4_cse.cpp
new file mode 100644
index 00000000000..2e65ef78548
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_cse.cpp
@@ -0,0 +1,296 @@
+/*
+ * Copyright © 2012, 2013, 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_vec4.h"
+#include "brw_vec4_live_variables.h"
+#include "brw_cfg.h"
+
+using namespace brw;
+
+/** @file brw_vec4_cse.cpp
+ *
+ * Support for local common subexpression elimination.
+ *
+ * See Muchnick's Advanced Compiler Design and Implementation, section
+ * 13.1 (p378).
+ */
+
+namespace {
+struct aeb_entry : public exec_node {
+   /** The instruction that generates the expression value. */
+   vec4_instruction *generator;
+
+   /** The temporary where the value is stored. */
+   src_reg tmp;
+};
+}
+
+static bool
+is_expression(const vec4_instruction *const inst)
+{
+   switch (inst->opcode) {
+   case BRW_OPCODE_MOV:
+   case BRW_OPCODE_SEL:
+   case BRW_OPCODE_NOT:
+   case BRW_OPCODE_AND:
+   case BRW_OPCODE_OR:
+   case BRW_OPCODE_XOR:
+   case BRW_OPCODE_SHR:
+   case BRW_OPCODE_SHL:
+   case BRW_OPCODE_ASR:
+   case BRW_OPCODE_CMP:
+   case BRW_OPCODE_CMPN:
+   case BRW_OPCODE_ADD:
+   case BRW_OPCODE_MUL:
+   case SHADER_OPCODE_MULH:
+   case BRW_OPCODE_FRC:
+   case BRW_OPCODE_RNDU:
+   case BRW_OPCODE_RNDD:
+   case BRW_OPCODE_RNDE:
+   case BRW_OPCODE_RNDZ:
+   case BRW_OPCODE_LINE:
+   case BRW_OPCODE_PLN:
+   case BRW_OPCODE_MAD:
+   case BRW_OPCODE_LRP:
+   case VEC4_OPCODE_UNPACK_UNIFORM:
+   case SHADER_OPCODE_FIND_LIVE_CHANNEL:
+   case SHADER_OPCODE_BROADCAST:
+   case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+   case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
+      return true;
+   case SHADER_OPCODE_RCP:
+   case SHADER_OPCODE_RSQ:
+   case SHADER_OPCODE_SQRT:
+   case SHADER_OPCODE_EXP2:
+   case SHADER_OPCODE_LOG2:
+   case SHADER_OPCODE_POW:
+   case SHADER_OPCODE_INT_QUOTIENT:
+   case SHADER_OPCODE_INT_REMAINDER:
+   case SHADER_OPCODE_SIN:
+   case SHADER_OPCODE_COS:
+      return inst->mlen == 0;
+   default:
+      return false;
+   }
+}
+
+static bool
+operands_match(const vec4_instruction *a, const vec4_instruction *b)
+{
+   const src_reg *xs = a->src;
+   const src_reg *ys = b->src;
+
+   if (a->opcode == BRW_OPCODE_MAD) {
+      return xs[0].equals(ys[0]) &&
+             ((xs[1].equals(ys[1]) && xs[2].equals(ys[2])) ||
+              (xs[2].equals(ys[1]) && xs[1].equals(ys[2])));
+   } else if (!a->is_commutative()) {
+      return xs[0].equals(ys[0]) && xs[1].equals(ys[1]) && xs[2].equals(ys[2]);
+   } else {
+      return (xs[0].equals(ys[0]) && xs[1].equals(ys[1])) ||
+             (xs[1].equals(ys[0]) && xs[0].equals(ys[1]));
+   }
+}
+
+static bool
+instructions_match(vec4_instruction *a, vec4_instruction *b)
+{
+   return a->opcode == b->opcode &&
+          a->saturate == b->saturate &&
+          a->predicate == b->predicate &&
+          a->predicate_inverse == b->predicate_inverse &&
+          a->conditional_mod == b->conditional_mod &&
+          a->flag_subreg == b->flag_subreg &&
+          a->dst.type == b->dst.type &&
+          a->offset == b->offset &&
+          a->mlen == b->mlen &&
+          a->base_mrf == b->base_mrf &&
+          a->header_size == b->header_size &&
+          a->shadow_compare == b->shadow_compare &&
+          a->dst.writemask == b->dst.writemask &&
+          a->force_writemask_all == b->force_writemask_all &&
+          a->size_written == b->size_written &&
+          a->exec_size == b->exec_size &&
+          a->group == b->group &&
+          operands_match(a, b);
+}
+
+bool
+vec4_visitor::opt_cse_local(bblock_t *block)
+{
+   bool progress = false;
+   exec_list aeb;
+
+   void *cse_ctx = ralloc_context(NULL);
+
+   int ip = block->start_ip;
+   foreach_inst_in_block (vec4_instruction, inst, block) {
+      /* Skip some cases. */
+      if (is_expression(inst) && !inst->predicate && inst->mlen == 0 &&
+          ((inst->dst.file != ARF && inst->dst.file != FIXED_GRF) ||
+           inst->dst.is_null()))
+      {
+         bool found = false;
+
+         foreach_in_list_use_after(aeb_entry, entry, &aeb) {
+            /* Match current instruction's expression against those in AEB. */
+            if (!(entry->generator->dst.is_null() && !inst->dst.is_null()) &&
+                instructions_match(inst, entry->generator)) {
+               found = true;
+               progress = true;
+               break;
+            }
+         }
+
+         if (!found) {
+            if (inst->opcode != BRW_OPCODE_MOV ||
+                (inst->opcode == BRW_OPCODE_MOV &&
+                 inst->src[0].file == IMM &&
+                 inst->src[0].type == BRW_REGISTER_TYPE_VF)) {
+               /* Our first sighting of this expression.  Create an entry. */
+               aeb_entry *entry = ralloc(cse_ctx, aeb_entry);
+               entry->tmp = src_reg(); /* file will be BAD_FILE */
+               entry->generator = inst;
+               aeb.push_tail(entry);
+            }
+         } else {
+            /* This is at least our second sighting of this expression.
+             * If we don't have a temporary already, make one.
+             */
+            bool no_existing_temp = entry->tmp.file == BAD_FILE;
+            if (no_existing_temp && !entry->generator->dst.is_null()) {
+               entry->tmp = retype(src_reg(VGRF, alloc.allocate(
+                                              regs_written(entry->generator)),
+                                           NULL), inst->dst.type);
+
+               const unsigned width = entry->generator->exec_size;
+               unsigned component_size = width * type_sz(entry->tmp.type);
+               unsigned num_copy_movs =
+                  DIV_ROUND_UP(entry->generator->size_written, component_size);
+               for (unsigned i = 0; i < num_copy_movs; ++i) {
+                  vec4_instruction *copy =
+                     MOV(offset(entry->generator->dst, width, i),
+                         offset(entry->tmp, width, i));
+                  copy->exec_size = width;
+                  copy->group = entry->generator->group;
+                  copy->force_writemask_all =
+                     entry->generator->force_writemask_all;
+                  entry->generator->insert_after(block, copy);
+               }
+
+               entry->generator->dst = dst_reg(entry->tmp);
+            }
+
+            /* dest <- temp */
+            if (!inst->dst.is_null()) {
+               assert(inst->dst.type == entry->tmp.type);
+               const unsigned width = inst->exec_size;
+               unsigned component_size = width * type_sz(inst->dst.type);
+               unsigned num_copy_movs =
+                  DIV_ROUND_UP(inst->size_written, component_size);
+               for (unsigned i = 0; i < num_copy_movs; ++i) {
+                  vec4_instruction *copy =
+                     MOV(offset(inst->dst, width, i),
+                         offset(entry->tmp, width, i));
+                  copy->exec_size = inst->exec_size;
+                  copy->group = inst->group;
+                  copy->force_writemask_all = inst->force_writemask_all;
+                  inst->insert_before(block, copy);
+               }
+            }
+
+            /* Set our iterator so that next time through the loop inst->next
+             * will get the instruction in the basic block after the one we've
+             * removed.
+             */
+            vec4_instruction *prev = (vec4_instruction *)inst->prev;
+
+            inst->remove(block);
+            inst = prev;
+         }
+      }
+
+      foreach_in_list_safe(aeb_entry, entry, &aeb) {
+         /* Kill all AEB entries that write a different value to or read from
+          * the flag register if we just wrote it.
+          */
+         if (inst->writes_flag()) {
+            if (entry->generator->reads_flag() ||
+                (entry->generator->writes_flag() &&
+                 !instructions_match(inst, entry->generator))) {
+               entry->remove();
+               ralloc_free(entry);
+               continue;
+            }
+         }
+
+         for (int i = 0; i < 3; i++) {
+            src_reg *src = &entry->generator->src[i];
+
+            /* Kill all AEB entries that use the destination we just
+             * overwrote.
+             */
+            if (inst->dst.file == entry->generator->src[i].file &&
+                inst->dst.nr == entry->generator->src[i].nr) {
+               entry->remove();
+               ralloc_free(entry);
+               break;
+            }
+
+            /* Kill any AEB entries using registers that don't get reused any
+             * more -- a sure sign they'll fail operands_match().
+             */
+            if (src->file == VGRF) {
+               if (var_range_end(var_from_reg(alloc, dst_reg(*src)), 8) < ip) {
+                  entry->remove();
+                  ralloc_free(entry);
+                  break;
+               }
+            }
+         }
+      }
+
+      ip++;
+   }
+
+   ralloc_free(cse_ctx);
+
+   return progress;
+}
+
+bool
+vec4_visitor::opt_cse()
+{
+   bool progress = false;
+
+   calculate_live_intervals();
+
+   foreach_block (block, cfg) {
+      progress = opt_cse_local(block) || progress;
+   }
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
diff --git a/src/intel/compiler/brw_vec4_dead_code_eliminate.cpp b/src/intel/compiler/brw_vec4_dead_code_eliminate.cpp
new file mode 100644
index 00000000000..5b22a096dd1
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_dead_code_eliminate.cpp
@@ -0,0 +1,160 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_vec4.h"
+#include "brw_vec4_live_variables.h"
+#include "brw_cfg.h"
+
+/** @file brw_vec4_dead_code_eliminate.cpp
+ *
+ * Dataflow-aware dead code elimination.
+ *
+ * Walks the instruction list from the bottom, removing instructions that
+ * have results that both aren't used in later blocks and haven't been read
+ * yet in the tail end of this block.
+ */
+
+using namespace brw;
+
+bool
+vec4_visitor::dead_code_eliminate()
+{
+   bool progress = false;
+
+   calculate_live_intervals();
+
+   int num_vars = live_intervals->num_vars;
+   BITSET_WORD *live = rzalloc_array(NULL, BITSET_WORD, BITSET_WORDS(num_vars));
+   BITSET_WORD *flag_live = rzalloc_array(NULL, BITSET_WORD, 1);
+
+   foreach_block_reverse_safe(block, cfg) {
+      memcpy(live, live_intervals->block_data[block->num].liveout,
+             sizeof(BITSET_WORD) * BITSET_WORDS(num_vars));
+      memcpy(flag_live, live_intervals->block_data[block->num].flag_liveout,
+             sizeof(BITSET_WORD));
+
+      foreach_inst_in_block_reverse_safe(vec4_instruction, inst, block) {
+         if ((inst->dst.file == VGRF && !inst->has_side_effects()) ||
+             (inst->dst.is_null() && inst->writes_flag())){
+            bool result_live[4] = { false };
+            if (inst->dst.file == VGRF) {
+               for (unsigned i = 0; i < DIV_ROUND_UP(inst->size_written, 16); i++) {
+                  for (int c = 0; c < 4; c++) {
+                     const unsigned v = var_from_reg(alloc, inst->dst, c, i);
+                     result_live[c] |= BITSET_TEST(live, v);
+                  }
+               }
+            } else {
+               for (unsigned c = 0; c < 4; c++)
+                  result_live[c] = BITSET_TEST(flag_live, c);
+            }
+
+            /* If the instruction can't do writemasking, then it's all or
+             * nothing.
+             */
+            if (!inst->can_do_writemask(devinfo)) {
+               bool result = result_live[0] | result_live[1] |
+                             result_live[2] | result_live[3];
+               result_live[0] = result;
+               result_live[1] = result;
+               result_live[2] = result;
+               result_live[3] = result;
+            }
+
+            for (int c = 0; c < 4; c++) {
+               if (!result_live[c] && inst->dst.writemask & (1 << c)) {
+                  inst->dst.writemask &= ~(1 << c);
+                  progress = true;
+
+                  if (inst->dst.writemask == 0) {
+                     if (inst->writes_accumulator || inst->writes_flag()) {
+                        inst->dst = dst_reg(retype(brw_null_reg(), inst->dst.type));
+                     } else {
+                        inst->opcode = BRW_OPCODE_NOP;
+                        break;
+                     }
+                  }
+               }
+            }
+         }
+
+         if (inst->dst.is_null() && inst->writes_flag()) {
+            bool combined_live = false;
+            for (unsigned c = 0; c < 4; c++)
+               combined_live |= BITSET_TEST(flag_live, c);
+
+            if (!combined_live) {
+               inst->opcode = BRW_OPCODE_NOP;
+               progress = true;
+            }
+         }
+
+         if (inst->dst.file == VGRF && !inst->predicate &&
+             !inst->is_align1_partial_write()) {
+            for (unsigned i = 0; i < DIV_ROUND_UP(inst->size_written, 16); i++) {
+               for (int c = 0; c < 4; c++) {
+                  if (inst->dst.writemask & (1 << c)) {
+                     const unsigned v = var_from_reg(alloc, inst->dst, c, i);
+                     BITSET_CLEAR(live, v);
+                  }
+               }
+            }
+         }
+
+         if (inst->writes_flag() && !inst->predicate) {
+            for (unsigned c = 0; c < 4; c++)
+               BITSET_CLEAR(flag_live, c);
+         }
+
+         if (inst->opcode == BRW_OPCODE_NOP) {
+            inst->remove(block);
+            continue;
+         }
+
+         for (int i = 0; i < 3; i++) {
+            if (inst->src[i].file == VGRF) {
+               for (unsigned j = 0; j < DIV_ROUND_UP(inst->size_read(i), 16); j++) {
+                  for (int c = 0; c < 4; c++) {
+                     const unsigned v = var_from_reg(alloc, inst->src[i], c, j);
+                     BITSET_SET(live, v);
+                  }
+               }
+            }
+         }
+
+         for (unsigned c = 0; c < 4; c++) {
+            if (inst->reads_flag(c)) {
+               BITSET_SET(flag_live, c);
+            }
+         }
+      }
+   }
+
+   ralloc_free(live);
+   ralloc_free(flag_live);
+
+   if (progress)
+      invalidate_live_intervals();
+
+   return progress;
+}
diff --git a/src/intel/compiler/brw_vec4_generator.cpp b/src/intel/compiler/brw_vec4_generator.cpp
new file mode 100644
index 00000000000..2ac287f17fa
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_generator.cpp
@@ -0,0 +1,2217 @@
+/* Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_vec4.h"
+#include "brw_cfg.h"
+#include "brw_eu.h"
+#include "common/gen_debug.h"
+
+using namespace brw;
+
+static void
+generate_math1_gen4(struct brw_codegen *p,
+                    vec4_instruction *inst,
+                    struct brw_reg dst,
+                    struct brw_reg src)
+{
+   gen4_math(p,
+	     dst,
+	     brw_math_function(inst->opcode),
+	     inst->base_mrf,
+	     src,
+	     BRW_MATH_PRECISION_FULL);
+}
+
+static void
+check_gen6_math_src_arg(struct brw_reg src)
+{
+   /* Source swizzles are ignored. */
+   assert(!src.abs);
+   assert(!src.negate);
+   assert(src.swizzle == BRW_SWIZZLE_XYZW);
+}
+
+static void
+generate_math_gen6(struct brw_codegen *p,
+                   vec4_instruction *inst,
+                   struct brw_reg dst,
+                   struct brw_reg src0,
+                   struct brw_reg src1)
+{
+   /* Can't do writemask because math can't be align16. */
+   assert(dst.writemask == WRITEMASK_XYZW);
+   /* Source swizzles are ignored. */
+   check_gen6_math_src_arg(src0);
+   if (src1.file == BRW_GENERAL_REGISTER_FILE)
+      check_gen6_math_src_arg(src1);
+
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   gen6_math(p, dst, brw_math_function(inst->opcode), src0, src1);
+   brw_set_default_access_mode(p, BRW_ALIGN_16);
+}
+
+static void
+generate_math2_gen4(struct brw_codegen *p,
+                    vec4_instruction *inst,
+                    struct brw_reg dst,
+                    struct brw_reg src0,
+                    struct brw_reg src1)
+{
+   /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
+    * "Message Payload":
+    *
+    * "Operand0[7].  For the INT DIV functions, this operand is the
+    *  denominator."
+    *  ...
+    * "Operand1[7].  For the INT DIV functions, this operand is the
+    *  numerator."
+    */
+   bool is_int_div = inst->opcode != SHADER_OPCODE_POW;
+   struct brw_reg &op0 = is_int_div ? src1 : src0;
+   struct brw_reg &op1 = is_int_div ? src0 : src1;
+
+   brw_push_insn_state(p);
+   brw_set_default_saturate(p, false);
+   brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+   brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1), op1.type), op1);
+   brw_pop_insn_state(p);
+
+   gen4_math(p,
+	     dst,
+	     brw_math_function(inst->opcode),
+	     inst->base_mrf,
+	     op0,
+	     BRW_MATH_PRECISION_FULL);
+}
+
+static void
+generate_tex(struct brw_codegen *p,
+             struct brw_vue_prog_data *prog_data,
+             gl_shader_stage stage,
+             vec4_instruction *inst,
+             struct brw_reg dst,
+             struct brw_reg src,
+             struct brw_reg surface_index,
+             struct brw_reg sampler_index)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   int msg_type = -1;
+
+   if (devinfo->gen >= 5) {
+      switch (inst->opcode) {
+      case SHADER_OPCODE_TEX:
+      case SHADER_OPCODE_TXL:
+	 if (inst->shadow_compare) {
+	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
+	 } else {
+	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
+	 }
+	 break;
+      case SHADER_OPCODE_TXD:
+         if (inst->shadow_compare) {
+            /* Gen7.5+.  Otherwise, lowered by brw_lower_texture_gradients(). */
+            assert(devinfo->gen >= 8 || devinfo->is_haswell);
+            msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
+         } else {
+            msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
+         }
+	 break;
+      case SHADER_OPCODE_TXF:
+	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
+	 break;
+      case SHADER_OPCODE_TXF_CMS_W:
+         assert(devinfo->gen >= 9);
+         msg_type = GEN9_SAMPLER_MESSAGE_SAMPLE_LD2DMS_W;
+         break;
+      case SHADER_OPCODE_TXF_CMS:
+         if (devinfo->gen >= 7)
+            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
+         else
+            msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
+         break;
+      case SHADER_OPCODE_TXF_MCS:
+         assert(devinfo->gen >= 7);
+         msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD_MCS;
+         break;
+      case SHADER_OPCODE_TXS:
+	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
+	 break;
+      case SHADER_OPCODE_TG4:
+         if (inst->shadow_compare) {
+            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_C;
+         } else {
+            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4;
+         }
+         break;
+      case SHADER_OPCODE_TG4_OFFSET:
+         if (inst->shadow_compare) {
+            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO_C;
+         } else {
+            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_GATHER4_PO;
+         }
+         break;
+      case SHADER_OPCODE_SAMPLEINFO:
+         msg_type = GEN6_SAMPLER_MESSAGE_SAMPLE_SAMPLEINFO;
+         break;
+      default:
+	 unreachable("should not get here: invalid vec4 texture opcode");
+      }
+   } else {
+      switch (inst->opcode) {
+      case SHADER_OPCODE_TEX:
+      case SHADER_OPCODE_TXL:
+	 if (inst->shadow_compare) {
+	    msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_COMPARE;
+	    assert(inst->mlen == 3);
+	 } else {
+	    msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD;
+	    assert(inst->mlen == 2);
+	 }
+	 break;
+      case SHADER_OPCODE_TXD:
+	 /* There is no sample_d_c message; comparisons are done manually. */
+	 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_GRADIENTS;
+	 assert(inst->mlen == 4);
+	 break;
+      case SHADER_OPCODE_TXF:
+	 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_LD;
+	 assert(inst->mlen == 2);
+	 break;
+      case SHADER_OPCODE_TXS:
+	 msg_type = BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO;
+	 assert(inst->mlen == 2);
+	 break;
+      default:
+	 unreachable("should not get here: invalid vec4 texture opcode");
+      }
+   }
+
+   assert(msg_type != -1);
+
+   assert(sampler_index.type == BRW_REGISTER_TYPE_UD);
+
+   /* Load the message header if present.  If there's a texture offset, we need
+    * to set it up explicitly and load the offset bitfield.  Otherwise, we can
+    * use an implied move from g0 to the first message register.
+    */
+   if (inst->header_size != 0) {
+      if (devinfo->gen < 6 && !inst->offset) {
+         /* Set up an implied move from g0 to the MRF. */
+         src = brw_vec8_grf(0, 0);
+      } else {
+         struct brw_reg header =
+            retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD);
+         uint32_t dw2 = 0;
+
+         /* Explicitly set up the message header by copying g0 to the MRF. */
+         brw_push_insn_state(p);
+         brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+         brw_MOV(p, header, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+         brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+         if (inst->offset)
+            /* Set the texel offset bits in DWord 2. */
+            dw2 = inst->offset;
+
+         if (devinfo->gen >= 9)
+            /* SKL+ overloads BRW_SAMPLER_SIMD_MODE_SIMD4X2 to also do SIMD8D,
+             * based on bit 22 in the header.
+             */
+            dw2 |= GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2;
+
+         /* The VS, DS, and FS stages have the g0.2 payload delivered as 0,
+          * so header0.2 is 0 when g0 is copied.  The HS and GS stages do
+          * not, so we must set to to 0 to avoid setting undesirable bits
+          * in the message header.
+          */
+         if (dw2 ||
+             stage == MESA_SHADER_TESS_CTRL ||
+             stage == MESA_SHADER_GEOMETRY) {
+            brw_MOV(p, get_element_ud(header, 2), brw_imm_ud(dw2));
+         }
+
+         brw_adjust_sampler_state_pointer(p, header, sampler_index);
+         brw_pop_insn_state(p);
+      }
+   }
+
+   uint32_t return_format;
+
+   switch (dst.type) {
+   case BRW_REGISTER_TYPE_D:
+      return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
+      break;
+   case BRW_REGISTER_TYPE_UD:
+      return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
+      break;
+   default:
+      return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
+      break;
+   }
+
+   uint32_t base_binding_table_index = (inst->opcode == SHADER_OPCODE_TG4 ||
+         inst->opcode == SHADER_OPCODE_TG4_OFFSET)
+         ? prog_data->base.binding_table.gather_texture_start
+         : prog_data->base.binding_table.texture_start;
+
+   if (surface_index.file == BRW_IMMEDIATE_VALUE &&
+       sampler_index.file == BRW_IMMEDIATE_VALUE) {
+      uint32_t surface = surface_index.ud;
+      uint32_t sampler = sampler_index.ud;
+
+      brw_SAMPLE(p,
+                 dst,
+                 inst->base_mrf,
+                 src,
+                 surface + base_binding_table_index,
+                 sampler % 16,
+                 msg_type,
+                 1, /* response length */
+                 inst->mlen,
+                 inst->header_size != 0,
+                 BRW_SAMPLER_SIMD_MODE_SIMD4X2,
+                 return_format);
+
+      brw_mark_surface_used(&prog_data->base, sampler + base_binding_table_index);
+   } else {
+      /* Non-constant sampler index. */
+
+      struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
+      struct brw_reg surface_reg = vec1(retype(surface_index, BRW_REGISTER_TYPE_UD));
+      struct brw_reg sampler_reg = vec1(retype(sampler_index, BRW_REGISTER_TYPE_UD));
+
+      brw_push_insn_state(p);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+      if (brw_regs_equal(&surface_reg, &sampler_reg)) {
+         brw_MUL(p, addr, sampler_reg, brw_imm_uw(0x101));
+      } else {
+         if (sampler_reg.file == BRW_IMMEDIATE_VALUE) {
+            brw_OR(p, addr, surface_reg, brw_imm_ud(sampler_reg.ud << 8));
+         } else {
+            brw_SHL(p, addr, sampler_reg, brw_imm_ud(8));
+            brw_OR(p, addr, addr, surface_reg);
+         }
+      }
+      if (base_binding_table_index)
+         brw_ADD(p, addr, addr, brw_imm_ud(base_binding_table_index));
+      brw_AND(p, addr, addr, brw_imm_ud(0xfff));
+
+      brw_pop_insn_state(p);
+
+      if (inst->base_mrf != -1)
+         gen6_resolve_implied_move(p, &src, inst->base_mrf);
+
+      /* dst = send(offset, a0.0 | <descriptor>) */
+      brw_inst *insn = brw_send_indirect_message(
+         p, BRW_SFID_SAMPLER, dst, src, addr);
+      brw_set_sampler_message(p, insn,
+                              0 /* surface */,
+                              0 /* sampler */,
+                              msg_type,
+                              1 /* rlen */,
+                              inst->mlen /* mlen */,
+                              inst->header_size != 0 /* header */,
+                              BRW_SAMPLER_SIMD_MODE_SIMD4X2,
+                              return_format);
+
+      /* visitor knows more than we do about the surface limit required,
+       * so has already done marking.
+       */
+   }
+}
+
+static void
+generate_vs_urb_write(struct brw_codegen *p, vec4_instruction *inst)
+{
+   brw_urb_WRITE(p,
+		 brw_null_reg(), /* dest */
+		 inst->base_mrf, /* starting mrf reg nr */
+		 brw_vec8_grf(0, 0), /* src */
+                 inst->urb_write_flags,
+		 inst->mlen,
+		 0,		/* response len */
+		 inst->offset,	/* urb destination offset */
+		 BRW_URB_SWIZZLE_INTERLEAVE);
+}
+
+static void
+generate_gs_urb_write(struct brw_codegen *p, vec4_instruction *inst)
+{
+   struct brw_reg src = brw_message_reg(inst->base_mrf);
+   brw_urb_WRITE(p,
+                 brw_null_reg(), /* dest */
+                 inst->base_mrf, /* starting mrf reg nr */
+                 src,
+                 inst->urb_write_flags,
+                 inst->mlen,
+                 0,             /* response len */
+                 inst->offset,  /* urb destination offset */
+                 BRW_URB_SWIZZLE_INTERLEAVE);
+}
+
+static void
+generate_gs_urb_write_allocate(struct brw_codegen *p, vec4_instruction *inst)
+{
+   struct brw_reg src = brw_message_reg(inst->base_mrf);
+
+   /* We pass the temporary passed in src0 as the writeback register */
+   brw_urb_WRITE(p,
+                 inst->src[0].as_brw_reg(), /* dest */
+                 inst->base_mrf, /* starting mrf reg nr */
+                 src,
+                 BRW_URB_WRITE_ALLOCATE_COMPLETE,
+                 inst->mlen,
+                 1, /* response len */
+                 inst->offset,  /* urb destination offset */
+                 BRW_URB_SWIZZLE_INTERLEAVE);
+
+   /* Now put allocated urb handle in dst.0 */
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, get_element_ud(inst->dst.as_brw_reg(), 0),
+           get_element_ud(inst->src[0].as_brw_reg(), 0));
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_thread_end(struct brw_codegen *p, vec4_instruction *inst)
+{
+   struct brw_reg src = brw_message_reg(inst->base_mrf);
+   brw_urb_WRITE(p,
+                 brw_null_reg(), /* dest */
+                 inst->base_mrf, /* starting mrf reg nr */
+                 src,
+                 BRW_URB_WRITE_EOT | inst->urb_write_flags,
+                 inst->mlen,
+                 0,              /* response len */
+                 0,              /* urb destination offset */
+                 BRW_URB_SWIZZLE_INTERLEAVE);
+}
+
+static void
+generate_gs_set_write_offset(struct brw_codegen *p,
+                             struct brw_reg dst,
+                             struct brw_reg src0,
+                             struct brw_reg src1)
+{
+   /* From p22 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
+    * Header: M0.3):
+    *
+    *     Slot 0 Offset. This field, after adding to the Global Offset field
+    *     in the message descriptor, specifies the offset (in 256-bit units)
+    *     from the start of the URB entry, as referenced by URB Handle 0, at
+    *     which the data will be accessed.
+    *
+    * Similar text describes DWORD M0.4, which is slot 1 offset.
+    *
+    * Therefore, we want to multiply DWORDs 0 and 4 of src0 (the x components
+    * of the register for geometry shader invocations 0 and 1) by the
+    * immediate value in src1, and store the result in DWORDs 3 and 4 of dst.
+    *
+    * We can do this with the following EU instruction:
+    *
+    *     mul(2) dst.3<1>UD src0<8;2,4>UD src1<...>UW   { Align1 WE_all }
+    */
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   assert(p->devinfo->gen >= 7 &&
+          src1.file == BRW_IMMEDIATE_VALUE &&
+          src1.type == BRW_REGISTER_TYPE_UD &&
+          src1.ud <= USHRT_MAX);
+   if (src0.file == BRW_IMMEDIATE_VALUE) {
+      brw_MOV(p, suboffset(stride(dst, 2, 2, 1), 3),
+              brw_imm_ud(src0.ud * src1.ud));
+   } else {
+      brw_MUL(p, suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4),
+              retype(src1, BRW_REGISTER_TYPE_UW));
+   }
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_set_vertex_count(struct brw_codegen *p,
+                             struct brw_reg dst,
+                             struct brw_reg src)
+{
+   brw_push_insn_state(p);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+   if (p->devinfo->gen >= 8) {
+      /* Move the vertex count into the second MRF for the EOT write. */
+      brw_MOV(p, retype(brw_message_reg(dst.nr + 1), BRW_REGISTER_TYPE_UD),
+              src);
+   } else {
+      /* If we think of the src and dst registers as composed of 8 DWORDs each,
+       * we want to pick up the contents of DWORDs 0 and 4 from src, truncate
+       * them to WORDs, and then pack them into DWORD 2 of dst.
+       *
+       * It's easier to get the EU to do this if we think of the src and dst
+       * registers as composed of 16 WORDS each; then, we want to pick up the
+       * contents of WORDs 0 and 8 from src, and pack them into WORDs 4 and 5
+       * of dst.
+       *
+       * We can do that by the following EU instruction:
+       *
+       *     mov (2) dst.4<1>:uw src<8;1,0>:uw   { Align1, Q1, NoMask }
+       */
+      brw_set_default_access_mode(p, BRW_ALIGN_1);
+      brw_MOV(p,
+              suboffset(stride(retype(dst, BRW_REGISTER_TYPE_UW), 2, 2, 1), 4),
+              stride(retype(src, BRW_REGISTER_TYPE_UW), 8, 1, 0));
+   }
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_svb_write(struct brw_codegen *p,
+                      struct brw_vue_prog_data *prog_data,
+                      vec4_instruction *inst,
+                      struct brw_reg dst,
+                      struct brw_reg src0,
+                      struct brw_reg src1)
+{
+   int binding = inst->sol_binding;
+   bool final_write = inst->sol_final_write;
+
+   brw_push_insn_state(p);
+   brw_set_default_exec_size(p, BRW_EXECUTE_4);
+   /* Copy Vertex data into M0.x */
+   brw_MOV(p, stride(dst, 4, 4, 1),
+           stride(retype(src0, BRW_REGISTER_TYPE_UD), 4, 4, 1));
+   brw_pop_insn_state(p);
+
+   brw_push_insn_state(p);
+   /* Send SVB Write */
+   brw_svb_write(p,
+                 final_write ? src1 : brw_null_reg(), /* dest == src1 */
+                 1, /* msg_reg_nr */
+                 dst, /* src0 == previous dst */
+                 BRW_GEN6_SOL_BINDING_START + binding, /* binding_table_index */
+                 final_write); /* send_commit_msg */
+
+   /* Finally, wait for the write commit to occur so that we can proceed to
+    * other things safely.
+    *
+    * From the Sandybridge PRM, Volume 4, Part 1, Section 3.3:
+    *
+    *   The write commit does not modify the destination register, but
+    *   merely clears the dependency associated with the destination
+    *   register. Thus, a simple “mov” instruction using the register as a
+    *   source is sufficient to wait for the write commit to occur.
+    */
+   if (final_write) {
+      brw_MOV(p, src1, src1);
+   }
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_svb_set_destination_index(struct brw_codegen *p,
+                                      vec4_instruction *inst,
+                                      struct brw_reg dst,
+                                      struct brw_reg src)
+{
+   int vertex = inst->sol_vertex;
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, get_element_ud(dst, 5), get_element_ud(src, vertex));
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_set_dword_2(struct brw_codegen *p,
+                        struct brw_reg dst,
+                        struct brw_reg src)
+{
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, suboffset(vec1(dst), 2), suboffset(vec1(src), 0));
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_prepare_channel_masks(struct brw_codegen *p,
+                                  struct brw_reg dst)
+{
+   /* We want to left shift just DWORD 4 (the x component belonging to the
+    * second geometry shader invocation) by 4 bits.  So generate the
+    * instruction:
+    *
+    *     shl(1) dst.4<1>UD dst.4<0,1,0>UD 4UD { align1 WE_all }
+    */
+   dst = suboffset(vec1(dst), 4);
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_SHL(p, dst, dst, brw_imm_ud(4));
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_set_channel_masks(struct brw_codegen *p,
+                              struct brw_reg dst,
+                              struct brw_reg src)
+{
+   /* From p21 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
+    * Header: M0.5):
+    *
+    *     15 Vertex 1 DATA [3] / Vertex 0 DATA[7] Channel Mask
+    *
+    *        When Swizzle Control = URB_INTERLEAVED this bit controls Vertex 1
+    *        DATA[3], when Swizzle Control = URB_NOSWIZZLE this bit controls
+    *        Vertex 0 DATA[7].  This bit is ANDed with the corresponding
+    *        channel enable to determine the final channel enable.  For the
+    *        URB_READ_OWORD & URB_READ_HWORD messages, when final channel
+    *        enable is 1 it indicates that Vertex 1 DATA [3] will be included
+    *        in the writeback message.  For the URB_WRITE_OWORD &
+    *        URB_WRITE_HWORD messages, when final channel enable is 1 it
+    *        indicates that Vertex 1 DATA [3] will be written to the surface.
+    *
+    *        0: Vertex 1 DATA [3] / Vertex 0 DATA[7] channel not included
+    *        1: Vertex DATA [3] / Vertex 0 DATA[7] channel included
+    *
+    *     14 Vertex 1 DATA [2] Channel Mask
+    *     13 Vertex 1 DATA [1] Channel Mask
+    *     12 Vertex 1 DATA [0] Channel Mask
+    *     11 Vertex 0 DATA [3] Channel Mask
+    *     10 Vertex 0 DATA [2] Channel Mask
+    *      9 Vertex 0 DATA [1] Channel Mask
+    *      8 Vertex 0 DATA [0] Channel Mask
+    *
+    * (This is from a section of the PRM that is agnostic to the particular
+    * type of shader being executed, so "Vertex 0" and "Vertex 1" refer to
+    * geometry shader invocations 0 and 1, respectively).  Since we have the
+    * enable flags for geometry shader invocation 0 in bits 3:0 of DWORD 0,
+    * and the enable flags for geometry shader invocation 1 in bits 7:0 of
+    * DWORD 4, we just need to OR them together and store the result in bits
+    * 15:8 of DWORD 5.
+    *
+    * It's easier to get the EU to do this if we think of the src and dst
+    * registers as composed of 32 bytes each; then, we want to pick up the
+    * contents of bytes 0 and 16 from src, OR them together, and store them in
+    * byte 21.
+    *
+    * We can do that by the following EU instruction:
+    *
+    *     or(1) dst.21<1>UB src<0,1,0>UB src.16<0,1,0>UB { align1 WE_all }
+    *
+    * Note: this relies on the source register having zeros in (a) bits 7:4 of
+    * DWORD 0 and (b) bits 3:0 of DWORD 4.  We can rely on (b) because the
+    * source register was prepared by GS_OPCODE_PREPARE_CHANNEL_MASKS (which
+    * shifts DWORD 4 left by 4 bits), and we can rely on (a) because prior to
+    * the execution of GS_OPCODE_PREPARE_CHANNEL_MASKS, DWORDs 0 and 4 need to
+    * contain valid channel mask values (which are in the range 0x0-0xf).
+    */
+   dst = retype(dst, BRW_REGISTER_TYPE_UB);
+   src = retype(src, BRW_REGISTER_TYPE_UB);
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_OR(p, suboffset(vec1(dst), 21), vec1(src), suboffset(vec1(src), 16));
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_get_instance_id(struct brw_codegen *p,
+                            struct brw_reg dst)
+{
+   /* We want to right shift R0.0 & R0.1 by GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT
+    * and store into dst.0 & dst.4. So generate the instruction:
+    *
+    *     shr(8) dst<1> R0<1,4,0> GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT { align1 WE_normal 1Q }
+    */
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   dst = retype(dst, BRW_REGISTER_TYPE_UD);
+   struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+   brw_SHR(p, dst, stride(r0, 1, 4, 0),
+           brw_imm_ud(GEN7_GS_PAYLOAD_INSTANCE_ID_SHIFT));
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_ff_sync_set_primitives(struct brw_codegen *p,
+                                   struct brw_reg dst,
+                                   struct brw_reg src0,
+                                   struct brw_reg src1,
+                                   struct brw_reg src2)
+{
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   /* Save src0 data in 16:31 bits of dst.0 */
+   brw_AND(p, suboffset(vec1(dst), 0), suboffset(vec1(src0), 0),
+           brw_imm_ud(0xffffu));
+   brw_SHL(p, suboffset(vec1(dst), 0), suboffset(vec1(dst), 0), brw_imm_ud(16));
+   /* Save src1 data in 0:15 bits of dst.0 */
+   brw_AND(p, suboffset(vec1(src2), 0), suboffset(vec1(src1), 0),
+           brw_imm_ud(0xffffu));
+   brw_OR(p, suboffset(vec1(dst), 0),
+          suboffset(vec1(dst), 0),
+          suboffset(vec1(src2), 0));
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_ff_sync(struct brw_codegen *p,
+                    vec4_instruction *inst,
+                    struct brw_reg dst,
+                    struct brw_reg src0,
+                    struct brw_reg src1)
+{
+   /* This opcode uses an implied MRF register for:
+    *  - the header of the ff_sync message. And as such it is expected to be
+    *    initialized to r0 before calling here.
+    *  - the destination where we will write the allocated URB handle.
+    */
+   struct brw_reg header =
+      retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD);
+
+   /* Overwrite dword 0 of the header (SO vertices to write) and
+    * dword 1 (number of primitives written).
+    */
+   brw_push_insn_state(p);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_MOV(p, get_element_ud(header, 0), get_element_ud(src1, 0));
+   brw_MOV(p, get_element_ud(header, 1), get_element_ud(src0, 0));
+   brw_pop_insn_state(p);
+
+   /* Allocate URB handle in dst */
+   brw_ff_sync(p,
+               dst,
+               0,
+               header,
+               1, /* allocate */
+               1, /* response length */
+               0 /* eot */);
+
+   /* Now put allocated urb handle in header.0 */
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, get_element_ud(header, 0), get_element_ud(dst, 0));
+
+   /* src1 is not an immediate when we use transform feedback */
+   if (src1.file != BRW_IMMEDIATE_VALUE) {
+      brw_set_default_exec_size(p, BRW_EXECUTE_4);
+      brw_MOV(p, brw_vec4_grf(src1.nr, 0), brw_vec4_grf(dst.nr, 1));
+   }
+
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_gs_set_primitive_id(struct brw_codegen *p, struct brw_reg dst)
+{
+   /* In gen6, PrimitiveID is delivered in R0.1 of the payload */
+   struct brw_reg src = brw_vec8_grf(0, 0);
+   brw_push_insn_state(p);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_MOV(p, get_element_ud(dst, 0), get_element_ud(src, 1));
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_tcs_get_instance_id(struct brw_codegen *p, struct brw_reg dst)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   const bool ivb = devinfo->is_ivybridge || devinfo->is_baytrail;
+
+   /* "Instance Count" comes as part of the payload in r0.2 bits 23:17.
+    *
+    * Since we operate in SIMD4x2 mode, we need run half as many threads
+    * as necessary.  So we assign (2i + 1, 2i) as the thread counts.  We
+    * shift right by one less to accomplish the multiplication by two.
+    */
+   dst = retype(dst, BRW_REGISTER_TYPE_UD);
+   struct brw_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+   const int mask = ivb ? INTEL_MASK(22, 16) : INTEL_MASK(23, 17);
+   const int shift = ivb ? 16 : 17;
+
+   brw_AND(p, get_element_ud(dst, 0), get_element_ud(r0, 2), brw_imm_ud(mask));
+   brw_SHR(p, get_element_ud(dst, 0), get_element_ud(dst, 0),
+           brw_imm_ud(shift - 1));
+   brw_ADD(p, get_element_ud(dst, 4), get_element_ud(dst, 0), brw_imm_ud(1));
+
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_tcs_urb_write(struct brw_codegen *p,
+                       vec4_instruction *inst,
+                       struct brw_reg urb_header)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+
+   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, send, brw_null_reg());
+   brw_set_src0(p, send, urb_header);
+
+   brw_set_message_descriptor(p, send, BRW_SFID_URB,
+                              inst->mlen /* mlen */, 0 /* rlen */,
+                              true /* header */, false /* eot */);
+   brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_WRITE_OWORD);
+   brw_inst_set_urb_global_offset(devinfo, send, inst->offset);
+   if (inst->urb_write_flags & BRW_URB_WRITE_EOT) {
+      brw_inst_set_eot(devinfo, send, 1);
+   } else {
+      brw_inst_set_urb_per_slot_offset(devinfo, send, 1);
+      brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE);
+   }
+
+   /* what happens to swizzles? */
+}
+
+
+static void
+generate_tcs_input_urb_offsets(struct brw_codegen *p,
+                               struct brw_reg dst,
+                               struct brw_reg vertex,
+                               struct brw_reg offset)
+{
+   /* Generates an URB read/write message header for HS/DS operation.
+    * Inputs are a vertex index, and a byte offset from the beginning of
+    * the vertex. */
+
+   /* If `vertex` is not an immediate, we clobber a0.0 */
+
+   assert(vertex.file == BRW_IMMEDIATE_VALUE || vertex.file == BRW_GENERAL_REGISTER_FILE);
+   assert(vertex.type == BRW_REGISTER_TYPE_UD || vertex.type == BRW_REGISTER_TYPE_D);
+
+   assert(dst.file == BRW_GENERAL_REGISTER_FILE);
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, dst, brw_imm_ud(0));
+
+   /* m0.5 bits 8-15 are channel enables */
+   brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00));
+
+   /* m0.0-0.1: URB handles */
+   if (vertex.file == BRW_IMMEDIATE_VALUE) {
+      uint32_t vertex_index = vertex.ud;
+      struct brw_reg index_reg = brw_vec1_grf(
+            1 + (vertex_index >> 3), vertex_index & 7);
+
+      brw_MOV(p, vec2(get_element_ud(dst, 0)),
+              retype(index_reg, BRW_REGISTER_TYPE_UD));
+   } else {
+      /* Use indirect addressing.  ICP Handles are DWords (single channels
+       * of a register) and start at g1.0.
+       *
+       * In order to start our region at g1.0, we add 8 to the vertex index,
+       * effectively skipping over the 8 channels in g0.0.  This gives us a
+       * DWord offset to the ICP Handle.
+       *
+       * Indirect addressing works in terms of bytes, so we then multiply
+       * the DWord offset by 4 (by shifting left by 2).
+       */
+      struct brw_reg addr = brw_address_reg(0);
+
+      /* bottom half: m0.0 = g[1.0 + vertex.0]UD */
+      brw_ADD(p, addr, retype(get_element_ud(vertex, 0), BRW_REGISTER_TYPE_UW),
+              brw_imm_uw(0x8));
+      brw_SHL(p, addr, addr, brw_imm_uw(2));
+      brw_MOV(p, get_element_ud(dst, 0), deref_1ud(brw_indirect(0, 0), 0));
+
+      /* top half: m0.1 = g[1.0 + vertex.4]UD */
+      brw_ADD(p, addr, retype(get_element_ud(vertex, 4), BRW_REGISTER_TYPE_UW),
+              brw_imm_uw(0x8));
+      brw_SHL(p, addr, addr, brw_imm_uw(2));
+      brw_MOV(p, get_element_ud(dst, 1), deref_1ud(brw_indirect(0, 0), 0));
+   }
+
+   /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */
+   if (offset.file != ARF)
+      brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
+
+   brw_pop_insn_state(p);
+}
+
+
+static void
+generate_tcs_output_urb_offsets(struct brw_codegen *p,
+                                struct brw_reg dst,
+                                struct brw_reg write_mask,
+                                struct brw_reg offset)
+{
+   /* Generates an URB read/write message header for HS/DS operation, for the patch URB entry. */
+   assert(dst.file == BRW_GENERAL_REGISTER_FILE || dst.file == BRW_MESSAGE_REGISTER_FILE);
+
+   assert(write_mask.file == BRW_IMMEDIATE_VALUE);
+   assert(write_mask.type == BRW_REGISTER_TYPE_UD);
+
+   brw_push_insn_state(p);
+
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, dst, brw_imm_ud(0));
+
+   unsigned mask = write_mask.ud;
+
+   /* m0.5 bits 15:12 and 11:8 are channel enables */
+   brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud((mask << 8) | (mask << 12)));
+
+   /* HS patch URB handle is delivered in r0.0 */
+   struct brw_reg urb_handle = brw_vec1_grf(0, 0);
+
+   /* m0.0-0.1: URB handles */
+   brw_MOV(p, vec2(get_element_ud(dst, 0)),
+           retype(urb_handle, BRW_REGISTER_TYPE_UD));
+
+   /* m0.3-0.4: 128bit-granular offsets into the URB from the handles */
+   if (offset.file != ARF)
+      brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
+
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_tes_create_input_read_header(struct brw_codegen *p,
+                                      struct brw_reg dst)
+{
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+   /* Initialize the register to 0 */
+   brw_MOV(p, dst, brw_imm_ud(0));
+
+   /* Enable all the channels in m0.5 bits 15:8 */
+   brw_MOV(p, get_element_ud(dst, 5), brw_imm_ud(0xff00));
+
+   /* Copy g1.3 (the patch URB handle) to m0.0 and m0.1.  For safety,
+    * mask out irrelevant "Reserved" bits, as they're not marked MBZ.
+    */
+   brw_AND(p, vec2(get_element_ud(dst, 0)),
+           retype(brw_vec1_grf(1, 3), BRW_REGISTER_TYPE_UD),
+           brw_imm_ud(0x1fff));
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_tes_add_indirect_urb_offset(struct brw_codegen *p,
+                                     struct brw_reg dst,
+                                     struct brw_reg header,
+                                     struct brw_reg offset)
+{
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+   brw_MOV(p, dst, header);
+   /* m0.3-0.4: 128-bit-granular offsets into the URB from the handles */
+   brw_MOV(p, vec2(get_element_ud(dst, 3)), stride(offset, 4, 1, 0));
+
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_vec4_urb_read(struct brw_codegen *p,
+                       vec4_instruction *inst,
+                       struct brw_reg dst,
+                       struct brw_reg header)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+
+   assert(header.file == BRW_GENERAL_REGISTER_FILE);
+   assert(header.type == BRW_REGISTER_TYPE_UD);
+
+   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, send, dst);
+   brw_set_src0(p, send, header);
+
+   brw_set_message_descriptor(p, send, BRW_SFID_URB,
+                              1 /* mlen */, 1 /* rlen */,
+                              true /* header */, false /* eot */);
+   brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD);
+   brw_inst_set_urb_swizzle_control(devinfo, send, BRW_URB_SWIZZLE_INTERLEAVE);
+   brw_inst_set_urb_per_slot_offset(devinfo, send, 1);
+
+   brw_inst_set_urb_global_offset(devinfo, send, inst->offset);
+}
+
+static void
+generate_tcs_release_input(struct brw_codegen *p,
+                           struct brw_reg header,
+                           struct brw_reg vertex,
+                           struct brw_reg is_unpaired)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+
+   assert(vertex.file == BRW_IMMEDIATE_VALUE);
+   assert(vertex.type == BRW_REGISTER_TYPE_UD);
+
+   /* m0.0-0.1: URB handles */
+   struct brw_reg urb_handles =
+      retype(brw_vec2_grf(1 + (vertex.ud >> 3), vertex.ud & 7),
+             BRW_REGISTER_TYPE_UD);
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, header, brw_imm_ud(0));
+   brw_MOV(p, vec2(get_element_ud(header, 0)), urb_handles);
+   brw_pop_insn_state(p);
+
+   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, send, brw_null_reg());
+   brw_set_src0(p, send, header);
+   brw_set_message_descriptor(p, send, BRW_SFID_URB,
+                              1 /* mlen */, 0 /* rlen */,
+                              true /* header */, false /* eot */);
+   brw_inst_set_urb_opcode(devinfo, send, BRW_URB_OPCODE_READ_OWORD);
+   brw_inst_set_urb_complete(devinfo, send, 1);
+   brw_inst_set_urb_swizzle_control(devinfo, send, is_unpaired.ud ?
+                                    BRW_URB_SWIZZLE_NONE :
+                                    BRW_URB_SWIZZLE_INTERLEAVE);
+}
+
+static void
+generate_tcs_thread_end(struct brw_codegen *p, vec4_instruction *inst)
+{
+   struct brw_reg header = brw_message_reg(inst->base_mrf);
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_MOV(p, header, brw_imm_ud(0));
+   brw_MOV(p, get_element_ud(header, 5), brw_imm_ud(WRITEMASK_X << 8));
+   brw_MOV(p, get_element_ud(header, 0),
+           retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD));
+   brw_MOV(p, brw_message_reg(inst->base_mrf + 1), brw_imm_ud(0u));
+   brw_pop_insn_state(p);
+
+   brw_urb_WRITE(p,
+                 brw_null_reg(), /* dest */
+                 inst->base_mrf, /* starting mrf reg nr */
+                 header,
+                 BRW_URB_WRITE_EOT | BRW_URB_WRITE_OWORD |
+                 BRW_URB_WRITE_USE_CHANNEL_MASKS,
+                 inst->mlen,
+                 0,              /* response len */
+                 0,              /* urb destination offset */
+                 0);
+}
+
+static void
+generate_tes_get_primitive_id(struct brw_codegen *p, struct brw_reg dst)
+{
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_MOV(p, dst, retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_D));
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_tcs_get_primitive_id(struct brw_codegen *p, struct brw_reg dst)
+{
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_MOV(p, dst, retype(brw_vec1_grf(0, 1), BRW_REGISTER_TYPE_UD));
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_tcs_create_barrier_header(struct brw_codegen *p,
+                                   struct brw_vue_prog_data *prog_data,
+                                   struct brw_reg dst)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   const bool ivb = devinfo->is_ivybridge || devinfo->is_baytrail;
+   struct brw_reg m0_2 = get_element_ud(dst, 2);
+   unsigned instances = ((struct brw_tcs_prog_data *) prog_data)->instances;
+
+   brw_push_insn_state(p);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+   /* Zero the message header */
+   brw_MOV(p, retype(dst, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u));
+
+   /* Copy "Barrier ID" from r0.2, bits 16:13 (Gen7.5+) or 15:12 (Gen7) */
+   brw_AND(p, m0_2,
+           retype(brw_vec1_grf(0, 2), BRW_REGISTER_TYPE_UD),
+           brw_imm_ud(ivb ? INTEL_MASK(15, 12) : INTEL_MASK(16, 13)));
+
+   /* Shift it up to bits 27:24. */
+   brw_SHL(p, m0_2, get_element_ud(dst, 2), brw_imm_ud(ivb ? 12 : 11));
+
+   /* Set the Barrier Count and the enable bit */
+   brw_OR(p, m0_2, m0_2, brw_imm_ud(instances << 9 | (1 << 15)));
+
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_oword_dual_block_offsets(struct brw_codegen *p,
+                                  struct brw_reg m1,
+                                  struct brw_reg index)
+{
+   int second_vertex_offset;
+
+   if (p->devinfo->gen >= 6)
+      second_vertex_offset = 1;
+   else
+      second_vertex_offset = 16;
+
+   m1 = retype(m1, BRW_REGISTER_TYPE_D);
+
+   /* Set up M1 (message payload).  Only the block offsets in M1.0 and
+    * M1.4 are used, and the rest are ignored.
+    */
+   struct brw_reg m1_0 = suboffset(vec1(m1), 0);
+   struct brw_reg m1_4 = suboffset(vec1(m1), 4);
+   struct brw_reg index_0 = suboffset(vec1(index), 0);
+   struct brw_reg index_4 = suboffset(vec1(index), 4);
+
+   brw_push_insn_state(p);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+   brw_MOV(p, m1_0, index_0);
+
+   if (index.file == BRW_IMMEDIATE_VALUE) {
+      index_4.ud += second_vertex_offset;
+      brw_MOV(p, m1_4, index_4);
+   } else {
+      brw_ADD(p, m1_4, index_4, brw_imm_d(second_vertex_offset));
+   }
+
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_unpack_flags(struct brw_codegen *p,
+                      struct brw_reg dst)
+{
+   brw_push_insn_state(p);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+   struct brw_reg flags = brw_flag_reg(0, 0);
+   struct brw_reg dst_0 = suboffset(vec1(dst), 0);
+   struct brw_reg dst_4 = suboffset(vec1(dst), 4);
+
+   brw_AND(p, dst_0, flags, brw_imm_ud(0x0f));
+   brw_AND(p, dst_4, flags, brw_imm_ud(0xf0));
+   brw_SHR(p, dst_4, dst_4, brw_imm_ud(4));
+
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_scratch_read(struct brw_codegen *p,
+                      vec4_instruction *inst,
+                      struct brw_reg dst,
+                      struct brw_reg index)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   struct brw_reg header = brw_vec8_grf(0, 0);
+
+   gen6_resolve_implied_move(p, &header, inst->base_mrf);
+
+   generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1),
+				     index);
+
+   uint32_t msg_type;
+
+   if (devinfo->gen >= 6)
+      msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+   else if (devinfo->gen == 5 || devinfo->is_g4x)
+      msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+   else
+      msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+
+   const unsigned target_cache =
+      devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
+      devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
+      BRW_DATAPORT_READ_TARGET_RENDER_CACHE;
+
+   /* Each of the 8 channel enables is considered for whether each
+    * dword is written.
+    */
+   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, send, dst);
+   brw_set_src0(p, send, header);
+   if (devinfo->gen < 6)
+      brw_inst_set_cond_modifier(devinfo, send, inst->base_mrf);
+   brw_set_dp_read_message(p, send,
+                           brw_scratch_surface_idx(p),
+			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
+			   msg_type, target_cache,
+			   2, /* mlen */
+                           true, /* header_present */
+			   1 /* rlen */);
+}
+
+static void
+generate_scratch_write(struct brw_codegen *p,
+                       vec4_instruction *inst,
+                       struct brw_reg dst,
+                       struct brw_reg src,
+                       struct brw_reg index)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   const unsigned target_cache =
+      (devinfo->gen >= 7 ? GEN7_SFID_DATAPORT_DATA_CACHE :
+       devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_RENDER_CACHE :
+       BRW_DATAPORT_READ_TARGET_RENDER_CACHE);
+   struct brw_reg header = brw_vec8_grf(0, 0);
+   bool write_commit;
+
+   /* If the instruction is predicated, we'll predicate the send, not
+    * the header setup.
+    */
+   brw_set_default_predicate_control(p, false);
+
+   gen6_resolve_implied_move(p, &header, inst->base_mrf);
+
+   generate_oword_dual_block_offsets(p, brw_message_reg(inst->base_mrf + 1),
+				     index);
+
+   brw_MOV(p,
+	   retype(brw_message_reg(inst->base_mrf + 2), BRW_REGISTER_TYPE_D),
+	   retype(src, BRW_REGISTER_TYPE_D));
+
+   uint32_t msg_type;
+
+   if (devinfo->gen >= 7)
+      msg_type = GEN7_DATAPORT_DC_OWORD_DUAL_BLOCK_WRITE;
+   else if (devinfo->gen == 6)
+      msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
+   else
+      msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_DUAL_BLOCK_WRITE;
+
+   brw_set_default_predicate_control(p, inst->predicate);
+
+   /* Pre-gen6, we have to specify write commits to ensure ordering
+    * between reads and writes within a thread.  Afterwards, that's
+    * guaranteed and write commits only matter for inter-thread
+    * synchronization.
+    */
+   if (devinfo->gen >= 6) {
+      write_commit = false;
+   } else {
+      /* The visitor set up our destination register to be g0.  This
+       * means that when the next read comes along, we will end up
+       * reading from g0 and causing a block on the write commit.  For
+       * write-after-read, we are relying on the value of the previous
+       * read being used (and thus blocking on completion) before our
+       * write is executed.  This means we have to be careful in
+       * instruction scheduling to not violate this assumption.
+       */
+      write_commit = true;
+   }
+
+   /* Each of the 8 channel enables is considered for whether each
+    * dword is written.
+    */
+   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, send, dst);
+   brw_set_src0(p, send, header);
+   if (devinfo->gen < 6)
+      brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf);
+   brw_set_dp_write_message(p, send,
+                            brw_scratch_surface_idx(p),
+			    BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
+			    msg_type,
+                            target_cache,
+			    3, /* mlen */
+			    true, /* header present */
+			    false, /* not a render target write */
+			    write_commit, /* rlen */
+			    false, /* eot */
+			    write_commit);
+}
+
+static void
+generate_pull_constant_load(struct brw_codegen *p,
+                            struct brw_vue_prog_data *prog_data,
+                            vec4_instruction *inst,
+                            struct brw_reg dst,
+                            struct brw_reg index,
+                            struct brw_reg offset)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   const unsigned target_cache =
+      (devinfo->gen >= 6 ? GEN6_SFID_DATAPORT_SAMPLER_CACHE :
+       BRW_DATAPORT_READ_TARGET_DATA_CACHE);
+   assert(index.file == BRW_IMMEDIATE_VALUE &&
+	  index.type == BRW_REGISTER_TYPE_UD);
+   uint32_t surf_index = index.ud;
+
+   struct brw_reg header = brw_vec8_grf(0, 0);
+
+   gen6_resolve_implied_move(p, &header, inst->base_mrf);
+
+   if (devinfo->gen >= 6) {
+      if (offset.file == BRW_IMMEDIATE_VALUE) {
+         brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1),
+                           BRW_REGISTER_TYPE_D),
+                 brw_imm_d(offset.ud >> 4));
+      } else {
+         brw_SHR(p, retype(brw_message_reg(inst->base_mrf + 1),
+                           BRW_REGISTER_TYPE_D),
+                 offset, brw_imm_d(4));
+      }
+   } else {
+      brw_MOV(p, retype(brw_message_reg(inst->base_mrf + 1),
+                        BRW_REGISTER_TYPE_D),
+              offset);
+   }
+
+   uint32_t msg_type;
+
+   if (devinfo->gen >= 6)
+      msg_type = GEN6_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+   else if (devinfo->gen == 5 || devinfo->is_g4x)
+      msg_type = G45_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+   else
+      msg_type = BRW_DATAPORT_READ_MESSAGE_OWORD_DUAL_BLOCK_READ;
+
+   /* Each of the 8 channel enables is considered for whether each
+    * dword is written.
+    */
+   brw_inst *send = brw_next_insn(p, BRW_OPCODE_SEND);
+   brw_set_dest(p, send, dst);
+   brw_set_src0(p, send, header);
+   if (devinfo->gen < 6)
+      brw_inst_set_cond_modifier(p->devinfo, send, inst->base_mrf);
+   brw_set_dp_read_message(p, send,
+			   surf_index,
+			   BRW_DATAPORT_OWORD_DUAL_BLOCK_1OWORD,
+			   msg_type,
+                           target_cache,
+			   2, /* mlen */
+                           true, /* header_present */
+			   1 /* rlen */);
+}
+
+static void
+generate_get_buffer_size(struct brw_codegen *p,
+                         struct brw_vue_prog_data *prog_data,
+                         vec4_instruction *inst,
+                         struct brw_reg dst,
+                         struct brw_reg src,
+                         struct brw_reg surf_index)
+{
+   assert(p->devinfo->gen >= 7);
+   assert(surf_index.type == BRW_REGISTER_TYPE_UD &&
+          surf_index.file == BRW_IMMEDIATE_VALUE);
+
+   brw_SAMPLE(p,
+              dst,
+              inst->base_mrf,
+              src,
+              surf_index.ud,
+              0,
+              GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO,
+              1, /* response length */
+              inst->mlen,
+              inst->header_size > 0,
+              BRW_SAMPLER_SIMD_MODE_SIMD4X2,
+              BRW_SAMPLER_RETURN_FORMAT_SINT32);
+
+   brw_mark_surface_used(&prog_data->base, surf_index.ud);
+}
+
+static void
+generate_pull_constant_load_gen7(struct brw_codegen *p,
+                                 struct brw_vue_prog_data *prog_data,
+                                 vec4_instruction *inst,
+                                 struct brw_reg dst,
+                                 struct brw_reg surf_index,
+                                 struct brw_reg offset)
+{
+   assert(surf_index.type == BRW_REGISTER_TYPE_UD);
+
+   if (surf_index.file == BRW_IMMEDIATE_VALUE) {
+
+      brw_inst *insn = brw_next_insn(p, BRW_OPCODE_SEND);
+      brw_set_dest(p, insn, dst);
+      brw_set_src0(p, insn, offset);
+      brw_set_sampler_message(p, insn,
+                              surf_index.ud,
+                              0, /* LD message ignores sampler unit */
+                              GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
+                              1, /* rlen */
+                              inst->mlen,
+                              inst->header_size != 0,
+                              BRW_SAMPLER_SIMD_MODE_SIMD4X2,
+                              0);
+
+      brw_mark_surface_used(&prog_data->base, surf_index.ud);
+
+   } else {
+
+      struct brw_reg addr = vec1(retype(brw_address_reg(0), BRW_REGISTER_TYPE_UD));
+
+      brw_push_insn_state(p);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+      brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+      /* a0.0 = surf_index & 0xff */
+      brw_inst *insn_and = brw_next_insn(p, BRW_OPCODE_AND);
+      brw_inst_set_exec_size(p->devinfo, insn_and, BRW_EXECUTE_1);
+      brw_set_dest(p, insn_and, addr);
+      brw_set_src0(p, insn_and, vec1(retype(surf_index, BRW_REGISTER_TYPE_UD)));
+      brw_set_src1(p, insn_and, brw_imm_ud(0x0ff));
+
+      brw_pop_insn_state(p);
+
+      /* dst = send(offset, a0.0 | <descriptor>) */
+      brw_inst *insn = brw_send_indirect_message(
+         p, BRW_SFID_SAMPLER, dst, offset, addr);
+      brw_set_sampler_message(p, insn,
+                              0 /* surface */,
+                              0 /* sampler */,
+                              GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
+                              1 /* rlen */,
+                              inst->mlen,
+                              inst->header_size != 0,
+                              BRW_SAMPLER_SIMD_MODE_SIMD4X2,
+                              0);
+   }
+}
+
+static void
+generate_set_simd4x2_header_gen9(struct brw_codegen *p,
+                                 vec4_instruction *inst,
+                                 struct brw_reg dst)
+{
+   brw_push_insn_state(p);
+   brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+   brw_set_default_exec_size(p, BRW_EXECUTE_8);
+   brw_MOV(p, vec8(dst), retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+
+   brw_set_default_access_mode(p, BRW_ALIGN_1);
+   brw_MOV(p, get_element_ud(dst, 2),
+           brw_imm_ud(GEN9_SAMPLER_SIMD_MODE_EXTENSION_SIMD4X2));
+
+   brw_pop_insn_state(p);
+}
+
+static void
+generate_mov_indirect(struct brw_codegen *p,
+                      vec4_instruction *inst,
+                      struct brw_reg dst, struct brw_reg reg,
+                      struct brw_reg indirect, struct brw_reg length)
+{
+   assert(indirect.type == BRW_REGISTER_TYPE_UD);
+   assert(p->devinfo->gen >= 6);
+
+   unsigned imm_byte_offset = reg.nr * REG_SIZE + reg.subnr * (REG_SIZE / 2);
+
+   /* This instruction acts in align1 mode */
+   assert(dst.writemask == WRITEMASK_XYZW);
+
+   if (indirect.file == BRW_IMMEDIATE_VALUE) {
+      imm_byte_offset += indirect.ud;
+
+      reg.nr = imm_byte_offset / REG_SIZE;
+      reg.subnr = (imm_byte_offset / (REG_SIZE / 2)) % 2;
+      unsigned shift = (imm_byte_offset / 4) % 4;
+      reg.swizzle += BRW_SWIZZLE4(shift, shift, shift, shift);
+
+      brw_MOV(p, dst, reg);
+   } else {
+      brw_push_insn_state(p);
+      brw_set_default_access_mode(p, BRW_ALIGN_1);
+      brw_set_default_mask_control(p, BRW_MASK_DISABLE);
+
+      struct brw_reg addr = vec8(brw_address_reg(0));
+
+      /* We need to move the indirect value into the address register.  In
+       * order to make things make some sense, we want to respect at least the
+       * X component of the swizzle.  In order to do that, we need to convert
+       * the subnr (probably 0) to an align1 subnr and add in the swizzle.
+       */
+      assert(brw_is_single_value_swizzle(indirect.swizzle));
+      indirect.subnr = (indirect.subnr * 4 + BRW_GET_SWZ(indirect.swizzle, 0));
+
+      /* We then use a region of <8,4,0>:uw to pick off the first 2 bytes of
+       * the indirect and splat it out to all four channels of the given half
+       * of a0.
+       */
+      indirect.subnr *= 2;
+      indirect = stride(retype(indirect, BRW_REGISTER_TYPE_UW), 8, 4, 0);
+      brw_ADD(p, addr, indirect, brw_imm_uw(imm_byte_offset));
+
+      /* Now we need to incorporate the swizzle from the source register */
+      if (reg.swizzle != BRW_SWIZZLE_XXXX) {
+         uint32_t uv_swiz = BRW_GET_SWZ(reg.swizzle, 0) << 2 |
+                            BRW_GET_SWZ(reg.swizzle, 1) << 6 |
+                            BRW_GET_SWZ(reg.swizzle, 2) << 10 |
+                            BRW_GET_SWZ(reg.swizzle, 3) << 14;
+         uv_swiz |= uv_swiz << 16;
+
+         brw_ADD(p, addr, addr, brw_imm_uv(uv_swiz));
+      }
+
+      brw_MOV(p, dst, retype(brw_VxH_indirect(0, 0), reg.type));
+
+      brw_pop_insn_state(p);
+   }
+}
+
+static void
+generate_code(struct brw_codegen *p,
+              const struct brw_compiler *compiler,
+              void *log_data,
+              const nir_shader *nir,
+              struct brw_vue_prog_data *prog_data,
+              const struct cfg_t *cfg)
+{
+   const struct gen_device_info *devinfo = p->devinfo;
+   const char *stage_abbrev = _mesa_shader_stage_to_abbrev(nir->stage);
+   bool debug_flag = INTEL_DEBUG &
+      intel_debug_flag_for_shader_stage(nir->stage);
+   struct annotation_info annotation;
+   memset(&annotation, 0, sizeof(annotation));
+   int spill_count = 0, fill_count = 0;
+   int loop_count = 0;
+
+   foreach_block_and_inst (block, vec4_instruction, inst, cfg) {
+      struct brw_reg src[3], dst;
+
+      if (unlikely(debug_flag))
+         annotate(p->devinfo, &annotation, cfg, inst, p->next_insn_offset);
+
+      for (unsigned int i = 0; i < 3; i++) {
+         src[i] = inst->src[i].as_brw_reg();
+      }
+      dst = inst->dst.as_brw_reg();
+
+      brw_set_default_predicate_control(p, inst->predicate);
+      brw_set_default_predicate_inverse(p, inst->predicate_inverse);
+      brw_set_default_flag_reg(p, 0, inst->flag_subreg);
+      brw_set_default_saturate(p, inst->saturate);
+      brw_set_default_mask_control(p, inst->force_writemask_all);
+      brw_set_default_acc_write_control(p, inst->writes_accumulator);
+      brw_set_default_exec_size(p, cvt(inst->exec_size) - 1);
+
+      assert(inst->group % inst->exec_size == 0);
+      assert(inst->group % 8 == 0 ||
+             inst->dst.type == BRW_REGISTER_TYPE_DF ||
+             inst->src[0].type == BRW_REGISTER_TYPE_DF ||
+             inst->src[1].type == BRW_REGISTER_TYPE_DF ||
+             inst->src[2].type == BRW_REGISTER_TYPE_DF);
+      if (!inst->force_writemask_all)
+         brw_set_default_group(p, inst->group);
+
+      assert(inst->base_mrf + inst->mlen <= BRW_MAX_MRF(devinfo->gen));
+      assert(inst->mlen <= BRW_MAX_MSG_LENGTH);
+
+      unsigned pre_emit_nr_insn = p->nr_insn;
+
+      switch (inst->opcode) {
+      case VEC4_OPCODE_UNPACK_UNIFORM:
+      case BRW_OPCODE_MOV:
+         brw_MOV(p, dst, src[0]);
+         break;
+      case BRW_OPCODE_ADD:
+         brw_ADD(p, dst, src[0], src[1]);
+         break;
+      case BRW_OPCODE_MUL:
+         brw_MUL(p, dst, src[0], src[1]);
+         break;
+      case BRW_OPCODE_MACH:
+         brw_MACH(p, dst, src[0], src[1]);
+         break;
+
+      case BRW_OPCODE_MAD:
+         assert(devinfo->gen >= 6);
+         brw_MAD(p, dst, src[0], src[1], src[2]);
+         break;
+
+      case BRW_OPCODE_FRC:
+         brw_FRC(p, dst, src[0]);
+         break;
+      case BRW_OPCODE_RNDD:
+         brw_RNDD(p, dst, src[0]);
+         break;
+      case BRW_OPCODE_RNDE:
+         brw_RNDE(p, dst, src[0]);
+         break;
+      case BRW_OPCODE_RNDZ:
+         brw_RNDZ(p, dst, src[0]);
+         break;
+
+      case BRW_OPCODE_AND:
+         brw_AND(p, dst, src[0], src[1]);
+         break;
+      case BRW_OPCODE_OR:
+         brw_OR(p, dst, src[0], src[1]);
+         break;
+      case BRW_OPCODE_XOR:
+         brw_XOR(p, dst, src[0], src[1]);
+         break;
+      case BRW_OPCODE_NOT:
+         brw_NOT(p, dst, src[0]);
+         break;
+      case BRW_OPCODE_ASR:
+         brw_ASR(p, dst, src[0], src[1]);
+         break;
+      case BRW_OPCODE_SHR:
+         brw_SHR(p, dst, src[0], src[1]);
+         break;
+      case BRW_OPCODE_SHL:
+         brw_SHL(p, dst, src[0], src[1]);
+         break;
+
+      case BRW_OPCODE_CMP:
+         brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
+         break;
+      case BRW_OPCODE_SEL:
+         brw_SEL(p, dst, src[0], src[1]);
+         break;
+
+      case BRW_OPCODE_DPH:
+         brw_DPH(p, dst, src[0], src[1]);
+         break;
+
+      case BRW_OPCODE_DP4:
+         brw_DP4(p, dst, src[0], src[1]);
+         break;
+
+      case BRW_OPCODE_DP3:
+         brw_DP3(p, dst, src[0], src[1]);
+         break;
+
+      case BRW_OPCODE_DP2:
+         brw_DP2(p, dst, src[0], src[1]);
+         break;
+
+      case BRW_OPCODE_F32TO16:
+         assert(devinfo->gen >= 7);
+         brw_F32TO16(p, dst, src[0]);
+         break;
+
+      case BRW_OPCODE_F16TO32:
+         assert(devinfo->gen >= 7);
+         brw_F16TO32(p, dst, src[0]);
+         break;
+
+      case BRW_OPCODE_LRP:
+         assert(devinfo->gen >= 6);
+         brw_LRP(p, dst, src[0], src[1], src[2]);
+         break;
+
+      case BRW_OPCODE_BFREV:
+         assert(devinfo->gen >= 7);
+         /* BFREV only supports UD type for src and dst. */
+         brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
+                   retype(src[0], BRW_REGISTER_TYPE_UD));
+         break;
+      case BRW_OPCODE_FBH:
+         assert(devinfo->gen >= 7);
+         /* FBH only supports UD type for dst. */
+         brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
+         break;
+      case BRW_OPCODE_FBL:
+         assert(devinfo->gen >= 7);
+         /* FBL only supports UD type for dst. */
+         brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
+         break;
+      case BRW_OPCODE_LZD:
+         brw_LZD(p, dst, src[0]);
+         break;
+      case BRW_OPCODE_CBIT:
+         assert(devinfo->gen >= 7);
+         /* CBIT only supports UD type for dst. */
+         brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
+         break;
+      case BRW_OPCODE_ADDC:
+         assert(devinfo->gen >= 7);
+         brw_ADDC(p, dst, src[0], src[1]);
+         break;
+      case BRW_OPCODE_SUBB:
+         assert(devinfo->gen >= 7);
+         brw_SUBB(p, dst, src[0], src[1]);
+         break;
+      case BRW_OPCODE_MAC:
+         brw_MAC(p, dst, src[0], src[1]);
+         break;
+
+      case BRW_OPCODE_BFE:
+         assert(devinfo->gen >= 7);
+         brw_BFE(p, dst, src[0], src[1], src[2]);
+         break;
+
+      case BRW_OPCODE_BFI1:
+         assert(devinfo->gen >= 7);
+         brw_BFI1(p, dst, src[0], src[1]);
+         break;
+      case BRW_OPCODE_BFI2:
+         assert(devinfo->gen >= 7);
+         brw_BFI2(p, dst, src[0], src[1], src[2]);
+         break;
+
+      case BRW_OPCODE_IF:
+         if (!inst->src[0].is_null()) {
+            /* The instruction has an embedded compare (only allowed on gen6) */
+            assert(devinfo->gen == 6);
+            gen6_IF(p, inst->conditional_mod, src[0], src[1]);
+         } else {
+            brw_inst *if_inst = brw_IF(p, BRW_EXECUTE_8);
+            brw_inst_set_pred_control(p->devinfo, if_inst, inst->predicate);
+         }
+         break;
+
+      case BRW_OPCODE_ELSE:
+         brw_ELSE(p);
+         break;
+      case BRW_OPCODE_ENDIF:
+         brw_ENDIF(p);
+         break;
+
+      case BRW_OPCODE_DO:
+         brw_DO(p, BRW_EXECUTE_8);
+         break;
+
+      case BRW_OPCODE_BREAK:
+         brw_BREAK(p);
+         brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+         break;
+      case BRW_OPCODE_CONTINUE:
+         brw_CONT(p);
+         brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+         break;
+
+      case BRW_OPCODE_WHILE:
+         brw_WHILE(p);
+         loop_count++;
+         break;
+
+      case SHADER_OPCODE_RCP:
+      case SHADER_OPCODE_RSQ:
+      case SHADER_OPCODE_SQRT:
+      case SHADER_OPCODE_EXP2:
+      case SHADER_OPCODE_LOG2:
+      case SHADER_OPCODE_SIN:
+      case SHADER_OPCODE_COS:
+         assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
+         if (devinfo->gen >= 7) {
+            gen6_math(p, dst, brw_math_function(inst->opcode), src[0],
+                      brw_null_reg());
+         } else if (devinfo->gen == 6) {
+            generate_math_gen6(p, inst, dst, src[0], brw_null_reg());
+         } else {
+            generate_math1_gen4(p, inst, dst, src[0]);
+         }
+         break;
+
+      case SHADER_OPCODE_POW:
+      case SHADER_OPCODE_INT_QUOTIENT:
+      case SHADER_OPCODE_INT_REMAINDER:
+         assert(inst->conditional_mod == BRW_CONDITIONAL_NONE);
+         if (devinfo->gen >= 7) {
+            gen6_math(p, dst, brw_math_function(inst->opcode), src[0], src[1]);
+         } else if (devinfo->gen == 6) {
+            generate_math_gen6(p, inst, dst, src[0], src[1]);
+         } else {
+            generate_math2_gen4(p, inst, dst, src[0], src[1]);
+         }
+         break;
+
+      case SHADER_OPCODE_TEX:
+      case SHADER_OPCODE_TXD:
+      case SHADER_OPCODE_TXF:
+      case SHADER_OPCODE_TXF_CMS:
+      case SHADER_OPCODE_TXF_CMS_W:
+      case SHADER_OPCODE_TXF_MCS:
+      case SHADER_OPCODE_TXL:
+      case SHADER_OPCODE_TXS:
+      case SHADER_OPCODE_TG4:
+      case SHADER_OPCODE_TG4_OFFSET:
+      case SHADER_OPCODE_SAMPLEINFO:
+         generate_tex(p, prog_data, nir->stage,
+                      inst, dst, src[0], src[1], src[2]);
+         break;
+
+      case VS_OPCODE_URB_WRITE:
+         generate_vs_urb_write(p, inst);
+         break;
+
+      case SHADER_OPCODE_GEN4_SCRATCH_READ:
+         generate_scratch_read(p, inst, dst, src[0]);
+         fill_count++;
+         break;
+
+      case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
+         generate_scratch_write(p, inst, dst, src[0], src[1]);
+         spill_count++;
+         break;
+
+      case VS_OPCODE_PULL_CONSTANT_LOAD:
+         generate_pull_constant_load(p, prog_data, inst, dst, src[0], src[1]);
+         break;
+
+      case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
+         generate_pull_constant_load_gen7(p, prog_data, inst, dst, src[0], src[1]);
+         break;
+
+      case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
+         generate_set_simd4x2_header_gen9(p, inst, dst);
+         break;
+
+
+      case VS_OPCODE_GET_BUFFER_SIZE:
+         generate_get_buffer_size(p, prog_data, inst, dst, src[0], src[1]);
+         break;
+
+      case GS_OPCODE_URB_WRITE:
+         generate_gs_urb_write(p, inst);
+         break;
+
+      case GS_OPCODE_URB_WRITE_ALLOCATE:
+         generate_gs_urb_write_allocate(p, inst);
+         break;
+
+      case GS_OPCODE_SVB_WRITE:
+         generate_gs_svb_write(p, prog_data, inst, dst, src[0], src[1]);
+         break;
+
+      case GS_OPCODE_SVB_SET_DST_INDEX:
+         generate_gs_svb_set_destination_index(p, inst, dst, src[0]);
+         break;
+
+      case GS_OPCODE_THREAD_END:
+         generate_gs_thread_end(p, inst);
+         break;
+
+      case GS_OPCODE_SET_WRITE_OFFSET:
+         generate_gs_set_write_offset(p, dst, src[0], src[1]);
+         break;
+
+      case GS_OPCODE_SET_VERTEX_COUNT:
+         generate_gs_set_vertex_count(p, dst, src[0]);
+         break;
+
+      case GS_OPCODE_FF_SYNC:
+         generate_gs_ff_sync(p, inst, dst, src[0], src[1]);
+         break;
+
+      case GS_OPCODE_FF_SYNC_SET_PRIMITIVES:
+         generate_gs_ff_sync_set_primitives(p, dst, src[0], src[1], src[2]);
+         break;
+
+      case GS_OPCODE_SET_PRIMITIVE_ID:
+         generate_gs_set_primitive_id(p, dst);
+         break;
+
+      case GS_OPCODE_SET_DWORD_2:
+         generate_gs_set_dword_2(p, dst, src[0]);
+         break;
+
+      case GS_OPCODE_PREPARE_CHANNEL_MASKS:
+         generate_gs_prepare_channel_masks(p, dst);
+         break;
+
+      case GS_OPCODE_SET_CHANNEL_MASKS:
+         generate_gs_set_channel_masks(p, dst, src[0]);
+         break;
+
+      case GS_OPCODE_GET_INSTANCE_ID:
+         generate_gs_get_instance_id(p, dst);
+         break;
+
+      case SHADER_OPCODE_SHADER_TIME_ADD:
+         brw_shader_time_add(p, src[0],
+                             prog_data->base.binding_table.shader_time_start);
+         brw_mark_surface_used(&prog_data->base,
+                               prog_data->base.binding_table.shader_time_start);
+         break;
+
+      case SHADER_OPCODE_UNTYPED_ATOMIC:
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
+         brw_untyped_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen,
+                            !inst->dst.is_null());
+         break;
+
+      case SHADER_OPCODE_UNTYPED_SURFACE_READ:
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
+         brw_untyped_surface_read(p, dst, src[0], src[1], inst->mlen,
+                                  src[2].ud);
+         break;
+
+      case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
+         brw_untyped_surface_write(p, src[0], src[1], inst->mlen,
+                                   src[2].ud);
+         break;
+
+      case SHADER_OPCODE_TYPED_ATOMIC:
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
+         brw_typed_atomic(p, dst, src[0], src[1], src[2].ud, inst->mlen,
+                          !inst->dst.is_null());
+         break;
+
+      case SHADER_OPCODE_TYPED_SURFACE_READ:
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
+         brw_typed_surface_read(p, dst, src[0], src[1], inst->mlen,
+                                src[2].ud);
+         break;
+
+      case SHADER_OPCODE_TYPED_SURFACE_WRITE:
+         assert(src[2].file == BRW_IMMEDIATE_VALUE);
+         brw_typed_surface_write(p, src[0], src[1], inst->mlen,
+                                 src[2].ud);
+         break;
+
+      case SHADER_OPCODE_MEMORY_FENCE:
+         brw_memory_fence(p, dst);
+         break;
+
+      case SHADER_OPCODE_FIND_LIVE_CHANNEL: {
+         const struct brw_reg mask =
+            brw_stage_has_packed_dispatch(devinfo, nir->stage,
+                                          &prog_data->base) ? brw_imm_ud(~0u) :
+            brw_dmask_reg();
+         brw_find_live_channel(p, dst, mask);
+         break;
+      }
+
+      case SHADER_OPCODE_BROADCAST:
+         assert(inst->force_writemask_all);
+         brw_broadcast(p, dst, src[0], src[1]);
+         break;
+
+      case VS_OPCODE_UNPACK_FLAGS_SIMD4X2:
+         generate_unpack_flags(p, dst);
+         break;
+
+      case VEC4_OPCODE_MOV_BYTES: {
+         /* Moves the low byte from each channel, using an Align1 access mode
+          * and a <4,1,0> source region.
+          */
+         assert(src[0].type == BRW_REGISTER_TYPE_UB ||
+                src[0].type == BRW_REGISTER_TYPE_B);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_1);
+         src[0].vstride = BRW_VERTICAL_STRIDE_4;
+         src[0].width = BRW_WIDTH_1;
+         src[0].hstride = BRW_HORIZONTAL_STRIDE_0;
+         brw_MOV(p, dst, src[0]);
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+         break;
+      }
+
+      case VEC4_OPCODE_FROM_DOUBLE: {
+         assert(type_sz(src[0].type) == 8);
+         assert(type_sz(dst.type) == 4);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+         dst.hstride = BRW_HORIZONTAL_STRIDE_2;
+         dst.width = BRW_WIDTH_4;
+         src[0].vstride = BRW_VERTICAL_STRIDE_4;
+         src[0].width = BRW_WIDTH_4;
+         brw_MOV(p, dst, src[0]);
+
+         struct brw_reg dst_as_src = dst;
+         dst.hstride = BRW_HORIZONTAL_STRIDE_1;
+         dst.width = BRW_WIDTH_8;
+         brw_MOV(p, dst, dst_as_src);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+         break;
+      }
+
+      case VEC4_OPCODE_TO_DOUBLE: {
+         assert(type_sz(src[0].type) == 4);
+         assert(type_sz(dst.type) == 8);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+         struct brw_reg tmp = retype(dst, src[0].type);
+         tmp.hstride = BRW_HORIZONTAL_STRIDE_2;
+         tmp.width = BRW_WIDTH_4;
+         src[0].vstride = BRW_VERTICAL_STRIDE_4;
+         src[0].hstride = BRW_HORIZONTAL_STRIDE_1;
+         src[0].width = BRW_WIDTH_4;
+         brw_MOV(p, tmp, src[0]);
+
+         tmp.vstride = BRW_VERTICAL_STRIDE_8;
+         tmp.hstride = BRW_HORIZONTAL_STRIDE_2;
+         tmp.width = BRW_WIDTH_4;
+         brw_MOV(p, dst, tmp);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+         break;
+      }
+
+      case VEC4_OPCODE_PICK_LOW_32BIT:
+      case VEC4_OPCODE_PICK_HIGH_32BIT: {
+         /* Stores the low/high 32-bit of each 64-bit element in src[0] into
+          * dst using ALIGN1 mode and a <8,4,2>:UD region on the source.
+          */
+         assert(type_sz(src[0].type) == 8);
+         assert(type_sz(dst.type) == 4);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+         dst = retype(dst, BRW_REGISTER_TYPE_UD);
+         dst.hstride = BRW_HORIZONTAL_STRIDE_1;
+
+         src[0] = retype(src[0], BRW_REGISTER_TYPE_UD);
+         if (inst->opcode == VEC4_OPCODE_PICK_HIGH_32BIT)
+            src[0] = suboffset(src[0], 1);
+         src[0].vstride = BRW_VERTICAL_STRIDE_8;
+         src[0].width = BRW_WIDTH_4;
+         src[0].hstride = BRW_HORIZONTAL_STRIDE_2;
+         brw_MOV(p, dst, src[0]);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+         break;
+      }
+
+      case VEC4_OPCODE_SET_LOW_32BIT:
+      case VEC4_OPCODE_SET_HIGH_32BIT: {
+         /* Reads consecutive 32-bit elements from src[0] and writes
+          * them to the low/high 32-bit of each 64-bit element in dst.
+          */
+         assert(type_sz(src[0].type) == 4);
+         assert(type_sz(dst.type) == 8);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+         dst = retype(dst, BRW_REGISTER_TYPE_UD);
+         if (inst->opcode == VEC4_OPCODE_SET_HIGH_32BIT)
+            dst = suboffset(dst, 1);
+         dst.hstride = BRW_HORIZONTAL_STRIDE_2;
+
+         src[0] = retype(src[0], BRW_REGISTER_TYPE_UD);
+         src[0].vstride = BRW_VERTICAL_STRIDE_4;
+         src[0].width = BRW_WIDTH_4;
+         src[0].hstride = BRW_HORIZONTAL_STRIDE_1;
+         brw_MOV(p, dst, src[0]);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+         break;
+      }
+
+      case VEC4_OPCODE_PACK_BYTES: {
+         /* Is effectively:
+          *
+          *   mov(8) dst<16,4,1>:UB src<4,1,0>:UB
+          *
+          * but destinations' only regioning is horizontal stride, so instead we
+          * have to use two instructions:
+          *
+          *   mov(4) dst<1>:UB     src<4,1,0>:UB
+          *   mov(4) dst.16<1>:UB  src.16<4,1,0>:UB
+          *
+          * where they pack the four bytes from the low and high four DW.
+          */
+         assert(_mesa_is_pow_two(dst.writemask) &&
+                dst.writemask != 0);
+         unsigned offset = __builtin_ctz(dst.writemask);
+
+         dst.type = BRW_REGISTER_TYPE_UB;
+
+         brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+         src[0].type = BRW_REGISTER_TYPE_UB;
+         src[0].vstride = BRW_VERTICAL_STRIDE_4;
+         src[0].width = BRW_WIDTH_1;
+         src[0].hstride = BRW_HORIZONTAL_STRIDE_0;
+         dst.subnr = offset * 4;
+         struct brw_inst *insn = brw_MOV(p, dst, src[0]);
+         brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4);
+         brw_inst_set_no_dd_clear(p->devinfo, insn, true);
+         brw_inst_set_no_dd_check(p->devinfo, insn, inst->no_dd_check);
+
+         src[0].subnr = 16;
+         dst.subnr = 16 + offset * 4;
+         insn = brw_MOV(p, dst, src[0]);
+         brw_inst_set_exec_size(p->devinfo, insn, BRW_EXECUTE_4);
+         brw_inst_set_no_dd_clear(p->devinfo, insn, inst->no_dd_clear);
+         brw_inst_set_no_dd_check(p->devinfo, insn, true);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+         break;
+      }
+
+      case TCS_OPCODE_URB_WRITE:
+         generate_tcs_urb_write(p, inst, src[0]);
+         break;
+
+      case VEC4_OPCODE_URB_READ:
+         generate_vec4_urb_read(p, inst, dst, src[0]);
+         break;
+
+      case TCS_OPCODE_SET_INPUT_URB_OFFSETS:
+         generate_tcs_input_urb_offsets(p, dst, src[0], src[1]);
+         break;
+
+      case TCS_OPCODE_SET_OUTPUT_URB_OFFSETS:
+         generate_tcs_output_urb_offsets(p, dst, src[0], src[1]);
+         break;
+
+      case TCS_OPCODE_GET_INSTANCE_ID:
+         generate_tcs_get_instance_id(p, dst);
+         break;
+
+      case TCS_OPCODE_GET_PRIMITIVE_ID:
+         generate_tcs_get_primitive_id(p, dst);
+         break;
+
+      case TCS_OPCODE_CREATE_BARRIER_HEADER:
+         generate_tcs_create_barrier_header(p, prog_data, dst);
+         break;
+
+      case TES_OPCODE_CREATE_INPUT_READ_HEADER:
+         generate_tes_create_input_read_header(p, dst);
+         break;
+
+      case TES_OPCODE_ADD_INDIRECT_URB_OFFSET:
+         generate_tes_add_indirect_urb_offset(p, dst, src[0], src[1]);
+         break;
+
+      case TES_OPCODE_GET_PRIMITIVE_ID:
+         generate_tes_get_primitive_id(p, dst);
+         break;
+
+      case TCS_OPCODE_SRC0_010_IS_ZERO:
+         /* If src_reg had stride like fs_reg, we wouldn't need this. */
+         brw_MOV(p, brw_null_reg(), stride(src[0], 0, 1, 0));
+         break;
+
+      case TCS_OPCODE_RELEASE_INPUT:
+         generate_tcs_release_input(p, dst, src[0], src[1]);
+         break;
+
+      case TCS_OPCODE_THREAD_END:
+         generate_tcs_thread_end(p, inst);
+         break;
+
+      case SHADER_OPCODE_BARRIER:
+         brw_barrier(p, src[0]);
+         brw_WAIT(p);
+         break;
+
+      case SHADER_OPCODE_MOV_INDIRECT:
+         generate_mov_indirect(p, inst, dst, src[0], src[1], src[2]);
+         break;
+
+      case BRW_OPCODE_DIM:
+         assert(devinfo->is_haswell);
+         assert(src[0].type == BRW_REGISTER_TYPE_DF);
+         assert(dst.type == BRW_REGISTER_TYPE_DF);
+         brw_DIM(p, dst, retype(src[0], BRW_REGISTER_TYPE_F));
+         break;
+
+      default:
+         unreachable("Unsupported opcode");
+      }
+
+      if (inst->opcode == VEC4_OPCODE_PACK_BYTES) {
+         /* Handled dependency hints in the generator. */
+
+         assert(!inst->conditional_mod);
+      } else if (inst->no_dd_clear || inst->no_dd_check || inst->conditional_mod) {
+         assert(p->nr_insn == pre_emit_nr_insn + 1 ||
+                !"conditional_mod, no_dd_check, or no_dd_clear set for IR "
+                 "emitting more than 1 instruction");
+
+         brw_inst *last = &p->store[pre_emit_nr_insn];
+
+         if (inst->conditional_mod)
+            brw_inst_set_cond_modifier(p->devinfo, last, inst->conditional_mod);
+         brw_inst_set_no_dd_clear(p->devinfo, last, inst->no_dd_clear);
+         brw_inst_set_no_dd_check(p->devinfo, last, inst->no_dd_check);
+      }
+   }
+
+   brw_set_uip_jip(p, 0);
+   annotation_finalize(&annotation, p->next_insn_offset);
+
+#ifndef NDEBUG
+   bool validated = brw_validate_instructions(p, 0, &annotation);
+#else
+   if (unlikely(debug_flag))
+      brw_validate_instructions(p, 0, &annotation);
+#endif
+
+   int before_size = p->next_insn_offset;
+   brw_compact_instructions(p, 0, annotation.ann_count, annotation.ann);
+   int after_size = p->next_insn_offset;
+
+   if (unlikely(debug_flag)) {
+      fprintf(stderr, "Native code for %s %s shader %s:\n",
+              nir->info->label ? nir->info->label : "unnamed",
+              _mesa_shader_stage_to_string(nir->stage), nir->info->name);
+
+      fprintf(stderr, "%s vec4 shader: %d instructions. %d loops. %u cycles. %d:%d "
+                      "spills:fills. Compacted %d to %d bytes (%.0f%%)\n",
+              stage_abbrev, before_size / 16, loop_count, cfg->cycle_count,
+              spill_count, fill_count, before_size, after_size,
+              100.0f * (before_size - after_size) / before_size);
+
+      dump_assembly(p->store, annotation.ann_count, annotation.ann,
+                    p->devinfo);
+      ralloc_free(annotation.mem_ctx);
+   }
+   assert(validated);
+
+   compiler->shader_debug_log(log_data,
+                              "%s vec4 shader: %d inst, %d loops, %u cycles, "
+                              "%d:%d spills:fills, compacted %d to %d bytes.",
+                              stage_abbrev, before_size / 16,
+                              loop_count, cfg->cycle_count, spill_count,
+                              fill_count, before_size, after_size);
+
+}
+
+extern "C" const unsigned *
+brw_vec4_generate_assembly(const struct brw_compiler *compiler,
+                           void *log_data,
+                           void *mem_ctx,
+                           const nir_shader *nir,
+                           struct brw_vue_prog_data *prog_data,
+                           const struct cfg_t *cfg,
+                           unsigned *out_assembly_size)
+{
+   struct brw_codegen *p = rzalloc(mem_ctx, struct brw_codegen);
+   brw_init_codegen(compiler->devinfo, p, mem_ctx);
+   brw_set_default_access_mode(p, BRW_ALIGN_16);
+
+   generate_code(p, compiler, log_data, nir, prog_data, cfg);
+
+   return brw_get_program(p, out_assembly_size);
+}
diff --git a/src/intel/compiler/brw_vec4_gs_nir.cpp b/src/intel/compiler/brw_vec4_gs_nir.cpp
new file mode 100644
index 00000000000..ed8c03b0594
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_gs_nir.cpp
@@ -0,0 +1,145 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_vec4_gs_visitor.h"
+
+namespace brw {
+
+void
+vec4_gs_visitor::nir_setup_inputs()
+{
+}
+
+void
+vec4_gs_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr)
+{
+   dst_reg *reg;
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_primitive_id:
+      /* We'll just read g1 directly; don't create a temporary. */
+      break;
+
+   case nir_intrinsic_load_invocation_id:
+      reg = &this->nir_system_values[SYSTEM_VALUE_INVOCATION_ID];
+      if (reg->file == BAD_FILE)
+         *reg = *this->make_reg_for_system_value(SYSTEM_VALUE_INVOCATION_ID);
+      break;
+
+   default:
+      vec4_visitor::nir_setup_system_value_intrinsic(instr);
+   }
+
+}
+
+void
+vec4_gs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
+{
+   dst_reg dest;
+   src_reg src;
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_per_vertex_input: {
+      /* The EmitNoIndirectInput flag guarantees our vertex index will
+       * be constant.  We should handle indirects someday.
+       */
+      nir_const_value *vertex = nir_src_as_const_value(instr->src[0]);
+      nir_const_value *offset_reg = nir_src_as_const_value(instr->src[1]);
+
+      if (nir_dest_bit_size(instr->dest) == 64) {
+         src = src_reg(ATTR, BRW_VARYING_SLOT_COUNT * vertex->u32[0] +
+                       instr->const_index[0] + offset_reg->u32[0],
+                       glsl_type::dvec4_type);
+
+         dst_reg tmp = dst_reg(this, glsl_type::dvec4_type);
+         shuffle_64bit_data(tmp, src, false);
+
+         src = src_reg(tmp);
+         src.swizzle = BRW_SWZ_COMP_INPUT(nir_intrinsic_component(instr) / 2);
+
+         /* Write to dst reg taking into account original writemask */
+         dest = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_DF);
+         dest.writemask = brw_writemask_for_size(instr->num_components);
+         emit(MOV(dest, src));
+      } else {
+         /* Make up a type...we have no way of knowing... */
+         const glsl_type *const type = glsl_type::ivec(instr->num_components);
+
+         src = src_reg(ATTR, BRW_VARYING_SLOT_COUNT * vertex->u32[0] +
+                       instr->const_index[0] + offset_reg->u32[0],
+                       type);
+         src.swizzle = BRW_SWZ_COMP_INPUT(nir_intrinsic_component(instr));
+
+         /* gl_PointSize is passed in the .w component of the VUE header */
+         if (instr->const_index[0] == VARYING_SLOT_PSIZ)
+            src.swizzle = BRW_SWIZZLE_WWWW;
+
+         dest = get_nir_dest(instr->dest, src.type);
+         dest.writemask = brw_writemask_for_size(instr->num_components);
+         emit(MOV(dest, src));
+      }
+      break;
+   }
+
+   case nir_intrinsic_load_input:
+      unreachable("nir_lower_io should have produced per_vertex intrinsics");
+
+   case nir_intrinsic_emit_vertex_with_counter: {
+      this->vertex_count =
+         retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD);
+      int stream_id = instr->const_index[0];
+      gs_emit_vertex(stream_id);
+      break;
+   }
+
+   case nir_intrinsic_end_primitive_with_counter:
+      this->vertex_count =
+         retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD);
+      gs_end_primitive();
+      break;
+
+   case nir_intrinsic_set_vertex_count:
+      this->vertex_count =
+         retype(get_nir_src(instr->src[0], 1), BRW_REGISTER_TYPE_UD);
+      break;
+
+   case nir_intrinsic_load_primitive_id:
+      assert(gs_prog_data->include_primitive_id);
+      dest = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
+      emit(MOV(dest, retype(brw_vec4_grf(1, 0), BRW_REGISTER_TYPE_D)));
+      break;
+
+   case nir_intrinsic_load_invocation_id: {
+      src_reg invocation_id =
+         src_reg(nir_system_values[SYSTEM_VALUE_INVOCATION_ID]);
+      assert(invocation_id.file != BAD_FILE);
+      dest = get_nir_dest(instr->dest, invocation_id.type);
+      emit(MOV(dest, invocation_id));
+      break;
+   }
+
+   default:
+      vec4_visitor::nir_emit_intrinsic(instr);
+   }
+}
+}
diff --git a/src/intel/compiler/brw_vec4_gs_visitor.cpp b/src/intel/compiler/brw_vec4_gs_visitor.cpp
new file mode 100644
index 00000000000..4a8b5be30e1
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_gs_visitor.cpp
@@ -0,0 +1,933 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_vec4_gs_visitor.cpp
+ *
+ * Geometry-shader-specific code derived from the vec4_visitor class.
+ */
+
+#include "brw_vec4_gs_visitor.h"
+#include "gen6_gs_visitor.h"
+#include "brw_fs.h"
+#include "brw_nir.h"
+#include "common/gen_debug.h"
+
+namespace brw {
+
+vec4_gs_visitor::vec4_gs_visitor(const struct brw_compiler *compiler,
+                                 void *log_data,
+                                 struct brw_gs_compile *c,
+                                 struct brw_gs_prog_data *prog_data,
+                                 const nir_shader *shader,
+                                 void *mem_ctx,
+                                 bool no_spills,
+                                 int shader_time_index)
+   : vec4_visitor(compiler, log_data, &c->key.tex,
+                  &prog_data->base, shader,  mem_ctx,
+                  no_spills, shader_time_index),
+     c(c),
+     gs_prog_data(prog_data)
+{
+}
+
+
+dst_reg *
+vec4_gs_visitor::make_reg_for_system_value(int location)
+{
+   dst_reg *reg = new(mem_ctx) dst_reg(this, glsl_type::int_type);
+
+   switch (location) {
+   case SYSTEM_VALUE_INVOCATION_ID:
+      this->current_annotation = "initialize gl_InvocationID";
+      if (gs_prog_data->invocations > 1)
+         emit(GS_OPCODE_GET_INSTANCE_ID, *reg);
+      else
+         emit(MOV(*reg, brw_imm_ud(0)));
+      break;
+   default:
+      unreachable("not reached");
+   }
+
+   return reg;
+}
+
+
+int
+vec4_gs_visitor::setup_varying_inputs(int payload_reg, int *attribute_map,
+                                      int attributes_per_reg)
+{
+   /* For geometry shaders there are N copies of the input attributes, where N
+    * is the number of input vertices.  attribute_map[BRW_VARYING_SLOT_COUNT *
+    * i + j] represents attribute j for vertex i.
+    *
+    * Note that GS inputs are read from the VUE 256 bits (2 vec4's) at a time,
+    * so the total number of input slots that will be delivered to the GS (and
+    * thus the stride of the input arrays) is urb_read_length * 2.
+    */
+   const unsigned num_input_vertices = nir->info->gs.vertices_in;
+   assert(num_input_vertices <= MAX_GS_INPUT_VERTICES);
+   unsigned input_array_stride = prog_data->urb_read_length * 2;
+
+   for (int slot = 0; slot < c->input_vue_map.num_slots; slot++) {
+      int varying = c->input_vue_map.slot_to_varying[slot];
+      for (unsigned vertex = 0; vertex < num_input_vertices; vertex++) {
+         attribute_map[BRW_VARYING_SLOT_COUNT * vertex + varying] =
+            attributes_per_reg * payload_reg + input_array_stride * vertex +
+            slot;
+      }
+   }
+
+   int regs_used = ALIGN(input_array_stride * num_input_vertices,
+                         attributes_per_reg) / attributes_per_reg;
+   return payload_reg + regs_used;
+}
+
+
+void
+vec4_gs_visitor::setup_payload()
+{
+   int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
+
+   /* If we are in dual instanced or single mode, then attributes are going
+    * to be interleaved, so one register contains two attribute slots.
+    */
+   int attributes_per_reg =
+      prog_data->dispatch_mode == DISPATCH_MODE_4X2_DUAL_OBJECT ? 1 : 2;
+
+   /* If a geometry shader tries to read from an input that wasn't written by
+    * the vertex shader, that produces undefined results, but it shouldn't
+    * crash anything.  So initialize attribute_map to zeros--that ensures that
+    * these undefined results are read from r0.
+    */
+   memset(attribute_map, 0, sizeof(attribute_map));
+
+   int reg = 0;
+
+   /* The payload always contains important data in r0, which contains
+    * the URB handles that are passed on to the URB write at the end
+    * of the thread.
+    */
+   reg++;
+
+   /* If the shader uses gl_PrimitiveIDIn, that goes in r1. */
+   if (gs_prog_data->include_primitive_id)
+      attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg++;
+
+   reg = setup_uniforms(reg);
+
+   reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg);
+
+   lower_attributes_to_hw_regs(attribute_map, attributes_per_reg > 1);
+
+   this->first_non_payload_grf = reg;
+}
+
+
+void
+vec4_gs_visitor::emit_prolog()
+{
+   /* In vertex shaders, r0.2 is guaranteed to be initialized to zero.  In
+    * geometry shaders, it isn't (it contains a bunch of information we don't
+    * need, like the input primitive type).  We need r0.2 to be zero in order
+    * to build scratch read/write messages correctly (otherwise this value
+    * will be interpreted as a global offset, causing us to do our scratch
+    * reads/writes to garbage memory).  So just set it to zero at the top of
+    * the shader.
+    */
+   this->current_annotation = "clear r0.2";
+   dst_reg r0(retype(brw_vec4_grf(0, 0), BRW_REGISTER_TYPE_UD));
+   vec4_instruction *inst = emit(GS_OPCODE_SET_DWORD_2, r0, brw_imm_ud(0u));
+   inst->force_writemask_all = true;
+
+   /* Create a virtual register to hold the vertex count */
+   this->vertex_count = src_reg(this, glsl_type::uint_type);
+
+   /* Initialize the vertex_count register to 0 */
+   this->current_annotation = "initialize vertex_count";
+   inst = emit(MOV(dst_reg(this->vertex_count), brw_imm_ud(0u)));
+   inst->force_writemask_all = true;
+
+   if (c->control_data_header_size_bits > 0) {
+      /* Create a virtual register to hold the current set of control data
+       * bits.
+       */
+      this->control_data_bits = src_reg(this, glsl_type::uint_type);
+
+      /* If we're outputting more than 32 control data bits, then EmitVertex()
+       * will set control_data_bits to 0 after emitting the first vertex.
+       * Otherwise, we need to initialize it to 0 here.
+       */
+      if (c->control_data_header_size_bits <= 32) {
+         this->current_annotation = "initialize control data bits";
+         inst = emit(MOV(dst_reg(this->control_data_bits), brw_imm_ud(0u)));
+         inst->force_writemask_all = true;
+      }
+   }
+
+   this->current_annotation = NULL;
+}
+
+void
+vec4_gs_visitor::emit_thread_end()
+{
+   if (c->control_data_header_size_bits > 0) {
+      /* During shader execution, we only ever call emit_control_data_bits()
+       * just prior to outputting a vertex.  Therefore, the control data bits
+       * corresponding to the most recently output vertex still need to be
+       * emitted.
+       */
+      current_annotation = "thread end: emit control data bits";
+      emit_control_data_bits();
+   }
+
+   /* MRF 0 is reserved for the debugger, so start with message header
+    * in MRF 1.
+    */
+   int base_mrf = 1;
+
+   bool static_vertex_count = gs_prog_data->static_vertex_count != -1;
+
+   /* If the previous instruction was a URB write, we don't need to issue
+    * a second one - we can just set the EOT bit on the previous write.
+    *
+    * Skip this on Gen8+ unless there's a static vertex count, as we also
+    * need to write the vertex count out, and combining the two may not be
+    * possible (or at least not straightforward).
+    */
+   vec4_instruction *last = (vec4_instruction *) instructions.get_tail();
+   if (last && last->opcode == GS_OPCODE_URB_WRITE &&
+       !(INTEL_DEBUG & DEBUG_SHADER_TIME) &&
+       devinfo->gen >= 8 && static_vertex_count) {
+      last->urb_write_flags = BRW_URB_WRITE_EOT | last->urb_write_flags;
+      return;
+   }
+
+   current_annotation = "thread end";
+   dst_reg mrf_reg(MRF, base_mrf);
+   src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+   vec4_instruction *inst = emit(MOV(mrf_reg, r0));
+   inst->force_writemask_all = true;
+   if (devinfo->gen < 8 || !static_vertex_count)
+      emit(GS_OPCODE_SET_VERTEX_COUNT, mrf_reg, this->vertex_count);
+   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+      emit_shader_time_end();
+   inst = emit(GS_OPCODE_THREAD_END);
+   inst->base_mrf = base_mrf;
+   inst->mlen = devinfo->gen >= 8 && !static_vertex_count ? 2 : 1;
+}
+
+
+void
+vec4_gs_visitor::emit_urb_write_header(int mrf)
+{
+   /* The SEND instruction that writes the vertex data to the VUE will use
+    * per_slot_offset=true, which means that DWORDs 3 and 4 of the message
+    * header specify an offset (in multiples of 256 bits) into the URB entry
+    * at which the write should take place.
+    *
+    * So we have to prepare a message header with the appropriate offset
+    * values.
+    */
+   dst_reg mrf_reg(MRF, mrf);
+   src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+   this->current_annotation = "URB write header";
+   vec4_instruction *inst = emit(MOV(mrf_reg, r0));
+   inst->force_writemask_all = true;
+   emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, this->vertex_count,
+        brw_imm_ud(gs_prog_data->output_vertex_size_hwords));
+}
+
+
+vec4_instruction *
+vec4_gs_visitor::emit_urb_write_opcode(bool complete)
+{
+   /* We don't care whether the vertex is complete, because in general
+    * geometry shaders output multiple vertices, and we don't terminate the
+    * thread until all vertices are complete.
+    */
+   (void) complete;
+
+   vec4_instruction *inst = emit(GS_OPCODE_URB_WRITE);
+   inst->offset = gs_prog_data->control_data_header_size_hwords;
+
+   /* We need to increment Global Offset by 1 to make room for Broadwell's
+    * extra "Vertex Count" payload at the beginning of the URB entry.
+    */
+   if (devinfo->gen >= 8 && gs_prog_data->static_vertex_count == -1)
+      inst->offset++;
+
+   inst->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
+   return inst;
+}
+
+
+/**
+ * Write out a batch of 32 control data bits from the control_data_bits
+ * register to the URB.
+ *
+ * The current value of the vertex_count register determines which DWORD in
+ * the URB receives the control data bits.  The control_data_bits register is
+ * assumed to contain the correct data for the vertex that was most recently
+ * output, and all previous vertices that share the same DWORD.
+ *
+ * This function takes care of ensuring that if no vertices have been output
+ * yet, no control bits are emitted.
+ */
+void
+vec4_gs_visitor::emit_control_data_bits()
+{
+   assert(c->control_data_bits_per_vertex != 0);
+
+   /* Since the URB_WRITE_OWORD message operates with 128-bit (vec4 sized)
+    * granularity, we need to use two tricks to ensure that the batch of 32
+    * control data bits is written to the appropriate DWORD in the URB.  To
+    * select which vec4 we are writing to, we use the "slot {0,1} offset"
+    * fields of the message header.  To select which DWORD in the vec4 we are
+    * writing to, we use the channel mask fields of the message header.  To
+    * avoid penalizing geometry shaders that emit a small number of vertices
+    * with extra bookkeeping, we only do each of these tricks when
+    * c->prog_data.control_data_header_size_bits is large enough to make it
+    * necessary.
+    *
+    * Note: this means that if we're outputting just a single DWORD of control
+    * data bits, we'll actually replicate it four times since we won't do any
+    * channel masking.  But that's not a problem since in this case the
+    * hardware only pays attention to the first DWORD.
+    */
+   enum brw_urb_write_flags urb_write_flags = BRW_URB_WRITE_OWORD;
+   if (c->control_data_header_size_bits > 32)
+      urb_write_flags = urb_write_flags | BRW_URB_WRITE_USE_CHANNEL_MASKS;
+   if (c->control_data_header_size_bits > 128)
+      urb_write_flags = urb_write_flags | BRW_URB_WRITE_PER_SLOT_OFFSET;
+
+   /* If we are using either channel masks or a per-slot offset, then we
+    * need to figure out which DWORD we are trying to write to, using the
+    * formula:
+    *
+    *     dword_index = (vertex_count - 1) * bits_per_vertex / 32
+    *
+    * Since bits_per_vertex is a power of two, and is known at compile
+    * time, this can be optimized to:
+    *
+    *     dword_index = (vertex_count - 1) >> (6 - log2(bits_per_vertex))
+    */
+   src_reg dword_index(this, glsl_type::uint_type);
+   if (urb_write_flags) {
+      src_reg prev_count(this, glsl_type::uint_type);
+      emit(ADD(dst_reg(prev_count), this->vertex_count,
+               brw_imm_ud(0xffffffffu)));
+      unsigned log2_bits_per_vertex =
+         util_last_bit(c->control_data_bits_per_vertex);
+      emit(SHR(dst_reg(dword_index), prev_count,
+               brw_imm_ud(6 - log2_bits_per_vertex)));
+   }
+
+   /* Start building the URB write message.  The first MRF gets a copy of
+    * R0.
+    */
+   int base_mrf = 1;
+   dst_reg mrf_reg(MRF, base_mrf);
+   src_reg r0(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
+   vec4_instruction *inst = emit(MOV(mrf_reg, r0));
+   inst->force_writemask_all = true;
+
+   if (urb_write_flags & BRW_URB_WRITE_PER_SLOT_OFFSET) {
+      /* Set the per-slot offset to dword_index / 4, to that we'll write to
+       * the appropriate OWORD within the control data header.
+       */
+      src_reg per_slot_offset(this, glsl_type::uint_type);
+      emit(SHR(dst_reg(per_slot_offset), dword_index, brw_imm_ud(2u)));
+      emit(GS_OPCODE_SET_WRITE_OFFSET, mrf_reg, per_slot_offset,
+           brw_imm_ud(1u));
+   }
+
+   if (urb_write_flags & BRW_URB_WRITE_USE_CHANNEL_MASKS) {
+      /* Set the channel masks to 1 << (dword_index % 4), so that we'll
+       * write to the appropriate DWORD within the OWORD.  We need to do
+       * this computation with force_writemask_all, otherwise garbage data
+       * from invocation 0 might clobber the mask for invocation 1 when
+       * GS_OPCODE_PREPARE_CHANNEL_MASKS tries to OR the two masks
+       * together.
+       */
+      src_reg channel(this, glsl_type::uint_type);
+      inst = emit(AND(dst_reg(channel), dword_index, brw_imm_ud(3u)));
+      inst->force_writemask_all = true;
+      src_reg one(this, glsl_type::uint_type);
+      inst = emit(MOV(dst_reg(one), brw_imm_ud(1u)));
+      inst->force_writemask_all = true;
+      src_reg channel_mask(this, glsl_type::uint_type);
+      inst = emit(SHL(dst_reg(channel_mask), one, channel));
+      inst->force_writemask_all = true;
+      emit(GS_OPCODE_PREPARE_CHANNEL_MASKS, dst_reg(channel_mask),
+                                            channel_mask);
+      emit(GS_OPCODE_SET_CHANNEL_MASKS, mrf_reg, channel_mask);
+   }
+
+   /* Store the control data bits in the message payload and send it. */
+   dst_reg mrf_reg2(MRF, base_mrf + 1);
+   inst = emit(MOV(mrf_reg2, this->control_data_bits));
+   inst->force_writemask_all = true;
+   inst = emit(GS_OPCODE_URB_WRITE);
+   inst->urb_write_flags = urb_write_flags;
+   /* We need to increment Global Offset by 256-bits to make room for
+    * Broadwell's extra "Vertex Count" payload at the beginning of the
+    * URB entry.  Since this is an OWord message, Global Offset is counted
+    * in 128-bit units, so we must set it to 2.
+    */
+   if (devinfo->gen >= 8 && gs_prog_data->static_vertex_count == -1)
+      inst->offset = 2;
+   inst->base_mrf = base_mrf;
+   inst->mlen = 2;
+}
+
+void
+vec4_gs_visitor::set_stream_control_data_bits(unsigned stream_id)
+{
+   /* control_data_bits |= stream_id << ((2 * (vertex_count - 1)) % 32) */
+
+   /* Note: we are calling this *before* increasing vertex_count, so
+    * this->vertex_count == vertex_count - 1 in the formula above.
+    */
+
+   /* Stream mode uses 2 bits per vertex */
+   assert(c->control_data_bits_per_vertex == 2);
+
+   /* Must be a valid stream */
+   assert(stream_id >= 0 && stream_id < MAX_VERTEX_STREAMS);
+
+   /* Control data bits are initialized to 0 so we don't have to set any
+    * bits when sending vertices to stream 0.
+    */
+   if (stream_id == 0)
+      return;
+
+   /* reg::sid = stream_id */
+   src_reg sid(this, glsl_type::uint_type);
+   emit(MOV(dst_reg(sid), brw_imm_ud(stream_id)));
+
+   /* reg:shift_count = 2 * (vertex_count - 1) */
+   src_reg shift_count(this, glsl_type::uint_type);
+   emit(SHL(dst_reg(shift_count), this->vertex_count, brw_imm_ud(1u)));
+
+   /* Note: we're relying on the fact that the GEN SHL instruction only pays
+    * attention to the lower 5 bits of its second source argument, so on this
+    * architecture, stream_id << 2 * (vertex_count - 1) is equivalent to
+    * stream_id << ((2 * (vertex_count - 1)) % 32).
+    */
+   src_reg mask(this, glsl_type::uint_type);
+   emit(SHL(dst_reg(mask), sid, shift_count));
+   emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask));
+}
+
+void
+vec4_gs_visitor::gs_emit_vertex(int stream_id)
+{
+   this->current_annotation = "emit vertex: safety check";
+
+   /* Haswell and later hardware ignores the "Render Stream Select" bits
+    * from the 3DSTATE_STREAMOUT packet when the SOL stage is disabled,
+    * and instead sends all primitives down the pipeline for rasterization.
+    * If the SOL stage is enabled, "Render Stream Select" is honored and
+    * primitives bound to non-zero streams are discarded after stream output.
+    *
+    * Since the only purpose of primives sent to non-zero streams is to
+    * be recorded by transform feedback, we can simply discard all geometry
+    * bound to these streams when transform feedback is disabled.
+    */
+   if (stream_id > 0 && !nir->info->has_transform_feedback_varyings)
+      return;
+
+   /* If we're outputting 32 control data bits or less, then we can wait
+    * until the shader is over to output them all.  Otherwise we need to
+    * output them as we go.  Now is the time to do it, since we're about to
+    * output the vertex_count'th vertex, so it's guaranteed that the
+    * control data bits associated with the (vertex_count - 1)th vertex are
+    * correct.
+    */
+   if (c->control_data_header_size_bits > 32) {
+      this->current_annotation = "emit vertex: emit control data bits";
+      /* Only emit control data bits if we've finished accumulating a batch
+       * of 32 bits.  This is the case when:
+       *
+       *     (vertex_count * bits_per_vertex) % 32 == 0
+       *
+       * (in other words, when the last 5 bits of vertex_count *
+       * bits_per_vertex are 0).  Assuming bits_per_vertex == 2^n for some
+       * integer n (which is always the case, since bits_per_vertex is
+       * always 1 or 2), this is equivalent to requiring that the last 5-n
+       * bits of vertex_count are 0:
+       *
+       *     vertex_count & (2^(5-n) - 1) == 0
+       *
+       * 2^(5-n) == 2^5 / 2^n == 32 / bits_per_vertex, so this is
+       * equivalent to:
+       *
+       *     vertex_count & (32 / bits_per_vertex - 1) == 0
+       */
+      vec4_instruction *inst =
+         emit(AND(dst_null_ud(), this->vertex_count,
+                  brw_imm_ud(32 / c->control_data_bits_per_vertex - 1)));
+      inst->conditional_mod = BRW_CONDITIONAL_Z;
+
+      emit(IF(BRW_PREDICATE_NORMAL));
+      {
+         /* If vertex_count is 0, then no control data bits have been
+          * accumulated yet, so we skip emitting them.
+          */
+         emit(CMP(dst_null_ud(), this->vertex_count, brw_imm_ud(0u),
+                  BRW_CONDITIONAL_NEQ));
+         emit(IF(BRW_PREDICATE_NORMAL));
+         emit_control_data_bits();
+         emit(BRW_OPCODE_ENDIF);
+
+         /* Reset control_data_bits to 0 so we can start accumulating a new
+          * batch.
+          *
+          * Note: in the case where vertex_count == 0, this neutralizes the
+          * effect of any call to EndPrimitive() that the shader may have
+          * made before outputting its first vertex.
+          */
+         inst = emit(MOV(dst_reg(this->control_data_bits), brw_imm_ud(0u)));
+         inst->force_writemask_all = true;
+      }
+      emit(BRW_OPCODE_ENDIF);
+   }
+
+   this->current_annotation = "emit vertex: vertex data";
+   emit_vertex();
+
+   /* In stream mode we have to set control data bits for all vertices
+    * unless we have disabled control data bits completely (which we do
+    * do for GL_POINTS outputs that don't use streams).
+    */
+   if (c->control_data_header_size_bits > 0 &&
+       gs_prog_data->control_data_format ==
+          GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID) {
+       this->current_annotation = "emit vertex: Stream control data bits";
+       set_stream_control_data_bits(stream_id);
+   }
+
+   this->current_annotation = NULL;
+}
+
+void
+vec4_gs_visitor::gs_end_primitive()
+{
+   /* We can only do EndPrimitive() functionality when the control data
+    * consists of cut bits.  Fortunately, the only time it isn't is when the
+    * output type is points, in which case EndPrimitive() is a no-op.
+    */
+   if (gs_prog_data->control_data_format !=
+       GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT) {
+      return;
+   }
+
+   if (c->control_data_header_size_bits == 0)
+      return;
+
+   /* Cut bits use one bit per vertex. */
+   assert(c->control_data_bits_per_vertex == 1);
+
+   /* Cut bit n should be set to 1 if EndPrimitive() was called after emitting
+    * vertex n, 0 otherwise.  So all we need to do here is mark bit
+    * (vertex_count - 1) % 32 in the cut_bits register to indicate that
+    * EndPrimitive() was called after emitting vertex (vertex_count - 1);
+    * vec4_gs_visitor::emit_control_data_bits() will take care of the rest.
+    *
+    * Note that if EndPrimitve() is called before emitting any vertices, this
+    * will cause us to set bit 31 of the control_data_bits register to 1.
+    * That's fine because:
+    *
+    * - If max_vertices < 32, then vertex number 31 (zero-based) will never be
+    *   output, so the hardware will ignore cut bit 31.
+    *
+    * - If max_vertices == 32, then vertex number 31 is guaranteed to be the
+    *   last vertex, so setting cut bit 31 has no effect (since the primitive
+    *   is automatically ended when the GS terminates).
+    *
+    * - If max_vertices > 32, then the ir_emit_vertex visitor will reset the
+    *   control_data_bits register to 0 when the first vertex is emitted.
+    */
+
+   /* control_data_bits |= 1 << ((vertex_count - 1) % 32) */
+   src_reg one(this, glsl_type::uint_type);
+   emit(MOV(dst_reg(one), brw_imm_ud(1u)));
+   src_reg prev_count(this, glsl_type::uint_type);
+   emit(ADD(dst_reg(prev_count), this->vertex_count, brw_imm_ud(0xffffffffu)));
+   src_reg mask(this, glsl_type::uint_type);
+   /* Note: we're relying on the fact that the GEN SHL instruction only pays
+    * attention to the lower 5 bits of its second source argument, so on this
+    * architecture, 1 << (vertex_count - 1) is equivalent to 1 <<
+    * ((vertex_count - 1) % 32).
+    */
+   emit(SHL(dst_reg(mask), one, prev_count));
+   emit(OR(dst_reg(this->control_data_bits), this->control_data_bits, mask));
+}
+
+static const GLuint gl_prim_to_hw_prim[GL_TRIANGLE_STRIP_ADJACENCY+1] = {
+   [GL_POINTS] =_3DPRIM_POINTLIST,
+   [GL_LINES] = _3DPRIM_LINELIST,
+   [GL_LINE_LOOP] = _3DPRIM_LINELOOP,
+   [GL_LINE_STRIP] = _3DPRIM_LINESTRIP,
+   [GL_TRIANGLES] = _3DPRIM_TRILIST,
+   [GL_TRIANGLE_STRIP] = _3DPRIM_TRISTRIP,
+   [GL_TRIANGLE_FAN] = _3DPRIM_TRIFAN,
+   [GL_QUADS] = _3DPRIM_QUADLIST,
+   [GL_QUAD_STRIP] = _3DPRIM_QUADSTRIP,
+   [GL_POLYGON] = _3DPRIM_POLYGON,
+   [GL_LINES_ADJACENCY] = _3DPRIM_LINELIST_ADJ,
+   [GL_LINE_STRIP_ADJACENCY] = _3DPRIM_LINESTRIP_ADJ,
+   [GL_TRIANGLES_ADJACENCY] = _3DPRIM_TRILIST_ADJ,
+   [GL_TRIANGLE_STRIP_ADJACENCY] = _3DPRIM_TRISTRIP_ADJ,
+};
+
+extern "C" const unsigned *
+brw_compile_gs(const struct brw_compiler *compiler, void *log_data,
+               void *mem_ctx,
+               const struct brw_gs_prog_key *key,
+               struct brw_gs_prog_data *prog_data,
+               const nir_shader *src_shader,
+               struct gl_program *prog,
+               int shader_time_index,
+               unsigned *final_assembly_size,
+               char **error_str)
+{
+   struct brw_gs_compile c;
+   memset(&c, 0, sizeof(c));
+   c.key = *key;
+
+   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_GEOMETRY];
+   nir_shader *shader = nir_shader_clone(mem_ctx, src_shader);
+
+   /* The GLSL linker will have already matched up GS inputs and the outputs
+    * of prior stages.  The driver does extend VS outputs in some cases, but
+    * only for legacy OpenGL or Gen4-5 hardware, neither of which offer
+    * geometry shader support.  So we can safely ignore that.
+    *
+    * For SSO pipelines, we use a fixed VUE map layout based on variable
+    * locations, so we can rely on rendezvous-by-location making this work.
+    */
+   GLbitfield64 inputs_read = shader->info->inputs_read;
+   brw_compute_vue_map(compiler->devinfo,
+                       &c.input_vue_map, inputs_read,
+                       shader->info->separate_shader);
+
+   shader = brw_nir_apply_sampler_key(shader, compiler, &key->tex, is_scalar);
+   brw_nir_lower_vue_inputs(shader, is_scalar, &c.input_vue_map);
+   brw_nir_lower_vue_outputs(shader, is_scalar);
+   shader = brw_postprocess_nir(shader, compiler, is_scalar);
+
+   prog_data->base.clip_distance_mask =
+      ((1 << shader->info->clip_distance_array_size) - 1);
+   prog_data->base.cull_distance_mask =
+      ((1 << shader->info->cull_distance_array_size) - 1) <<
+      shader->info->clip_distance_array_size;
+
+   prog_data->include_primitive_id =
+      (shader->info->system_values_read & (1 << SYSTEM_VALUE_PRIMITIVE_ID)) != 0;
+
+   prog_data->invocations = shader->info->gs.invocations;
+
+   if (compiler->devinfo->gen >= 8)
+      prog_data->static_vertex_count = nir_gs_count_vertices(shader);
+
+   if (compiler->devinfo->gen >= 7) {
+      if (shader->info->gs.output_primitive == GL_POINTS) {
+         /* When the output type is points, the geometry shader may output data
+          * to multiple streams, and EndPrimitive() has no effect.  So we
+          * configure the hardware to interpret the control data as stream ID.
+          */
+         prog_data->control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_SID;
+
+         /* We only have to emit control bits if we are using streams */
+         if (prog && prog->info.gs.uses_streams)
+            c.control_data_bits_per_vertex = 2;
+         else
+            c.control_data_bits_per_vertex = 0;
+      } else {
+         /* When the output type is triangle_strip or line_strip, EndPrimitive()
+          * may be used to terminate the current strip and start a new one
+          * (similar to primitive restart), and outputting data to multiple
+          * streams is not supported.  So we configure the hardware to interpret
+          * the control data as EndPrimitive information (a.k.a. "cut bits").
+          */
+         prog_data->control_data_format = GEN7_GS_CONTROL_DATA_FORMAT_GSCTL_CUT;
+
+         /* We only need to output control data if the shader actually calls
+          * EndPrimitive().
+          */
+         c.control_data_bits_per_vertex =
+            shader->info->gs.uses_end_primitive ? 1 : 0;
+      }
+   } else {
+      /* There are no control data bits in gen6. */
+      c.control_data_bits_per_vertex = 0;
+   }
+   c.control_data_header_size_bits =
+      shader->info->gs.vertices_out * c.control_data_bits_per_vertex;
+
+   /* 1 HWORD = 32 bytes = 256 bits */
+   prog_data->control_data_header_size_hwords =
+      ALIGN(c.control_data_header_size_bits, 256) / 256;
+
+   /* Compute the output vertex size.
+    *
+    * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 STATE_GS - Output Vertex
+    * Size (p168):
+    *
+    *     [0,62] indicating [1,63] 16B units
+    *
+    *     Specifies the size of each vertex stored in the GS output entry
+    *     (following any Control Header data) as a number of 128-bit units
+    *     (minus one).
+    *
+    *     Programming Restrictions: The vertex size must be programmed as a
+    *     multiple of 32B units with the following exception: Rendering is
+    *     disabled (as per SOL stage state) and the vertex size output by the
+    *     GS thread is 16B.
+    *
+    *     If rendering is enabled (as per SOL state) the vertex size must be
+    *     programmed as a multiple of 32B units. In other words, the only time
+    *     software can program a vertex size with an odd number of 16B units
+    *     is when rendering is disabled.
+    *
+    * Note: B=bytes in the above text.
+    *
+    * It doesn't seem worth the extra trouble to optimize the case where the
+    * vertex size is 16B (especially since this would require special-casing
+    * the GEN assembly that writes to the URB).  So we just set the vertex
+    * size to a multiple of 32B (2 vec4's) in all cases.
+    *
+    * The maximum output vertex size is 62*16 = 992 bytes (31 hwords).  We
+    * budget that as follows:
+    *
+    *   512 bytes for varyings (a varying component is 4 bytes and
+    *             gl_MaxGeometryOutputComponents = 128)
+    *    16 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
+    *             bytes)
+    *    16 bytes overhead for gl_Position (we allocate it a slot in the VUE
+    *             even if it's not used)
+    *    32 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
+    *             whenever clip planes are enabled, even if the shader doesn't
+    *             write to gl_ClipDistance)
+    *    16 bytes overhead since the VUE size must be a multiple of 32 bytes
+    *             (see below)--this causes up to 1 VUE slot to be wasted
+    *   400 bytes available for varying packing overhead
+    *
+    * Worst-case varying packing overhead is 3/4 of a varying slot (12 bytes)
+    * per interpolation type, so this is plenty.
+    *
+    */
+   unsigned output_vertex_size_bytes = prog_data->base.vue_map.num_slots * 16;
+   assert(compiler->devinfo->gen == 6 ||
+          output_vertex_size_bytes <= GEN7_MAX_GS_OUTPUT_VERTEX_SIZE_BYTES);
+   prog_data->output_vertex_size_hwords =
+      ALIGN(output_vertex_size_bytes, 32) / 32;
+
+   /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
+    * That divides up as follows:
+    *
+    *     64 bytes for the control data header (cut indices or StreamID bits)
+    *   4096 bytes for varyings (a varying component is 4 bytes and
+    *              gl_MaxGeometryTotalOutputComponents = 1024)
+    *   4096 bytes overhead for VARYING_SLOT_PSIZ (each varying slot is 16
+    *              bytes/vertex and gl_MaxGeometryOutputVertices is 256)
+    *   4096 bytes overhead for gl_Position (we allocate it a slot in the VUE
+    *              even if it's not used)
+    *   8192 bytes overhead for gl_ClipDistance (we allocate it 2 VUE slots
+    *              whenever clip planes are enabled, even if the shader doesn't
+    *              write to gl_ClipDistance)
+    *   4096 bytes overhead since the VUE size must be a multiple of 32
+    *              bytes (see above)--this causes up to 1 VUE slot to be wasted
+    *   8128 bytes available for varying packing overhead
+    *
+    * Worst-case varying packing overhead is 3/4 of a varying slot per
+    * interpolation type, which works out to 3072 bytes, so this would allow
+    * us to accommodate 2 interpolation types without any danger of running
+    * out of URB space.
+    *
+    * In practice, the risk of running out of URB space is very small, since
+    * the above figures are all worst-case, and most of them scale with the
+    * number of output vertices.  So we'll just calculate the amount of space
+    * we need, and if it's too large, fail to compile.
+    *
+    * The above is for gen7+ where we have a single URB entry that will hold
+    * all the output. In gen6, we will have to allocate URB entries for every
+    * vertex we emit, so our URB entries only need to be large enough to hold
+    * a single vertex. Also, gen6 does not have a control data header.
+    */
+   unsigned output_size_bytes;
+   if (compiler->devinfo->gen >= 7) {
+      output_size_bytes =
+         prog_data->output_vertex_size_hwords * 32 * shader->info->gs.vertices_out;
+      output_size_bytes += 32 * prog_data->control_data_header_size_hwords;
+   } else {
+      output_size_bytes = prog_data->output_vertex_size_hwords * 32;
+   }
+
+   /* Broadwell stores "Vertex Count" as a full 8 DWord (32 byte) URB output,
+    * which comes before the control header.
+    */
+   if (compiler->devinfo->gen >= 8)
+      output_size_bytes += 32;
+
+   /* Shaders can technically set max_vertices = 0, at which point we
+    * may have a URB size of 0 bytes.  Nothing good can come from that,
+    * so enforce a minimum size.
+    */
+   if (output_size_bytes == 0)
+      output_size_bytes = 1;
+
+   unsigned max_output_size_bytes = GEN7_MAX_GS_URB_ENTRY_SIZE_BYTES;
+   if (compiler->devinfo->gen == 6)
+      max_output_size_bytes = GEN6_MAX_GS_URB_ENTRY_SIZE_BYTES;
+   if (output_size_bytes > max_output_size_bytes)
+      return NULL;
+
+
+   /* URB entry sizes are stored as a multiple of 64 bytes in gen7+ and
+    * a multiple of 128 bytes in gen6.
+    */
+   if (compiler->devinfo->gen >= 7)
+      prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
+   else
+      prog_data->base.urb_entry_size = ALIGN(output_size_bytes, 128) / 128;
+
+   assert(shader->info->gs.output_primitive < ARRAY_SIZE(gl_prim_to_hw_prim));
+   prog_data->output_topology =
+      gl_prim_to_hw_prim[shader->info->gs.output_primitive];
+
+   prog_data->vertices_in = shader->info->gs.vertices_in;
+
+   /* GS inputs are read from the VUE 256 bits (2 vec4's) at a time, so we
+    * need to program a URB read length of ceiling(num_slots / 2).
+    */
+   prog_data->base.urb_read_length = (c.input_vue_map.num_slots + 1) / 2;
+
+   /* Now that prog_data setup is done, we are ready to actually compile the
+    * program.
+    */
+   if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
+      fprintf(stderr, "GS Input ");
+      brw_print_vue_map(stderr, &c.input_vue_map);
+      fprintf(stderr, "GS Output ");
+      brw_print_vue_map(stderr, &prog_data->base.vue_map);
+   }
+
+   if (is_scalar) {
+      fs_visitor v(compiler, log_data, mem_ctx, &c, prog_data, shader,
+                   shader_time_index);
+      if (v.run_gs()) {
+         prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
+         prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs;
+
+         fs_generator g(compiler, log_data, mem_ctx, &c.key,
+                        &prog_data->base.base, v.promoted_constants,
+                        false, MESA_SHADER_GEOMETRY);
+         if (unlikely(INTEL_DEBUG & DEBUG_GS)) {
+            const char *label =
+               shader->info->label ? shader->info->label : "unnamed";
+            char *name = ralloc_asprintf(mem_ctx, "%s geometry shader %s",
+                                         label, shader->info->name);
+            g.enable_debug(name);
+         }
+         g.generate_code(v.cfg, 8);
+         return g.get_assembly(final_assembly_size);
+      }
+   }
+
+   if (compiler->devinfo->gen >= 7) {
+      /* Compile the geometry shader in DUAL_OBJECT dispatch mode, if we can do
+       * so without spilling. If the GS invocations count > 1, then we can't use
+       * dual object mode.
+       */
+      if (prog_data->invocations <= 1 &&
+          likely(!(INTEL_DEBUG & DEBUG_NO_DUAL_OBJECT_GS))) {
+         prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
+
+         vec4_gs_visitor v(compiler, log_data, &c, prog_data, shader,
+                           mem_ctx, true /* no_spills */, shader_time_index);
+         if (v.run()) {
+            return brw_vec4_generate_assembly(compiler, log_data, mem_ctx,
+                                              shader, &prog_data->base, v.cfg,
+                                              final_assembly_size);
+         }
+      }
+   }
+
+   /* Either we failed to compile in DUAL_OBJECT mode (probably because it
+    * would have required spilling) or DUAL_OBJECT mode is disabled.  So fall
+    * back to DUAL_INSTANCED or SINGLE mode, which consumes fewer registers.
+    *
+    * FIXME: Single dispatch mode requires that the driver can handle
+    * interleaving of input registers, but this is already supported (dual
+    * instance mode has the same requirement). However, to take full advantage
+    * of single dispatch mode to reduce register pressure we would also need to
+    * do interleaved outputs, but currently, the vec4 visitor and generator
+    * classes do not support this, so at the moment register pressure in
+    * single and dual instance modes is the same.
+    *
+    * From the Ivy Bridge PRM, Vol2 Part1 7.2.1.1 "3DSTATE_GS"
+    * "If InstanceCount>1, DUAL_OBJECT mode is invalid. Software will likely
+    * want to use DUAL_INSTANCE mode for higher performance, but SINGLE mode
+    * is also supported. When InstanceCount=1 (one instance per object) software
+    * can decide which dispatch mode to use. DUAL_OBJECT mode would likely be
+    * the best choice for performance, followed by SINGLE mode."
+    *
+    * So SINGLE mode is more performant when invocations == 1 and DUAL_INSTANCE
+    * mode is more performant when invocations > 1. Gen6 only supports
+    * SINGLE mode.
+    */
+   if (prog_data->invocations <= 1 || compiler->devinfo->gen < 7)
+      prog_data->base.dispatch_mode = DISPATCH_MODE_4X1_SINGLE;
+   else
+      prog_data->base.dispatch_mode = DISPATCH_MODE_4X2_DUAL_INSTANCE;
+
+   vec4_gs_visitor *gs = NULL;
+   const unsigned *ret = NULL;
+
+   if (compiler->devinfo->gen >= 7)
+      gs = new vec4_gs_visitor(compiler, log_data, &c, prog_data,
+                               shader, mem_ctx, false /* no_spills */,
+                               shader_time_index);
+   else
+      gs = new gen6_gs_visitor(compiler, log_data, &c, prog_data, prog,
+                               shader, mem_ctx, false /* no_spills */,
+                               shader_time_index);
+
+   if (!gs->run()) {
+      if (error_str)
+         *error_str = ralloc_strdup(mem_ctx, gs->fail_msg);
+   } else {
+      ret = brw_vec4_generate_assembly(compiler, log_data, mem_ctx, shader,
+                                       &prog_data->base, gs->cfg,
+                                       final_assembly_size);
+   }
+
+   delete gs;
+   return ret;
+}
+
+
+} /* namespace brw */
diff --git a/src/intel/compiler/brw_vec4_gs_visitor.h b/src/intel/compiler/brw_vec4_gs_visitor.h
new file mode 100644
index 00000000000..09221f928d1
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_gs_visitor.h
@@ -0,0 +1,81 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_vec4_gs_visitor.h
+ *
+ * Geometry-shader-specific code derived from the vec4_visitor class.
+ */
+
+#ifndef BRW_VEC4_GS_VISITOR_H
+#define BRW_VEC4_GS_VISITOR_H
+
+#include "brw_vec4.h"
+
+#define MAX_GS_INPUT_VERTICES 6
+
+#ifdef __cplusplus
+namespace brw {
+
+class vec4_gs_visitor : public vec4_visitor
+{
+public:
+   vec4_gs_visitor(const struct brw_compiler *compiler,
+                   void *log_data,
+                   struct brw_gs_compile *c,
+                   struct brw_gs_prog_data *prog_data,
+                   const nir_shader *shader,
+                   void *mem_ctx,
+                   bool no_spills,
+                   int shader_time_index);
+
+   virtual void nir_setup_inputs();
+   virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr);
+
+protected:
+   virtual dst_reg *make_reg_for_system_value(int location);
+   virtual void setup_payload();
+   virtual void emit_prolog();
+   virtual void emit_thread_end();
+   virtual void emit_urb_write_header(int mrf);
+   virtual vec4_instruction *emit_urb_write_opcode(bool complete);
+   virtual void gs_emit_vertex(int stream_id);
+   virtual void gs_end_primitive();
+   virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
+
+protected:
+   int setup_varying_inputs(int payload_reg, int *attribute_map,
+                            int attributes_per_reg);
+   void emit_control_data_bits();
+   void set_stream_control_data_bits(unsigned stream_id);
+
+   src_reg vertex_count;
+   src_reg control_data_bits;
+   const struct brw_gs_compile * const c;
+   struct brw_gs_prog_data * const gs_prog_data;
+};
+
+} /* namespace brw */
+#endif /* __cplusplus */
+
+#endif /* BRW_VEC4_GS_VISITOR_H */
diff --git a/src/intel/compiler/brw_vec4_live_variables.cpp b/src/intel/compiler/brw_vec4_live_variables.cpp
new file mode 100644
index 00000000000..73f658cd8fa
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_live_variables.cpp
@@ -0,0 +1,343 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "brw_cfg.h"
+#include "brw_vec4_live_variables.h"
+
+using namespace brw;
+
+/** @file brw_vec4_live_variables.cpp
+ *
+ * Support for computing at the basic block level which variables
+ * (virtual GRFs in our case) are live at entry and exit.
+ *
+ * See Muchnick's Advanced Compiler Design and Implementation, section
+ * 14.1 (p444).
+ */
+
+/**
+ * Sets up the use[] and def[] arrays.
+ *
+ * The basic-block-level live variable analysis needs to know which
+ * variables get used before they're completely defined, and which
+ * variables are completely defined before they're used.
+ *
+ * We independently track each channel of a vec4.  This is because we need to
+ * be able to recognize a sequence like:
+ *
+ * ...
+ * DP4 tmp.x a b;
+ * DP4 tmp.y c d;
+ * MUL result.xy tmp.xy e.xy
+ * ...
+ *
+ * as having tmp live only across that sequence (assuming it's used nowhere
+ * else), because it's a common pattern.  A more conservative approach that
+ * doesn't get tmp marked a deffed in this block will tend to result in
+ * spilling.
+ */
+void
+vec4_live_variables::setup_def_use()
+{
+   int ip = 0;
+
+   foreach_block (block, cfg) {
+      assert(ip == block->start_ip);
+      if (block->num > 0)
+	 assert(cfg->blocks[block->num - 1]->end_ip == ip - 1);
+
+      foreach_inst_in_block(vec4_instruction, inst, block) {
+         struct block_data *bd = &block_data[block->num];
+
+	 /* Set use[] for this instruction */
+	 for (unsigned int i = 0; i < 3; i++) {
+	    if (inst->src[i].file == VGRF) {
+               for (unsigned j = 0; j < DIV_ROUND_UP(inst->size_read(i), 16); j++) {
+                  for (int c = 0; c < 4; c++) {
+                     const unsigned v = var_from_reg(alloc, inst->src[i], c, j);
+                     if (!BITSET_TEST(bd->def, v))
+                        BITSET_SET(bd->use, v);
+                  }
+               }
+	    }
+	 }
+         for (unsigned c = 0; c < 4; c++) {
+            if (inst->reads_flag(c) &&
+                !BITSET_TEST(bd->flag_def, c)) {
+               BITSET_SET(bd->flag_use, c);
+            }
+         }
+
+	 /* Check for unconditional writes to whole registers. These
+	  * are the things that screen off preceding definitions of a
+	  * variable, and thus qualify for being in def[].
+	  */
+	 if (inst->dst.file == VGRF &&
+	     (!inst->predicate || inst->opcode == BRW_OPCODE_SEL)) {
+            for (unsigned i = 0; i < DIV_ROUND_UP(inst->size_written, 16); i++) {
+               for (int c = 0; c < 4; c++) {
+                  if (inst->dst.writemask & (1 << c)) {
+                     const unsigned v = var_from_reg(alloc, inst->dst, c, i);
+                     if (!BITSET_TEST(bd->use, v))
+                        BITSET_SET(bd->def, v);
+                  }
+               }
+            }
+         }
+         if (inst->writes_flag()) {
+            for (unsigned c = 0; c < 4; c++) {
+               if ((inst->dst.writemask & (1 << c)) &&
+                   !BITSET_TEST(bd->flag_use, c)) {
+                  BITSET_SET(bd->flag_def, c);
+               }
+            }
+         }
+
+	 ip++;
+      }
+   }
+}
+
+/**
+ * The algorithm incrementally sets bits in liveout and livein,
+ * propagating it through control flow.  It will eventually terminate
+ * because it only ever adds bits, and stops when no bits are added in
+ * a pass.
+ */
+void
+vec4_live_variables::compute_live_variables()
+{
+   bool cont = true;
+
+   while (cont) {
+      cont = false;
+
+      foreach_block_reverse (block, cfg) {
+         struct block_data *bd = &block_data[block->num];
+
+	 /* Update liveout */
+	 foreach_list_typed(bblock_link, child_link, link, &block->children) {
+            struct block_data *child_bd = &block_data[child_link->block->num];
+
+	    for (int i = 0; i < bitset_words; i++) {
+               BITSET_WORD new_liveout = (child_bd->livein[i] &
+                                          ~bd->liveout[i]);
+               if (new_liveout) {
+                  bd->liveout[i] |= new_liveout;
+		  cont = true;
+	       }
+	    }
+            BITSET_WORD new_liveout = (child_bd->flag_livein[0] &
+                                       ~bd->flag_liveout[0]);
+            if (new_liveout) {
+               bd->flag_liveout[0] |= new_liveout;
+               cont = true;
+            }
+	 }
+
+         /* Update livein */
+         for (int i = 0; i < bitset_words; i++) {
+            BITSET_WORD new_livein = (bd->use[i] |
+                                      (bd->liveout[i] &
+                                       ~bd->def[i]));
+            if (new_livein & ~bd->livein[i]) {
+               bd->livein[i] |= new_livein;
+               cont = true;
+            }
+         }
+         BITSET_WORD new_livein = (bd->flag_use[0] |
+                                   (bd->flag_liveout[0] &
+                                    ~bd->flag_def[0]));
+         if (new_livein & ~bd->flag_livein[0]) {
+            bd->flag_livein[0] |= new_livein;
+            cont = true;
+         }
+      }
+   }
+}
+
+vec4_live_variables::vec4_live_variables(const simple_allocator &alloc,
+                                         cfg_t *cfg)
+   : alloc(alloc), cfg(cfg)
+{
+   mem_ctx = ralloc_context(NULL);
+
+   num_vars = alloc.total_size * 8;
+   block_data = rzalloc_array(mem_ctx, struct block_data, cfg->num_blocks);
+
+   bitset_words = BITSET_WORDS(num_vars);
+   for (int i = 0; i < cfg->num_blocks; i++) {
+      block_data[i].def = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
+      block_data[i].use = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
+      block_data[i].livein = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
+      block_data[i].liveout = rzalloc_array(mem_ctx, BITSET_WORD, bitset_words);
+
+      block_data[i].flag_def[0] = 0;
+      block_data[i].flag_use[0] = 0;
+      block_data[i].flag_livein[0] = 0;
+      block_data[i].flag_liveout[0] = 0;
+   }
+
+   setup_def_use();
+   compute_live_variables();
+}
+
+vec4_live_variables::~vec4_live_variables()
+{
+   ralloc_free(mem_ctx);
+}
+
+#define MAX_INSTRUCTION (1 << 30)
+
+/**
+ * Computes a conservative start/end of the live intervals for each virtual GRF.
+ *
+ * We could expose per-channel live intervals to the consumer based on the
+ * information we computed in vec4_live_variables, except that our only
+ * current user is virtual_grf_interferes().  So we instead union the
+ * per-channel ranges into a per-vgrf range for virtual_grf_start[] and
+ * virtual_grf_end[].
+ *
+ * We could potentially have virtual_grf_interferes() do the test per-channel,
+ * which would let some interesting register allocation occur (particularly on
+ * code-generated GLSL sequences from the Cg compiler which does register
+ * allocation at the GLSL level and thus reuses components of the variable
+ * with distinct lifetimes).  But right now the complexity of doing so doesn't
+ * seem worth it, since having virtual_grf_interferes() be cheap is important
+ * for register allocation performance.
+ */
+void
+vec4_visitor::calculate_live_intervals()
+{
+   if (this->live_intervals)
+      return;
+
+   int *start = ralloc_array(mem_ctx, int, this->alloc.total_size * 8);
+   int *end = ralloc_array(mem_ctx, int, this->alloc.total_size * 8);
+   ralloc_free(this->virtual_grf_start);
+   ralloc_free(this->virtual_grf_end);
+   this->virtual_grf_start = start;
+   this->virtual_grf_end = end;
+
+   for (unsigned i = 0; i < this->alloc.total_size * 8; i++) {
+      start[i] = MAX_INSTRUCTION;
+      end[i] = -1;
+   }
+
+   /* Start by setting up the intervals with no knowledge of control
+    * flow.
+    */
+   int ip = 0;
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      for (unsigned int i = 0; i < 3; i++) {
+	 if (inst->src[i].file == VGRF) {
+            for (unsigned j = 0; j < DIV_ROUND_UP(inst->size_read(i), 16); j++) {
+               for (int c = 0; c < 4; c++) {
+                  const unsigned v = var_from_reg(alloc, inst->src[i], c, j);
+                  start[v] = MIN2(start[v], ip);
+                  end[v] = ip;
+               }
+            }
+	 }
+      }
+
+      if (inst->dst.file == VGRF) {
+         for (unsigned i = 0; i < DIV_ROUND_UP(inst->size_written, 16); i++) {
+            for (int c = 0; c < 4; c++) {
+               if (inst->dst.writemask & (1 << c)) {
+                  const unsigned v = var_from_reg(alloc, inst->dst, c, i);
+                  start[v] = MIN2(start[v], ip);
+                  end[v] = ip;
+               }
+            }
+         }
+      }
+
+      ip++;
+   }
+
+   /* Now, extend those intervals using our analysis of control flow.
+    *
+    * The control flow-aware analysis was done at a channel level, while at
+    * this point we're distilling it down to vgrfs.
+    */
+   this->live_intervals = new(mem_ctx) vec4_live_variables(alloc, cfg);
+
+   foreach_block (block, cfg) {
+      struct block_data *bd = &live_intervals->block_data[block->num];
+
+      for (int i = 0; i < live_intervals->num_vars; i++) {
+         if (BITSET_TEST(bd->livein, i)) {
+            start[i] = MIN2(start[i], block->start_ip);
+            end[i] = MAX2(end[i], block->start_ip);
+         }
+
+         if (BITSET_TEST(bd->liveout, i)) {
+            start[i] = MIN2(start[i], block->end_ip);
+            end[i] = MAX2(end[i], block->end_ip);
+         }
+      }
+   }
+}
+
+void
+vec4_visitor::invalidate_live_intervals()
+{
+   ralloc_free(live_intervals);
+   live_intervals = NULL;
+}
+
+int
+vec4_visitor::var_range_start(unsigned v, unsigned n) const
+{
+   int start = INT_MAX;
+
+   for (unsigned i = 0; i < n; i++)
+      start = MIN2(start, virtual_grf_start[v + i]);
+
+   return start;
+}
+
+int
+vec4_visitor::var_range_end(unsigned v, unsigned n) const
+{
+   int end = INT_MIN;
+
+   for (unsigned i = 0; i < n; i++)
+      end = MAX2(end, virtual_grf_end[v + i]);
+
+   return end;
+}
+
+bool
+vec4_visitor::virtual_grf_interferes(int a, int b)
+{
+   return !((var_range_end(8 * alloc.offsets[a], 8 * alloc.sizes[a]) <=
+             var_range_start(8 * alloc.offsets[b], 8 * alloc.sizes[b])) ||
+            (var_range_end(8 * alloc.offsets[b], 8 * alloc.sizes[b]) <=
+             var_range_start(8 * alloc.offsets[a], 8 * alloc.sizes[a])));
+}
diff --git a/src/intel/compiler/brw_vec4_live_variables.h b/src/intel/compiler/brw_vec4_live_variables.h
new file mode 100644
index 00000000000..8807c453743
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_live_variables.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <eric@anholt.net>
+ *
+ */
+
+#include "util/bitset.h"
+#include "brw_vec4.h"
+
+namespace brw {
+
+struct block_data {
+   /**
+    * Which variables are defined before being used in the block.
+    *
+    * Note that for our purposes, "defined" means unconditionally, completely
+    * defined.
+    */
+   BITSET_WORD *def;
+
+   /**
+    * Which variables are used before being defined in the block.
+    */
+   BITSET_WORD *use;
+
+   /** Which defs reach the entry point of the block. */
+   BITSET_WORD *livein;
+
+   /** Which defs reach the exit point of the block. */
+   BITSET_WORD *liveout;
+
+   BITSET_WORD flag_def[1];
+   BITSET_WORD flag_use[1];
+   BITSET_WORD flag_livein[1];
+   BITSET_WORD flag_liveout[1];
+};
+
+class vec4_live_variables {
+public:
+   DECLARE_RALLOC_CXX_OPERATORS(vec4_live_variables)
+
+   vec4_live_variables(const simple_allocator &alloc, cfg_t *cfg);
+   ~vec4_live_variables();
+
+   int num_vars;
+   int bitset_words;
+
+   /** Per-basic-block information on live variables */
+   struct block_data *block_data;
+
+protected:
+   void setup_def_use();
+   void compute_live_variables();
+
+   const simple_allocator &alloc;
+   cfg_t *cfg;
+   void *mem_ctx;
+};
+
+/* Returns the variable index for the k-th dword of the c-th component of
+ * register reg.
+ */
+inline unsigned
+var_from_reg(const simple_allocator &alloc, const src_reg &reg,
+             unsigned c = 0, unsigned k = 0)
+{
+   assert(reg.file == VGRF && reg.nr < alloc.count && c < 4);
+   const unsigned csize = DIV_ROUND_UP(type_sz(reg.type), 4);
+   unsigned result =
+      8 * (alloc.offsets[reg.nr] + reg.offset / REG_SIZE) +
+      (BRW_GET_SWZ(reg.swizzle, c) + k / csize * 4) * csize + k % csize;
+   /* Do not exceed the limit for this register */
+   assert(result < 8 * (alloc.offsets[reg.nr] + alloc.sizes[reg.nr]));
+   return result;
+}
+
+inline unsigned
+var_from_reg(const simple_allocator &alloc, const dst_reg &reg,
+             unsigned c = 0, unsigned k = 0)
+{
+   assert(reg.file == VGRF && reg.nr < alloc.count && c < 4);
+   const unsigned csize = DIV_ROUND_UP(type_sz(reg.type), 4);
+   unsigned result =
+      8 * (alloc.offsets[reg.nr] + reg.offset / REG_SIZE) +
+      (c + k / csize * 4) * csize + k % csize;
+   /* Do not exceed the limit for this register */
+   assert(result < 8 * (alloc.offsets[reg.nr] + alloc.sizes[reg.nr]));
+   return result;
+}
+
+} /* namespace brw */
diff --git a/src/intel/compiler/brw_vec4_nir.cpp b/src/intel/compiler/brw_vec4_nir.cpp
new file mode 100644
index 00000000000..4e88b795049
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_nir.cpp
@@ -0,0 +1,2407 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_nir.h"
+#include "brw_vec4.h"
+#include "brw_vec4_builder.h"
+#include "brw_vec4_surface_builder.h"
+
+using namespace brw;
+using namespace brw::surface_access;
+
+namespace brw {
+
+void
+vec4_visitor::emit_nir_code()
+{
+   if (nir->num_uniforms > 0)
+      nir_setup_uniforms();
+
+   nir_setup_system_values();
+
+   /* get the main function and emit it */
+   nir_foreach_function(function, nir) {
+      assert(strcmp(function->name, "main") == 0);
+      assert(function->impl);
+      nir_emit_impl(function->impl);
+   }
+}
+
+void
+vec4_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr)
+{
+   dst_reg *reg;
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_vertex_id:
+      unreachable("should be lowered by lower_vertex_id().");
+
+   case nir_intrinsic_load_vertex_id_zero_base:
+      reg = &nir_system_values[SYSTEM_VALUE_VERTEX_ID_ZERO_BASE];
+      if (reg->file == BAD_FILE)
+         *reg = *make_reg_for_system_value(SYSTEM_VALUE_VERTEX_ID_ZERO_BASE);
+      break;
+
+   case nir_intrinsic_load_base_vertex:
+      reg = &nir_system_values[SYSTEM_VALUE_BASE_VERTEX];
+      if (reg->file == BAD_FILE)
+         *reg = *make_reg_for_system_value(SYSTEM_VALUE_BASE_VERTEX);
+      break;
+
+   case nir_intrinsic_load_instance_id:
+      reg = &nir_system_values[SYSTEM_VALUE_INSTANCE_ID];
+      if (reg->file == BAD_FILE)
+         *reg = *make_reg_for_system_value(SYSTEM_VALUE_INSTANCE_ID);
+      break;
+
+   case nir_intrinsic_load_base_instance:
+      reg = &nir_system_values[SYSTEM_VALUE_BASE_INSTANCE];
+      if (reg->file == BAD_FILE)
+         *reg = *make_reg_for_system_value(SYSTEM_VALUE_BASE_INSTANCE);
+      break;
+
+   case nir_intrinsic_load_draw_id:
+      reg = &nir_system_values[SYSTEM_VALUE_DRAW_ID];
+      if (reg->file == BAD_FILE)
+         *reg = *make_reg_for_system_value(SYSTEM_VALUE_DRAW_ID);
+      break;
+
+   default:
+      break;
+   }
+}
+
+static bool
+setup_system_values_block(nir_block *block, vec4_visitor *v)
+{
+   nir_foreach_instr(instr, block) {
+      if (instr->type != nir_instr_type_intrinsic)
+         continue;
+
+      nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(instr);
+      v->nir_setup_system_value_intrinsic(intrin);
+   }
+
+   return true;
+}
+
+void
+vec4_visitor::nir_setup_system_values()
+{
+   nir_system_values = ralloc_array(mem_ctx, dst_reg, SYSTEM_VALUE_MAX);
+   for (unsigned i = 0; i < SYSTEM_VALUE_MAX; i++) {
+      nir_system_values[i] = dst_reg();
+   }
+
+   nir_foreach_function(function, nir) {
+      assert(strcmp(function->name, "main") == 0);
+      assert(function->impl);
+      nir_foreach_block(block, function->impl) {
+         setup_system_values_block(block, this);
+      }
+   }
+}
+
+void
+vec4_visitor::nir_setup_uniforms()
+{
+   uniforms = nir->num_uniforms / 16;
+}
+
+void
+vec4_visitor::nir_emit_impl(nir_function_impl *impl)
+{
+   nir_locals = ralloc_array(mem_ctx, dst_reg, impl->reg_alloc);
+   for (unsigned i = 0; i < impl->reg_alloc; i++) {
+      nir_locals[i] = dst_reg();
+   }
+
+   foreach_list_typed(nir_register, reg, node, &impl->registers) {
+      unsigned array_elems =
+         reg->num_array_elems == 0 ? 1 : reg->num_array_elems;
+      const unsigned num_regs = array_elems * DIV_ROUND_UP(reg->bit_size, 32);
+      nir_locals[reg->index] = dst_reg(VGRF, alloc.allocate(num_regs));
+
+      if (reg->bit_size == 64)
+         nir_locals[reg->index].type = BRW_REGISTER_TYPE_DF;
+   }
+
+   nir_ssa_values = ralloc_array(mem_ctx, dst_reg, impl->ssa_alloc);
+
+   nir_emit_cf_list(&impl->body);
+}
+
+void
+vec4_visitor::nir_emit_cf_list(exec_list *list)
+{
+   exec_list_validate(list);
+   foreach_list_typed(nir_cf_node, node, node, list) {
+      switch (node->type) {
+      case nir_cf_node_if:
+         nir_emit_if(nir_cf_node_as_if(node));
+         break;
+
+      case nir_cf_node_loop:
+         nir_emit_loop(nir_cf_node_as_loop(node));
+         break;
+
+      case nir_cf_node_block:
+         nir_emit_block(nir_cf_node_as_block(node));
+         break;
+
+      default:
+         unreachable("Invalid CFG node block");
+      }
+   }
+}
+
+void
+vec4_visitor::nir_emit_if(nir_if *if_stmt)
+{
+   /* First, put the condition in f0 */
+   src_reg condition = get_nir_src(if_stmt->condition, BRW_REGISTER_TYPE_D, 1);
+   vec4_instruction *inst = emit(MOV(dst_null_d(), condition));
+   inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+   /* We can just predicate based on the X channel, as the condition only
+    * goes on its own line */
+   emit(IF(BRW_PREDICATE_ALIGN16_REPLICATE_X));
+
+   nir_emit_cf_list(&if_stmt->then_list);
+
+   /* note: if the else is empty, dead CF elimination will remove it */
+   emit(BRW_OPCODE_ELSE);
+
+   nir_emit_cf_list(&if_stmt->else_list);
+
+   emit(BRW_OPCODE_ENDIF);
+}
+
+void
+vec4_visitor::nir_emit_loop(nir_loop *loop)
+{
+   emit(BRW_OPCODE_DO);
+
+   nir_emit_cf_list(&loop->body);
+
+   emit(BRW_OPCODE_WHILE);
+}
+
+void
+vec4_visitor::nir_emit_block(nir_block *block)
+{
+   nir_foreach_instr(instr, block) {
+      nir_emit_instr(instr);
+   }
+}
+
+void
+vec4_visitor::nir_emit_instr(nir_instr *instr)
+{
+   base_ir = instr;
+
+   switch (instr->type) {
+   case nir_instr_type_load_const:
+      nir_emit_load_const(nir_instr_as_load_const(instr));
+      break;
+
+   case nir_instr_type_intrinsic:
+      nir_emit_intrinsic(nir_instr_as_intrinsic(instr));
+      break;
+
+   case nir_instr_type_alu:
+      nir_emit_alu(nir_instr_as_alu(instr));
+      break;
+
+   case nir_instr_type_jump:
+      nir_emit_jump(nir_instr_as_jump(instr));
+      break;
+
+   case nir_instr_type_tex:
+      nir_emit_texture(nir_instr_as_tex(instr));
+      break;
+
+   case nir_instr_type_ssa_undef:
+      nir_emit_undef(nir_instr_as_ssa_undef(instr));
+      break;
+
+   default:
+      fprintf(stderr, "VS instruction not yet implemented by NIR->vec4\n");
+      break;
+   }
+}
+
+static dst_reg
+dst_reg_for_nir_reg(vec4_visitor *v, nir_register *nir_reg,
+                    unsigned base_offset, nir_src *indirect)
+{
+   dst_reg reg;
+
+   reg = v->nir_locals[nir_reg->index];
+   if (nir_reg->bit_size == 64)
+      reg.type = BRW_REGISTER_TYPE_DF;
+   reg = offset(reg, 8, base_offset);
+   if (indirect) {
+      reg.reladdr =
+         new(v->mem_ctx) src_reg(v->get_nir_src(*indirect,
+                                                BRW_REGISTER_TYPE_D,
+                                                1));
+   }
+   return reg;
+}
+
+dst_reg
+vec4_visitor::get_nir_dest(const nir_dest &dest)
+{
+   if (dest.is_ssa) {
+      dst_reg dst =
+         dst_reg(VGRF, alloc.allocate(DIV_ROUND_UP(dest.ssa.bit_size, 32)));
+      if (dest.ssa.bit_size == 64)
+         dst.type = BRW_REGISTER_TYPE_DF;
+      nir_ssa_values[dest.ssa.index] = dst;
+      return dst;
+   } else {
+      return dst_reg_for_nir_reg(this, dest.reg.reg, dest.reg.base_offset,
+                                 dest.reg.indirect);
+   }
+}
+
+dst_reg
+vec4_visitor::get_nir_dest(const nir_dest &dest, enum brw_reg_type type)
+{
+   return retype(get_nir_dest(dest), type);
+}
+
+dst_reg
+vec4_visitor::get_nir_dest(const nir_dest &dest, nir_alu_type type)
+{
+   return get_nir_dest(dest, brw_type_for_nir_type(devinfo, type));
+}
+
+src_reg
+vec4_visitor::get_nir_src(const nir_src &src, enum brw_reg_type type,
+                          unsigned num_components)
+{
+   dst_reg reg;
+
+   if (src.is_ssa) {
+      assert(src.ssa != NULL);
+      reg = nir_ssa_values[src.ssa->index];
+   }
+   else {
+      reg = dst_reg_for_nir_reg(this, src.reg.reg, src.reg.base_offset,
+                                src.reg.indirect);
+   }
+
+   reg = retype(reg, type);
+
+   src_reg reg_as_src = src_reg(reg);
+   reg_as_src.swizzle = brw_swizzle_for_size(num_components);
+   return reg_as_src;
+}
+
+src_reg
+vec4_visitor::get_nir_src(const nir_src &src, nir_alu_type type,
+                          unsigned num_components)
+{
+   return get_nir_src(src, brw_type_for_nir_type(devinfo, type),
+                      num_components);
+}
+
+src_reg
+vec4_visitor::get_nir_src(const nir_src &src, unsigned num_components)
+{
+   /* if type is not specified, default to signed int */
+   return get_nir_src(src, nir_type_int32, num_components);
+}
+
+src_reg
+vec4_visitor::get_indirect_offset(nir_intrinsic_instr *instr)
+{
+   nir_src *offset_src = nir_get_io_offset_src(instr);
+   nir_const_value *const_value = nir_src_as_const_value(*offset_src);
+
+   if (const_value) {
+      /* The only constant offset we should find is 0.  brw_nir.c's
+       * add_const_offset_to_base() will fold other constant offsets
+       * into instr->const_index[0].
+       */
+      assert(const_value->u32[0] == 0);
+      return src_reg();
+   }
+
+   return get_nir_src(*offset_src, BRW_REGISTER_TYPE_UD, 1);
+}
+
+void
+vec4_visitor::nir_emit_load_const(nir_load_const_instr *instr)
+{
+   dst_reg reg;
+
+   if (instr->def.bit_size == 64) {
+      reg = dst_reg(VGRF, alloc.allocate(2));
+      reg.type = BRW_REGISTER_TYPE_DF;
+   } else {
+      reg = dst_reg(VGRF, alloc.allocate(1));
+      reg.type = BRW_REGISTER_TYPE_D;
+   }
+
+   unsigned remaining = brw_writemask_for_size(instr->def.num_components);
+
+   /* @FIXME: consider emitting vector operations to save some MOVs in
+    * cases where the components are representable in 8 bits.
+    * For now, we emit a MOV for each distinct value.
+    */
+   for (unsigned i = 0; i < instr->def.num_components; i++) {
+      unsigned writemask = 1 << i;
+
+      if ((remaining & writemask) == 0)
+         continue;
+
+      for (unsigned j = i; j < instr->def.num_components; j++) {
+         if ((instr->def.bit_size == 32 &&
+              instr->value.u32[i] == instr->value.u32[j]) ||
+             (instr->def.bit_size == 64 &&
+              instr->value.f64[i] == instr->value.f64[j])) {
+            writemask |= 1 << j;
+         }
+      }
+
+      reg.writemask = writemask;
+      if (instr->def.bit_size == 64) {
+         emit(MOV(reg, setup_imm_df(instr->value.f64[i])));
+      } else {
+         emit(MOV(reg, brw_imm_d(instr->value.i32[i])));
+      }
+
+      remaining &= ~writemask;
+   }
+
+   /* Set final writemask */
+   reg.writemask = brw_writemask_for_size(instr->def.num_components);
+
+   nir_ssa_values[instr->def.index] = reg;
+}
+
+void
+vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
+{
+   dst_reg dest;
+   src_reg src;
+
+   switch (instr->intrinsic) {
+
+   case nir_intrinsic_load_input: {
+      nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+
+      /* We set EmitNoIndirectInput for VS */
+      assert(const_offset);
+
+      dest = get_nir_dest(instr->dest);
+      dest.writemask = brw_writemask_for_size(instr->num_components);
+
+      src = src_reg(ATTR, instr->const_index[0] + const_offset->u32[0],
+                    glsl_type::uvec4_type);
+      src = retype(src, dest.type);
+
+      bool is_64bit = nir_dest_bit_size(instr->dest) == 64;
+      if (is_64bit) {
+         dst_reg tmp = dst_reg(this, glsl_type::dvec4_type);
+         src.swizzle = BRW_SWIZZLE_XYZW;
+         shuffle_64bit_data(tmp, src, false);
+         emit(MOV(dest, src_reg(tmp)));
+      } else {
+         /* Swizzle source based on component layout qualifier */
+         src.swizzle = BRW_SWZ_COMP_INPUT(nir_intrinsic_component(instr));
+         emit(MOV(dest, src));
+      }
+      break;
+   }
+
+   case nir_intrinsic_store_output: {
+      nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
+      assert(const_offset);
+
+      int varying = instr->const_index[0] + const_offset->u32[0];
+
+      bool is_64bit = nir_src_bit_size(instr->src[0]) == 64;
+      if (is_64bit) {
+         src_reg data;
+         src = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_DF,
+                           instr->num_components);
+         data = src_reg(this, glsl_type::dvec4_type);
+         shuffle_64bit_data(dst_reg(data), src, true);
+         src = retype(data, BRW_REGISTER_TYPE_F);
+      } else {
+         src = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_F,
+                           instr->num_components);
+      }
+
+      unsigned c = nir_intrinsic_component(instr);
+      output_reg[varying][c] = dst_reg(src);
+      output_num_components[varying][c] = instr->num_components;
+
+      unsigned num_components = instr->num_components;
+      if (is_64bit)
+         num_components *= 2;
+
+      output_reg[varying][c] = dst_reg(src);
+      output_num_components[varying][c] = MIN2(4, num_components);
+
+      if (is_64bit && num_components > 4) {
+         assert(num_components <= 8);
+         output_reg[varying + 1][c] = byte_offset(dst_reg(src), REG_SIZE);
+         output_num_components[varying + 1][c] = num_components - 4;
+      }
+      break;
+   }
+
+   case nir_intrinsic_get_buffer_size: {
+      nir_const_value *const_uniform_block = nir_src_as_const_value(instr->src[0]);
+      unsigned ssbo_index = const_uniform_block ? const_uniform_block->u32[0] : 0;
+
+      const unsigned index =
+         prog_data->base.binding_table.ssbo_start + ssbo_index;
+      dst_reg result_dst = get_nir_dest(instr->dest);
+      vec4_instruction *inst = new(mem_ctx)
+         vec4_instruction(VS_OPCODE_GET_BUFFER_SIZE, result_dst);
+
+      inst->base_mrf = 2;
+      inst->mlen = 1; /* always at least one */
+      inst->src[1] = brw_imm_ud(index);
+
+      /* MRF for the first parameter */
+      src_reg lod = brw_imm_d(0);
+      int param_base = inst->base_mrf;
+      int writemask = WRITEMASK_X;
+      emit(MOV(dst_reg(MRF, param_base, glsl_type::int_type, writemask), lod));
+
+      emit(inst);
+
+      brw_mark_surface_used(&prog_data->base, index);
+      break;
+   }
+
+   case nir_intrinsic_store_ssbo: {
+      assert(devinfo->gen >= 7);
+
+      /* Block index */
+      src_reg surf_index;
+      nir_const_value *const_uniform_block =
+         nir_src_as_const_value(instr->src[1]);
+      if (const_uniform_block) {
+         unsigned index = prog_data->base.binding_table.ssbo_start +
+                          const_uniform_block->u32[0];
+         surf_index = brw_imm_ud(index);
+         brw_mark_surface_used(&prog_data->base, index);
+      } else {
+         surf_index = src_reg(this, glsl_type::uint_type);
+         emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[1], 1),
+                  brw_imm_ud(prog_data->base.binding_table.ssbo_start)));
+         surf_index = emit_uniformize(surf_index);
+
+         brw_mark_surface_used(&prog_data->base,
+                               prog_data->base.binding_table.ssbo_start +
+                               nir->info->num_ssbos - 1);
+      }
+
+      /* Offset */
+      src_reg offset_reg;
+      nir_const_value *const_offset = nir_src_as_const_value(instr->src[2]);
+      if (const_offset) {
+         offset_reg = brw_imm_ud(const_offset->u32[0]);
+      } else {
+         offset_reg = get_nir_src(instr->src[2], 1);
+      }
+
+      /* Value */
+      src_reg val_reg = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_F, 4);
+
+      /* Writemask */
+      unsigned write_mask = instr->const_index[0];
+
+      /* IvyBridge does not have a native SIMD4x2 untyped write message so untyped
+       * writes will use SIMD8 mode. In order to hide this and keep symmetry across
+       * typed and untyped messages and across hardware platforms, the
+       * current implementation of the untyped messages will transparently convert
+       * the SIMD4x2 payload into an equivalent SIMD8 payload by transposing it
+       * and enabling only channel X on the SEND instruction.
+       *
+       * The above, works well for full vector writes, but not for partial writes
+       * where we want to write some channels and not others, like when we have
+       * code such as v.xyw = vec3(1,2,4). Because the untyped write messages are
+       * quite restrictive with regards to the channel enables we can configure in
+       * the message descriptor (not all combinations are allowed) we cannot simply
+       * implement these scenarios with a single message while keeping the
+       * aforementioned symmetry in the implementation. For now we de decided that
+       * it is better to keep the symmetry to reduce complexity, so in situations
+       * such as the one described we end up emitting two untyped write messages
+       * (one for xy and another for w).
+       *
+       * The code below packs consecutive channels into a single write message,
+       * detects gaps in the vector write and if needed, sends a second message
+       * with the remaining channels. If in the future we decide that we want to
+       * emit a single message at the expense of losing the symmetry in the
+       * implementation we can:
+       *
+       * 1) For IvyBridge: Only use the red channel of the untyped write SIMD8
+       *    message payload. In this mode we can write up to 8 offsets and dwords
+       *    to the red channel only (for the two vec4s in the SIMD4x2 execution)
+       *    and select which of the 8 channels carry data to write by setting the
+       *    appropriate writemask in the dst register of the SEND instruction.
+       *    It would require to write a new generator opcode specifically for
+       *    IvyBridge since we would need to prepare a SIMD8 payload that could
+       *    use any channel, not just X.
+       *
+       * 2) For Haswell+: Simply send a single write message but set the writemask
+       *    on the dst of the SEND instruction to select the channels we want to
+       *    write. It would require to modify the current messages to receive
+       *    and honor the writemask provided.
+       */
+      const vec4_builder bld = vec4_builder(this).at_end()
+                               .annotate(current_annotation, base_ir);
+
+      unsigned type_slots = nir_src_bit_size(instr->src[0]) / 32;
+      if (type_slots == 2) {
+         dst_reg tmp = dst_reg(this, glsl_type::dvec4_type);
+         shuffle_64bit_data(tmp, retype(val_reg, tmp.type), true);
+         val_reg = src_reg(retype(tmp, BRW_REGISTER_TYPE_F));
+      }
+
+      uint8_t swizzle[4] = { 0, 0, 0, 0};
+      int num_channels = 0;
+      unsigned skipped_channels = 0;
+      int num_components = instr->num_components;
+      for (int i = 0; i < num_components; i++) {
+         /* Read components Z/W of a dvec from the appropriate place. We will
+          * also have to adjust the swizzle (we do that with the '% 4' below)
+          */
+         if (i == 2 && type_slots == 2)
+            val_reg = byte_offset(val_reg, REG_SIZE);
+
+         /* Check if this channel needs to be written. If so, record the
+          * channel we need to take the data from in the swizzle array
+          */
+         int component_mask = 1 << i;
+         int write_test = write_mask & component_mask;
+         if (write_test) {
+            /* If we are writing doubles we have to write 2 channels worth of
+             * of data (64 bits) for each double component.
+             */
+            swizzle[num_channels++] = (i * type_slots) % 4;
+            if (type_slots == 2)
+               swizzle[num_channels++] = (i * type_slots + 1) % 4;
+         }
+
+         /* If we don't have to write this channel it means we have a gap in the
+          * vector, so write the channels we accumulated until now, if any. Do
+          * the same if this was the last component in the vector, if we have
+          * enough channels for a full vec4 write or if we have processed
+          * components XY of a dvec (since components ZW are not in the same
+          * SIMD register)
+          */
+         if (!write_test || i == num_components - 1 || num_channels == 4 ||
+             (i == 1 && type_slots == 2)) {
+            if (num_channels > 0) {
+               /* We have channels to write, so update the offset we need to
+                * write at to skip the channels we skipped, if any.
+                */
+               if (skipped_channels > 0) {
+                  if (offset_reg.file == IMM) {
+                     offset_reg.ud += 4 * skipped_channels;
+                  } else {
+                     emit(ADD(dst_reg(offset_reg), offset_reg,
+                              brw_imm_ud(4 * skipped_channels)));
+                  }
+               }
+
+               /* Swizzle the data register so we take the data from the channels
+                * we need to write and send the write message. This will write
+                * num_channels consecutive dwords starting at offset.
+                */
+               val_reg.swizzle =
+                  BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
+               emit_untyped_write(bld, surf_index, offset_reg, val_reg,
+                                  1 /* dims */, num_channels /* size */,
+                                  BRW_PREDICATE_NONE);
+
+               /* If we have to do a second write we will have to update the
+                * offset so that we jump over the channels we have just written
+                * now.
+                */
+               skipped_channels = num_channels;
+
+               /* Restart the count for the next write message */
+               num_channels = 0;
+            }
+
+            /* If we didn't write the channel, increase skipped count */
+            if (!write_test)
+               skipped_channels += type_slots;
+         }
+      }
+
+      break;
+   }
+
+   case nir_intrinsic_load_ssbo: {
+      assert(devinfo->gen >= 7);
+
+      nir_const_value *const_uniform_block =
+         nir_src_as_const_value(instr->src[0]);
+
+      src_reg surf_index;
+      if (const_uniform_block) {
+         unsigned index = prog_data->base.binding_table.ssbo_start +
+                          const_uniform_block->u32[0];
+         surf_index = brw_imm_ud(index);
+
+         brw_mark_surface_used(&prog_data->base, index);
+      } else {
+         surf_index = src_reg(this, glsl_type::uint_type);
+         emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[0], 1),
+                  brw_imm_ud(prog_data->base.binding_table.ssbo_start)));
+         surf_index = emit_uniformize(surf_index);
+
+         /* Assume this may touch any UBO. It would be nice to provide
+          * a tighter bound, but the array information is already lowered away.
+          */
+         brw_mark_surface_used(&prog_data->base,
+                               prog_data->base.binding_table.ssbo_start +
+                               nir->info->num_ssbos - 1);
+      }
+
+      src_reg offset_reg;
+      nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
+      if (const_offset) {
+         offset_reg = brw_imm_ud(const_offset->u32[0]);
+      } else {
+         offset_reg = get_nir_src(instr->src[1], 1);
+      }
+
+      /* Read the vector */
+      const vec4_builder bld = vec4_builder(this).at_end()
+         .annotate(current_annotation, base_ir);
+
+      src_reg read_result;
+      dst_reg dest = get_nir_dest(instr->dest);
+      if (type_sz(dest.type) < 8) {
+         read_result = emit_untyped_read(bld, surf_index, offset_reg,
+                                         1 /* dims */, 4 /* size*/,
+                                         BRW_PREDICATE_NONE);
+      } else {
+         src_reg shuffled = src_reg(this, glsl_type::dvec4_type);
+
+         src_reg temp;
+         temp = emit_untyped_read(bld, surf_index, offset_reg,
+                                  1 /* dims */, 4 /* size*/,
+                                  BRW_PREDICATE_NONE);
+         emit(MOV(dst_reg(retype(shuffled, temp.type)), temp));
+
+         if (offset_reg.file == IMM)
+            offset_reg.ud += 16;
+         else
+            emit(ADD(dst_reg(offset_reg), offset_reg, brw_imm_ud(16)));
+
+         temp = emit_untyped_read(bld, surf_index, offset_reg,
+                                  1 /* dims */, 4 /* size*/,
+                                  BRW_PREDICATE_NONE);
+         emit(MOV(dst_reg(retype(byte_offset(shuffled, REG_SIZE), temp.type)),
+                  temp));
+
+         read_result = src_reg(this, glsl_type::dvec4_type);
+         shuffle_64bit_data(dst_reg(read_result), shuffled, false);
+      }
+
+      read_result.type = dest.type;
+      read_result.swizzle = brw_swizzle_for_size(instr->num_components);
+      emit(MOV(dest, read_result));
+      break;
+   }
+
+   case nir_intrinsic_ssbo_atomic_add:
+      nir_emit_ssbo_atomic(BRW_AOP_ADD, instr);
+      break;
+   case nir_intrinsic_ssbo_atomic_imin:
+      nir_emit_ssbo_atomic(BRW_AOP_IMIN, instr);
+      break;
+   case nir_intrinsic_ssbo_atomic_umin:
+      nir_emit_ssbo_atomic(BRW_AOP_UMIN, instr);
+      break;
+   case nir_intrinsic_ssbo_atomic_imax:
+      nir_emit_ssbo_atomic(BRW_AOP_IMAX, instr);
+      break;
+   case nir_intrinsic_ssbo_atomic_umax:
+      nir_emit_ssbo_atomic(BRW_AOP_UMAX, instr);
+      break;
+   case nir_intrinsic_ssbo_atomic_and:
+      nir_emit_ssbo_atomic(BRW_AOP_AND, instr);
+      break;
+   case nir_intrinsic_ssbo_atomic_or:
+      nir_emit_ssbo_atomic(BRW_AOP_OR, instr);
+      break;
+   case nir_intrinsic_ssbo_atomic_xor:
+      nir_emit_ssbo_atomic(BRW_AOP_XOR, instr);
+      break;
+   case nir_intrinsic_ssbo_atomic_exchange:
+      nir_emit_ssbo_atomic(BRW_AOP_MOV, instr);
+      break;
+   case nir_intrinsic_ssbo_atomic_comp_swap:
+      nir_emit_ssbo_atomic(BRW_AOP_CMPWR, instr);
+      break;
+
+   case nir_intrinsic_load_vertex_id:
+      unreachable("should be lowered by lower_vertex_id()");
+
+   case nir_intrinsic_load_vertex_id_zero_base:
+   case nir_intrinsic_load_base_vertex:
+   case nir_intrinsic_load_instance_id:
+   case nir_intrinsic_load_base_instance:
+   case nir_intrinsic_load_draw_id:
+   case nir_intrinsic_load_invocation_id: {
+      gl_system_value sv = nir_system_value_from_intrinsic(instr->intrinsic);
+      src_reg val = src_reg(nir_system_values[sv]);
+      assert(val.file != BAD_FILE);
+      dest = get_nir_dest(instr->dest, val.type);
+      emit(MOV(dest, val));
+      break;
+   }
+
+   case nir_intrinsic_load_uniform: {
+      /* Offsets are in bytes but they should always be multiples of 4 */
+      assert(nir_intrinsic_base(instr) % 4 == 0);
+
+      dest = get_nir_dest(instr->dest);
+
+      src = src_reg(dst_reg(UNIFORM, nir_intrinsic_base(instr) / 16));
+      src.type = dest.type;
+
+      /* Uniforms don't actually have to be vec4 aligned.  In the case that
+       * it isn't, we have to use a swizzle to shift things around.  They
+       * do still have the std140 alignment requirement that vec2's have to
+       * be vec2-aligned and vec3's and vec4's have to be vec4-aligned.
+       *
+       * The swizzle also works in the indirect case as the generator adds
+       * the swizzle to the offset for us.
+       */
+      unsigned shift = (nir_intrinsic_base(instr) % 16) / 4;
+      assert(shift + instr->num_components <= 4);
+
+      nir_const_value *const_offset = nir_src_as_const_value(instr->src[0]);
+      if (const_offset) {
+         /* Offsets are in bytes but they should always be multiples of 4 */
+         assert(const_offset->u32[0] % 4 == 0);
+
+         unsigned offset = const_offset->u32[0] + shift * 4;
+         src.offset = ROUND_DOWN_TO(offset, 16);
+         shift = (offset % 16) / 4;
+         src.swizzle += BRW_SWIZZLE4(shift, shift, shift, shift);
+
+         emit(MOV(dest, src));
+      } else {
+         src.swizzle += BRW_SWIZZLE4(shift, shift, shift, shift);
+
+         src_reg indirect = get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1);
+
+         /* MOV_INDIRECT is going to stomp the whole thing anyway */
+         dest.writemask = WRITEMASK_XYZW;
+
+         emit(SHADER_OPCODE_MOV_INDIRECT, dest, src,
+              indirect, brw_imm_ud(instr->const_index[1]));
+      }
+      break;
+   }
+
+   case nir_intrinsic_atomic_counter_read:
+   case nir_intrinsic_atomic_counter_inc:
+   case nir_intrinsic_atomic_counter_dec: {
+      unsigned surf_index = prog_data->base.binding_table.abo_start +
+         (unsigned) instr->const_index[0];
+      const vec4_builder bld =
+         vec4_builder(this).at_end().annotate(current_annotation, base_ir);
+
+      /* Get some metadata from the image intrinsic. */
+      const nir_intrinsic_info *info = &nir_intrinsic_infos[instr->intrinsic];
+
+      /* Get the arguments of the atomic intrinsic. */
+      src_reg offset = get_nir_src(instr->src[0], nir_type_int32,
+                                   instr->num_components);
+      const src_reg surface = brw_imm_ud(surf_index);
+      const src_reg src0 = (info->num_srcs >= 2
+                           ? get_nir_src(instr->src[1]) : src_reg());
+      const src_reg src1 = (info->num_srcs >= 3
+                           ? get_nir_src(instr->src[2]) : src_reg());
+
+      src_reg tmp;
+
+      dest = get_nir_dest(instr->dest);
+
+      if (instr->intrinsic == nir_intrinsic_atomic_counter_read) {
+         tmp = emit_untyped_read(bld, surface, offset, 1, 1);
+      } else {
+         tmp = emit_untyped_atomic(bld, surface, offset,
+                                   src0, src1,
+                                   1, 1,
+                                   get_atomic_counter_op(instr->intrinsic));
+      }
+
+      bld.MOV(retype(dest, tmp.type), tmp);
+      brw_mark_surface_used(stage_prog_data, surf_index);
+      break;
+   }
+
+   case nir_intrinsic_load_ubo: {
+      nir_const_value *const_block_index = nir_src_as_const_value(instr->src[0]);
+      src_reg surf_index;
+
+      dest = get_nir_dest(instr->dest);
+
+      if (const_block_index) {
+         /* The block index is a constant, so just emit the binding table entry
+          * as an immediate.
+          */
+         const unsigned index = prog_data->base.binding_table.ubo_start +
+                                const_block_index->u32[0];
+         surf_index = brw_imm_ud(index);
+         brw_mark_surface_used(&prog_data->base, index);
+      } else {
+         /* The block index is not a constant. Evaluate the index expression
+          * per-channel and add the base UBO index; we have to select a value
+          * from any live channel.
+          */
+         surf_index = src_reg(this, glsl_type::uint_type);
+         emit(ADD(dst_reg(surf_index), get_nir_src(instr->src[0], nir_type_int32,
+                                                   instr->num_components),
+                  brw_imm_ud(prog_data->base.binding_table.ubo_start)));
+         surf_index = emit_uniformize(surf_index);
+
+         /* Assume this may touch any UBO. It would be nice to provide
+          * a tighter bound, but the array information is already lowered away.
+          */
+         brw_mark_surface_used(&prog_data->base,
+                               prog_data->base.binding_table.ubo_start +
+                               nir->info->num_ubos - 1);
+      }
+
+      src_reg offset_reg;
+      nir_const_value *const_offset = nir_src_as_const_value(instr->src[1]);
+      if (const_offset) {
+         offset_reg = brw_imm_ud(const_offset->u32[0] & ~15);
+      } else {
+         offset_reg = get_nir_src(instr->src[1], nir_type_uint32, 1);
+      }
+
+      src_reg packed_consts;
+      if (nir_dest_bit_size(instr->dest) == 32) {
+         packed_consts = src_reg(this, glsl_type::vec4_type);
+         emit_pull_constant_load_reg(dst_reg(packed_consts),
+                                     surf_index,
+                                     offset_reg,
+                                     NULL, NULL /* before_block/inst */);
+      } else {
+         src_reg temp = src_reg(this, glsl_type::dvec4_type);
+         src_reg temp_float = retype(temp, BRW_REGISTER_TYPE_F);
+
+         emit_pull_constant_load_reg(dst_reg(temp_float),
+                                     surf_index, offset_reg, NULL, NULL);
+         if (offset_reg.file == IMM)
+            offset_reg.ud += 16;
+         else
+            emit(ADD(dst_reg(offset_reg), offset_reg, brw_imm_ud(16u)));
+         emit_pull_constant_load_reg(dst_reg(byte_offset(temp_float, REG_SIZE)),
+                                     surf_index, offset_reg, NULL, NULL);
+
+         packed_consts = src_reg(this, glsl_type::dvec4_type);
+         shuffle_64bit_data(dst_reg(packed_consts), temp, false);
+      }
+
+      packed_consts.swizzle = brw_swizzle_for_size(instr->num_components);
+      if (const_offset) {
+         unsigned type_size = type_sz(dest.type);
+         packed_consts.swizzle +=
+            BRW_SWIZZLE4(const_offset->u32[0] % 16 / type_size,
+                         const_offset->u32[0] % 16 / type_size,
+                         const_offset->u32[0] % 16 / type_size,
+                         const_offset->u32[0] % 16 / type_size);
+      }
+
+      emit(MOV(dest, retype(packed_consts, dest.type)));
+
+      break;
+   }
+
+   case nir_intrinsic_memory_barrier: {
+      const vec4_builder bld =
+         vec4_builder(this).at_end().annotate(current_annotation, base_ir);
+      const dst_reg tmp = bld.vgrf(BRW_REGISTER_TYPE_UD, 2);
+      bld.emit(SHADER_OPCODE_MEMORY_FENCE, tmp)
+         ->size_written = 2 * REG_SIZE;
+      break;
+   }
+
+   case nir_intrinsic_shader_clock: {
+      /* We cannot do anything if there is an event, so ignore it for now */
+      const src_reg shader_clock = get_timestamp();
+      const enum brw_reg_type type = brw_type_for_base_type(glsl_type::uvec2_type);
+
+      dest = get_nir_dest(instr->dest, type);
+      emit(MOV(dest, shader_clock));
+      break;
+   }
+
+   default:
+      unreachable("Unknown intrinsic");
+   }
+}
+
+void
+vec4_visitor::nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr)
+{
+   dst_reg dest;
+   if (nir_intrinsic_infos[instr->intrinsic].has_dest)
+      dest = get_nir_dest(instr->dest);
+
+   src_reg surface;
+   nir_const_value *const_surface = nir_src_as_const_value(instr->src[0]);
+   if (const_surface) {
+      unsigned surf_index = prog_data->base.binding_table.ssbo_start +
+                            const_surface->u32[0];
+      surface = brw_imm_ud(surf_index);
+      brw_mark_surface_used(&prog_data->base, surf_index);
+   } else {
+      surface = src_reg(this, glsl_type::uint_type);
+      emit(ADD(dst_reg(surface), get_nir_src(instr->src[0]),
+               brw_imm_ud(prog_data->base.binding_table.ssbo_start)));
+
+      /* Assume this may touch any UBO. This is the same we do for other
+       * UBO/SSBO accesses with non-constant surface.
+       */
+      brw_mark_surface_used(&prog_data->base,
+                            prog_data->base.binding_table.ssbo_start +
+                            nir->info->num_ssbos - 1);
+   }
+
+   src_reg offset = get_nir_src(instr->src[1], 1);
+   src_reg data1 = get_nir_src(instr->src[2], 1);
+   src_reg data2;
+   if (op == BRW_AOP_CMPWR)
+      data2 = get_nir_src(instr->src[3], 1);
+
+   /* Emit the actual atomic operation operation */
+   const vec4_builder bld =
+      vec4_builder(this).at_end().annotate(current_annotation, base_ir);
+
+   src_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
+                                               data1, data2,
+                                               1 /* dims */, 1 /* rsize */,
+                                               op,
+                                               BRW_PREDICATE_NONE);
+   dest.type = atomic_result.type;
+   bld.MOV(dest, atomic_result);
+}
+
+static unsigned
+brw_swizzle_for_nir_swizzle(uint8_t swizzle[4])
+{
+   return BRW_SWIZZLE4(swizzle[0], swizzle[1], swizzle[2], swizzle[3]);
+}
+
+static enum brw_conditional_mod
+brw_conditional_for_nir_comparison(nir_op op)
+{
+   switch (op) {
+   case nir_op_flt:
+   case nir_op_ilt:
+   case nir_op_ult:
+      return BRW_CONDITIONAL_L;
+
+   case nir_op_fge:
+   case nir_op_ige:
+   case nir_op_uge:
+      return BRW_CONDITIONAL_GE;
+
+   case nir_op_feq:
+   case nir_op_ieq:
+   case nir_op_ball_fequal2:
+   case nir_op_ball_iequal2:
+   case nir_op_ball_fequal3:
+   case nir_op_ball_iequal3:
+   case nir_op_ball_fequal4:
+   case nir_op_ball_iequal4:
+      return BRW_CONDITIONAL_Z;
+
+   case nir_op_fne:
+   case nir_op_ine:
+   case nir_op_bany_fnequal2:
+   case nir_op_bany_inequal2:
+   case nir_op_bany_fnequal3:
+   case nir_op_bany_inequal3:
+   case nir_op_bany_fnequal4:
+   case nir_op_bany_inequal4:
+      return BRW_CONDITIONAL_NZ;
+
+   default:
+      unreachable("not reached: bad operation for comparison");
+   }
+}
+
+bool
+vec4_visitor::optimize_predicate(nir_alu_instr *instr,
+                                 enum brw_predicate *predicate)
+{
+   if (!instr->src[0].src.is_ssa ||
+       instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
+      return false;
+
+   nir_alu_instr *cmp_instr =
+      nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
+
+   switch (cmp_instr->op) {
+   case nir_op_bany_fnequal2:
+   case nir_op_bany_inequal2:
+   case nir_op_bany_fnequal3:
+   case nir_op_bany_inequal3:
+   case nir_op_bany_fnequal4:
+   case nir_op_bany_inequal4:
+      *predicate = BRW_PREDICATE_ALIGN16_ANY4H;
+      break;
+   case nir_op_ball_fequal2:
+   case nir_op_ball_iequal2:
+   case nir_op_ball_fequal3:
+   case nir_op_ball_iequal3:
+   case nir_op_ball_fequal4:
+   case nir_op_ball_iequal4:
+      *predicate = BRW_PREDICATE_ALIGN16_ALL4H;
+      break;
+   default:
+      return false;
+   }
+
+   unsigned size_swizzle =
+      brw_swizzle_for_size(nir_op_infos[cmp_instr->op].input_sizes[0]);
+
+   src_reg op[2];
+   assert(nir_op_infos[cmp_instr->op].num_inputs == 2);
+   for (unsigned i = 0; i < 2; i++) {
+      nir_alu_type type = nir_op_infos[cmp_instr->op].input_types[i];
+      unsigned bit_size = nir_src_bit_size(cmp_instr->src[i].src);
+      type = (nir_alu_type) (((unsigned) type) | bit_size);
+      op[i] = get_nir_src(cmp_instr->src[i].src, type, 4);
+      unsigned base_swizzle =
+         brw_swizzle_for_nir_swizzle(cmp_instr->src[i].swizzle);
+      op[i].swizzle = brw_compose_swizzle(size_swizzle, base_swizzle);
+      op[i].abs = cmp_instr->src[i].abs;
+      op[i].negate = cmp_instr->src[i].negate;
+   }
+
+   emit(CMP(dst_null_d(), op[0], op[1],
+            brw_conditional_for_nir_comparison(cmp_instr->op)));
+
+   return true;
+}
+
+static void
+emit_find_msb_using_lzd(const vec4_builder &bld,
+                        const dst_reg &dst,
+                        const src_reg &src,
+                        bool is_signed)
+{
+   vec4_instruction *inst;
+   src_reg temp = src;
+
+   if (is_signed) {
+      /* LZD of an absolute value source almost always does the right
+       * thing.  There are two problem values:
+       *
+       * * 0x80000000.  Since abs(0x80000000) == 0x80000000, LZD returns
+       *   0.  However, findMSB(int(0x80000000)) == 30.
+       *
+       * * 0xffffffff.  Since abs(0xffffffff) == 1, LZD returns
+       *   31.  Section 8.8 (Integer Functions) of the GLSL 4.50 spec says:
+       *
+       *    For a value of zero or negative one, -1 will be returned.
+       *
+       * * Negative powers of two.  LZD(abs(-(1<<x))) returns x, but
+       *   findMSB(-(1<<x)) should return x-1.
+       *
+       * For all negative number cases, including 0x80000000 and
+       * 0xffffffff, the correct value is obtained from LZD if instead of
+       * negating the (already negative) value the logical-not is used.  A
+       * conditonal logical-not can be achieved in two instructions.
+       */
+      temp = src_reg(bld.vgrf(BRW_REGISTER_TYPE_D));
+
+      bld.ASR(dst_reg(temp), src, brw_imm_d(31));
+      bld.XOR(dst_reg(temp), temp, src);
+   }
+
+   bld.LZD(retype(dst, BRW_REGISTER_TYPE_UD),
+           retype(temp, BRW_REGISTER_TYPE_UD));
+
+   /* LZD counts from the MSB side, while GLSL's findMSB() wants the count
+    * from the LSB side. Subtract the result from 31 to convert the MSB count
+    * into an LSB count.  If no bits are set, LZD will return 32.  31-32 = -1,
+    * which is exactly what findMSB() is supposed to return.
+    */
+   inst = bld.ADD(dst, retype(src_reg(dst), BRW_REGISTER_TYPE_D),
+                  brw_imm_d(31));
+   inst->src[0].negate = true;
+}
+
+void
+vec4_visitor::emit_conversion_from_double(dst_reg dst, src_reg src,
+                                          bool saturate,
+                                          brw_reg_type single_type)
+{
+   /* BDW PRM vol 15 - workarounds:
+    * DF->f format conversion for Align16 has wrong emask calculation when
+    * source is immediate.
+    */
+   if (devinfo->gen == 8 && single_type == BRW_REGISTER_TYPE_F &&
+       src.file == BRW_IMMEDIATE_VALUE) {
+      vec4_instruction *inst = emit(MOV(dst, brw_imm_f(src.df)));
+      inst->saturate = saturate;
+      return;
+   }
+
+   dst_reg temp = dst_reg(this, glsl_type::dvec4_type);
+   emit(MOV(temp, src));
+
+   dst_reg temp2 = dst_reg(this, glsl_type::dvec4_type);
+   temp2 = retype(temp2, single_type);
+   emit(VEC4_OPCODE_FROM_DOUBLE, temp2, src_reg(temp))
+      ->size_written = 2 * REG_SIZE;
+
+   vec4_instruction *inst = emit(MOV(dst, src_reg(temp2)));
+   inst->saturate = saturate;
+}
+
+void
+vec4_visitor::emit_conversion_to_double(dst_reg dst, src_reg src,
+                                        bool saturate,
+                                        brw_reg_type single_type)
+{
+   dst_reg tmp_dst = dst_reg(src_reg(this, glsl_type::dvec4_type));
+   src_reg tmp_src = retype(src_reg(this, glsl_type::vec4_type), single_type);
+   emit(MOV(dst_reg(tmp_src), retype(src, single_type)));
+   emit(VEC4_OPCODE_TO_DOUBLE, tmp_dst, tmp_src);
+   vec4_instruction *inst = emit(MOV(dst, src_reg(tmp_dst)));
+   inst->saturate = saturate;
+}
+
+src_reg
+vec4_visitor::setup_imm_df(double v)
+{
+   assert(devinfo->gen >= 7);
+
+   if (devinfo->gen >= 8)
+      return brw_imm_df(v);
+
+   /* gen7.5 does not support DF immediates straighforward but the DIM
+    * instruction allows to set the 64-bit immediate value.
+    */
+   if (devinfo->is_haswell) {
+      dst_reg dst = retype(dst_reg(VGRF, alloc.allocate(2)), BRW_REGISTER_TYPE_DF);
+      emit(DIM(dst, brw_imm_df(v)))->force_writemask_all = true;
+      return swizzle(src_reg(retype(dst, BRW_REGISTER_TYPE_DF)), BRW_SWIZZLE_XXXX);
+   }
+
+   /* gen7 does not support DF immediates */
+   union {
+      double d;
+      struct {
+         uint32_t i1;
+         uint32_t i2;
+      };
+   } di;
+
+   di.d = v;
+
+   /* Write the low 32-bit of the constant to the X:UD channel and the
+    * high 32-bit to the Y:UD channel to build the constant in a VGRF.
+    * We have to do this twice (offset 0 and offset 1), since a DF VGRF takes
+    * two SIMD8 registers in SIMD4x2 execution. Finally, return a swizzle
+    * XXXX so any access to the VGRF only reads the constant data in these
+    * channels.
+    */
+   const dst_reg tmp =
+      retype(dst_reg(VGRF, alloc.allocate(2)), BRW_REGISTER_TYPE_UD);
+   for (int n = 0; n < 2; n++) {
+      emit(MOV(writemask(offset(tmp, 8, n), WRITEMASK_X), brw_imm_ud(di.i1)))
+         ->force_writemask_all = true;
+      emit(MOV(writemask(offset(tmp, 8, n), WRITEMASK_Y), brw_imm_ud(di.i2)))
+         ->force_writemask_all = true;
+   }
+
+   return swizzle(src_reg(retype(tmp, BRW_REGISTER_TYPE_DF)), BRW_SWIZZLE_XXXX);
+}
+
+void
+vec4_visitor::nir_emit_alu(nir_alu_instr *instr)
+{
+   vec4_instruction *inst;
+
+   nir_alu_type dst_type = (nir_alu_type) (nir_op_infos[instr->op].output_type |
+                                           nir_dest_bit_size(instr->dest.dest));
+   dst_reg dst = get_nir_dest(instr->dest.dest, dst_type);
+   dst.writemask = instr->dest.write_mask;
+
+   src_reg op[4];
+   for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) {
+      nir_alu_type src_type = (nir_alu_type)
+         (nir_op_infos[instr->op].input_types[i] |
+          nir_src_bit_size(instr->src[i].src));
+      op[i] = get_nir_src(instr->src[i].src, src_type, 4);
+      op[i].swizzle = brw_swizzle_for_nir_swizzle(instr->src[i].swizzle);
+      op[i].abs = instr->src[i].abs;
+      op[i].negate = instr->src[i].negate;
+   }
+
+   switch (instr->op) {
+   case nir_op_imov:
+   case nir_op_fmov:
+      inst = emit(MOV(dst, op[0]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_vec2:
+   case nir_op_vec3:
+   case nir_op_vec4:
+      unreachable("not reached: should be handled by lower_vec_to_movs()");
+
+   case nir_op_i2f:
+   case nir_op_u2f:
+      inst = emit(MOV(dst, op[0]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_f2i:
+   case nir_op_f2u:
+      inst = emit(MOV(dst, op[0]));
+      break;
+
+   case nir_op_d2f:
+      emit_conversion_from_double(dst, op[0], instr->dest.saturate,
+                                  BRW_REGISTER_TYPE_F);
+      break;
+
+   case nir_op_f2d:
+      emit_conversion_to_double(dst, op[0], instr->dest.saturate,
+                                BRW_REGISTER_TYPE_F);
+      break;
+
+   case nir_op_d2i:
+   case nir_op_d2u:
+      emit_conversion_from_double(dst, op[0], instr->dest.saturate,
+                                  instr->op == nir_op_d2i ? BRW_REGISTER_TYPE_D :
+                                                            BRW_REGISTER_TYPE_UD);
+      break;
+
+   case nir_op_i2d:
+   case nir_op_u2d:
+      emit_conversion_to_double(dst, op[0], instr->dest.saturate,
+                                instr->op == nir_op_i2d ? BRW_REGISTER_TYPE_D :
+                                                          BRW_REGISTER_TYPE_UD);
+      break;
+
+   case nir_op_iadd:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+   case nir_op_fadd:
+      inst = emit(ADD(dst, op[0], op[1]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fmul:
+      inst = emit(MUL(dst, op[0], op[1]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_imul: {
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      if (devinfo->gen < 8) {
+         nir_const_value *value0 = nir_src_as_const_value(instr->src[0].src);
+         nir_const_value *value1 = nir_src_as_const_value(instr->src[1].src);
+
+         /* For integer multiplication, the MUL uses the low 16 bits of one of
+          * the operands (src0 through SNB, src1 on IVB and later). The MACH
+          * accumulates in the contribution of the upper 16 bits of that
+          * operand. If we can determine that one of the args is in the low
+          * 16 bits, though, we can just emit a single MUL.
+          */
+         if (value0 && value0->u32[0] < (1 << 16)) {
+            if (devinfo->gen < 7)
+               emit(MUL(dst, op[0], op[1]));
+            else
+               emit(MUL(dst, op[1], op[0]));
+         } else if (value1 && value1->u32[0] < (1 << 16)) {
+            if (devinfo->gen < 7)
+               emit(MUL(dst, op[1], op[0]));
+            else
+               emit(MUL(dst, op[0], op[1]));
+         } else {
+            struct brw_reg acc = retype(brw_acc_reg(8), dst.type);
+
+            emit(MUL(acc, op[0], op[1]));
+            emit(MACH(dst_null_d(), op[0], op[1]));
+            emit(MOV(dst, src_reg(acc)));
+         }
+      } else {
+	 emit(MUL(dst, op[0], op[1]));
+      }
+      break;
+   }
+
+   case nir_op_imul_high:
+   case nir_op_umul_high: {
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      struct brw_reg acc = retype(brw_acc_reg(8), dst.type);
+
+      if (devinfo->gen >= 8)
+         emit(MUL(acc, op[0], retype(op[1], BRW_REGISTER_TYPE_UW)));
+      else
+         emit(MUL(acc, op[0], op[1]));
+
+      emit(MACH(dst, op[0], op[1]));
+      break;
+   }
+
+   case nir_op_frcp:
+      inst = emit_math(SHADER_OPCODE_RCP, dst, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fexp2:
+      inst = emit_math(SHADER_OPCODE_EXP2, dst, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_flog2:
+      inst = emit_math(SHADER_OPCODE_LOG2, dst, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fsin:
+      inst = emit_math(SHADER_OPCODE_SIN, dst, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fcos:
+      inst = emit_math(SHADER_OPCODE_COS, dst, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_idiv:
+   case nir_op_udiv:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      emit_math(SHADER_OPCODE_INT_QUOTIENT, dst, op[0], op[1]);
+      break;
+
+   case nir_op_umod:
+   case nir_op_irem:
+      /* According to the sign table for INT DIV in the Ivy Bridge PRM, it
+       * appears that our hardware just does the right thing for signed
+       * remainder.
+       */
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      emit_math(SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]);
+      break;
+
+   case nir_op_imod: {
+      /* Get a regular C-style remainder.  If a % b == 0, set the predicate. */
+      inst = emit_math(SHADER_OPCODE_INT_REMAINDER, dst, op[0], op[1]);
+
+      /* Math instructions don't support conditional mod */
+      inst = emit(MOV(dst_null_d(), src_reg(dst)));
+      inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+      /* Now, we need to determine if signs of the sources are different.
+       * When we XOR the sources, the top bit is 0 if they are the same and 1
+       * if they are different.  We can then use a conditional modifier to
+       * turn that into a predicate.  This leads us to an XOR.l instruction.
+       *
+       * Technically, according to the PRM, you're not allowed to use .l on a
+       * XOR instruction.  However, emperical experiments and Curro's reading
+       * of the simulator source both indicate that it's safe.
+       */
+      src_reg tmp = src_reg(this, glsl_type::ivec4_type);
+      inst = emit(XOR(dst_reg(tmp), op[0], op[1]));
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      inst->conditional_mod = BRW_CONDITIONAL_L;
+
+      /* If the result of the initial remainder operation is non-zero and the
+       * two sources have different signs, add in a copy of op[1] to get the
+       * final integer modulus value.
+       */
+      inst = emit(ADD(dst, src_reg(dst), op[1]));
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      break;
+   }
+
+   case nir_op_ldexp:
+      unreachable("not reached: should be handled by ldexp_to_arith()");
+
+   case nir_op_fsqrt:
+      inst = emit_math(SHADER_OPCODE_SQRT, dst, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_frsq:
+      inst = emit_math(SHADER_OPCODE_RSQ, dst, op[0]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fpow:
+      inst = emit_math(SHADER_OPCODE_POW, dst, op[0], op[1]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_uadd_carry: {
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
+
+      emit(ADDC(dst_null_ud(), op[0], op[1]));
+      emit(MOV(dst, src_reg(acc)));
+      break;
+   }
+
+   case nir_op_usub_borrow: {
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      struct brw_reg acc = retype(brw_acc_reg(8), BRW_REGISTER_TYPE_UD);
+
+      emit(SUBB(dst_null_ud(), op[0], op[1]));
+      emit(MOV(dst, src_reg(acc)));
+      break;
+   }
+
+   case nir_op_ftrunc:
+      inst = emit(RNDZ(dst, op[0]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fceil: {
+      src_reg tmp = src_reg(this, glsl_type::float_type);
+      tmp.swizzle =
+         brw_swizzle_for_size(instr->src[0].src.is_ssa ?
+                              instr->src[0].src.ssa->num_components :
+                              instr->src[0].src.reg.reg->num_components);
+
+      op[0].negate = !op[0].negate;
+      emit(RNDD(dst_reg(tmp), op[0]));
+      tmp.negate = true;
+      inst = emit(MOV(dst, tmp));
+      inst->saturate = instr->dest.saturate;
+      break;
+   }
+
+   case nir_op_ffloor:
+      inst = emit(RNDD(dst, op[0]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_ffract:
+      inst = emit(FRC(dst, op[0]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fround_even:
+      inst = emit(RNDE(dst, op[0]));
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fquantize2f16: {
+      /* See also vec4_visitor::emit_pack_half_2x16() */
+      src_reg tmp16 = src_reg(this, glsl_type::uvec4_type);
+      src_reg tmp32 = src_reg(this, glsl_type::vec4_type);
+      src_reg zero = src_reg(this, glsl_type::vec4_type);
+
+      /* Check for denormal */
+      src_reg abs_src0 = op[0];
+      abs_src0.abs = true;
+      emit(CMP(dst_null_f(), abs_src0, brw_imm_f(ldexpf(1.0, -14)),
+               BRW_CONDITIONAL_L));
+      /* Get the appropriately signed zero */
+      emit(AND(retype(dst_reg(zero), BRW_REGISTER_TYPE_UD),
+               retype(op[0], BRW_REGISTER_TYPE_UD),
+               brw_imm_ud(0x80000000)));
+      /* Do the actual F32 -> F16 -> F32 conversion */
+      emit(F32TO16(dst_reg(tmp16), op[0]));
+      emit(F16TO32(dst_reg(tmp32), tmp16));
+      /* Select that or zero based on normal status */
+      inst = emit(BRW_OPCODE_SEL, dst, zero, tmp32);
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      inst->saturate = instr->dest.saturate;
+      break;
+   }
+
+   case nir_op_imin:
+   case nir_op_umin:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+   case nir_op_fmin:
+      inst = emit_minmax(BRW_CONDITIONAL_L, dst, op[0], op[1]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_imax:
+   case nir_op_umax:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+   case nir_op_fmax:
+      inst = emit_minmax(BRW_CONDITIONAL_GE, dst, op[0], op[1]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fddx:
+   case nir_op_fddx_coarse:
+   case nir_op_fddx_fine:
+   case nir_op_fddy:
+   case nir_op_fddy_coarse:
+   case nir_op_fddy_fine:
+      unreachable("derivatives are not valid in vertex shaders");
+
+   case nir_op_ilt:
+   case nir_op_ult:
+   case nir_op_ige:
+   case nir_op_uge:
+   case nir_op_ieq:
+   case nir_op_ine:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      /* Fallthrough */
+   case nir_op_flt:
+   case nir_op_fge:
+   case nir_op_feq:
+   case nir_op_fne: {
+      enum brw_conditional_mod conditional_mod =
+         brw_conditional_for_nir_comparison(instr->op);
+
+      if (nir_src_bit_size(instr->src[0].src) < 64) {
+         emit(CMP(dst, op[0], op[1], conditional_mod));
+      } else {
+         /* Produce a 32-bit boolean result from the DF comparison by selecting
+          * only the low 32-bit in each DF produced. Do this in a temporary
+          * so we can then move from there to the result using align16 again
+          * to honor the original writemask.
+          */
+         dst_reg temp = dst_reg(this, glsl_type::dvec4_type);
+         emit(CMP(temp, op[0], op[1], conditional_mod));
+         dst_reg result = dst_reg(this, glsl_type::bvec4_type);
+         emit(VEC4_OPCODE_PICK_LOW_32BIT, result, src_reg(temp));
+         emit(MOV(dst, src_reg(result)));
+      }
+      break;
+   }
+
+   case nir_op_ball_iequal2:
+   case nir_op_ball_iequal3:
+   case nir_op_ball_iequal4:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      /* Fallthrough */
+   case nir_op_ball_fequal2:
+   case nir_op_ball_fequal3:
+   case nir_op_ball_fequal4: {
+      unsigned swiz =
+         brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]);
+
+      emit(CMP(dst_null_d(), swizzle(op[0], swiz), swizzle(op[1], swiz),
+               brw_conditional_for_nir_comparison(instr->op)));
+      emit(MOV(dst, brw_imm_d(0)));
+      inst = emit(MOV(dst, brw_imm_d(~0)));
+      inst->predicate = BRW_PREDICATE_ALIGN16_ALL4H;
+      break;
+   }
+
+   case nir_op_bany_inequal2:
+   case nir_op_bany_inequal3:
+   case nir_op_bany_inequal4:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      /* Fallthrough */
+   case nir_op_bany_fnequal2:
+   case nir_op_bany_fnequal3:
+   case nir_op_bany_fnequal4: {
+      unsigned swiz =
+         brw_swizzle_for_size(nir_op_infos[instr->op].input_sizes[0]);
+
+      emit(CMP(dst_null_d(), swizzle(op[0], swiz), swizzle(op[1], swiz),
+               brw_conditional_for_nir_comparison(instr->op)));
+
+      emit(MOV(dst, brw_imm_d(0)));
+      inst = emit(MOV(dst, brw_imm_d(~0)));
+      inst->predicate = BRW_PREDICATE_ALIGN16_ANY4H;
+      break;
+   }
+
+   case nir_op_inot:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      if (devinfo->gen >= 8) {
+         op[0] = resolve_source_modifiers(op[0]);
+      }
+      emit(NOT(dst, op[0]));
+      break;
+
+   case nir_op_ixor:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      if (devinfo->gen >= 8) {
+         op[0] = resolve_source_modifiers(op[0]);
+         op[1] = resolve_source_modifiers(op[1]);
+      }
+      emit(XOR(dst, op[0], op[1]));
+      break;
+
+   case nir_op_ior:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      if (devinfo->gen >= 8) {
+         op[0] = resolve_source_modifiers(op[0]);
+         op[1] = resolve_source_modifiers(op[1]);
+      }
+      emit(OR(dst, op[0], op[1]));
+      break;
+
+   case nir_op_iand:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      if (devinfo->gen >= 8) {
+         op[0] = resolve_source_modifiers(op[0]);
+         op[1] = resolve_source_modifiers(op[1]);
+      }
+      emit(AND(dst, op[0], op[1]));
+      break;
+
+   case nir_op_b2i:
+   case nir_op_b2f:
+      emit(MOV(dst, negate(op[0])));
+      break;
+
+   case nir_op_f2b:
+      emit(CMP(dst, op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ));
+      break;
+
+   case nir_op_d2b: {
+      /* We use a MOV with conditional_mod to check if the provided value is
+       * 0.0. We want this to flush denormalized numbers to zero, so we set a
+       * source modifier on the source operand to trigger this, as source
+       * modifiers don't affect the result of the testing against 0.0.
+       */
+      src_reg value = op[0];
+      value.abs = true;
+      vec4_instruction *inst = emit(MOV(dst_null_df(), value));
+      inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+      src_reg one = src_reg(this, glsl_type::ivec4_type);
+      emit(MOV(dst_reg(one), brw_imm_d(~0)));
+      inst = emit(BRW_OPCODE_SEL, dst, one, brw_imm_d(0));
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      break;
+   }
+
+   case nir_op_i2b:
+      emit(CMP(dst, op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ));
+      break;
+
+   case nir_op_fnoise1_1:
+   case nir_op_fnoise1_2:
+   case nir_op_fnoise1_3:
+   case nir_op_fnoise1_4:
+   case nir_op_fnoise2_1:
+   case nir_op_fnoise2_2:
+   case nir_op_fnoise2_3:
+   case nir_op_fnoise2_4:
+   case nir_op_fnoise3_1:
+   case nir_op_fnoise3_2:
+   case nir_op_fnoise3_3:
+   case nir_op_fnoise3_4:
+   case nir_op_fnoise4_1:
+   case nir_op_fnoise4_2:
+   case nir_op_fnoise4_3:
+   case nir_op_fnoise4_4:
+      unreachable("not reached: should be handled by lower_noise");
+
+   case nir_op_unpack_half_2x16_split_x:
+   case nir_op_unpack_half_2x16_split_y:
+   case nir_op_pack_half_2x16_split:
+      unreachable("not reached: should not occur in vertex shader");
+
+   case nir_op_unpack_snorm_2x16:
+   case nir_op_unpack_unorm_2x16:
+   case nir_op_pack_snorm_2x16:
+   case nir_op_pack_unorm_2x16:
+      unreachable("not reached: should be handled by lower_packing_builtins");
+
+   case nir_op_pack_uvec4_to_uint:
+      unreachable("not reached");
+
+   case nir_op_pack_uvec2_to_uint: {
+      dst_reg tmp1 = dst_reg(this, glsl_type::uint_type);
+      tmp1.writemask = WRITEMASK_X;
+      op[0].swizzle = BRW_SWIZZLE_YYYY;
+      emit(SHL(tmp1, op[0], src_reg(brw_imm_ud(16u))));
+
+      dst_reg tmp2 = dst_reg(this, glsl_type::uint_type);
+      tmp2.writemask = WRITEMASK_X;
+      op[0].swizzle = BRW_SWIZZLE_XXXX;
+      emit(AND(tmp2, op[0], src_reg(brw_imm_ud(0xffffu))));
+
+      emit(OR(dst, src_reg(tmp1), src_reg(tmp2)));
+      break;
+   }
+
+   case nir_op_pack_64_2x32_split: {
+      dst_reg result = dst_reg(this, glsl_type::dvec4_type);
+      dst_reg tmp = dst_reg(this, glsl_type::uvec4_type);
+      emit(MOV(tmp, retype(op[0], BRW_REGISTER_TYPE_UD)));
+      emit(VEC4_OPCODE_SET_LOW_32BIT, result, src_reg(tmp));
+      emit(MOV(tmp, retype(op[1], BRW_REGISTER_TYPE_UD)));
+      emit(VEC4_OPCODE_SET_HIGH_32BIT, result, src_reg(tmp));
+      emit(MOV(dst, src_reg(result)));
+      break;
+   }
+
+   case nir_op_unpack_64_2x32_split_x:
+   case nir_op_unpack_64_2x32_split_y: {
+      enum opcode oper = (instr->op == nir_op_unpack_64_2x32_split_x) ?
+         VEC4_OPCODE_PICK_LOW_32BIT : VEC4_OPCODE_PICK_HIGH_32BIT;
+      dst_reg tmp = dst_reg(this, glsl_type::dvec4_type);
+      emit(MOV(tmp, op[0]));
+      dst_reg tmp2 = dst_reg(this, glsl_type::uvec4_type);
+      emit(oper, tmp2, src_reg(tmp));
+      emit(MOV(dst, src_reg(tmp2)));
+      break;
+   }
+
+   case nir_op_unpack_half_2x16:
+      /* As NIR does not guarantee that we have a correct swizzle outside the
+       * boundaries of a vector, and the implementation of emit_unpack_half_2x16
+       * uses the source operand in an operation with WRITEMASK_Y while our
+       * source operand has only size 1, it accessed incorrect data producing
+       * regressions in Piglit. We repeat the swizzle of the first component on the
+       * rest of components to avoid regressions. In the vec4_visitor IR code path
+       * this is not needed because the operand has already the correct swizzle.
+       */
+      op[0].swizzle = brw_compose_swizzle(BRW_SWIZZLE_XXXX, op[0].swizzle);
+      emit_unpack_half_2x16(dst, op[0]);
+      break;
+
+   case nir_op_pack_half_2x16:
+      emit_pack_half_2x16(dst, op[0]);
+      break;
+
+   case nir_op_unpack_unorm_4x8:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      emit_unpack_unorm_4x8(dst, op[0]);
+      break;
+
+   case nir_op_pack_unorm_4x8:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      emit_pack_unorm_4x8(dst, op[0]);
+      break;
+
+   case nir_op_unpack_snorm_4x8:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      emit_unpack_snorm_4x8(dst, op[0]);
+      break;
+
+   case nir_op_pack_snorm_4x8:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      emit_pack_snorm_4x8(dst, op[0]);
+      break;
+
+   case nir_op_bitfield_reverse:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      emit(BFREV(dst, op[0]));
+      break;
+
+   case nir_op_bit_count:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      emit(CBIT(dst, op[0]));
+      break;
+
+   case nir_op_ufind_msb:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      emit_find_msb_using_lzd(vec4_builder(this).at_end(), dst, op[0], false);
+      break;
+
+   case nir_op_ifind_msb: {
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      vec4_builder bld = vec4_builder(this).at_end();
+      src_reg src(dst);
+
+      if (devinfo->gen < 7) {
+         emit_find_msb_using_lzd(bld, dst, op[0], true);
+      } else {
+         emit(FBH(retype(dst, BRW_REGISTER_TYPE_UD), op[0]));
+
+         /* FBH counts from the MSB side, while GLSL's findMSB() wants the
+          * count from the LSB side. If FBH didn't return an error
+          * (0xFFFFFFFF), then subtract the result from 31 to convert the MSB
+          * count into an LSB count.
+          */
+         bld.CMP(dst_null_d(), src, brw_imm_d(-1), BRW_CONDITIONAL_NZ);
+
+         inst = bld.ADD(dst, src, brw_imm_d(31));
+         inst->predicate = BRW_PREDICATE_NORMAL;
+         inst->src[0].negate = true;
+      }
+      break;
+   }
+
+   case nir_op_find_lsb: {
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      vec4_builder bld = vec4_builder(this).at_end();
+
+      if (devinfo->gen < 7) {
+         dst_reg temp = bld.vgrf(BRW_REGISTER_TYPE_D);
+
+         /* (x & -x) generates a value that consists of only the LSB of x.
+          * For all powers of 2, findMSB(y) == findLSB(y).
+          */
+         src_reg src = src_reg(retype(op[0], BRW_REGISTER_TYPE_D));
+         src_reg negated_src = src;
+
+         /* One must be negated, and the other must be non-negated.  It
+          * doesn't matter which is which.
+          */
+         negated_src.negate = true;
+         src.negate = false;
+
+         bld.AND(temp, src, negated_src);
+         emit_find_msb_using_lzd(bld, dst, src_reg(temp), false);
+      } else {
+         bld.FBL(dst, op[0]);
+      }
+      break;
+   }
+
+   case nir_op_ubitfield_extract:
+   case nir_op_ibitfield_extract:
+      unreachable("should have been lowered");
+   case nir_op_ubfe:
+   case nir_op_ibfe:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      op[0] = fix_3src_operand(op[0]);
+      op[1] = fix_3src_operand(op[1]);
+      op[2] = fix_3src_operand(op[2]);
+
+      emit(BFE(dst, op[2], op[1], op[0]));
+      break;
+
+   case nir_op_bfm:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      emit(BFI1(dst, op[0], op[1]));
+      break;
+
+   case nir_op_bfi:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      op[0] = fix_3src_operand(op[0]);
+      op[1] = fix_3src_operand(op[1]);
+      op[2] = fix_3src_operand(op[2]);
+
+      emit(BFI2(dst, op[0], op[1], op[2]));
+      break;
+
+   case nir_op_bitfield_insert:
+      unreachable("not reached: should have been lowered");
+
+   case nir_op_fsign:
+      if (type_sz(op[0].type) < 8) {
+         /* AND(val, 0x80000000) gives the sign bit.
+          *
+          * Predicated OR ORs 1.0 (0x3f800000) with the sign bit if val is not
+          * zero.
+          */
+         emit(CMP(dst_null_f(), op[0], brw_imm_f(0.0f), BRW_CONDITIONAL_NZ));
+
+         op[0].type = BRW_REGISTER_TYPE_UD;
+         dst.type = BRW_REGISTER_TYPE_UD;
+         emit(AND(dst, op[0], brw_imm_ud(0x80000000u)));
+
+         inst = emit(OR(dst, src_reg(dst), brw_imm_ud(0x3f800000u)));
+         inst->predicate = BRW_PREDICATE_NORMAL;
+         dst.type = BRW_REGISTER_TYPE_F;
+
+         if (instr->dest.saturate) {
+            inst = emit(MOV(dst, src_reg(dst)));
+            inst->saturate = true;
+         }
+      } else {
+         /* For doubles we do the same but we need to consider:
+          *
+          * - We use a MOV with conditional_mod instead of a CMP so that we can
+          *   skip loading a 0.0 immediate. We use a source modifier on the
+          *   source of the MOV so that we flush denormalized values to 0.
+          *   Since we want to compare against 0, this won't alter the result.
+          * - We need to extract the high 32-bit of each DF where the sign
+          *   is stored.
+          * - We need to produce a DF result.
+          */
+
+         /* Check for zero */
+         src_reg value = op[0];
+         value.abs = true;
+         inst = emit(MOV(dst_null_df(), value));
+         inst->conditional_mod = BRW_CONDITIONAL_NZ;
+
+         /* AND each high 32-bit channel with 0x80000000u */
+         dst_reg tmp = dst_reg(this, glsl_type::uvec4_type);
+         emit(VEC4_OPCODE_PICK_HIGH_32BIT, tmp, op[0]);
+         emit(AND(tmp, src_reg(tmp), brw_imm_ud(0x80000000u)));
+
+         /* Add 1.0 to each channel, predicated to skip the cases where the
+          * channel's value was 0
+          */
+         inst = emit(OR(tmp, src_reg(tmp), brw_imm_ud(0x3f800000u)));
+         inst->predicate = BRW_PREDICATE_NORMAL;
+
+         /* Now convert the result from float to double */
+         emit_conversion_to_double(dst, src_reg(tmp), instr->dest.saturate,
+                                   BRW_REGISTER_TYPE_F);
+      }
+      break;
+
+   case nir_op_isign:
+      /*  ASR(val, 31) -> negative val generates 0xffffffff (signed -1).
+       *               -> non-negative val generates 0x00000000.
+       *  Predicated OR sets 1 if val is positive.
+       */
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      emit(CMP(dst_null_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_G));
+      emit(ASR(dst, op[0], brw_imm_d(31)));
+      inst = emit(OR(dst, src_reg(dst), brw_imm_d(1)));
+      inst->predicate = BRW_PREDICATE_NORMAL;
+      break;
+
+   case nir_op_ishl:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      emit(SHL(dst, op[0], op[1]));
+      break;
+
+   case nir_op_ishr:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      emit(ASR(dst, op[0], op[1]));
+      break;
+
+   case nir_op_ushr:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+      emit(SHR(dst, op[0], op[1]));
+      break;
+
+   case nir_op_ffma:
+      if (type_sz(dst.type) == 8) {
+         dst_reg mul_dst = dst_reg(this, glsl_type::dvec4_type);
+         emit(MUL(mul_dst, op[1], op[0]));
+         inst = emit(ADD(dst, src_reg(mul_dst), op[2]));
+         inst->saturate = instr->dest.saturate;
+      } else {
+         op[0] = fix_3src_operand(op[0]);
+         op[1] = fix_3src_operand(op[1]);
+         op[2] = fix_3src_operand(op[2]);
+
+         inst = emit(MAD(dst, op[2], op[1], op[0]));
+         inst->saturate = instr->dest.saturate;
+      }
+      break;
+
+   case nir_op_flrp:
+      inst = emit_lrp(dst, op[0], op[1], op[2]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_bcsel:
+      enum brw_predicate predicate;
+      if (!optimize_predicate(instr, &predicate)) {
+         emit(CMP(dst_null_d(), op[0], brw_imm_d(0), BRW_CONDITIONAL_NZ));
+         switch (dst.writemask) {
+         case WRITEMASK_X:
+            predicate = BRW_PREDICATE_ALIGN16_REPLICATE_X;
+            break;
+         case WRITEMASK_Y:
+            predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Y;
+            break;
+         case WRITEMASK_Z:
+            predicate = BRW_PREDICATE_ALIGN16_REPLICATE_Z;
+            break;
+         case WRITEMASK_W:
+            predicate = BRW_PREDICATE_ALIGN16_REPLICATE_W;
+            break;
+         default:
+            predicate = BRW_PREDICATE_NORMAL;
+            break;
+         }
+      }
+      inst = emit(BRW_OPCODE_SEL, dst, op[1], op[2]);
+      inst->predicate = predicate;
+      break;
+
+   case nir_op_fdot_replicated2:
+      inst = emit(BRW_OPCODE_DP2, dst, op[0], op[1]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fdot_replicated3:
+      inst = emit(BRW_OPCODE_DP3, dst, op[0], op[1]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fdot_replicated4:
+      inst = emit(BRW_OPCODE_DP4, dst, op[0], op[1]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_fdph_replicated:
+      inst = emit(BRW_OPCODE_DPH, dst, op[0], op[1]);
+      inst->saturate = instr->dest.saturate;
+      break;
+
+   case nir_op_iabs:
+   case nir_op_ineg:
+      assert(nir_dest_bit_size(instr->dest.dest) < 64);
+   case nir_op_fabs:
+   case nir_op_fneg:
+   case nir_op_fsat:
+      unreachable("not reached: should be lowered by lower_source mods");
+
+   case nir_op_fdiv:
+      unreachable("not reached: should be lowered by DIV_TO_MUL_RCP in the compiler");
+
+   case nir_op_fmod:
+      unreachable("not reached: should be lowered by MOD_TO_FLOOR in the compiler");
+
+   case nir_op_fsub:
+   case nir_op_isub:
+      unreachable("not reached: should be handled by ir_sub_to_add_neg");
+
+   default:
+      unreachable("Unimplemented ALU operation");
+   }
+
+   /* If we need to do a boolean resolve, replace the result with -(x & 1)
+    * to sign extend the low bit to 0/~0
+    */
+   if (devinfo->gen <= 5 &&
+       (instr->instr.pass_flags & BRW_NIR_BOOLEAN_MASK) ==
+       BRW_NIR_BOOLEAN_NEEDS_RESOLVE) {
+      dst_reg masked = dst_reg(this, glsl_type::int_type);
+      masked.writemask = dst.writemask;
+      emit(AND(masked, src_reg(dst), brw_imm_d(1)));
+      src_reg masked_neg = src_reg(masked);
+      masked_neg.negate = true;
+      emit(MOV(retype(dst, BRW_REGISTER_TYPE_D), masked_neg));
+   }
+}
+
+void
+vec4_visitor::nir_emit_jump(nir_jump_instr *instr)
+{
+   switch (instr->type) {
+   case nir_jump_break:
+      emit(BRW_OPCODE_BREAK);
+      break;
+
+   case nir_jump_continue:
+      emit(BRW_OPCODE_CONTINUE);
+      break;
+
+   case nir_jump_return:
+      /* fall through */
+   default:
+      unreachable("unknown jump");
+   }
+}
+
+enum ir_texture_opcode
+ir_texture_opcode_for_nir_texop(nir_texop texop)
+{
+   enum ir_texture_opcode op;
+
+   switch (texop) {
+   case nir_texop_lod: op = ir_lod; break;
+   case nir_texop_query_levels: op = ir_query_levels; break;
+   case nir_texop_texture_samples: op = ir_texture_samples; break;
+   case nir_texop_tex: op = ir_tex; break;
+   case nir_texop_tg4: op = ir_tg4; break;
+   case nir_texop_txb: op = ir_txb; break;
+   case nir_texop_txd: op = ir_txd; break;
+   case nir_texop_txf: op = ir_txf; break;
+   case nir_texop_txf_ms: op = ir_txf_ms; break;
+   case nir_texop_txl: op = ir_txl; break;
+   case nir_texop_txs: op = ir_txs; break;
+   case nir_texop_samples_identical: op = ir_samples_identical; break;
+   default:
+      unreachable("unknown texture opcode");
+   }
+
+   return op;
+}
+const glsl_type *
+glsl_type_for_nir_alu_type(nir_alu_type alu_type,
+                           unsigned components)
+{
+   return glsl_type::get_instance(brw_glsl_base_type_for_nir_type(alu_type),
+                                  components, 1);
+}
+
+void
+vec4_visitor::nir_emit_texture(nir_tex_instr *instr)
+{
+   unsigned texture = instr->texture_index;
+   unsigned sampler = instr->sampler_index;
+   src_reg texture_reg = brw_imm_ud(texture);
+   src_reg sampler_reg = brw_imm_ud(sampler);
+   src_reg coordinate;
+   const glsl_type *coord_type = NULL;
+   src_reg shadow_comparator;
+   src_reg offset_value;
+   src_reg lod, lod2;
+   src_reg sample_index;
+   src_reg mcs;
+
+   const glsl_type *dest_type =
+      glsl_type_for_nir_alu_type(instr->dest_type,
+                                 nir_tex_instr_dest_size(instr));
+   dst_reg dest = get_nir_dest(instr->dest, instr->dest_type);
+
+   /* The hardware requires a LOD for buffer textures */
+   if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF)
+      lod = brw_imm_d(0);
+
+   /* Load the texture operation sources */
+   uint32_t constant_offset = 0;
+   for (unsigned i = 0; i < instr->num_srcs; i++) {
+      switch (instr->src[i].src_type) {
+      case nir_tex_src_comparator:
+         shadow_comparator = get_nir_src(instr->src[i].src,
+                                         BRW_REGISTER_TYPE_F, 1);
+         break;
+
+      case nir_tex_src_coord: {
+         unsigned src_size = nir_tex_instr_src_size(instr, i);
+
+         switch (instr->op) {
+         case nir_texop_txf:
+         case nir_texop_txf_ms:
+         case nir_texop_samples_identical:
+            coordinate = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D,
+                                     src_size);
+            coord_type = glsl_type::ivec(src_size);
+            break;
+
+         default:
+            coordinate = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F,
+                                     src_size);
+            coord_type = glsl_type::vec(src_size);
+            break;
+         }
+         break;
+      }
+
+      case nir_tex_src_ddx:
+         lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F,
+                           nir_tex_instr_src_size(instr, i));
+         break;
+
+      case nir_tex_src_ddy:
+         lod2 = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F,
+                           nir_tex_instr_src_size(instr, i));
+         break;
+
+      case nir_tex_src_lod:
+         switch (instr->op) {
+         case nir_texop_txs:
+         case nir_texop_txf:
+            lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 1);
+            break;
+
+         default:
+            lod = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_F, 1);
+            break;
+         }
+         break;
+
+      case nir_tex_src_ms_index: {
+         sample_index = get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 1);
+         break;
+      }
+
+      case nir_tex_src_offset: {
+         nir_const_value *const_offset =
+            nir_src_as_const_value(instr->src[i].src);
+         if (!const_offset ||
+             !brw_texture_offset(const_offset->i32,
+                                 nir_tex_instr_src_size(instr, i),
+                                 &constant_offset)) {
+            offset_value =
+               get_nir_src(instr->src[i].src, BRW_REGISTER_TYPE_D, 2);
+         }
+         break;
+      }
+
+      case nir_tex_src_texture_offset: {
+         /* The highest texture which may be used by this operation is
+          * the last element of the array. Mark it here, because the generator
+          * doesn't have enough information to determine the bound.
+          */
+         uint32_t array_size = instr->texture_array_size;
+         uint32_t max_used = texture + array_size - 1;
+         if (instr->op == nir_texop_tg4) {
+            max_used += prog_data->base.binding_table.gather_texture_start;
+         } else {
+            max_used += prog_data->base.binding_table.texture_start;
+         }
+
+         brw_mark_surface_used(&prog_data->base, max_used);
+
+         /* Emit code to evaluate the actual indexing expression */
+         src_reg src = get_nir_src(instr->src[i].src, 1);
+         src_reg temp(this, glsl_type::uint_type);
+         emit(ADD(dst_reg(temp), src, brw_imm_ud(texture)));
+         texture_reg = emit_uniformize(temp);
+         break;
+      }
+
+      case nir_tex_src_sampler_offset: {
+         /* Emit code to evaluate the actual indexing expression */
+         src_reg src = get_nir_src(instr->src[i].src, 1);
+         src_reg temp(this, glsl_type::uint_type);
+         emit(ADD(dst_reg(temp), src, brw_imm_ud(sampler)));
+         sampler_reg = emit_uniformize(temp);
+         break;
+      }
+
+      case nir_tex_src_projector:
+         unreachable("Should be lowered by do_lower_texture_projection");
+
+      case nir_tex_src_bias:
+         unreachable("LOD bias is not valid for vertex shaders.\n");
+
+      default:
+         unreachable("unknown texture source");
+      }
+   }
+
+   if (instr->op == nir_texop_txf_ms ||
+       instr->op == nir_texop_samples_identical) {
+      assert(coord_type != NULL);
+      if (devinfo->gen >= 7 &&
+          key_tex->compressed_multisample_layout_mask & (1 << texture)) {
+         mcs = emit_mcs_fetch(coord_type, coordinate, texture_reg);
+      } else {
+         mcs = brw_imm_ud(0u);
+      }
+   }
+
+   /* Stuff the channel select bits in the top of the texture offset */
+   if (instr->op == nir_texop_tg4) {
+      if (instr->component == 1 &&
+          (key_tex->gather_channel_quirk_mask & (1 << texture))) {
+         /* gather4 sampler is broken for green channel on RG32F --
+          * we must ask for blue instead.
+          */
+         constant_offset |= 2 << 16;
+      } else {
+         constant_offset |= instr->component << 16;
+      }
+   }
+
+   ir_texture_opcode op = ir_texture_opcode_for_nir_texop(instr->op);
+
+   emit_texture(op, dest, dest_type, coordinate, instr->coord_components,
+                shadow_comparator,
+                lod, lod2, sample_index,
+                constant_offset, offset_value, mcs,
+                texture, texture_reg, sampler_reg);
+}
+
+void
+vec4_visitor::nir_emit_undef(nir_ssa_undef_instr *instr)
+{
+   nir_ssa_values[instr->def.index] =
+      dst_reg(VGRF, alloc.allocate(DIV_ROUND_UP(instr->def.bit_size, 32)));
+}
+
+/* SIMD4x2 64bit data is stored in register space like this:
+ *
+ * r0.0:DF  x0 y0 z0 w0
+ * r1.0:DF  x1 y1 z1 w1
+ *
+ * When we need to write data such as this to memory using 32-bit write
+ * messages we need to shuffle it in this fashion:
+ *
+ * r0.0:DF  x0 y0 x1 y1 (to be written at base offset)
+ * r0.0:DF  z0 w0 z1 w1 (to be written at base offset + 16)
+ *
+ * We need to do the inverse operation when we read using 32-bit messages,
+ * which we can do by applying the same exact shuffling on the 64-bit data
+ * read, only that because the data for each vertex is positioned differently
+ * we need to apply different channel enables.
+ *
+ * This function takes 64bit data and shuffles it as explained above.
+ *
+ * The @for_write parameter is used to specify if the shuffling is being done
+ * for proper SIMD4x2 64-bit data that needs to be shuffled prior to a 32-bit
+ * write message (for_write = true), or instead we are doing the inverse
+ * operation and we have just read 64-bit data using a 32-bit messages that we
+ * need to shuffle to create valid SIMD4x2 64-bit data (for_write = false).
+ *
+ * If @block and @ref are non-NULL, then the shuffling is done after @ref,
+ * otherwise the instructions are emitted normally at the end. The function
+ * returns the last instruction inserted.
+ *
+ * Notice that @src and @dst cannot be the same register.
+ */
+vec4_instruction *
+vec4_visitor::shuffle_64bit_data(dst_reg dst, src_reg src, bool for_write,
+                                 bblock_t *block, vec4_instruction *ref)
+{
+   assert(type_sz(src.type) == 8);
+   assert(type_sz(dst.type) == 8);
+   assert(!regions_overlap(dst, 2 * REG_SIZE, src, 2 * REG_SIZE));
+   assert(!ref == !block);
+
+   const vec4_builder bld = !ref ? vec4_builder(this).at_end() :
+                                   vec4_builder(this).at(block, ref->next);
+
+   /* Resolve swizzle in src */
+   vec4_instruction *inst;
+   if (src.swizzle != BRW_SWIZZLE_XYZW) {
+      dst_reg data = dst_reg(this, glsl_type::dvec4_type);
+      inst = bld.MOV(data, src);
+      src = src_reg(data);
+   }
+
+   /* dst+0.XY = src+0.XY */
+   inst = bld.group(4, 0).MOV(writemask(dst, WRITEMASK_XY), src);
+
+   /* dst+0.ZW = src+1.XY */
+   inst = bld.group(4, for_write ? 1 : 0)
+             .MOV(writemask(dst, WRITEMASK_ZW),
+                  swizzle(byte_offset(src, REG_SIZE), BRW_SWIZZLE_XYXY));
+
+   /* dst+1.XY = src+0.ZW */
+   inst = bld.group(4, for_write ? 0 : 1)
+            .MOV(writemask(byte_offset(dst, REG_SIZE), WRITEMASK_XY),
+                 swizzle(src, BRW_SWIZZLE_ZWZW));
+
+   /* dst+1.ZW = src+1.ZW */
+   inst = bld.group(4, 1)
+             .MOV(writemask(byte_offset(dst, REG_SIZE), WRITEMASK_ZW),
+                 byte_offset(src, REG_SIZE));
+
+   return inst;
+}
+
+}
diff --git a/src/intel/compiler/brw_vec4_reg_allocate.cpp b/src/intel/compiler/brw_vec4_reg_allocate.cpp
new file mode 100644
index 00000000000..e3b46cc2f7f
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_reg_allocate.cpp
@@ -0,0 +1,558 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "util/register_allocate.h"
+#include "brw_vec4.h"
+#include "brw_cfg.h"
+
+using namespace brw;
+
+namespace brw {
+
+static void
+assign(unsigned int *reg_hw_locations, backend_reg *reg)
+{
+   if (reg->file == VGRF) {
+      reg->nr = reg_hw_locations[reg->nr] + reg->offset / REG_SIZE;
+      reg->offset %= REG_SIZE;
+   }
+}
+
+bool
+vec4_visitor::reg_allocate_trivial()
+{
+   unsigned int hw_reg_mapping[this->alloc.count];
+   bool virtual_grf_used[this->alloc.count];
+   int next;
+
+   /* Calculate which virtual GRFs are actually in use after whatever
+    * optimization passes have occurred.
+    */
+   for (unsigned i = 0; i < this->alloc.count; i++) {
+      virtual_grf_used[i] = false;
+   }
+
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      if (inst->dst.file == VGRF)
+         virtual_grf_used[inst->dst.nr] = true;
+
+      for (unsigned i = 0; i < 3; i++) {
+	 if (inst->src[i].file == VGRF)
+            virtual_grf_used[inst->src[i].nr] = true;
+      }
+   }
+
+   hw_reg_mapping[0] = this->first_non_payload_grf;
+   next = hw_reg_mapping[0] + this->alloc.sizes[0];
+   for (unsigned i = 1; i < this->alloc.count; i++) {
+      if (virtual_grf_used[i]) {
+	 hw_reg_mapping[i] = next;
+	 next += this->alloc.sizes[i];
+      }
+   }
+   prog_data->total_grf = next;
+
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      assign(hw_reg_mapping, &inst->dst);
+      assign(hw_reg_mapping, &inst->src[0]);
+      assign(hw_reg_mapping, &inst->src[1]);
+      assign(hw_reg_mapping, &inst->src[2]);
+   }
+
+   if (prog_data->total_grf > max_grf) {
+      fail("Ran out of regs on trivial allocator (%d/%d)\n",
+	   prog_data->total_grf, max_grf);
+      return false;
+   }
+
+   return true;
+}
+
+extern "C" void
+brw_vec4_alloc_reg_set(struct brw_compiler *compiler)
+{
+   int base_reg_count =
+      compiler->devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
+
+   /* After running split_virtual_grfs(), almost all VGRFs will be of size 1.
+    * SEND-from-GRF sources cannot be split, so we also need classes for each
+    * potential message length.
+    */
+   const int class_count = MAX_VGRF_SIZE;
+   int class_sizes[MAX_VGRF_SIZE];
+
+   for (int i = 0; i < class_count; i++)
+      class_sizes[i] = i + 1;
+
+   /* Compute the total number of registers across all classes. */
+   int ra_reg_count = 0;
+   for (int i = 0; i < class_count; i++) {
+      ra_reg_count += base_reg_count - (class_sizes[i] - 1);
+   }
+
+   ralloc_free(compiler->vec4_reg_set.ra_reg_to_grf);
+   compiler->vec4_reg_set.ra_reg_to_grf = ralloc_array(compiler, uint8_t, ra_reg_count);
+   ralloc_free(compiler->vec4_reg_set.regs);
+   compiler->vec4_reg_set.regs = ra_alloc_reg_set(compiler, ra_reg_count, false);
+   if (compiler->devinfo->gen >= 6)
+      ra_set_allocate_round_robin(compiler->vec4_reg_set.regs);
+   ralloc_free(compiler->vec4_reg_set.classes);
+   compiler->vec4_reg_set.classes = ralloc_array(compiler, int, class_count);
+
+   /* Now, add the registers to their classes, and add the conflicts
+    * between them and the base GRF registers (and also each other).
+    */
+   int reg = 0;
+   unsigned *q_values[MAX_VGRF_SIZE];
+   for (int i = 0; i < class_count; i++) {
+      int class_reg_count = base_reg_count - (class_sizes[i] - 1);
+      compiler->vec4_reg_set.classes[i] = ra_alloc_reg_class(compiler->vec4_reg_set.regs);
+
+      q_values[i] = new unsigned[MAX_VGRF_SIZE];
+
+      for (int j = 0; j < class_reg_count; j++) {
+	 ra_class_add_reg(compiler->vec4_reg_set.regs, compiler->vec4_reg_set.classes[i], reg);
+
+	 compiler->vec4_reg_set.ra_reg_to_grf[reg] = j;
+
+	 for (int base_reg = j;
+	      base_reg < j + class_sizes[i];
+	      base_reg++) {
+	    ra_add_reg_conflict(compiler->vec4_reg_set.regs, base_reg, reg);
+	 }
+
+	 reg++;
+      }
+
+      for (int j = 0; j < class_count; j++) {
+         /* Calculate the q values manually because the algorithm used by
+          * ra_set_finalize() to do it has higher complexity affecting the
+          * start-up time of some applications.  q(i, j) is just the maximum
+          * number of registers from class i a register from class j can
+          * conflict with.
+          */
+         q_values[i][j] = class_sizes[i] + class_sizes[j] - 1;
+      }
+   }
+   assert(reg == ra_reg_count);
+
+   for (int reg = 0; reg < base_reg_count; reg++)
+      ra_make_reg_conflicts_transitive(compiler->vec4_reg_set.regs, reg);
+
+   ra_set_finalize(compiler->vec4_reg_set.regs, q_values);
+
+   for (int i = 0; i < MAX_VGRF_SIZE; i++)
+      delete[] q_values[i];
+}
+
+void
+vec4_visitor::setup_payload_interference(struct ra_graph *g,
+                                         int first_payload_node,
+                                         int reg_node_count)
+{
+   int payload_node_count = this->first_non_payload_grf;
+
+   for (int i = 0; i < payload_node_count; i++) {
+      /* Mark each payload reg node as being allocated to its physical register.
+       *
+       * The alternative would be to have per-physical register classes, which
+       * would just be silly.
+       */
+      ra_set_node_reg(g, first_payload_node + i, i);
+
+      /* For now, just mark each payload node as interfering with every other
+       * node to be allocated.
+       */
+      for (int j = 0; j < reg_node_count; j++) {
+         ra_add_node_interference(g, first_payload_node + i, j);
+      }
+   }
+}
+
+bool
+vec4_visitor::reg_allocate()
+{
+   unsigned int hw_reg_mapping[alloc.count];
+   int payload_reg_count = this->first_non_payload_grf;
+
+   /* Using the trivial allocator can be useful in debugging undefined
+    * register access as a result of broken optimization passes.
+    */
+   if (0)
+      return reg_allocate_trivial();
+
+   calculate_live_intervals();
+
+   int node_count = alloc.count;
+   int first_payload_node = node_count;
+   node_count += payload_reg_count;
+   struct ra_graph *g =
+      ra_alloc_interference_graph(compiler->vec4_reg_set.regs, node_count);
+
+   for (unsigned i = 0; i < alloc.count; i++) {
+      int size = this->alloc.sizes[i];
+      assert(size >= 1 && size <= MAX_VGRF_SIZE);
+      ra_set_node_class(g, i, compiler->vec4_reg_set.classes[size - 1]);
+
+      for (unsigned j = 0; j < i; j++) {
+	 if (virtual_grf_interferes(i, j)) {
+	    ra_add_node_interference(g, i, j);
+	 }
+      }
+   }
+
+   /* Certain instructions can't safely use the same register for their
+    * sources and destination.  Add interference.
+    */
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      if (inst->dst.file == VGRF && inst->has_source_and_destination_hazard()) {
+         for (unsigned i = 0; i < 3; i++) {
+            if (inst->src[i].file == VGRF) {
+               ra_add_node_interference(g, inst->dst.nr, inst->src[i].nr);
+            }
+         }
+      }
+   }
+
+   setup_payload_interference(g, first_payload_node, node_count);
+
+   if (!ra_allocate(g)) {
+      /* Failed to allocate registers.  Spill a reg, and the caller will
+       * loop back into here to try again.
+       */
+      int reg = choose_spill_reg(g);
+      if (this->no_spills) {
+         fail("Failure to register allocate.  Reduce number of live "
+              "values to avoid this.");
+      } else if (reg == -1) {
+         fail("no register to spill\n");
+      } else {
+         spill_reg(reg);
+      }
+      ralloc_free(g);
+      return false;
+   }
+
+   /* Get the chosen virtual registers for each node, and map virtual
+    * regs in the register classes back down to real hardware reg
+    * numbers.
+    */
+   prog_data->total_grf = payload_reg_count;
+   for (unsigned i = 0; i < alloc.count; i++) {
+      int reg = ra_get_node_reg(g, i);
+
+      hw_reg_mapping[i] = compiler->vec4_reg_set.ra_reg_to_grf[reg];
+      prog_data->total_grf = MAX2(prog_data->total_grf,
+				  hw_reg_mapping[i] + alloc.sizes[i]);
+   }
+
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      assign(hw_reg_mapping, &inst->dst);
+      assign(hw_reg_mapping, &inst->src[0]);
+      assign(hw_reg_mapping, &inst->src[1]);
+      assign(hw_reg_mapping, &inst->src[2]);
+   }
+
+   ralloc_free(g);
+
+   return true;
+}
+
+/**
+ * When we decide to spill a register, instead of blindly spilling every use,
+ * save unspills when the spill register is used (read) in consecutive
+ * instructions. This can potentially save a bunch of unspills that would
+ * have very little impact in register allocation anyway.
+ *
+ * Notice that we need to account for this behavior when spilling a register
+ * and when evaluating spilling costs. This function is designed so it can
+ * be called from both places and avoid repeating the logic.
+ *
+ *  - When we call this function from spill_reg(), we pass in scratch_reg the
+ *    actual unspill/spill register that we want to reuse in the current
+ *    instruction.
+ *
+ *  - When we call this from evaluate_spill_costs(), we pass the register for
+ *    which we are evaluating spilling costs.
+ *
+ * In either case, we check if the previous instructions read scratch_reg until
+ * we find one that writes to it with a compatible mask or does not read/write
+ * scratch_reg at all.
+ */
+static bool
+can_use_scratch_for_source(const vec4_instruction *inst, unsigned i,
+                           unsigned scratch_reg)
+{
+   assert(inst->src[i].file == VGRF);
+   bool prev_inst_read_scratch_reg = false;
+
+   /* See if any previous source in the same instructions reads scratch_reg */
+   for (unsigned n = 0; n < i; n++) {
+      if (inst->src[n].file == VGRF && inst->src[n].nr == scratch_reg)
+         prev_inst_read_scratch_reg = true;
+   }
+
+   /* Now check if previous instructions read/write scratch_reg */
+   for (vec4_instruction *prev_inst = (vec4_instruction *) inst->prev;
+        !prev_inst->is_head_sentinel();
+        prev_inst = (vec4_instruction *) prev_inst->prev) {
+
+      /* If the previous instruction writes to scratch_reg then we can reuse
+       * it if the write is not conditional and the channels we write are
+       * compatible with our read mask
+       */
+      if (prev_inst->dst.file == VGRF && prev_inst->dst.nr == scratch_reg) {
+         return (!prev_inst->predicate || prev_inst->opcode == BRW_OPCODE_SEL) &&
+                (brw_mask_for_swizzle(inst->src[i].swizzle) &
+                 ~prev_inst->dst.writemask) == 0;
+      }
+
+      /* Skip scratch read/writes so that instructions generated by spilling
+       * other registers (that won't read/write scratch_reg) do not stop us from
+       * reusing scratch_reg for this instruction.
+       */
+      if (prev_inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_WRITE ||
+          prev_inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_READ)
+         continue;
+
+      /* If the previous instruction does not write to scratch_reg, then check
+       * if it reads it
+       */
+      int n;
+      for (n = 0; n < 3; n++) {
+         if (prev_inst->src[n].file == VGRF &&
+             prev_inst->src[n].nr == scratch_reg) {
+            prev_inst_read_scratch_reg = true;
+            break;
+         }
+      }
+      if (n == 3) {
+         /* The previous instruction does not read scratch_reg. At this point,
+          * if no previous instruction has read scratch_reg it means that we
+          * will need to unspill it here and we can't reuse it (so we return
+          * false). Otherwise, if we found at least one consecutive instruction
+          * that read scratch_reg, then we know that we got here from
+          * evaluate_spill_costs (since for the spill_reg path any block of
+          * consecutive instructions using scratch_reg must start with a write
+          * to that register, so we would've exited the loop in the check for
+          * the write that we have at the start of this loop), and in that case
+          * it means that we found the point at which the scratch_reg would be
+          * unspilled. Since we always unspill a full vec4, it means that we
+          * have all the channels available and we can just return true to
+          * signal that we can reuse the register in the current instruction
+          * too.
+          */
+         return prev_inst_read_scratch_reg;
+      }
+   }
+
+   return prev_inst_read_scratch_reg;
+}
+
+static inline unsigned
+spill_cost_for_type(enum brw_reg_type type)
+{
+   /* Spilling of a 64-bit register involves emitting 2 32-bit scratch
+    * messages plus the 64b/32b shuffling code.
+    */
+   return type_sz(type) == 8 ? 2.25f : 1.0f;
+}
+
+void
+vec4_visitor::evaluate_spill_costs(float *spill_costs, bool *no_spill)
+{
+   float loop_scale = 1.0;
+
+   unsigned *reg_type_size = (unsigned *)
+      ralloc_size(NULL, this->alloc.count * sizeof(unsigned));
+
+   for (unsigned i = 0; i < this->alloc.count; i++) {
+      spill_costs[i] = 0.0;
+      no_spill[i] = alloc.sizes[i] != 1 && alloc.sizes[i] != 2;
+      reg_type_size[i] = 0;
+   }
+
+   /* Calculate costs for spilling nodes.  Call it a cost of 1 per
+    * spill/unspill we'll have to do, and guess that the insides of
+    * loops run 10 times.
+    */
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      for (unsigned int i = 0; i < 3; i++) {
+         if (inst->src[i].file == VGRF && !no_spill[inst->src[i].nr]) {
+            /* We will only unspill src[i] it it wasn't unspilled for the
+             * previous instruction, in which case we'll just reuse the scratch
+             * reg for this instruction.
+             */
+            if (!can_use_scratch_for_source(inst, i, inst->src[i].nr)) {
+               spill_costs[inst->src[i].nr] +=
+                  loop_scale * spill_cost_for_type(inst->src[i].type);
+               if (inst->src[i].reladdr ||
+                   inst->src[i].offset >= REG_SIZE)
+                  no_spill[inst->src[i].nr] = true;
+
+               /* We don't support unspills of partial DF reads.
+                *
+                * Our 64-bit unspills are implemented with two 32-bit scratch
+                * messages, each one reading that for both SIMD4x2 threads that
+                * we need to shuffle into correct 64-bit data. Ensure that we
+                * are reading data for both threads.
+                */
+               if (type_sz(inst->src[i].type) == 8 && inst->exec_size != 8)
+                  no_spill[inst->src[i].nr] = true;
+            }
+
+            /* We can't spill registers that mix 32-bit and 64-bit access (that
+             * contain 64-bit data that is operated on via 32-bit instructions)
+             */
+            unsigned type_size = type_sz(inst->src[i].type);
+            if (reg_type_size[inst->src[i].nr] == 0)
+               reg_type_size[inst->src[i].nr] = type_size;
+            else if (reg_type_size[inst->src[i].nr] != type_size)
+               no_spill[inst->src[i].nr] = true;
+         }
+      }
+
+      if (inst->dst.file == VGRF && !no_spill[inst->dst.nr]) {
+         spill_costs[inst->dst.nr] +=
+            loop_scale * spill_cost_for_type(inst->dst.type);
+         if (inst->dst.reladdr || inst->dst.offset >= REG_SIZE)
+            no_spill[inst->dst.nr] = true;
+
+         /* We don't support spills of partial DF writes.
+          *
+          * Our 64-bit spills are implemented with two 32-bit scratch messages,
+          * each one writing that for both SIMD4x2 threads. Ensure that we
+          * are writing data for both threads.
+          */
+         if (type_sz(inst->dst.type) == 8 && inst->exec_size != 8)
+            no_spill[inst->dst.nr] = true;
+
+         /* FROM_DOUBLE opcodes are setup so that they use a dst register
+          * with a size of 2 even if they only produce a single-precison
+          * result (this is so that the opcode can use the larger register to
+          * produce a 64-bit aligned intermediary result as required by the
+          * hardware during the conversion process). This creates a problem for
+          * spilling though, because when we attempt to emit a spill for the
+          * dst we see a 32-bit destination and emit a scratch write that
+          * allocates a single spill register.
+          */
+         if (inst->opcode == VEC4_OPCODE_FROM_DOUBLE)
+            no_spill[inst->dst.nr] = true;
+
+         /* We can't spill registers that mix 32-bit and 64-bit access (that
+          * contain 64-bit data that is operated on via 32-bit instructions)
+          */
+         unsigned type_size = type_sz(inst->dst.type);
+         if (reg_type_size[inst->dst.nr] == 0)
+            reg_type_size[inst->dst.nr] = type_size;
+         else if (reg_type_size[inst->dst.nr] != type_size)
+            no_spill[inst->dst.nr] = true;
+      }
+
+      switch (inst->opcode) {
+
+      case BRW_OPCODE_DO:
+         loop_scale *= 10;
+         break;
+
+      case BRW_OPCODE_WHILE:
+         loop_scale /= 10;
+         break;
+
+      case SHADER_OPCODE_GEN4_SCRATCH_READ:
+      case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
+         for (int i = 0; i < 3; i++) {
+            if (inst->src[i].file == VGRF)
+               no_spill[inst->src[i].nr] = true;
+         }
+         if (inst->dst.file == VGRF)
+            no_spill[inst->dst.nr] = true;
+         break;
+
+      default:
+         break;
+      }
+   }
+
+   ralloc_free(reg_type_size);
+}
+
+int
+vec4_visitor::choose_spill_reg(struct ra_graph *g)
+{
+   float spill_costs[this->alloc.count];
+   bool no_spill[this->alloc.count];
+
+   evaluate_spill_costs(spill_costs, no_spill);
+
+   for (unsigned i = 0; i < this->alloc.count; i++) {
+      if (!no_spill[i])
+         ra_set_node_spill_cost(g, i, spill_costs[i]);
+   }
+
+   return ra_get_best_spill_node(g);
+}
+
+void
+vec4_visitor::spill_reg(int spill_reg_nr)
+{
+   assert(alloc.sizes[spill_reg_nr] == 1 || alloc.sizes[spill_reg_nr] == 2);
+   unsigned int spill_offset = last_scratch;
+   last_scratch += alloc.sizes[spill_reg_nr];
+
+   /* Generate spill/unspill instructions for the objects being spilled. */
+   int scratch_reg = -1;
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      for (unsigned int i = 0; i < 3; i++) {
+         if (inst->src[i].file == VGRF && inst->src[i].nr == spill_reg_nr) {
+            if (scratch_reg == -1 ||
+                !can_use_scratch_for_source(inst, i, scratch_reg)) {
+               /* We need to unspill anyway so make sure we read the full vec4
+                * in any case. This way, the cached register can be reused
+                * for consecutive instructions that read different channels of
+                * the same vec4.
+                */
+               scratch_reg = alloc.allocate(alloc.sizes[spill_reg_nr]);
+               src_reg temp = inst->src[i];
+               temp.nr = scratch_reg;
+               temp.offset = 0;
+               temp.swizzle = BRW_SWIZZLE_XYZW;
+               emit_scratch_read(block, inst,
+                                 dst_reg(temp), inst->src[i], spill_offset);
+               temp.offset = inst->src[i].offset;
+            }
+            assert(scratch_reg != -1);
+            inst->src[i].nr = scratch_reg;
+         }
+      }
+
+      if (inst->dst.file == VGRF && inst->dst.nr == spill_reg_nr) {
+         emit_scratch_write(block, inst, spill_offset);
+         scratch_reg = inst->dst.nr;
+      }
+   }
+
+   invalidate_live_intervals();
+}
+
+} /* namespace brw */
diff --git a/src/intel/compiler/brw_vec4_surface_builder.cpp b/src/intel/compiler/brw_vec4_surface_builder.cpp
new file mode 100644
index 00000000000..00c94fedca2
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_surface_builder.cpp
@@ -0,0 +1,332 @@
+/*
+ * Copyright © 2013-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_vec4_surface_builder.h"
+
+using namespace brw;
+
+namespace {
+   namespace array_utils {
+      /**
+       * Copy one every \p src_stride logical components of the argument into
+       * one every \p dst_stride logical components of the result.
+       */
+      src_reg
+      emit_stride(const vec4_builder &bld, const src_reg &src, unsigned size,
+                  unsigned dst_stride, unsigned src_stride)
+      {
+         if (src_stride == 1 && dst_stride == 1) {
+            return src;
+         } else {
+            const dst_reg dst = bld.vgrf(src.type,
+                                         DIV_ROUND_UP(size * dst_stride, 4));
+
+            for (unsigned i = 0; i < size; ++i)
+               bld.MOV(writemask(offset(dst, 8, i * dst_stride / 4),
+                                 1 << (i * dst_stride % 4)),
+                       swizzle(offset(src, 8, i * src_stride / 4),
+                               brw_swizzle_for_mask(1 << (i * src_stride % 4))));
+
+            return src_reg(dst);
+         }
+      }
+
+      /**
+       * Convert a VEC4 into an array of registers with the layout expected by
+       * the recipient shared unit.  If \p has_simd4x2 is true the argument is
+       * left unmodified in SIMD4x2 form, otherwise it will be rearranged into
+       * a SIMD8 vector.
+       */
+      src_reg
+      emit_insert(const vec4_builder &bld, const src_reg &src,
+                  unsigned n, bool has_simd4x2)
+      {
+         if (src.file == BAD_FILE || n == 0) {
+            return src_reg();
+
+         } else {
+            /* Pad unused components with zeroes. */
+            const unsigned mask = (1 << n) - 1;
+            const dst_reg tmp = bld.vgrf(src.type);
+
+            bld.MOV(writemask(tmp, mask), src);
+            if (n < 4)
+               bld.MOV(writemask(tmp, ~mask), brw_imm_d(0));
+
+            return emit_stride(bld, src_reg(tmp), n, has_simd4x2 ? 1 : 4, 1);
+         }
+      }
+
+      /**
+       * Convert an array of registers back into a VEC4 according to the
+       * layout expected from some shared unit.  If \p has_simd4x2 is true the
+       * argument is left unmodified in SIMD4x2 form, otherwise it will be
+       * rearranged from SIMD8 form.
+       */
+      src_reg
+      emit_extract(const vec4_builder &bld, const src_reg src,
+                   unsigned n, bool has_simd4x2)
+      {
+         if (src.file == BAD_FILE || n == 0) {
+            return src_reg();
+
+         } else {
+            return emit_stride(bld, src, n, 1, has_simd4x2 ? 1 : 4);
+         }
+      }
+   }
+}
+
+namespace brw {
+   namespace surface_access {
+      namespace {
+         using namespace array_utils;
+
+         /**
+          * Generate a send opcode for a surface message and return the
+          * result.
+          */
+         src_reg
+         emit_send(const vec4_builder &bld, enum opcode op,
+                   const src_reg &header,
+                   const src_reg &addr, unsigned addr_sz,
+                   const src_reg &src, unsigned src_sz,
+                   const src_reg &surface,
+                   unsigned arg, unsigned ret_sz,
+                   brw_predicate pred = BRW_PREDICATE_NONE)
+         {
+            /* Calculate the total number of components of the payload. */
+            const unsigned header_sz = (header.file == BAD_FILE ? 0 : 1);
+            const unsigned sz = header_sz + addr_sz + src_sz;
+
+            /* Construct the payload. */
+            const dst_reg payload = bld.vgrf(BRW_REGISTER_TYPE_UD, sz);
+            unsigned n = 0;
+
+            if (header_sz)
+               bld.exec_all().MOV(offset(payload, 8, n++),
+                                  retype(header, BRW_REGISTER_TYPE_UD));
+
+            for (unsigned i = 0; i < addr_sz; i++)
+               bld.MOV(offset(payload, 8, n++),
+                       offset(retype(addr, BRW_REGISTER_TYPE_UD), 8, i));
+
+            for (unsigned i = 0; i < src_sz; i++)
+               bld.MOV(offset(payload, 8, n++),
+                       offset(retype(src, BRW_REGISTER_TYPE_UD), 8, i));
+
+            /* Reduce the dynamically uniform surface index to a single
+             * scalar.
+             */
+            const src_reg usurface = bld.emit_uniformize(surface);
+
+            /* Emit the message send instruction. */
+            const dst_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD, ret_sz);
+            vec4_instruction *inst =
+               bld.emit(op, dst, src_reg(payload), usurface, brw_imm_ud(arg));
+            inst->mlen = sz;
+            inst->size_written = ret_sz * REG_SIZE;
+            inst->header_size = header_sz;
+            inst->predicate = pred;
+
+            return src_reg(dst);
+         }
+      }
+
+      /**
+       * Emit an untyped surface read opcode.  \p dims determines the number
+       * of components of the address and \p size the number of components of
+       * the returned value.
+       */
+      src_reg
+      emit_untyped_read(const vec4_builder &bld,
+                        const src_reg &surface, const src_reg &addr,
+                        unsigned dims, unsigned size,
+                        brw_predicate pred)
+      {
+         return emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_READ, src_reg(),
+                          emit_insert(bld, addr, dims, true), 1,
+                          src_reg(), 0,
+                          surface, size, 1, pred);
+      }
+
+      /**
+       * Emit an untyped surface write opcode.  \p dims determines the number
+       * of components of the address and \p size the number of components of
+       * the argument.
+       */
+      void
+      emit_untyped_write(const vec4_builder &bld, const src_reg &surface,
+                         const src_reg &addr, const src_reg &src,
+                         unsigned dims, unsigned size,
+                         brw_predicate pred)
+      {
+         const bool has_simd4x2 = (bld.shader->devinfo->gen >= 8 ||
+                                   bld.shader->devinfo->is_haswell);
+         emit_send(bld, SHADER_OPCODE_UNTYPED_SURFACE_WRITE, src_reg(),
+                   emit_insert(bld, addr, dims, has_simd4x2),
+                   has_simd4x2 ? 1 : dims,
+                   emit_insert(bld, src, size, has_simd4x2),
+                   has_simd4x2 ? 1 : size,
+                   surface, size, 0, pred);
+      }
+
+      /**
+       * Emit an untyped surface atomic opcode.  \p dims determines the number
+       * of components of the address and \p rsize the number of components of
+       * the returned value (either zero or one).
+       */
+      src_reg
+      emit_untyped_atomic(const vec4_builder &bld,
+                          const src_reg &surface, const src_reg &addr,
+                          const src_reg &src0, const src_reg &src1,
+                          unsigned dims, unsigned rsize, unsigned op,
+                          brw_predicate pred)
+      {
+         const bool has_simd4x2 = (bld.shader->devinfo->gen >= 8 ||
+                                   bld.shader->devinfo->is_haswell);
+
+         /* Zip the components of both sources, they are represented as the X
+          * and Y components of the same vector.
+          */
+         const unsigned size = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
+         const dst_reg srcs = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+         if (size >= 1)
+            bld.MOV(writemask(srcs, WRITEMASK_X), src0);
+         if (size >= 2)
+            bld.MOV(writemask(srcs, WRITEMASK_Y), src1);
+
+         return emit_send(bld, SHADER_OPCODE_UNTYPED_ATOMIC, src_reg(),
+                          emit_insert(bld, addr, dims, has_simd4x2),
+                          has_simd4x2 ? 1 : dims,
+                          emit_insert(bld, src_reg(srcs), size, has_simd4x2),
+                          has_simd4x2 && size ? 1 : size,
+                          surface, op, rsize, pred);
+      }
+
+      namespace {
+         /**
+          * Initialize the header present in typed surface messages.
+          */
+         src_reg
+         emit_typed_message_header(const vec4_builder &bld)
+         {
+            const vec4_builder ubld = bld.exec_all();
+            const dst_reg dst = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+            ubld.MOV(dst, brw_imm_d(0));
+
+            if (bld.shader->devinfo->gen == 7 &&
+                !bld.shader->devinfo->is_haswell) {
+               /* The sample mask is used on IVB for the SIMD8 messages that
+                * have no SIMD4x2 variant.  We only use the two X channels
+                * in that case, mask everything else out.
+                */
+               ubld.MOV(writemask(dst, WRITEMASK_W), brw_imm_d(0x11));
+            }
+
+            return src_reg(dst);
+         }
+      }
+
+      /**
+       * Emit a typed surface read opcode.  \p dims determines the number of
+       * components of the address and \p size the number of components of the
+       * returned value.
+       */
+      src_reg
+      emit_typed_read(const vec4_builder &bld, const src_reg &surface,
+                      const src_reg &addr, unsigned dims, unsigned size)
+      {
+         const bool has_simd4x2 = (bld.shader->devinfo->gen >= 8 ||
+                                   bld.shader->devinfo->is_haswell);
+         const src_reg tmp =
+            emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_READ,
+                      emit_typed_message_header(bld),
+                      emit_insert(bld, addr, dims, has_simd4x2),
+                      has_simd4x2 ? 1 : dims,
+                      src_reg(), 0,
+                      surface, size,
+                      has_simd4x2 ? 1 : size);
+
+         return emit_extract(bld, tmp, size, has_simd4x2);
+      }
+
+      /**
+       * Emit a typed surface write opcode.  \p dims determines the number of
+       * components of the address and \p size the number of components of the
+       * argument.
+       */
+      void
+      emit_typed_write(const vec4_builder &bld, const src_reg &surface,
+                       const src_reg &addr, const src_reg &src,
+                       unsigned dims, unsigned size)
+      {
+         const bool has_simd4x2 = (bld.shader->devinfo->gen >= 8 ||
+                                   bld.shader->devinfo->is_haswell);
+         emit_send(bld, SHADER_OPCODE_TYPED_SURFACE_WRITE,
+                   emit_typed_message_header(bld),
+                   emit_insert(bld, addr, dims, has_simd4x2),
+                   has_simd4x2 ? 1 : dims,
+                   emit_insert(bld, src, size, has_simd4x2),
+                   has_simd4x2 ? 1 : size,
+                   surface, size, 0);
+      }
+
+      /**
+       * Emit a typed surface atomic opcode.  \p dims determines the number of
+       * components of the address and \p rsize the number of components of
+       * the returned value (either zero or one).
+       */
+      src_reg
+      emit_typed_atomic(const vec4_builder &bld,
+                        const src_reg &surface, const src_reg &addr,
+                        const src_reg &src0, const src_reg &src1,
+                        unsigned dims, unsigned rsize, unsigned op,
+                        brw_predicate pred)
+      {
+         const bool has_simd4x2 = (bld.shader->devinfo->gen >= 8 ||
+                                   bld.shader->devinfo->is_haswell);
+
+         /* Zip the components of both sources, they are represented as the X
+          * and Y components of the same vector.
+          */
+         const unsigned size = (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
+         const dst_reg srcs = bld.vgrf(BRW_REGISTER_TYPE_UD);
+
+         if (size >= 1)
+            bld.MOV(writemask(srcs, WRITEMASK_X), src0);
+         if (size >= 2)
+            bld.MOV(writemask(srcs, WRITEMASK_Y), src1);
+
+         return emit_send(bld, SHADER_OPCODE_TYPED_ATOMIC,
+                          emit_typed_message_header(bld),
+                          emit_insert(bld, addr, dims, has_simd4x2),
+                          has_simd4x2 ? 1 : dims,
+                          emit_insert(bld, src_reg(srcs), size, has_simd4x2),
+                          has_simd4x2 ? 1 : size,
+                          surface, op, rsize, pred);
+      }
+   }
+}
diff --git a/src/intel/compiler/brw_vec4_surface_builder.h b/src/intel/compiler/brw_vec4_surface_builder.h
new file mode 100644
index 00000000000..6e61c0fce9b
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_surface_builder.h
@@ -0,0 +1,69 @@
+/* -*- c++ -*- */
+/*
+ * Copyright © 2013-2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_VEC4_SURFACE_BUILDER_H
+#define BRW_VEC4_SURFACE_BUILDER_H
+
+#include "brw_vec4_builder.h"
+
+namespace brw {
+   namespace surface_access {
+      src_reg
+      emit_untyped_read(const vec4_builder &bld,
+                        const src_reg &surface, const src_reg &addr,
+                        unsigned dims, unsigned size,
+                        brw_predicate pred = BRW_PREDICATE_NONE);
+
+      void
+      emit_untyped_write(const vec4_builder &bld, const src_reg &surface,
+                         const src_reg &addr, const src_reg &src,
+                         unsigned dims, unsigned size,
+                         brw_predicate pred = BRW_PREDICATE_NONE);
+
+      src_reg
+      emit_untyped_atomic(const vec4_builder &bld,
+                          const src_reg &surface, const src_reg &addr,
+                          const src_reg &src0, const src_reg &src1,
+                          unsigned dims, unsigned rsize, unsigned op,
+                          brw_predicate pred = BRW_PREDICATE_NONE);
+
+      src_reg
+      emit_typed_read(const vec4_builder &bld, const src_reg &surface,
+                      const src_reg &addr, unsigned dims, unsigned size);
+
+      void
+      emit_typed_write(const vec4_builder &bld, const src_reg &surface,
+                       const src_reg &addr, const src_reg &src,
+                       unsigned dims, unsigned size);
+
+      src_reg
+      emit_typed_atomic(const vec4_builder &bld, const src_reg &surface,
+                        const src_reg &addr,
+                        const src_reg &src0, const src_reg &src1,
+                        unsigned dims, unsigned rsize, unsigned op,
+                        brw_predicate pred = BRW_PREDICATE_NONE);
+   }
+}
+
+#endif
diff --git a/src/intel/compiler/brw_vec4_tcs.cpp b/src/intel/compiler/brw_vec4_tcs.cpp
new file mode 100644
index 00000000000..d4a647d029f
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_tcs.cpp
@@ -0,0 +1,516 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_vec4_tcs.cpp
+ *
+ * Tessellaton control shader specific code derived from the vec4_visitor class.
+ */
+
+#include "brw_nir.h"
+#include "brw_vec4_tcs.h"
+#include "brw_fs.h"
+#include "common/gen_debug.h"
+
+namespace brw {
+
+vec4_tcs_visitor::vec4_tcs_visitor(const struct brw_compiler *compiler,
+                                   void *log_data,
+                                   const struct brw_tcs_prog_key *key,
+                                   struct brw_tcs_prog_data *prog_data,
+                                   const nir_shader *nir,
+                                   void *mem_ctx,
+                                   int shader_time_index,
+                                   const struct brw_vue_map *input_vue_map)
+   : vec4_visitor(compiler, log_data, &key->tex, &prog_data->base,
+                  nir, mem_ctx, false, shader_time_index),
+     input_vue_map(input_vue_map), key(key)
+{
+}
+
+
+void
+vec4_tcs_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr)
+{
+}
+
+dst_reg *
+vec4_tcs_visitor::make_reg_for_system_value(int location)
+{
+   return NULL;
+}
+
+
+void
+vec4_tcs_visitor::setup_payload()
+{
+   int reg = 0;
+
+   /* The payload always contains important data in r0, which contains
+    * the URB handles that are passed on to the URB write at the end
+    * of the thread.
+    */
+   reg++;
+
+   /* r1.0 - r4.7 may contain the input control point URB handles,
+    * which we use to pull vertex data.
+    */
+   reg += 4;
+
+   /* Push constants may start at r5.0 */
+   reg = setup_uniforms(reg);
+
+   this->first_non_payload_grf = reg;
+}
+
+
+void
+vec4_tcs_visitor::emit_prolog()
+{
+   invocation_id = src_reg(this, glsl_type::uint_type);
+   emit(TCS_OPCODE_GET_INSTANCE_ID, dst_reg(invocation_id));
+
+   /* HS threads are dispatched with the dispatch mask set to 0xFF.
+    * If there are an odd number of output vertices, then the final
+    * HS instance dispatched will only have its bottom half doing real
+    * work, and so we need to disable the upper half:
+    */
+   if (nir->info->tess.tcs_vertices_out % 2) {
+      emit(CMP(dst_null_d(), invocation_id,
+               brw_imm_ud(nir->info->tess.tcs_vertices_out),
+               BRW_CONDITIONAL_L));
+
+      /* Matching ENDIF is in emit_thread_end() */
+      emit(IF(BRW_PREDICATE_NORMAL));
+   }
+}
+
+
+void
+vec4_tcs_visitor::emit_thread_end()
+{
+   vec4_instruction *inst;
+   current_annotation = "thread end";
+
+   if (nir->info->tess.tcs_vertices_out % 2) {
+      emit(BRW_OPCODE_ENDIF);
+   }
+
+   if (devinfo->gen == 7) {
+      struct brw_tcs_prog_data *tcs_prog_data =
+         (struct brw_tcs_prog_data *) prog_data;
+
+      current_annotation = "release input vertices";
+
+      /* Synchronize all threads, so we know that no one is still
+       * using the input URB handles.
+       */
+      if (tcs_prog_data->instances > 1) {
+         dst_reg header = dst_reg(this, glsl_type::uvec4_type);
+         emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header);
+         emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header));
+      }
+
+      /* Make thread 0 (invocations <1, 0>) release pairs of ICP handles.
+       * We want to compare the bottom half of invocation_id with 0, but
+       * use that truth value for the top half as well.  Unfortunately,
+       * we don't have stride in the vec4 world, nor UV immediates in
+       * align16, so we need an opcode to get invocation_id<0,4,0>.
+       */
+      set_condmod(BRW_CONDITIONAL_Z,
+                  emit(TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(),
+                       invocation_id));
+      emit(IF(BRW_PREDICATE_NORMAL));
+      for (unsigned i = 0; i < key->input_vertices; i += 2) {
+         /* If we have an odd number of input vertices, the last will be
+          * unpaired.  We don't want to use an interleaved URB write in
+          * that case.
+          */
+         const bool is_unpaired = i == key->input_vertices - 1;
+
+         dst_reg header(this, glsl_type::uvec4_type);
+         emit(TCS_OPCODE_RELEASE_INPUT, header, brw_imm_ud(i),
+              brw_imm_ud(is_unpaired));
+      }
+      emit(BRW_OPCODE_ENDIF);
+   }
+
+   if (unlikely(INTEL_DEBUG & DEBUG_SHADER_TIME))
+      emit_shader_time_end();
+
+   inst = emit(TCS_OPCODE_THREAD_END);
+   inst->base_mrf = 14;
+   inst->mlen = 2;
+}
+
+
+void
+vec4_tcs_visitor::emit_input_urb_read(const dst_reg &dst,
+                                      const src_reg &vertex_index,
+                                      unsigned base_offset,
+                                      unsigned first_component,
+                                      const src_reg &indirect_offset)
+{
+   vec4_instruction *inst;
+   dst_reg temp(this, glsl_type::ivec4_type);
+   temp.type = dst.type;
+
+   /* Set up the message header to reference the proper parts of the URB */
+   dst_reg header = dst_reg(this, glsl_type::uvec4_type);
+   inst = emit(TCS_OPCODE_SET_INPUT_URB_OFFSETS, header, vertex_index,
+               indirect_offset);
+   inst->force_writemask_all = true;
+
+   /* Read into a temporary, ignoring writemasking. */
+   inst = emit(VEC4_OPCODE_URB_READ, temp, src_reg(header));
+   inst->offset = base_offset;
+   inst->mlen = 1;
+   inst->base_mrf = -1;
+
+   /* Copy the temporary to the destination to deal with writemasking.
+    *
+    * Also attempt to deal with gl_PointSize being in the .w component.
+    */
+   if (inst->offset == 0 && indirect_offset.file == BAD_FILE) {
+      emit(MOV(dst, swizzle(src_reg(temp), BRW_SWIZZLE_WWWW)));
+   } else {
+      src_reg src = src_reg(temp);
+      src.swizzle = BRW_SWZ_COMP_INPUT(first_component);
+      emit(MOV(dst, src));
+   }
+}
+
+void
+vec4_tcs_visitor::emit_output_urb_read(const dst_reg &dst,
+                                       unsigned base_offset,
+                                       unsigned first_component,
+                                       const src_reg &indirect_offset)
+{
+   vec4_instruction *inst;
+
+   /* Set up the message header to reference the proper parts of the URB */
+   dst_reg header = dst_reg(this, glsl_type::uvec4_type);
+   inst = emit(TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, header,
+               brw_imm_ud(dst.writemask << first_component), indirect_offset);
+   inst->force_writemask_all = true;
+
+   vec4_instruction *read = emit(VEC4_OPCODE_URB_READ, dst, src_reg(header));
+   read->offset = base_offset;
+   read->mlen = 1;
+   read->base_mrf = -1;
+
+   if (first_component) {
+      /* Read into a temporary and copy with a swizzle and writemask. */
+      read->dst = retype(dst_reg(this, glsl_type::ivec4_type), dst.type);
+      emit(MOV(dst, swizzle(src_reg(read->dst),
+                            BRW_SWZ_COMP_INPUT(first_component))));
+   }
+}
+
+void
+vec4_tcs_visitor::emit_urb_write(const src_reg &value,
+                                 unsigned writemask,
+                                 unsigned base_offset,
+                                 const src_reg &indirect_offset)
+{
+   if (writemask == 0)
+      return;
+
+   src_reg message(this, glsl_type::uvec4_type, 2);
+   vec4_instruction *inst;
+
+   inst = emit(TCS_OPCODE_SET_OUTPUT_URB_OFFSETS, dst_reg(message),
+               brw_imm_ud(writemask), indirect_offset);
+   inst->force_writemask_all = true;
+   inst = emit(MOV(byte_offset(dst_reg(retype(message, value.type)), REG_SIZE),
+                   value));
+   inst->force_writemask_all = true;
+
+   inst = emit(TCS_OPCODE_URB_WRITE, dst_null_f(), message);
+   inst->offset = base_offset;
+   inst->mlen = 2;
+   inst->base_mrf = -1;
+}
+
+void
+vec4_tcs_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
+{
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_invocation_id:
+      emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD),
+               invocation_id));
+      break;
+   case nir_intrinsic_load_primitive_id:
+      emit(TCS_OPCODE_GET_PRIMITIVE_ID,
+           get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD));
+      break;
+   case nir_intrinsic_load_patch_vertices_in:
+      emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D),
+               brw_imm_d(key->input_vertices)));
+      break;
+   case nir_intrinsic_load_per_vertex_input: {
+      src_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = instr->const_index[0];
+
+      nir_const_value *vertex_const = nir_src_as_const_value(instr->src[0]);
+      src_reg vertex_index =
+         vertex_const ? src_reg(brw_imm_ud(vertex_const->u32[0]))
+                      : get_nir_src(instr->src[0], BRW_REGISTER_TYPE_UD, 1);
+
+      unsigned first_component = nir_intrinsic_component(instr);
+      if (nir_dest_bit_size(instr->dest) == 64) {
+         /* We need to emit up to two 32-bit URB reads, then shuffle
+          * the result into a temporary, then move to the destination
+          * honoring the writemask
+          *
+          * We don't need to divide first_component by 2 because
+          * emit_input_urb_read takes a 32-bit type.
+          */
+         dst_reg tmp = dst_reg(this, glsl_type::dvec4_type);
+         dst_reg tmp_d = retype(tmp, BRW_REGISTER_TYPE_D);
+         emit_input_urb_read(tmp_d, vertex_index, imm_offset,
+                             first_component, indirect_offset);
+         if (instr->num_components > 2) {
+            emit_input_urb_read(byte_offset(tmp_d, REG_SIZE), vertex_index,
+                                imm_offset + 1, 0, indirect_offset);
+         }
+
+         src_reg tmp_src = retype(src_reg(tmp_d), BRW_REGISTER_TYPE_DF);
+         dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
+         shuffle_64bit_data(shuffled, tmp_src, false);
+
+         dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_DF);
+         dst.writemask = brw_writemask_for_size(instr->num_components);
+         emit(MOV(dst, src_reg(shuffled)));
+      } else {
+         dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
+         dst.writemask = brw_writemask_for_size(instr->num_components);
+         emit_input_urb_read(dst, vertex_index, imm_offset,
+                             first_component, indirect_offset);
+      }
+      break;
+   }
+   case nir_intrinsic_load_input:
+      unreachable("nir_lower_io should use load_per_vertex_input intrinsics");
+      break;
+   case nir_intrinsic_load_output:
+   case nir_intrinsic_load_per_vertex_output: {
+      src_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = instr->const_index[0];
+
+      dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
+      dst.writemask = brw_writemask_for_size(instr->num_components);
+
+      emit_output_urb_read(dst, imm_offset, nir_intrinsic_component(instr),
+                           indirect_offset);
+      break;
+   }
+   case nir_intrinsic_store_output:
+   case nir_intrinsic_store_per_vertex_output: {
+      src_reg value = get_nir_src(instr->src[0]);
+      unsigned mask = instr->const_index[1];
+      unsigned swiz = BRW_SWIZZLE_XYZW;
+
+      src_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = instr->const_index[0];
+
+      unsigned first_component = nir_intrinsic_component(instr);
+      if (first_component) {
+         if (nir_src_bit_size(instr->src[0]) == 64)
+            first_component /= 2;
+         assert(swiz == BRW_SWIZZLE_XYZW);
+         swiz = BRW_SWZ_COMP_OUTPUT(first_component);
+         mask = mask << first_component;
+      }
+
+      if (nir_src_bit_size(instr->src[0]) == 64) {
+         /* For 64-bit data we need to shuffle the data before we write and
+          * emit two messages. Also, since each channel is twice as large we
+          * need to fix the writemask in each 32-bit message to account for it.
+          */
+         value = swizzle(retype(value, BRW_REGISTER_TYPE_DF), swiz);
+         dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
+         shuffle_64bit_data(shuffled, value, true);
+         src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
+
+         for (int n = 0; n < 2; n++) {
+            unsigned fixed_mask = 0;
+            if (mask & WRITEMASK_X)
+               fixed_mask |= WRITEMASK_XY;
+            if (mask & WRITEMASK_Y)
+               fixed_mask |= WRITEMASK_ZW;
+            emit_urb_write(shuffled_float, fixed_mask,
+                           imm_offset, indirect_offset);
+
+            shuffled_float = byte_offset(shuffled_float, REG_SIZE);
+            mask >>= 2;
+            imm_offset++;
+         }
+      } else {
+         emit_urb_write(swizzle(value, swiz), mask,
+                        imm_offset, indirect_offset);
+      }
+      break;
+   }
+
+   case nir_intrinsic_barrier: {
+      dst_reg header = dst_reg(this, glsl_type::uvec4_type);
+      emit(TCS_OPCODE_CREATE_BARRIER_HEADER, header);
+      emit(SHADER_OPCODE_BARRIER, dst_null_ud(), src_reg(header));
+      break;
+   }
+
+   default:
+      vec4_visitor::nir_emit_intrinsic(instr);
+   }
+}
+
+
+extern "C" const unsigned *
+brw_compile_tcs(const struct brw_compiler *compiler,
+                void *log_data,
+                void *mem_ctx,
+                const struct brw_tcs_prog_key *key,
+                struct brw_tcs_prog_data *prog_data,
+                const nir_shader *src_shader,
+                int shader_time_index,
+                unsigned *final_assembly_size,
+                char **error_str)
+{
+   const struct gen_device_info *devinfo = compiler->devinfo;
+   struct brw_vue_prog_data *vue_prog_data = &prog_data->base;
+   const bool is_scalar = compiler->scalar_stage[MESA_SHADER_TESS_CTRL];
+
+   nir_shader *nir = nir_shader_clone(mem_ctx, src_shader);
+   nir->info->outputs_written = key->outputs_written;
+   nir->info->patch_outputs_written = key->patch_outputs_written;
+
+   struct brw_vue_map input_vue_map;
+   brw_compute_vue_map(devinfo, &input_vue_map, nir->info->inputs_read,
+                       nir->info->separate_shader);
+   brw_compute_tess_vue_map(&vue_prog_data->vue_map,
+                            nir->info->outputs_written,
+                            nir->info->patch_outputs_written);
+
+   nir = brw_nir_apply_sampler_key(nir, compiler, &key->tex, is_scalar);
+   brw_nir_lower_vue_inputs(nir, is_scalar, &input_vue_map);
+   brw_nir_lower_tcs_outputs(nir, &vue_prog_data->vue_map,
+                             key->tes_primitive_mode);
+   if (key->quads_workaround)
+      brw_nir_apply_tcs_quads_workaround(nir);
+
+   nir = brw_postprocess_nir(nir, compiler, is_scalar);
+
+   if (is_scalar)
+      prog_data->instances = DIV_ROUND_UP(nir->info->tess.tcs_vertices_out, 8);
+   else
+      prog_data->instances = DIV_ROUND_UP(nir->info->tess.tcs_vertices_out, 2);
+
+   /* Compute URB entry size.  The maximum allowed URB entry size is 32k.
+    * That divides up as follows:
+    *
+    *     32 bytes for the patch header (tessellation factors)
+    *    480 bytes for per-patch varyings (a varying component is 4 bytes and
+    *              gl_MaxTessPatchComponents = 120)
+    *  16384 bytes for per-vertex varyings (a varying component is 4 bytes,
+    *              gl_MaxPatchVertices = 32 and
+    *              gl_MaxTessControlOutputComponents = 128)
+    *
+    *  15808 bytes left for varying packing overhead
+    */
+   const int num_per_patch_slots = vue_prog_data->vue_map.num_per_patch_slots;
+   const int num_per_vertex_slots = vue_prog_data->vue_map.num_per_vertex_slots;
+   unsigned output_size_bytes = 0;
+   /* Note that the patch header is counted in num_per_patch_slots. */
+   output_size_bytes += num_per_patch_slots * 16;
+   output_size_bytes += nir->info->tess.tcs_vertices_out *
+                        num_per_vertex_slots * 16;
+
+   assert(output_size_bytes >= 1);
+   if (output_size_bytes > GEN7_MAX_HS_URB_ENTRY_SIZE_BYTES)
+      return NULL;
+
+   /* URB entry sizes are stored as a multiple of 64 bytes. */
+   vue_prog_data->urb_entry_size = ALIGN(output_size_bytes, 64) / 64;
+
+   /* HS does not use the usual payload pushing from URB to GRFs,
+    * because we don't have enough registers for a full-size payload, and
+    * the hardware is broken on Haswell anyway.
+    */
+   vue_prog_data->urb_read_length = 0;
+
+   if (unlikely(INTEL_DEBUG & DEBUG_TCS)) {
+      fprintf(stderr, "TCS Input ");
+      brw_print_vue_map(stderr, &input_vue_map);
+      fprintf(stderr, "TCS Output ");
+      brw_print_vue_map(stderr, &vue_prog_data->vue_map);
+   }
+
+   if (is_scalar) {
+      fs_visitor v(compiler, log_data, mem_ctx, (void *) key,
+                   &prog_data->base.base, NULL, nir, 8,
+                   shader_time_index, &input_vue_map);
+      if (!v.run_tcs_single_patch()) {
+         if (error_str)
+            *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
+         return NULL;
+      }
+
+      prog_data->base.base.dispatch_grf_start_reg = v.payload.num_regs;
+      prog_data->base.dispatch_mode = DISPATCH_MODE_SIMD8;
+
+      fs_generator g(compiler, log_data, mem_ctx, (void *) key,
+                     &prog_data->base.base, v.promoted_constants, false,
+                     MESA_SHADER_TESS_CTRL);
+      if (unlikely(INTEL_DEBUG & DEBUG_TCS)) {
+         g.enable_debug(ralloc_asprintf(mem_ctx,
+                                        "%s tessellation control shader %s",
+                                        nir->info->label ? nir->info->label
+                                                        : "unnamed",
+                                        nir->info->name));
+      }
+
+      g.generate_code(v.cfg, 8);
+
+      return g.get_assembly(final_assembly_size);
+   } else {
+      vec4_tcs_visitor v(compiler, log_data, key, prog_data,
+                         nir, mem_ctx, shader_time_index, &input_vue_map);
+      if (!v.run()) {
+         if (error_str)
+            *error_str = ralloc_strdup(mem_ctx, v.fail_msg);
+         return NULL;
+      }
+
+      if (unlikely(INTEL_DEBUG & DEBUG_TCS))
+         v.dump_instructions();
+
+
+      return brw_vec4_generate_assembly(compiler, log_data, mem_ctx, nir,
+                                        &prog_data->base, v.cfg,
+                                        final_assembly_size);
+   }
+}
+
+
+} /* namespace brw */
diff --git a/src/intel/compiler/brw_vec4_tcs.h b/src/intel/compiler/brw_vec4_tcs.h
new file mode 100644
index 00000000000..030eb5e6603
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_tcs.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_vec4_tcs.h
+ *
+ * The vec4-mode tessellation control shader compiler backend.
+ */
+
+#ifndef BRW_VEC4_TCS_H
+#define BRW_VEC4_TCS_H
+
+#include "brw_compiler.h"
+#include "brw_vec4.h"
+
+#ifdef __cplusplus
+namespace brw {
+
+class vec4_tcs_visitor : public vec4_visitor
+{
+public:
+   vec4_tcs_visitor(const struct brw_compiler *compiler,
+                    void *log_data,
+                    const struct brw_tcs_prog_key *key,
+                    struct brw_tcs_prog_data *prog_data,
+                    const nir_shader *nir,
+                    void *mem_ctx,
+                    int shader_time_index,
+                    const struct brw_vue_map *input_vue_map);
+
+protected:
+   virtual dst_reg *make_reg_for_system_value(int location);
+   virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr);
+   virtual void setup_payload();
+   virtual void emit_prolog();
+   virtual void emit_thread_end();
+
+   virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
+
+   void emit_input_urb_read(const dst_reg &dst,
+                            const src_reg &vertex_index,
+                            unsigned base_offset,
+                            unsigned first_component,
+                            const src_reg &indirect_offset);
+   void emit_output_urb_read(const dst_reg &dst,
+                             unsigned base_offset,
+                             unsigned first_component,
+                             const src_reg &indirect_offset);
+
+   void emit_urb_write(const src_reg &value, unsigned writemask,
+                       unsigned base_offset, const src_reg &indirect_offset);
+
+   /* we do not use the normal end-of-shader URB write mechanism -- but every vec4 stage
+    * must provide implementations of these:
+    */
+   virtual void emit_urb_write_header(int mrf) {}
+   virtual vec4_instruction *emit_urb_write_opcode(bool complete) { return NULL; }
+
+   const struct brw_vue_map *input_vue_map;
+
+   const struct brw_tcs_prog_key *key;
+   src_reg invocation_id;
+};
+
+} /* namespace brw */
+#endif /* __cplusplus */
+
+#endif /* BRW_VEC4_TCS_H */
diff --git a/src/intel/compiler/brw_vec4_tes.cpp b/src/intel/compiler/brw_vec4_tes.cpp
new file mode 100644
index 00000000000..bcf9a87eb01
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_tes.cpp
@@ -0,0 +1,296 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_vec4_tes.cpp
+ *
+ * Tessellaton evaluation shader specific code derived from the vec4_visitor class.
+ */
+
+#include "brw_vec4_tes.h"
+#include "brw_cfg.h"
+#include "common/gen_debug.h"
+
+namespace brw {
+
+vec4_tes_visitor::vec4_tes_visitor(const struct brw_compiler *compiler,
+                                  void *log_data,
+                                  const struct brw_tes_prog_key *key,
+                                  struct brw_tes_prog_data *prog_data,
+                                  const nir_shader *shader,
+                                  void *mem_ctx,
+                                  int shader_time_index)
+   : vec4_visitor(compiler, log_data, &key->tex, &prog_data->base,
+                  shader, mem_ctx, false, shader_time_index)
+{
+}
+
+
+dst_reg *
+vec4_tes_visitor::make_reg_for_system_value(int location)
+{
+   return NULL;
+}
+
+void
+vec4_tes_visitor::nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr)
+{
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_tess_level_outer:
+   case nir_intrinsic_load_tess_level_inner:
+      break;
+   default:
+      vec4_visitor::nir_setup_system_value_intrinsic(instr);
+   }
+}
+
+
+void
+vec4_tes_visitor::setup_payload()
+{
+   int reg = 0;
+
+   /* The payload always contains important data in r0 and r1, which contains
+    * the URB handles that are passed on to the URB write at the end
+    * of the thread.
+    */
+   reg += 2;
+
+   reg = setup_uniforms(reg);
+
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      for (int i = 0; i < 3; i++) {
+         if (inst->src[i].file != ATTR)
+            continue;
+
+         bool is_64bit = type_sz(inst->src[i].type) == 8;
+
+         unsigned slot = inst->src[i].nr + inst->src[i].offset / 16;
+         struct brw_reg grf = brw_vec4_grf(reg + slot / 2, 4 * (slot % 2));
+         grf = stride(grf, 0, is_64bit ? 2 : 4, 1);
+         grf.swizzle = inst->src[i].swizzle;
+         grf.type = inst->src[i].type;
+         grf.abs = inst->src[i].abs;
+         grf.negate = inst->src[i].negate;
+
+         /* For 64-bit attributes we can end up with components XY in the
+          * second half of a register and components ZW in the first half
+          * of the next. Fix it up here.
+          */
+         if (is_64bit && grf.subnr > 0) {
+            /* We can't do swizzles that mix XY and ZW channels in this case.
+             * Such cases should have been handled by the scalarization pass.
+             */
+            assert((brw_mask_for_swizzle(grf.swizzle) & 0x3) ^
+                   (brw_mask_for_swizzle(grf.swizzle) & 0xc));
+            if (brw_mask_for_swizzle(grf.swizzle) & 0xc) {
+               grf.subnr = 0;
+               grf.nr++;
+               grf.swizzle -= BRW_SWIZZLE_ZZZZ;
+            }
+         }
+
+         inst->src[i] = grf;
+      }
+   }
+
+   reg += 8 * prog_data->urb_read_length;
+
+   this->first_non_payload_grf = reg;
+}
+
+
+void
+vec4_tes_visitor::emit_prolog()
+{
+   input_read_header = src_reg(this, glsl_type::uvec4_type);
+   emit(TES_OPCODE_CREATE_INPUT_READ_HEADER, dst_reg(input_read_header));
+
+   this->current_annotation = NULL;
+}
+
+
+void
+vec4_tes_visitor::emit_urb_write_header(int mrf)
+{
+   /* No need to do anything for DS; an implied write to this MRF will be
+    * performed by VS_OPCODE_URB_WRITE.
+    */
+   (void) mrf;
+}
+
+
+vec4_instruction *
+vec4_tes_visitor::emit_urb_write_opcode(bool complete)
+{
+   /* For DS, the URB writes end the thread. */
+   if (complete) {
+      if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+         emit_shader_time_end();
+   }
+
+   vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
+   inst->urb_write_flags = complete ?
+      BRW_URB_WRITE_EOT_COMPLETE : BRW_URB_WRITE_NO_FLAGS;
+
+   return inst;
+}
+
+void
+vec4_tes_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
+{
+   const struct brw_tes_prog_data *tes_prog_data =
+      (const struct brw_tes_prog_data *) prog_data;
+
+   switch (instr->intrinsic) {
+   case nir_intrinsic_load_tess_coord:
+      /* gl_TessCoord is part of the payload in g1 channels 0-2 and 4-6. */
+      emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
+               src_reg(brw_vec8_grf(1, 0))));
+      break;
+   case nir_intrinsic_load_tess_level_outer:
+      if (tes_prog_data->domain == BRW_TESS_DOMAIN_ISOLINE) {
+         emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
+                  swizzle(src_reg(ATTR, 1, glsl_type::vec4_type),
+                          BRW_SWIZZLE_ZWZW)));
+      } else {
+         emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
+                  swizzle(src_reg(ATTR, 1, glsl_type::vec4_type),
+                          BRW_SWIZZLE_WZYX)));
+      }
+      break;
+   case nir_intrinsic_load_tess_level_inner:
+      if (tes_prog_data->domain == BRW_TESS_DOMAIN_QUAD) {
+         emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
+                  swizzle(src_reg(ATTR, 0, glsl_type::vec4_type),
+                          BRW_SWIZZLE_WZYX)));
+      } else {
+         emit(MOV(get_nir_dest(instr->dest, BRW_REGISTER_TYPE_F),
+                  src_reg(ATTR, 1, glsl_type::float_type)));
+      }
+      break;
+   case nir_intrinsic_load_primitive_id:
+      emit(TES_OPCODE_GET_PRIMITIVE_ID,
+           get_nir_dest(instr->dest, BRW_REGISTER_TYPE_UD));
+      break;
+
+   case nir_intrinsic_load_input:
+   case nir_intrinsic_load_per_vertex_input: {
+      src_reg indirect_offset = get_indirect_offset(instr);
+      unsigned imm_offset = instr->const_index[0];
+      src_reg header = input_read_header;
+      bool is_64bit = nir_dest_bit_size(instr->dest) == 64;
+      unsigned first_component = nir_intrinsic_component(instr);
+      if (is_64bit)
+         first_component /= 2;
+
+      if (indirect_offset.file != BAD_FILE) {
+         header = src_reg(this, glsl_type::uvec4_type);
+         emit(TES_OPCODE_ADD_INDIRECT_URB_OFFSET, dst_reg(header),
+              input_read_header, indirect_offset);
+      } else {
+         /* Arbitrarily only push up to 24 vec4 slots worth of data,
+          * which is 12 registers (since each holds 2 vec4 slots).
+          */
+         const unsigned max_push_slots = 24;
+         if (imm_offset < max_push_slots) {
+            const glsl_type *src_glsl_type =
+               is_64bit ? glsl_type::dvec4_type : glsl_type::ivec4_type;
+            src_reg src = src_reg(ATTR, imm_offset, src_glsl_type);
+            src.swizzle = BRW_SWZ_COMP_INPUT(first_component);
+
+            const brw_reg_type dst_reg_type =
+               is_64bit ? BRW_REGISTER_TYPE_DF : BRW_REGISTER_TYPE_D;
+            emit(MOV(get_nir_dest(instr->dest, dst_reg_type), src));
+
+            prog_data->urb_read_length =
+               MAX2(prog_data->urb_read_length,
+                    DIV_ROUND_UP(imm_offset + (is_64bit ? 2 : 1), 2));
+            break;
+         }
+      }
+
+      if (!is_64bit) {
+         dst_reg temp(this, glsl_type::ivec4_type);
+         vec4_instruction *read =
+            emit(VEC4_OPCODE_URB_READ, temp, src_reg(header));
+         read->offset = imm_offset;
+         read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
+
+         src_reg src = src_reg(temp);
+         src.swizzle = BRW_SWZ_COMP_INPUT(first_component);
+
+         /* Copy to target.  We might end up with some funky writemasks landing
+          * in here, but we really don't want them in the above pseudo-ops.
+          */
+         dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_D);
+         dst.writemask = brw_writemask_for_size(instr->num_components);
+         emit(MOV(dst, src));
+      } else {
+         /* For 64-bit we need to load twice as many 32-bit components, and for
+          * dvec3/4 we need to emit 2 URB Read messages
+          */
+         dst_reg temp(this, glsl_type::dvec4_type);
+         dst_reg temp_d = retype(temp, BRW_REGISTER_TYPE_D);
+
+         vec4_instruction *read =
+            emit(VEC4_OPCODE_URB_READ, temp_d, src_reg(header));
+         read->offset = imm_offset;
+         read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
+
+         if (instr->num_components > 2) {
+            read = emit(VEC4_OPCODE_URB_READ, byte_offset(temp_d, REG_SIZE),
+                        src_reg(header));
+            read->offset = imm_offset + 1;
+            read->urb_write_flags = BRW_URB_WRITE_PER_SLOT_OFFSET;
+         }
+
+         src_reg temp_as_src = src_reg(temp);
+         temp_as_src.swizzle = BRW_SWZ_COMP_INPUT(first_component);
+
+         dst_reg shuffled(this, glsl_type::dvec4_type);
+         shuffle_64bit_data(shuffled, temp_as_src, false);
+
+         dst_reg dst = get_nir_dest(instr->dest, BRW_REGISTER_TYPE_DF);
+         dst.writemask = brw_writemask_for_size(instr->num_components);
+         emit(MOV(dst, src_reg(shuffled)));
+      }
+      break;
+   }
+   default:
+      vec4_visitor::nir_emit_intrinsic(instr);
+   }
+}
+
+
+void
+vec4_tes_visitor::emit_thread_end()
+{
+   /* For DS, we always end the thread by emitting a single vertex.
+    * emit_urb_write_opcode() will take care of setting the eot flag on the
+    * SEND instruction.
+    */
+   emit_vertex();
+}
+
+} /* namespace brw */
diff --git a/src/intel/compiler/brw_vec4_tes.h b/src/intel/compiler/brw_vec4_tes.h
new file mode 100644
index 00000000000..31a28f35974
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_tes.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * \file brw_vec4_tes.h
+ *
+ * The vec4 mode tessellation evaluation shader compiler backend.
+ */
+
+#ifndef BRW_VEC4_TES_H
+#define BRW_VEC4_TES_H
+
+#include "brw_vec4.h"
+
+#ifdef __cplusplus
+namespace brw {
+
+class vec4_tes_visitor : public vec4_visitor
+{
+public:
+   vec4_tes_visitor(const struct brw_compiler *compiler,
+                   void *log_data,
+                   const struct brw_tes_prog_key *key,
+                   struct brw_tes_prog_data *prog_data,
+                   const nir_shader *nir,
+                   void *mem_ctx,
+                   int shader_time_index);
+
+protected:
+   virtual dst_reg *make_reg_for_system_value(int location);
+   virtual void nir_setup_system_value_intrinsic(nir_intrinsic_instr *instr);
+   virtual void nir_emit_intrinsic(nir_intrinsic_instr *instr);
+
+   virtual void setup_payload();
+   virtual void emit_prolog();
+   virtual void emit_thread_end();
+
+   virtual void emit_urb_write_header(int mrf);
+   virtual vec4_instruction *emit_urb_write_opcode(bool complete);
+
+private:
+   src_reg input_read_header;
+};
+
+} /* namespace brw */
+#endif /* __cplusplus */
+
+#endif /* BRW_VEC4_TES_H */
diff --git a/src/intel/compiler/brw_vec4_visitor.cpp b/src/intel/compiler/brw_vec4_visitor.cpp
new file mode 100644
index 00000000000..262a084ca87
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_visitor.cpp
@@ -0,0 +1,1917 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_vec4.h"
+#include "brw_cfg.h"
+#include "brw_eu.h"
+
+namespace brw {
+
+vec4_instruction::vec4_instruction(enum opcode opcode, const dst_reg &dst,
+                                   const src_reg &src0, const src_reg &src1,
+                                   const src_reg &src2)
+{
+   this->opcode = opcode;
+   this->dst = dst;
+   this->src[0] = src0;
+   this->src[1] = src1;
+   this->src[2] = src2;
+   this->saturate = false;
+   this->force_writemask_all = false;
+   this->no_dd_clear = false;
+   this->no_dd_check = false;
+   this->writes_accumulator = false;
+   this->conditional_mod = BRW_CONDITIONAL_NONE;
+   this->predicate = BRW_PREDICATE_NONE;
+   this->predicate_inverse = false;
+   this->target = 0;
+   this->shadow_compare = false;
+   this->ir = NULL;
+   this->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
+   this->header_size = 0;
+   this->flag_subreg = 0;
+   this->mlen = 0;
+   this->base_mrf = 0;
+   this->offset = 0;
+   this->exec_size = 8;
+   this->group = 0;
+   this->size_written = (dst.file == BAD_FILE ?
+                         0 : this->exec_size * type_sz(dst.type));
+   this->annotation = NULL;
+}
+
+vec4_instruction *
+vec4_visitor::emit(vec4_instruction *inst)
+{
+   inst->ir = this->base_ir;
+   inst->annotation = this->current_annotation;
+
+   this->instructions.push_tail(inst);
+
+   return inst;
+}
+
+vec4_instruction *
+vec4_visitor::emit_before(bblock_t *block, vec4_instruction *inst,
+                          vec4_instruction *new_inst)
+{
+   new_inst->ir = inst->ir;
+   new_inst->annotation = inst->annotation;
+
+   inst->insert_before(block, new_inst);
+
+   return inst;
+}
+
+vec4_instruction *
+vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+                   const src_reg &src1, const src_reg &src2)
+{
+   return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1, src2));
+}
+
+
+vec4_instruction *
+vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0,
+                   const src_reg &src1)
+{
+   return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0, src1));
+}
+
+vec4_instruction *
+vec4_visitor::emit(enum opcode opcode, const dst_reg &dst, const src_reg &src0)
+{
+   return emit(new(mem_ctx) vec4_instruction(opcode, dst, src0));
+}
+
+vec4_instruction *
+vec4_visitor::emit(enum opcode opcode, const dst_reg &dst)
+{
+   return emit(new(mem_ctx) vec4_instruction(opcode, dst));
+}
+
+vec4_instruction *
+vec4_visitor::emit(enum opcode opcode)
+{
+   return emit(new(mem_ctx) vec4_instruction(opcode, dst_reg()));
+}
+
+#define ALU1(op)							\
+   vec4_instruction *							\
+   vec4_visitor::op(const dst_reg &dst, const src_reg &src0)		\
+   {									\
+      return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst, src0); \
+   }
+
+#define ALU2(op)							\
+   vec4_instruction *							\
+   vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
+                    const src_reg &src1)				\
+   {									\
+      return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,        \
+                                           src0, src1);                 \
+   }
+
+#define ALU2_ACC(op)							\
+   vec4_instruction *							\
+   vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
+                    const src_reg &src1)				\
+   {									\
+      vec4_instruction *inst = new(mem_ctx) vec4_instruction(           \
+                       BRW_OPCODE_##op, dst, src0, src1);		\
+      inst->writes_accumulator = true;                                  \
+      return inst;                                                      \
+   }
+
+#define ALU3(op)							\
+   vec4_instruction *							\
+   vec4_visitor::op(const dst_reg &dst, const src_reg &src0,		\
+                    const src_reg &src1, const src_reg &src2)		\
+   {									\
+      assert(devinfo->gen >= 6);						\
+      return new(mem_ctx) vec4_instruction(BRW_OPCODE_##op, dst,	\
+					   src0, src1, src2);		\
+   }
+
+ALU1(NOT)
+ALU1(MOV)
+ALU1(FRC)
+ALU1(RNDD)
+ALU1(RNDE)
+ALU1(RNDZ)
+ALU1(F32TO16)
+ALU1(F16TO32)
+ALU2(ADD)
+ALU2(MUL)
+ALU2_ACC(MACH)
+ALU2(AND)
+ALU2(OR)
+ALU2(XOR)
+ALU2(DP3)
+ALU2(DP4)
+ALU2(DPH)
+ALU2(SHL)
+ALU2(SHR)
+ALU2(ASR)
+ALU3(LRP)
+ALU1(BFREV)
+ALU3(BFE)
+ALU2(BFI1)
+ALU3(BFI2)
+ALU1(FBH)
+ALU1(FBL)
+ALU1(CBIT)
+ALU3(MAD)
+ALU2_ACC(ADDC)
+ALU2_ACC(SUBB)
+ALU2(MAC)
+ALU1(DIM)
+
+/** Gen4 predicated IF. */
+vec4_instruction *
+vec4_visitor::IF(enum brw_predicate predicate)
+{
+   vec4_instruction *inst;
+
+   inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF);
+   inst->predicate = predicate;
+
+   return inst;
+}
+
+/** Gen6 IF with embedded comparison. */
+vec4_instruction *
+vec4_visitor::IF(src_reg src0, src_reg src1,
+                 enum brw_conditional_mod condition)
+{
+   assert(devinfo->gen == 6);
+
+   vec4_instruction *inst;
+
+   resolve_ud_negate(&src0);
+   resolve_ud_negate(&src1);
+
+   inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_IF, dst_null_d(),
+					src0, src1);
+   inst->conditional_mod = condition;
+
+   return inst;
+}
+
+/**
+ * CMP: Sets the low bit of the destination channels with the result
+ * of the comparison, while the upper bits are undefined, and updates
+ * the flag register with the packed 16 bits of the result.
+ */
+vec4_instruction *
+vec4_visitor::CMP(dst_reg dst, src_reg src0, src_reg src1,
+                  enum brw_conditional_mod condition)
+{
+   vec4_instruction *inst;
+
+   /* Take the instruction:
+    *
+    * CMP null<d> src0<f> src1<f>
+    *
+    * Original gen4 does type conversion to the destination type before
+    * comparison, producing garbage results for floating point comparisons.
+    *
+    * The destination type doesn't matter on newer generations, so we set the
+    * type to match src0 so we can compact the instruction.
+    */
+   dst.type = src0.type;
+
+   resolve_ud_negate(&src0);
+   resolve_ud_negate(&src1);
+
+   inst = new(mem_ctx) vec4_instruction(BRW_OPCODE_CMP, dst, src0, src1);
+   inst->conditional_mod = condition;
+
+   return inst;
+}
+
+vec4_instruction *
+vec4_visitor::SCRATCH_READ(const dst_reg &dst, const src_reg &index)
+{
+   vec4_instruction *inst;
+
+   inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_READ,
+					dst, index);
+   inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen) + 1;
+   inst->mlen = 2;
+
+   return inst;
+}
+
+vec4_instruction *
+vec4_visitor::SCRATCH_WRITE(const dst_reg &dst, const src_reg &src,
+                            const src_reg &index)
+{
+   vec4_instruction *inst;
+
+   inst = new(mem_ctx) vec4_instruction(SHADER_OPCODE_GEN4_SCRATCH_WRITE,
+					dst, src, index);
+   inst->base_mrf = FIRST_SPILL_MRF(devinfo->gen);
+   inst->mlen = 3;
+
+   return inst;
+}
+
+src_reg
+vec4_visitor::fix_3src_operand(const src_reg &src)
+{
+   /* Using vec4 uniforms in SIMD4x2 programs is difficult. You'd like to be
+    * able to use vertical stride of zero to replicate the vec4 uniform, like
+    *
+    *    g3<0;4,1>:f - [0, 4][1, 5][2, 6][3, 7]
+    *
+    * But you can't, since vertical stride is always four in three-source
+    * instructions. Instead, insert a MOV instruction to do the replication so
+    * that the three-source instruction can consume it.
+    */
+
+   /* The MOV is only needed if the source is a uniform or immediate. */
+   if (src.file != UNIFORM && src.file != IMM)
+      return src;
+
+   if (src.file == UNIFORM && brw_is_single_value_swizzle(src.swizzle))
+      return src;
+
+   dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
+   expanded.type = src.type;
+   emit(VEC4_OPCODE_UNPACK_UNIFORM, expanded, src);
+   return src_reg(expanded);
+}
+
+src_reg
+vec4_visitor::resolve_source_modifiers(const src_reg &src)
+{
+   if (!src.abs && !src.negate)
+      return src;
+
+   dst_reg resolved = dst_reg(this, glsl_type::ivec4_type);
+   resolved.type = src.type;
+   emit(MOV(resolved, src));
+
+   return src_reg(resolved);
+}
+
+src_reg
+vec4_visitor::fix_math_operand(const src_reg &src)
+{
+   if (devinfo->gen < 6 || devinfo->gen >= 8 || src.file == BAD_FILE)
+      return src;
+
+   /* The gen6 math instruction ignores the source modifiers --
+    * swizzle, abs, negate, and at least some parts of the register
+    * region description.
+    *
+    * Rather than trying to enumerate all these cases, *always* expand the
+    * operand to a temp GRF for gen6.
+    *
+    * For gen7, keep the operand as-is, except if immediate, which gen7 still
+    * can't use.
+    */
+
+   if (devinfo->gen == 7 && src.file != IMM)
+      return src;
+
+   dst_reg expanded = dst_reg(this, glsl_type::vec4_type);
+   expanded.type = src.type;
+   emit(MOV(expanded, src));
+   return src_reg(expanded);
+}
+
+vec4_instruction *
+vec4_visitor::emit_math(enum opcode opcode,
+                        const dst_reg &dst,
+                        const src_reg &src0, const src_reg &src1)
+{
+   vec4_instruction *math =
+      emit(opcode, dst, fix_math_operand(src0), fix_math_operand(src1));
+
+   if (devinfo->gen == 6 && dst.writemask != WRITEMASK_XYZW) {
+      /* MATH on Gen6 must be align1, so we can't do writemasks. */
+      math->dst = dst_reg(this, glsl_type::vec4_type);
+      math->dst.type = dst.type;
+      math = emit(MOV(dst, src_reg(math->dst)));
+   } else if (devinfo->gen < 6) {
+      math->base_mrf = 1;
+      math->mlen = src1.file == BAD_FILE ? 1 : 2;
+   }
+
+   return math;
+}
+
+void
+vec4_visitor::emit_pack_half_2x16(dst_reg dst, src_reg src0)
+{
+   if (devinfo->gen < 7) {
+      unreachable("ir_unop_pack_half_2x16 should be lowered");
+   }
+
+   assert(dst.type == BRW_REGISTER_TYPE_UD);
+   assert(src0.type == BRW_REGISTER_TYPE_F);
+
+   /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
+    *
+    *   Because this instruction does not have a 16-bit floating-point type,
+    *   the destination data type must be Word (W).
+    *
+    *   The destination must be DWord-aligned and specify a horizontal stride
+    *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
+    *   each destination channel and the upper word is not modified.
+    *
+    * The above restriction implies that the f32to16 instruction must use
+    * align1 mode, because only in align1 mode is it possible to specify
+    * horizontal stride.  We choose here to defy the hardware docs and emit
+    * align16 instructions.
+    *
+    * (I [chadv] did attempt to emit align1 instructions for VS f32to16
+    * instructions. I was partially successful in that the code passed all
+    * tests.  However, the code was dubiously correct and fragile, and the
+    * tests were not harsh enough to probe that frailty. Not trusting the
+    * code, I chose instead to remain in align16 mode in defiance of the hw
+    * docs).
+    *
+    * I've [chadv] experimentally confirmed that, on gen7 hardware and the
+    * simulator, emitting a f32to16 in align16 mode with UD as destination
+    * data type is safe. The behavior differs from that specified in the PRM
+    * in that the upper word of each destination channel is cleared to 0.
+    */
+
+   dst_reg tmp_dst(this, glsl_type::uvec2_type);
+   src_reg tmp_src(tmp_dst);
+
+#if 0
+   /* Verify the undocumented behavior on which the following instructions
+    * rely.  If f32to16 fails to clear the upper word of the X and Y channels,
+    * then the result of the bit-or instruction below will be incorrect.
+    *
+    * You should inspect the disasm output in order to verify that the MOV is
+    * not optimized away.
+    */
+   emit(MOV(tmp_dst, brw_imm_ud(0x12345678u)));
+#endif
+
+   /* Give tmp the form below, where "." means untouched.
+    *
+    *     w z          y          x w z          y          x
+    *   |.|.|0x0000hhhh|0x0000llll|.|.|0x0000hhhh|0x0000llll|
+    *
+    * That the upper word of each write-channel be 0 is required for the
+    * following bit-shift and bit-or instructions to work. Note that this
+    * relies on the undocumented hardware behavior mentioned above.
+    */
+   tmp_dst.writemask = WRITEMASK_XY;
+   emit(F32TO16(tmp_dst, src0));
+
+   /* Give the write-channels of dst the form:
+    *   0xhhhh0000
+    */
+   tmp_src.swizzle = BRW_SWIZZLE_YYYY;
+   emit(SHL(dst, tmp_src, brw_imm_ud(16u)));
+
+   /* Finally, give the write-channels of dst the form of packHalf2x16's
+    * output:
+    *   0xhhhhllll
+    */
+   tmp_src.swizzle = BRW_SWIZZLE_XXXX;
+   emit(OR(dst, src_reg(dst), tmp_src));
+}
+
+void
+vec4_visitor::emit_unpack_half_2x16(dst_reg dst, src_reg src0)
+{
+   if (devinfo->gen < 7) {
+      unreachable("ir_unop_unpack_half_2x16 should be lowered");
+   }
+
+   assert(dst.type == BRW_REGISTER_TYPE_F);
+   assert(src0.type == BRW_REGISTER_TYPE_UD);
+
+   /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
+    *
+    *   Because this instruction does not have a 16-bit floating-point type,
+    *   the source data type must be Word (W). The destination type must be
+    *   F (Float).
+    *
+    * To use W as the source data type, we must adjust horizontal strides,
+    * which is only possible in align1 mode. All my [chadv] attempts at
+    * emitting align1 instructions for unpackHalf2x16 failed to pass the
+    * Piglit tests, so I gave up.
+    *
+    * I've verified that, on gen7 hardware and the simulator, it is safe to
+    * emit f16to32 in align16 mode with UD as source data type.
+    */
+
+   dst_reg tmp_dst(this, glsl_type::uvec2_type);
+   src_reg tmp_src(tmp_dst);
+
+   tmp_dst.writemask = WRITEMASK_X;
+   emit(AND(tmp_dst, src0, brw_imm_ud(0xffffu)));
+
+   tmp_dst.writemask = WRITEMASK_Y;
+   emit(SHR(tmp_dst, src0, brw_imm_ud(16u)));
+
+   dst.writemask = WRITEMASK_XY;
+   emit(F16TO32(dst, tmp_src));
+}
+
+void
+vec4_visitor::emit_unpack_unorm_4x8(const dst_reg &dst, src_reg src0)
+{
+   /* Instead of splitting the 32-bit integer, shifting, and ORing it back
+    * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
+    * is not suitable to generate the shift values, but we can use the packed
+    * vector float and a type-converting MOV.
+    */
+   dst_reg shift(this, glsl_type::uvec4_type);
+   emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
+
+   dst_reg shifted(this, glsl_type::uvec4_type);
+   src0.swizzle = BRW_SWIZZLE_XXXX;
+   emit(SHR(shifted, src0, src_reg(shift)));
+
+   shifted.type = BRW_REGISTER_TYPE_UB;
+   dst_reg f(this, glsl_type::vec4_type);
+   emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
+
+   emit(MUL(dst, src_reg(f), brw_imm_f(1.0f / 255.0f)));
+}
+
+void
+vec4_visitor::emit_unpack_snorm_4x8(const dst_reg &dst, src_reg src0)
+{
+   /* Instead of splitting the 32-bit integer, shifting, and ORing it back
+    * together, we can shift it by <0, 8, 16, 24>. The packed integer immediate
+    * is not suitable to generate the shift values, but we can use the packed
+    * vector float and a type-converting MOV.
+    */
+   dst_reg shift(this, glsl_type::uvec4_type);
+   emit(MOV(shift, brw_imm_vf4(0x00, 0x60, 0x70, 0x78)));
+
+   dst_reg shifted(this, glsl_type::uvec4_type);
+   src0.swizzle = BRW_SWIZZLE_XXXX;
+   emit(SHR(shifted, src0, src_reg(shift)));
+
+   shifted.type = BRW_REGISTER_TYPE_B;
+   dst_reg f(this, glsl_type::vec4_type);
+   emit(VEC4_OPCODE_MOV_BYTES, f, src_reg(shifted));
+
+   dst_reg scaled(this, glsl_type::vec4_type);
+   emit(MUL(scaled, src_reg(f), brw_imm_f(1.0f / 127.0f)));
+
+   dst_reg max(this, glsl_type::vec4_type);
+   emit_minmax(BRW_CONDITIONAL_GE, max, src_reg(scaled), brw_imm_f(-1.0f));
+   emit_minmax(BRW_CONDITIONAL_L, dst, src_reg(max), brw_imm_f(1.0f));
+}
+
+void
+vec4_visitor::emit_pack_unorm_4x8(const dst_reg &dst, const src_reg &src0)
+{
+   dst_reg saturated(this, glsl_type::vec4_type);
+   vec4_instruction *inst = emit(MOV(saturated, src0));
+   inst->saturate = true;
+
+   dst_reg scaled(this, glsl_type::vec4_type);
+   emit(MUL(scaled, src_reg(saturated), brw_imm_f(255.0f)));
+
+   dst_reg rounded(this, glsl_type::vec4_type);
+   emit(RNDE(rounded, src_reg(scaled)));
+
+   dst_reg u(this, glsl_type::uvec4_type);
+   emit(MOV(u, src_reg(rounded)));
+
+   src_reg bytes(u);
+   emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
+}
+
+void
+vec4_visitor::emit_pack_snorm_4x8(const dst_reg &dst, const src_reg &src0)
+{
+   dst_reg max(this, glsl_type::vec4_type);
+   emit_minmax(BRW_CONDITIONAL_GE, max, src0, brw_imm_f(-1.0f));
+
+   dst_reg min(this, glsl_type::vec4_type);
+   emit_minmax(BRW_CONDITIONAL_L, min, src_reg(max), brw_imm_f(1.0f));
+
+   dst_reg scaled(this, glsl_type::vec4_type);
+   emit(MUL(scaled, src_reg(min), brw_imm_f(127.0f)));
+
+   dst_reg rounded(this, glsl_type::vec4_type);
+   emit(RNDE(rounded, src_reg(scaled)));
+
+   dst_reg i(this, glsl_type::ivec4_type);
+   emit(MOV(i, src_reg(rounded)));
+
+   src_reg bytes(i);
+   emit(VEC4_OPCODE_PACK_BYTES, dst, bytes);
+}
+
+/*
+ * Returns the minimum number of vec4 (as_vec4 == true) or dvec4 (as_vec4 ==
+ * false) elements needed to pack a type.
+ */
+static int
+type_size_xvec4(const struct glsl_type *type, bool as_vec4)
+{
+   unsigned int i;
+   int size;
+
+   switch (type->base_type) {
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_FLOAT:
+   case GLSL_TYPE_BOOL:
+   case GLSL_TYPE_DOUBLE:
+   case GLSL_TYPE_UINT64:
+   case GLSL_TYPE_INT64:
+      if (type->is_matrix()) {
+         const glsl_type *col_type = type->column_type();
+         unsigned col_slots =
+            (as_vec4 && col_type->is_dual_slot()) ? 2 : 1;
+         return type->matrix_columns * col_slots;
+      } else {
+         /* Regardless of size of vector, it gets a vec4. This is bad
+          * packing for things like floats, but otherwise arrays become a
+          * mess.  Hopefully a later pass over the code can pack scalars
+          * down if appropriate.
+          */
+         return (as_vec4 && type->is_dual_slot()) ? 2 : 1;
+      }
+   case GLSL_TYPE_ARRAY:
+      assert(type->length > 0);
+      return type_size_xvec4(type->fields.array, as_vec4) * type->length;
+   case GLSL_TYPE_STRUCT:
+      size = 0;
+      for (i = 0; i < type->length; i++) {
+	 size += type_size_xvec4(type->fields.structure[i].type, as_vec4);
+      }
+      return size;
+   case GLSL_TYPE_SUBROUTINE:
+      return 1;
+
+   case GLSL_TYPE_SAMPLER:
+      /* Samplers take up no register space, since they're baked in at
+       * link time.
+       */
+      return 0;
+   case GLSL_TYPE_ATOMIC_UINT:
+      return 0;
+   case GLSL_TYPE_IMAGE:
+      return DIV_ROUND_UP(BRW_IMAGE_PARAM_SIZE, 4);
+   case GLSL_TYPE_VOID:
+   case GLSL_TYPE_ERROR:
+   case GLSL_TYPE_INTERFACE:
+   case GLSL_TYPE_FUNCTION:
+      unreachable("not reached");
+   }
+
+   return 0;
+}
+
+/**
+ * Returns the minimum number of vec4 elements needed to pack a type.
+ *
+ * For simple types, it will return 1 (a single vec4); for matrices, the
+ * number of columns; for array and struct, the sum of the vec4_size of
+ * each of its elements; and for sampler and atomic, zero.
+ *
+ * This method is useful to calculate how much register space is needed to
+ * store a particular type.
+ */
+extern "C" int
+type_size_vec4(const struct glsl_type *type)
+{
+   return type_size_xvec4(type, true);
+}
+
+/**
+ * Returns the minimum number of dvec4 elements needed to pack a type.
+ *
+ * For simple types, it will return 1 (a single dvec4); for matrices, the
+ * number of columns; for array and struct, the sum of the dvec4_size of
+ * each of its elements; and for sampler and atomic, zero.
+ *
+ * This method is useful to calculate how much register space is needed to
+ * store a particular type.
+ *
+ * Measuring double-precision vertex inputs as dvec4 is required because
+ * ARB_vertex_attrib_64bit states that these uses the same number of locations
+ * than the single-precision version. That is, two consecutives dvec4 would be
+ * located in location "x" and location "x+1", not "x+2".
+ *
+ * In order to map vec4/dvec4 vertex inputs in the proper ATTRs,
+ * remap_vs_attrs() will take in account both the location and also if the
+ * type fits in one or two vec4 slots.
+ */
+extern "C" int
+type_size_dvec4(const struct glsl_type *type)
+{
+   return type_size_xvec4(type, false);
+}
+
+src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type)
+{
+   init();
+
+   this->file = VGRF;
+   this->nr = v->alloc.allocate(type_size_vec4(type));
+
+   if (type->is_array() || type->is_record()) {
+      this->swizzle = BRW_SWIZZLE_NOOP;
+   } else {
+      this->swizzle = brw_swizzle_for_size(type->vector_elements);
+   }
+
+   this->type = brw_type_for_base_type(type);
+}
+
+src_reg::src_reg(class vec4_visitor *v, const struct glsl_type *type, int size)
+{
+   assert(size > 0);
+
+   init();
+
+   this->file = VGRF;
+   this->nr = v->alloc.allocate(type_size_vec4(type) * size);
+
+   this->swizzle = BRW_SWIZZLE_NOOP;
+
+   this->type = brw_type_for_base_type(type);
+}
+
+dst_reg::dst_reg(class vec4_visitor *v, const struct glsl_type *type)
+{
+   init();
+
+   this->file = VGRF;
+   this->nr = v->alloc.allocate(type_size_vec4(type));
+
+   if (type->is_array() || type->is_record()) {
+      this->writemask = WRITEMASK_XYZW;
+   } else {
+      this->writemask = (1 << type->vector_elements) - 1;
+   }
+
+   this->type = brw_type_for_base_type(type);
+}
+
+vec4_instruction *
+vec4_visitor::emit_minmax(enum brw_conditional_mod conditionalmod, dst_reg dst,
+                          src_reg src0, src_reg src1)
+{
+   vec4_instruction *inst = emit(BRW_OPCODE_SEL, dst, src0, src1);
+   inst->conditional_mod = conditionalmod;
+   return inst;
+}
+
+vec4_instruction *
+vec4_visitor::emit_lrp(const dst_reg &dst,
+                       const src_reg &x, const src_reg &y, const src_reg &a)
+{
+   if (devinfo->gen >= 6) {
+      /* Note that the instruction's argument order is reversed from GLSL
+       * and the IR.
+       */
+     return emit(LRP(dst, fix_3src_operand(a), fix_3src_operand(y),
+                     fix_3src_operand(x)));
+   } else {
+      /* Earlier generations don't support three source operations, so we
+       * need to emit x*(1-a) + y*a.
+       */
+      dst_reg y_times_a           = dst_reg(this, glsl_type::vec4_type);
+      dst_reg one_minus_a         = dst_reg(this, glsl_type::vec4_type);
+      dst_reg x_times_one_minus_a = dst_reg(this, glsl_type::vec4_type);
+      y_times_a.writemask           = dst.writemask;
+      one_minus_a.writemask         = dst.writemask;
+      x_times_one_minus_a.writemask = dst.writemask;
+
+      emit(MUL(y_times_a, y, a));
+      emit(ADD(one_minus_a, negate(a), brw_imm_f(1.0f)));
+      emit(MUL(x_times_one_minus_a, x, src_reg(one_minus_a)));
+      return emit(ADD(dst, src_reg(x_times_one_minus_a), src_reg(y_times_a)));
+   }
+}
+
+/**
+ * Emits the instructions needed to perform a pull constant load. before_block
+ * and before_inst can be NULL in which case the instruction will be appended
+ * to the end of the instruction list.
+ */
+void
+vec4_visitor::emit_pull_constant_load_reg(dst_reg dst,
+                                          src_reg surf_index,
+                                          src_reg offset_reg,
+                                          bblock_t *before_block,
+                                          vec4_instruction *before_inst)
+{
+   assert((before_inst == NULL && before_block == NULL) ||
+          (before_inst && before_block));
+
+   vec4_instruction *pull;
+
+   if (devinfo->gen >= 9) {
+      /* Gen9+ needs a message header in order to use SIMD4x2 mode */
+      src_reg header(this, glsl_type::uvec4_type, 2);
+
+      pull = new(mem_ctx)
+         vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
+                          dst_reg(header));
+
+      if (before_inst)
+         emit_before(before_block, before_inst, pull);
+      else
+         emit(pull);
+
+      dst_reg index_reg = retype(byte_offset(dst_reg(header), REG_SIZE),
+                                 offset_reg.type);
+      pull = MOV(writemask(index_reg, WRITEMASK_X), offset_reg);
+
+      if (before_inst)
+         emit_before(before_block, before_inst, pull);
+      else
+         emit(pull);
+
+      pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
+                                           dst,
+                                           surf_index,
+                                           header);
+      pull->mlen = 2;
+      pull->header_size = 1;
+   } else if (devinfo->gen >= 7) {
+      dst_reg grf_offset = dst_reg(this, glsl_type::uint_type);
+
+      grf_offset.type = offset_reg.type;
+
+      pull = MOV(grf_offset, offset_reg);
+
+      if (before_inst)
+         emit_before(before_block, before_inst, pull);
+      else
+         emit(pull);
+
+      pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD_GEN7,
+                                           dst,
+                                           surf_index,
+                                           src_reg(grf_offset));
+      pull->mlen = 1;
+   } else {
+      pull = new(mem_ctx) vec4_instruction(VS_OPCODE_PULL_CONSTANT_LOAD,
+                                           dst,
+                                           surf_index,
+                                           offset_reg);
+      pull->base_mrf = FIRST_PULL_LOAD_MRF(devinfo->gen) + 1;
+      pull->mlen = 1;
+   }
+
+   if (before_inst)
+      emit_before(before_block, before_inst, pull);
+   else
+      emit(pull);
+}
+
+src_reg
+vec4_visitor::emit_uniformize(const src_reg &src)
+{
+   const src_reg chan_index(this, glsl_type::uint_type);
+   const dst_reg dst = retype(dst_reg(this, glsl_type::uint_type),
+                              src.type);
+
+   emit(SHADER_OPCODE_FIND_LIVE_CHANNEL, dst_reg(chan_index))
+      ->force_writemask_all = true;
+   emit(SHADER_OPCODE_BROADCAST, dst, src, chan_index)
+      ->force_writemask_all = true;
+
+   return src_reg(dst);
+}
+
+src_reg
+vec4_visitor::emit_mcs_fetch(const glsl_type *coordinate_type,
+                             src_reg coordinate, src_reg surface)
+{
+   vec4_instruction *inst =
+      new(mem_ctx) vec4_instruction(SHADER_OPCODE_TXF_MCS,
+                                    dst_reg(this, glsl_type::uvec4_type));
+   inst->base_mrf = 2;
+   inst->src[1] = surface;
+   inst->src[2] = surface;
+
+   int param_base;
+
+   if (devinfo->gen >= 9) {
+      /* Gen9+ needs a message header in order to use SIMD4x2 mode */
+      vec4_instruction *header_inst = new(mem_ctx)
+         vec4_instruction(VS_OPCODE_SET_SIMD4X2_HEADER_GEN9,
+                          dst_reg(MRF, inst->base_mrf));
+
+      emit(header_inst);
+
+      inst->mlen = 2;
+      inst->header_size = 1;
+      param_base = inst->base_mrf + 1;
+   } else {
+      inst->mlen = 1;
+      param_base = inst->base_mrf;
+   }
+
+   /* parameters are: u, v, r, lod; lod will always be zero due to api restrictions */
+   int coord_mask = (1 << coordinate_type->vector_elements) - 1;
+   int zero_mask = 0xf & ~coord_mask;
+
+   emit(MOV(dst_reg(MRF, param_base, coordinate_type, coord_mask),
+            coordinate));
+
+   emit(MOV(dst_reg(MRF, param_base, coordinate_type, zero_mask),
+            brw_imm_d(0)));
+
+   emit(inst);
+   return src_reg(inst->dst);
+}
+
+bool
+vec4_visitor::is_high_sampler(src_reg sampler)
+{
+   if (devinfo->gen < 8 && !devinfo->is_haswell)
+      return false;
+
+   return sampler.file != IMM || sampler.ud >= 16;
+}
+
+void
+vec4_visitor::emit_texture(ir_texture_opcode op,
+                           dst_reg dest,
+                           const glsl_type *dest_type,
+                           src_reg coordinate,
+                           int coord_components,
+                           src_reg shadow_comparator,
+                           src_reg lod, src_reg lod2,
+                           src_reg sample_index,
+                           uint32_t constant_offset,
+                           src_reg offset_value,
+                           src_reg mcs,
+                           uint32_t surface,
+                           src_reg surface_reg,
+                           src_reg sampler_reg)
+{
+   /* The sampler can only meaningfully compute LOD for fragment shader
+    * messages. For all other stages, we change the opcode to TXL and hardcode
+    * the LOD to 0.
+    *
+    * textureQueryLevels() is implemented in terms of TXS so we need to pass a
+    * valid LOD argument.
+    */
+   if (op == ir_tex || op == ir_query_levels) {
+      assert(lod.file == BAD_FILE);
+      lod = brw_imm_f(0.0f);
+   }
+
+   enum opcode opcode;
+   switch (op) {
+   case ir_tex: opcode = SHADER_OPCODE_TXL; break;
+   case ir_txl: opcode = SHADER_OPCODE_TXL; break;
+   case ir_txd: opcode = SHADER_OPCODE_TXD; break;
+   case ir_txf: opcode = SHADER_OPCODE_TXF; break;
+   case ir_txf_ms: opcode = (devinfo->gen >= 9 ? SHADER_OPCODE_TXF_CMS_W :
+                             SHADER_OPCODE_TXF_CMS); break;
+   case ir_txs: opcode = SHADER_OPCODE_TXS; break;
+   case ir_tg4: opcode = offset_value.file != BAD_FILE
+                         ? SHADER_OPCODE_TG4_OFFSET : SHADER_OPCODE_TG4; break;
+   case ir_query_levels: opcode = SHADER_OPCODE_TXS; break;
+   case ir_texture_samples: opcode = SHADER_OPCODE_SAMPLEINFO; break;
+   case ir_txb:
+      unreachable("TXB is not valid for vertex shaders.");
+   case ir_lod:
+      unreachable("LOD is not valid for vertex shaders.");
+   case ir_samples_identical: {
+      /* There are some challenges implementing this for vec4, and it seems
+       * unlikely to be used anyway.  For now, just return false ways.
+       */
+      emit(MOV(dest, brw_imm_ud(0u)));
+      return;
+   }
+   default:
+      unreachable("Unrecognized tex op");
+   }
+
+   vec4_instruction *inst = new(mem_ctx) vec4_instruction(opcode, dest);
+
+   inst->offset = constant_offset;
+
+   /* The message header is necessary for:
+    * - Gen4 (always)
+    * - Gen9+ for selecting SIMD4x2
+    * - Texel offsets
+    * - Gather channel selection
+    * - Sampler indices too large to fit in a 4-bit value.
+    * - Sampleinfo message - takes no parameters, but mlen = 0 is illegal
+    */
+   inst->header_size =
+      (devinfo->gen < 5 || devinfo->gen >= 9 ||
+       inst->offset != 0 || op == ir_tg4 ||
+       op == ir_texture_samples ||
+       is_high_sampler(sampler_reg)) ? 1 : 0;
+   inst->base_mrf = 2;
+   inst->mlen = inst->header_size;
+   inst->dst.writemask = WRITEMASK_XYZW;
+   inst->shadow_compare = shadow_comparator.file != BAD_FILE;
+
+   inst->src[1] = surface_reg;
+   inst->src[2] = sampler_reg;
+
+   /* MRF for the first parameter */
+   int param_base = inst->base_mrf + inst->header_size;
+
+   if (op == ir_txs || op == ir_query_levels) {
+      int writemask = devinfo->gen == 4 ? WRITEMASK_W : WRITEMASK_X;
+      emit(MOV(dst_reg(MRF, param_base, lod.type, writemask), lod));
+      inst->mlen++;
+   } else if (op == ir_texture_samples) {
+      inst->dst.writemask = WRITEMASK_X;
+   } else {
+      /* Load the coordinate */
+      /* FINISHME: gl_clamp_mask and saturate */
+      int coord_mask = (1 << coord_components) - 1;
+      int zero_mask = 0xf & ~coord_mask;
+
+      emit(MOV(dst_reg(MRF, param_base, coordinate.type, coord_mask),
+               coordinate));
+      inst->mlen++;
+
+      if (zero_mask != 0) {
+         emit(MOV(dst_reg(MRF, param_base, coordinate.type, zero_mask),
+                  brw_imm_d(0)));
+      }
+      /* Load the shadow comparator */
+      if (shadow_comparator.file != BAD_FILE && op != ir_txd && (op != ir_tg4 || offset_value.file == BAD_FILE)) {
+	 emit(MOV(dst_reg(MRF, param_base + 1, shadow_comparator.type,
+			  WRITEMASK_X),
+		  shadow_comparator));
+	 inst->mlen++;
+      }
+
+      /* Load the LOD info */
+      if (op == ir_tex || op == ir_txl) {
+	 int mrf, writemask;
+	 if (devinfo->gen >= 5) {
+	    mrf = param_base + 1;
+	    if (shadow_comparator.file != BAD_FILE) {
+	       writemask = WRITEMASK_Y;
+	       /* mlen already incremented */
+	    } else {
+	       writemask = WRITEMASK_X;
+	       inst->mlen++;
+	    }
+	 } else /* devinfo->gen == 4 */ {
+	    mrf = param_base;
+	    writemask = WRITEMASK_W;
+	 }
+	 emit(MOV(dst_reg(MRF, mrf, lod.type, writemask), lod));
+      } else if (op == ir_txf) {
+         emit(MOV(dst_reg(MRF, param_base, lod.type, WRITEMASK_W), lod));
+      } else if (op == ir_txf_ms) {
+         emit(MOV(dst_reg(MRF, param_base + 1, sample_index.type, WRITEMASK_X),
+                  sample_index));
+         if (opcode == SHADER_OPCODE_TXF_CMS_W) {
+            /* MCS data is stored in the first two channels of ‘mcs’, but we
+             * need to get it into the .y and .z channels of the second vec4
+             * of params.
+             */
+            mcs.swizzle = BRW_SWIZZLE4(0, 0, 1, 1);
+            emit(MOV(dst_reg(MRF, param_base + 1,
+                             glsl_type::uint_type, WRITEMASK_YZ),
+                     mcs));
+         } else if (devinfo->gen >= 7) {
+            /* MCS data is in the first channel of `mcs`, but we need to get it into
+             * the .y channel of the second vec4 of params, so replicate .x across
+             * the whole vec4 and then mask off everything except .y
+             */
+            mcs.swizzle = BRW_SWIZZLE_XXXX;
+            emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::uint_type, WRITEMASK_Y),
+                     mcs));
+         }
+         inst->mlen++;
+      } else if (op == ir_txd) {
+         const brw_reg_type type = lod.type;
+
+	 if (devinfo->gen >= 5) {
+	    lod.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
+	    lod2.swizzle = BRW_SWIZZLE4(SWIZZLE_X,SWIZZLE_X,SWIZZLE_Y,SWIZZLE_Y);
+	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XZ), lod));
+	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_YW), lod2));
+	    inst->mlen++;
+
+	    if (dest_type->vector_elements == 3 || shadow_comparator.file != BAD_FILE) {
+	       lod.swizzle = BRW_SWIZZLE_ZZZZ;
+	       lod2.swizzle = BRW_SWIZZLE_ZZZZ;
+	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_X), lod));
+	       emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_Y), lod2));
+	       inst->mlen++;
+
+               if (shadow_comparator.file != BAD_FILE) {
+                  emit(MOV(dst_reg(MRF, param_base + 2,
+                                   shadow_comparator.type, WRITEMASK_Z),
+                           shadow_comparator));
+               }
+	    }
+	 } else /* devinfo->gen == 4 */ {
+	    emit(MOV(dst_reg(MRF, param_base + 1, type, WRITEMASK_XYZ), lod));
+	    emit(MOV(dst_reg(MRF, param_base + 2, type, WRITEMASK_XYZ), lod2));
+	    inst->mlen += 2;
+	 }
+      } else if (op == ir_tg4 && offset_value.file != BAD_FILE) {
+         if (shadow_comparator.file != BAD_FILE) {
+            emit(MOV(dst_reg(MRF, param_base, shadow_comparator.type, WRITEMASK_W),
+                     shadow_comparator));
+         }
+
+         emit(MOV(dst_reg(MRF, param_base + 1, glsl_type::ivec2_type, WRITEMASK_XY),
+                  offset_value));
+         inst->mlen++;
+      }
+   }
+
+   emit(inst);
+
+   /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
+    * spec requires layers.
+    */
+   if (op == ir_txs && devinfo->gen < 7) {
+      /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
+      emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
+                  src_reg(inst->dst), brw_imm_d(1));
+   }
+
+   if (devinfo->gen == 6 && op == ir_tg4) {
+      emit_gen6_gather_wa(key_tex->gen6_gather_wa[surface], inst->dst);
+   }
+
+   if (op == ir_query_levels) {
+      /* # levels is in .w */
+      src_reg swizzled(dest);
+      swizzled.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W,
+                                      SWIZZLE_W, SWIZZLE_W);
+      emit(MOV(dest, swizzled));
+   }
+}
+
+/**
+ * Apply workarounds for Gen6 gather with UINT/SINT
+ */
+void
+vec4_visitor::emit_gen6_gather_wa(uint8_t wa, dst_reg dst)
+{
+   if (!wa)
+      return;
+
+   int width = (wa & WA_8BIT) ? 8 : 16;
+   dst_reg dst_f = dst;
+   dst_f.type = BRW_REGISTER_TYPE_F;
+
+   /* Convert from UNORM to UINT */
+   emit(MUL(dst_f, src_reg(dst_f), brw_imm_f((float)((1 << width) - 1))));
+   emit(MOV(dst, src_reg(dst_f)));
+
+   if (wa & WA_SIGN) {
+      /* Reinterpret the UINT value as a signed INT value by
+       * shifting the sign bit into place, then shifting back
+       * preserving sign.
+       */
+      emit(SHL(dst, src_reg(dst), brw_imm_d(32 - width)));
+      emit(ASR(dst, src_reg(dst), brw_imm_d(32 - width)));
+   }
+}
+
+void
+vec4_visitor::gs_emit_vertex(int /* stream_id */)
+{
+   unreachable("not reached");
+}
+
+void
+vec4_visitor::gs_end_primitive()
+{
+   unreachable("not reached");
+}
+
+void
+vec4_visitor::emit_ndc_computation()
+{
+   if (output_reg[VARYING_SLOT_POS][0].file == BAD_FILE)
+      return;
+
+   /* Get the position */
+   src_reg pos = src_reg(output_reg[VARYING_SLOT_POS][0]);
+
+   /* Build ndc coords, which are (x/w, y/w, z/w, 1/w) */
+   dst_reg ndc = dst_reg(this, glsl_type::vec4_type);
+   output_reg[BRW_VARYING_SLOT_NDC][0] = ndc;
+   output_num_components[BRW_VARYING_SLOT_NDC][0] = 4;
+
+   current_annotation = "NDC";
+   dst_reg ndc_w = ndc;
+   ndc_w.writemask = WRITEMASK_W;
+   src_reg pos_w = pos;
+   pos_w.swizzle = BRW_SWIZZLE4(SWIZZLE_W, SWIZZLE_W, SWIZZLE_W, SWIZZLE_W);
+   emit_math(SHADER_OPCODE_RCP, ndc_w, pos_w);
+
+   dst_reg ndc_xyz = ndc;
+   ndc_xyz.writemask = WRITEMASK_XYZ;
+
+   emit(MUL(ndc_xyz, pos, src_reg(ndc_w)));
+}
+
+void
+vec4_visitor::emit_psiz_and_flags(dst_reg reg)
+{
+   if (devinfo->gen < 6 &&
+       ((prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) ||
+        output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE ||
+        devinfo->has_negative_rhw_bug)) {
+      dst_reg header1 = dst_reg(this, glsl_type::uvec4_type);
+      dst_reg header1_w = header1;
+      header1_w.writemask = WRITEMASK_W;
+
+      emit(MOV(header1, brw_imm_ud(0u)));
+
+      if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
+	 src_reg psiz = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
+
+	 current_annotation = "Point size";
+	 emit(MUL(header1_w, psiz, brw_imm_f((float)(1 << 11))));
+	 emit(AND(header1_w, src_reg(header1_w), brw_imm_d(0x7ff << 8)));
+      }
+
+      if (output_reg[VARYING_SLOT_CLIP_DIST0][0].file != BAD_FILE) {
+         current_annotation = "Clipping flags";
+         dst_reg flags0 = dst_reg(this, glsl_type::uint_type);
+         dst_reg flags1 = dst_reg(this, glsl_type::uint_type);
+
+         emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST0][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
+         emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags0, brw_imm_d(0));
+         emit(OR(header1_w, src_reg(header1_w), src_reg(flags0)));
+
+         emit(CMP(dst_null_f(), src_reg(output_reg[VARYING_SLOT_CLIP_DIST1][0]), brw_imm_f(0.0f), BRW_CONDITIONAL_L));
+         emit(VS_OPCODE_UNPACK_FLAGS_SIMD4X2, flags1, brw_imm_d(0));
+         emit(SHL(flags1, src_reg(flags1), brw_imm_d(4)));
+         emit(OR(header1_w, src_reg(header1_w), src_reg(flags1)));
+      }
+
+      /* i965 clipping workaround:
+       * 1) Test for -ve rhw
+       * 2) If set,
+       *      set ndc = (0,0,0,0)
+       *      set ucp[6] = 1
+       *
+       * Later, clipping will detect ucp[6] and ensure the primitive is
+       * clipped against all fixed planes.
+       */
+      if (devinfo->has_negative_rhw_bug &&
+          output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE) {
+         src_reg ndc_w = src_reg(output_reg[BRW_VARYING_SLOT_NDC][0]);
+         ndc_w.swizzle = BRW_SWIZZLE_WWWW;
+         emit(CMP(dst_null_f(), ndc_w, brw_imm_f(0.0f), BRW_CONDITIONAL_L));
+         vec4_instruction *inst;
+         inst = emit(OR(header1_w, src_reg(header1_w), brw_imm_ud(1u << 6)));
+         inst->predicate = BRW_PREDICATE_NORMAL;
+         output_reg[BRW_VARYING_SLOT_NDC][0].type = BRW_REGISTER_TYPE_F;
+         inst = emit(MOV(output_reg[BRW_VARYING_SLOT_NDC][0], brw_imm_f(0.0f)));
+         inst->predicate = BRW_PREDICATE_NORMAL;
+      }
+
+      emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), src_reg(header1)));
+   } else if (devinfo->gen < 6) {
+      emit(MOV(retype(reg, BRW_REGISTER_TYPE_UD), brw_imm_ud(0u)));
+   } else {
+      emit(MOV(retype(reg, BRW_REGISTER_TYPE_D), brw_imm_d(0)));
+      if (prog_data->vue_map.slots_valid & VARYING_BIT_PSIZ) {
+         dst_reg reg_w = reg;
+         reg_w.writemask = WRITEMASK_W;
+         src_reg reg_as_src = src_reg(output_reg[VARYING_SLOT_PSIZ][0]);
+         reg_as_src.type = reg_w.type;
+         reg_as_src.swizzle = brw_swizzle_for_size(1);
+         emit(MOV(reg_w, reg_as_src));
+      }
+      if (prog_data->vue_map.slots_valid & VARYING_BIT_LAYER) {
+         dst_reg reg_y = reg;
+         reg_y.writemask = WRITEMASK_Y;
+         reg_y.type = BRW_REGISTER_TYPE_D;
+         output_reg[VARYING_SLOT_LAYER][0].type = reg_y.type;
+         emit(MOV(reg_y, src_reg(output_reg[VARYING_SLOT_LAYER][0])));
+      }
+      if (prog_data->vue_map.slots_valid & VARYING_BIT_VIEWPORT) {
+         dst_reg reg_z = reg;
+         reg_z.writemask = WRITEMASK_Z;
+         reg_z.type = BRW_REGISTER_TYPE_D;
+         output_reg[VARYING_SLOT_VIEWPORT][0].type = reg_z.type;
+         emit(MOV(reg_z, src_reg(output_reg[VARYING_SLOT_VIEWPORT][0])));
+      }
+   }
+}
+
+vec4_instruction *
+vec4_visitor::emit_generic_urb_slot(dst_reg reg, int varying, int component)
+{
+   assert(varying < VARYING_SLOT_MAX);
+
+   unsigned num_comps = output_num_components[varying][component];
+   if (num_comps == 0)
+      return NULL;
+
+   assert(output_reg[varying][component].type == reg.type);
+   current_annotation = output_reg_annotation[varying];
+   if (output_reg[varying][component].file != BAD_FILE) {
+      src_reg src = src_reg(output_reg[varying][component]);
+      src.swizzle = BRW_SWZ_COMP_OUTPUT(component);
+      reg.writemask =
+         brw_writemask_for_component_packing(num_comps, component);
+      return emit(MOV(reg, src));
+   }
+   return NULL;
+}
+
+void
+vec4_visitor::emit_urb_slot(dst_reg reg, int varying)
+{
+   reg.type = BRW_REGISTER_TYPE_F;
+   output_reg[varying][0].type = reg.type;
+
+   switch (varying) {
+   case VARYING_SLOT_PSIZ:
+   {
+      /* PSIZ is always in slot 0, and is coupled with other flags. */
+      current_annotation = "indices, point width, clip flags";
+      emit_psiz_and_flags(reg);
+      break;
+   }
+   case BRW_VARYING_SLOT_NDC:
+      current_annotation = "NDC";
+      if (output_reg[BRW_VARYING_SLOT_NDC][0].file != BAD_FILE)
+         emit(MOV(reg, src_reg(output_reg[BRW_VARYING_SLOT_NDC][0])));
+      break;
+   case VARYING_SLOT_POS:
+      current_annotation = "gl_Position";
+      if (output_reg[VARYING_SLOT_POS][0].file != BAD_FILE)
+         emit(MOV(reg, src_reg(output_reg[VARYING_SLOT_POS][0])));
+      break;
+   case VARYING_SLOT_EDGE:
+      /* This is present when doing unfilled polygons.  We're supposed to copy
+       * the edge flag from the user-provided vertex array
+       * (glEdgeFlagPointer), or otherwise we'll copy from the current value
+       * of that attribute (starts as 1.0f).  This is then used in clipping to
+       * determine which edges should be drawn as wireframe.
+       */
+      current_annotation = "edge flag";
+      emit(MOV(reg, src_reg(dst_reg(ATTR, VERT_ATTRIB_EDGEFLAG,
+                                    glsl_type::float_type, WRITEMASK_XYZW))));
+      break;
+   case BRW_VARYING_SLOT_PAD:
+      /* No need to write to this slot */
+      break;
+   default:
+      for (int i = 0; i < 4; i++) {
+         emit_generic_urb_slot(reg, varying, i);
+      }
+      break;
+   }
+}
+
+static int
+align_interleaved_urb_mlen(const struct gen_device_info *devinfo, int mlen)
+{
+   if (devinfo->gen >= 6) {
+      /* URB data written (does not include the message header reg) must
+       * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
+       * section 5.4.3.2.2: URB_INTERLEAVED.
+       *
+       * URB entries are allocated on a multiple of 1024 bits, so an
+       * extra 128 bits written here to make the end align to 256 is
+       * no problem.
+       */
+      if ((mlen % 2) != 1)
+	 mlen++;
+   }
+
+   return mlen;
+}
+
+
+/**
+ * Generates the VUE payload plus the necessary URB write instructions to
+ * output it.
+ *
+ * The VUE layout is documented in Volume 2a.
+ */
+void
+vec4_visitor::emit_vertex()
+{
+   /* MRF 0 is reserved for the debugger, so start with message header
+    * in MRF 1.
+    */
+   int base_mrf = 1;
+   int mrf = base_mrf;
+   /* In the process of generating our URB write message contents, we
+    * may need to unspill a register or load from an array.  Those
+    * reads would use MRFs 14-15.
+    */
+   int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
+
+   /* The following assertion verifies that max_usable_mrf causes an
+    * even-numbered amount of URB write data, which will meet gen6's
+    * requirements for length alignment.
+    */
+   assert ((max_usable_mrf - base_mrf) % 2 == 0);
+
+   /* First mrf is the g0-based message header containing URB handles and
+    * such.
+    */
+   emit_urb_write_header(mrf++);
+
+   if (devinfo->gen < 6) {
+      emit_ndc_computation();
+   }
+
+   /* We may need to split this up into several URB writes, so do them in a
+    * loop.
+    */
+   int slot = 0;
+   bool complete = false;
+   do {
+      /* URB offset is in URB row increments, and each of our MRFs is half of
+       * one of those, since we're doing interleaved writes.
+       */
+      int offset = slot / 2;
+
+      mrf = base_mrf + 1;
+      for (; slot < prog_data->vue_map.num_slots; ++slot) {
+         emit_urb_slot(dst_reg(MRF, mrf++),
+                       prog_data->vue_map.slot_to_varying[slot]);
+
+         /* If this was max_usable_mrf, we can't fit anything more into this
+          * URB WRITE. Same thing if we reached the maximum length available.
+          */
+         if (mrf > max_usable_mrf ||
+             align_interleaved_urb_mlen(devinfo, mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
+            slot++;
+            break;
+         }
+      }
+
+      complete = slot >= prog_data->vue_map.num_slots;
+      current_annotation = "URB write";
+      vec4_instruction *inst = emit_urb_write_opcode(complete);
+      inst->base_mrf = base_mrf;
+      inst->mlen = align_interleaved_urb_mlen(devinfo, mrf - base_mrf);
+      inst->offset += offset;
+   } while(!complete);
+}
+
+
+src_reg
+vec4_visitor::get_scratch_offset(bblock_t *block, vec4_instruction *inst,
+				 src_reg *reladdr, int reg_offset)
+{
+   /* Because we store the values to scratch interleaved like our
+    * vertex data, we need to scale the vec4 index by 2.
+    */
+   int message_header_scale = 2;
+
+   /* Pre-gen6, the message header uses byte offsets instead of vec4
+    * (16-byte) offset units.
+    */
+   if (devinfo->gen < 6)
+      message_header_scale *= 16;
+
+   if (reladdr) {
+      /* A vec4 is 16 bytes and a dvec4 is 32 bytes so for doubles we have
+       * to multiply the reladdr by 2. Notice that the reg_offset part
+       * is in units of 16 bytes and is used to select the low/high 16-byte
+       * chunk of a full dvec4, so we don't want to multiply that part.
+       */
+      src_reg index = src_reg(this, glsl_type::int_type);
+      if (type_sz(inst->dst.type) < 8) {
+         emit_before(block, inst, ADD(dst_reg(index), *reladdr,
+                                      brw_imm_d(reg_offset)));
+         emit_before(block, inst, MUL(dst_reg(index), index,
+                                      brw_imm_d(message_header_scale)));
+      } else {
+         emit_before(block, inst, MUL(dst_reg(index), *reladdr,
+                                      brw_imm_d(message_header_scale * 2)));
+         emit_before(block, inst, ADD(dst_reg(index), index,
+                                      brw_imm_d(reg_offset * message_header_scale)));
+      }
+      return index;
+   } else {
+      return brw_imm_d(reg_offset * message_header_scale);
+   }
+}
+
+/**
+ * Emits an instruction before @inst to load the value named by @orig_src
+ * from scratch space at @base_offset to @temp.
+ *
+ * @base_offset is measured in 32-byte units (the size of a register).
+ */
+void
+vec4_visitor::emit_scratch_read(bblock_t *block, vec4_instruction *inst,
+				dst_reg temp, src_reg orig_src,
+				int base_offset)
+{
+   assert(orig_src.offset % REG_SIZE == 0);
+   int reg_offset = base_offset + orig_src.offset / REG_SIZE;
+   src_reg index = get_scratch_offset(block, inst, orig_src.reladdr,
+                                      reg_offset);
+
+   if (type_sz(orig_src.type) < 8) {
+      emit_before(block, inst, SCRATCH_READ(temp, index));
+   } else {
+      dst_reg shuffled = dst_reg(this, glsl_type::dvec4_type);
+      dst_reg shuffled_float = retype(shuffled, BRW_REGISTER_TYPE_F);
+      emit_before(block, inst, SCRATCH_READ(shuffled_float, index));
+      index = get_scratch_offset(block, inst, orig_src.reladdr, reg_offset + 1);
+      vec4_instruction *last_read =
+         SCRATCH_READ(byte_offset(shuffled_float, REG_SIZE), index);
+      emit_before(block, inst, last_read);
+      shuffle_64bit_data(temp, src_reg(shuffled), false, block, last_read);
+   }
+}
+
+/**
+ * Emits an instruction after @inst to store the value to be written
+ * to @orig_dst to scratch space at @base_offset, from @temp.
+ *
+ * @base_offset is measured in 32-byte units (the size of a register).
+ */
+void
+vec4_visitor::emit_scratch_write(bblock_t *block, vec4_instruction *inst,
+                                 int base_offset)
+{
+   assert(inst->dst.offset % REG_SIZE == 0);
+   int reg_offset = base_offset + inst->dst.offset / REG_SIZE;
+   src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
+                                      reg_offset);
+
+   /* Create a temporary register to store *inst's result in.
+    *
+    * We have to be careful in MOVing from our temporary result register in
+    * the scratch write.  If we swizzle from channels of the temporary that
+    * weren't initialized, it will confuse live interval analysis, which will
+    * make spilling fail to make progress.
+    */
+   bool is_64bit = type_sz(inst->dst.type) == 8;
+   const glsl_type *alloc_type =
+      is_64bit ? glsl_type::dvec4_type : glsl_type::vec4_type;
+   const src_reg temp = swizzle(retype(src_reg(this, alloc_type),
+                                       inst->dst.type),
+                                brw_swizzle_for_mask(inst->dst.writemask));
+
+   if (!is_64bit) {
+      dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0),
+				          inst->dst.writemask));
+      vec4_instruction *write = SCRATCH_WRITE(dst, temp, index);
+      if (inst->opcode != BRW_OPCODE_SEL)
+         write->predicate = inst->predicate;
+      write->ir = inst->ir;
+      write->annotation = inst->annotation;
+      inst->insert_after(block, write);
+   } else {
+      dst_reg shuffled = dst_reg(this, alloc_type);
+      vec4_instruction *last =
+         shuffle_64bit_data(shuffled, temp, true, block, inst);
+      src_reg shuffled_float = src_reg(retype(shuffled, BRW_REGISTER_TYPE_F));
+
+      uint8_t mask = 0;
+      if (inst->dst.writemask & WRITEMASK_X)
+         mask |= WRITEMASK_XY;
+      if (inst->dst.writemask & WRITEMASK_Y)
+         mask |= WRITEMASK_ZW;
+      if (mask) {
+         dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
+
+         vec4_instruction *write = SCRATCH_WRITE(dst, shuffled_float, index);
+         if (inst->opcode != BRW_OPCODE_SEL)
+            write->predicate = inst->predicate;
+         write->ir = inst->ir;
+         write->annotation = inst->annotation;
+         last->insert_after(block, write);
+      }
+
+      mask = 0;
+      if (inst->dst.writemask & WRITEMASK_Z)
+         mask |= WRITEMASK_XY;
+      if (inst->dst.writemask & WRITEMASK_W)
+         mask |= WRITEMASK_ZW;
+      if (mask) {
+         dst_reg dst = dst_reg(brw_writemask(brw_vec8_grf(0, 0), mask));
+
+         src_reg index = get_scratch_offset(block, inst, inst->dst.reladdr,
+                                            reg_offset + 1);
+         vec4_instruction *write =
+            SCRATCH_WRITE(dst, byte_offset(shuffled_float, REG_SIZE), index);
+         if (inst->opcode != BRW_OPCODE_SEL)
+            write->predicate = inst->predicate;
+         write->ir = inst->ir;
+         write->annotation = inst->annotation;
+         last->insert_after(block, write);
+      }
+   }
+
+   inst->dst.file = temp.file;
+   inst->dst.nr = temp.nr;
+   inst->dst.offset %= REG_SIZE;
+   inst->dst.reladdr = NULL;
+}
+
+/**
+ * Checks if \p src and/or \p src.reladdr require a scratch read, and if so,
+ * adds the scratch read(s) before \p inst. The function also checks for
+ * recursive reladdr scratch accesses, issuing the corresponding scratch
+ * loads and rewriting reladdr references accordingly.
+ *
+ * \return \p src if it did not require a scratch load, otherwise, the
+ * register holding the result of the scratch load that the caller should
+ * use to rewrite src.
+ */
+src_reg
+vec4_visitor::emit_resolve_reladdr(int scratch_loc[], bblock_t *block,
+                                   vec4_instruction *inst, src_reg src)
+{
+   /* Resolve recursive reladdr scratch access by calling ourselves
+    * with src.reladdr
+    */
+   if (src.reladdr)
+      *src.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
+                                          *src.reladdr);
+
+   /* Now handle scratch access on src */
+   if (src.file == VGRF && scratch_loc[src.nr] != -1) {
+      dst_reg temp = dst_reg(this, type_sz(src.type) == 8 ?
+         glsl_type::dvec4_type : glsl_type::vec4_type);
+      emit_scratch_read(block, inst, temp, src, scratch_loc[src.nr]);
+      src.nr = temp.nr;
+      src.offset %= REG_SIZE;
+      src.reladdr = NULL;
+   }
+
+   return src;
+}
+
+/**
+ * We can't generally support array access in GRF space, because a
+ * single instruction's destination can only span 2 contiguous
+ * registers.  So, we send all GRF arrays that get variable index
+ * access to scratch space.
+ */
+void
+vec4_visitor::move_grf_array_access_to_scratch()
+{
+   int scratch_loc[this->alloc.count];
+   memset(scratch_loc, -1, sizeof(scratch_loc));
+
+   /* First, calculate the set of virtual GRFs that need to be punted
+    * to scratch due to having any array access on them, and where in
+    * scratch.
+    */
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      if (inst->dst.file == VGRF && inst->dst.reladdr) {
+         if (scratch_loc[inst->dst.nr] == -1) {
+            scratch_loc[inst->dst.nr] = last_scratch;
+            last_scratch += this->alloc.sizes[inst->dst.nr];
+         }
+
+         for (src_reg *iter = inst->dst.reladdr;
+              iter->reladdr;
+              iter = iter->reladdr) {
+            if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
+               scratch_loc[iter->nr] = last_scratch;
+               last_scratch += this->alloc.sizes[iter->nr];
+            }
+         }
+      }
+
+      for (int i = 0 ; i < 3; i++) {
+         for (src_reg *iter = &inst->src[i];
+              iter->reladdr;
+              iter = iter->reladdr) {
+            if (iter->file == VGRF && scratch_loc[iter->nr] == -1) {
+               scratch_loc[iter->nr] = last_scratch;
+               last_scratch += this->alloc.sizes[iter->nr];
+            }
+         }
+      }
+   }
+
+   /* Now, for anything that will be accessed through scratch, rewrite
+    * it to load/store.  Note that this is a _safe list walk, because
+    * we may generate a new scratch_write instruction after the one
+    * we're processing.
+    */
+   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+      /* Set up the annotation tracking for new generated instructions. */
+      base_ir = inst->ir;
+      current_annotation = inst->annotation;
+
+      /* First handle scratch access on the dst. Notice we have to handle
+       * the case where the dst's reladdr also points to scratch space.
+       */
+      if (inst->dst.reladdr)
+         *inst->dst.reladdr = emit_resolve_reladdr(scratch_loc, block, inst,
+                                                   *inst->dst.reladdr);
+
+      /* Now that we have handled any (possibly recursive) reladdr scratch
+       * accesses for dst we can safely do the scratch write for dst itself
+       */
+      if (inst->dst.file == VGRF && scratch_loc[inst->dst.nr] != -1)
+         emit_scratch_write(block, inst, scratch_loc[inst->dst.nr]);
+
+      /* Now handle scratch access on any src. In this case, since inst->src[i]
+       * already is a src_reg, we can just call emit_resolve_reladdr with
+       * inst->src[i] and it will take care of handling scratch loads for
+       * both src and src.reladdr (recursively).
+       */
+      for (int i = 0 ; i < 3; i++) {
+         inst->src[i] = emit_resolve_reladdr(scratch_loc, block, inst,
+                                             inst->src[i]);
+      }
+   }
+}
+
+/**
+ * Emits an instruction before @inst to load the value named by @orig_src
+ * from the pull constant buffer (surface) at @base_offset to @temp.
+ */
+void
+vec4_visitor::emit_pull_constant_load(bblock_t *block, vec4_instruction *inst,
+                                      dst_reg temp, src_reg orig_src,
+                                      int base_offset, src_reg indirect)
+{
+   assert(orig_src.offset % 16 == 0);
+   const unsigned index = prog_data->base.binding_table.pull_constants_start;
+
+   /* For 64bit loads we need to emit two 32-bit load messages and we also
+    * we need to shuffle the 32-bit data result into proper 64-bit data. To do
+    * that we emit the 32-bit loads into a temporary and we shuffle the result
+    * into the original destination.
+    */
+   dst_reg orig_temp = temp;
+   bool is_64bit = type_sz(orig_src.type) == 8;
+   if (is_64bit) {
+      assert(type_sz(temp.type) == 8);
+      dst_reg temp_df = dst_reg(this, glsl_type::dvec4_type);
+      temp = retype(temp_df, BRW_REGISTER_TYPE_F);
+   }
+
+   src_reg src = orig_src;
+   for (int i = 0; i < (is_64bit ? 2 : 1); i++) {
+      int reg_offset = base_offset + src.offset / 16;
+
+      src_reg offset;
+      if (indirect.file != BAD_FILE) {
+         offset = src_reg(this, glsl_type::uint_type);
+         emit_before(block, inst, ADD(dst_reg(offset), indirect,
+                                      brw_imm_ud(reg_offset * 16)));
+      } else if (devinfo->gen >= 8) {
+         /* Store the offset in a GRF so we can send-from-GRF. */
+         offset = src_reg(this, glsl_type::uint_type);
+         emit_before(block, inst, MOV(dst_reg(offset),
+                                      brw_imm_ud(reg_offset * 16)));
+      } else {
+         offset = brw_imm_d(reg_offset * 16);
+      }
+
+      emit_pull_constant_load_reg(byte_offset(temp, i * REG_SIZE),
+                                  brw_imm_ud(index),
+                                  offset,
+                                  block, inst);
+
+      src = byte_offset(src, 16);
+   }
+
+   brw_mark_surface_used(&prog_data->base, index);
+
+   if (is_64bit) {
+      temp = retype(temp, BRW_REGISTER_TYPE_DF);
+      shuffle_64bit_data(orig_temp, src_reg(temp), false, block, inst);
+   }
+}
+
+/**
+ * Implements array access of uniforms by inserting a
+ * PULL_CONSTANT_LOAD instruction.
+ *
+ * Unlike temporary GRF array access (where we don't support it due to
+ * the difficulty of doing relative addressing on instruction
+ * destinations), we could potentially do array access of uniforms
+ * that were loaded in GRF space as push constants.  In real-world
+ * usage we've seen, though, the arrays being used are always larger
+ * than we could load as push constants, so just always move all
+ * uniform array access out to a pull constant buffer.
+ */
+void
+vec4_visitor::move_uniform_array_access_to_pull_constants()
+{
+   /* The vulkan dirver doesn't support pull constants other than UBOs so
+    * everything has to be pushed regardless.
+    */
+   if (stage_prog_data->pull_param == NULL) {
+      split_uniform_registers();
+      return;
+   }
+
+   int pull_constant_loc[this->uniforms];
+   memset(pull_constant_loc, -1, sizeof(pull_constant_loc));
+
+   /* First, walk through the instructions and determine which things need to
+    * be pulled.  We mark something as needing to be pulled by setting
+    * pull_constant_loc to 0.
+    */
+   foreach_block_and_inst(block, vec4_instruction, inst, cfg) {
+      /* We only care about MOV_INDIRECT of a uniform */
+      if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
+          inst->src[0].file != UNIFORM)
+         continue;
+
+      int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
+
+      for (unsigned j = 0; j < DIV_ROUND_UP(inst->src[2].ud, 16); j++)
+         pull_constant_loc[uniform_nr + j] = 0;
+   }
+
+   /* Next, we walk the list of uniforms and assign real pull constant
+    * locations and set their corresponding entries in pull_param.
+    */
+   for (int j = 0; j < this->uniforms; j++) {
+      if (pull_constant_loc[j] < 0)
+         continue;
+
+      pull_constant_loc[j] = stage_prog_data->nr_pull_params / 4;
+
+      for (int i = 0; i < 4; i++) {
+         stage_prog_data->pull_param[stage_prog_data->nr_pull_params++]
+            = stage_prog_data->param[j * 4 + i];
+      }
+   }
+
+   /* Finally, we can walk through the instructions and lower MOV_INDIRECT
+    * instructions to actual uniform pulls.
+    */
+   foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
+      /* We only care about MOV_INDIRECT of a uniform */
+      if (inst->opcode != SHADER_OPCODE_MOV_INDIRECT ||
+          inst->src[0].file != UNIFORM)
+         continue;
+
+      int uniform_nr = inst->src[0].nr + inst->src[0].offset / 16;
+
+      assert(inst->src[0].swizzle == BRW_SWIZZLE_NOOP);
+
+      emit_pull_constant_load(block, inst, inst->dst, inst->src[0],
+                              pull_constant_loc[uniform_nr], inst->src[1]);
+      inst->remove(block);
+   }
+
+   /* Now there are no accesses of the UNIFORM file with a reladdr, so
+    * no need to track them as larger-than-vec4 objects.  This will be
+    * relied on in cutting out unused uniform vectors from push
+    * constants.
+    */
+   split_uniform_registers();
+}
+
+void
+vec4_visitor::resolve_ud_negate(src_reg *reg)
+{
+   if (reg->type != BRW_REGISTER_TYPE_UD ||
+       !reg->negate)
+      return;
+
+   src_reg temp = src_reg(this, glsl_type::uvec4_type);
+   emit(BRW_OPCODE_MOV, dst_reg(temp), *reg);
+   *reg = temp;
+}
+
+vec4_visitor::vec4_visitor(const struct brw_compiler *compiler,
+                           void *log_data,
+                           const struct brw_sampler_prog_key_data *key_tex,
+                           struct brw_vue_prog_data *prog_data,
+                           const nir_shader *shader,
+			   void *mem_ctx,
+                           bool no_spills,
+                           int shader_time_index)
+   : backend_shader(compiler, log_data, mem_ctx, shader, &prog_data->base),
+     key_tex(key_tex),
+     prog_data(prog_data),
+     fail_msg(NULL),
+     first_non_payload_grf(0),
+     need_all_constants_in_pull_buffer(false),
+     no_spills(no_spills),
+     shader_time_index(shader_time_index),
+     last_scratch(0)
+{
+   this->failed = false;
+
+   this->base_ir = NULL;
+   this->current_annotation = NULL;
+   memset(this->output_reg_annotation, 0, sizeof(this->output_reg_annotation));
+
+   memset(this->output_num_components, 0, sizeof(this->output_num_components));
+
+   this->virtual_grf_start = NULL;
+   this->virtual_grf_end = NULL;
+   this->live_intervals = NULL;
+
+   this->max_grf = devinfo->gen >= 7 ? GEN7_MRF_HACK_START : BRW_MAX_GRF;
+
+   this->uniforms = 0;
+}
+
+vec4_visitor::~vec4_visitor()
+{
+}
+
+
+void
+vec4_visitor::fail(const char *format, ...)
+{
+   va_list va;
+   char *msg;
+
+   if (failed)
+      return;
+
+   failed = true;
+
+   va_start(va, format);
+   msg = ralloc_vasprintf(mem_ctx, format, va);
+   va_end(va);
+   msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
+
+   this->fail_msg = msg;
+
+   if (debug_enabled) {
+      fprintf(stderr, "%s",  msg);
+   }
+}
+
+} /* namespace brw */
diff --git a/src/intel/compiler/brw_vec4_vs.h b/src/intel/compiler/brw_vec4_vs.h
new file mode 100644
index 00000000000..8c346d7636a
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_vs.h
@@ -0,0 +1,68 @@
+/*
+ * Copyright © 2006 - 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef BRW_VEC4_VS_VISITOR_H
+#define BRW_VEC4_VS_VISITOR_H
+
+#include "brw_vec4.h"
+
+namespace brw {
+
+class vec4_vs_visitor : public vec4_visitor
+{
+public:
+   vec4_vs_visitor(const struct brw_compiler *compiler,
+                   void *log_data,
+                   const struct brw_vs_prog_key *key,
+                   struct brw_vs_prog_data *vs_prog_data,
+                   const nir_shader *shader,
+                   gl_clip_plane *clip_planes,
+                   void *mem_ctx,
+                   int shader_time_index,
+                   bool use_legacy_snorm_formula);
+
+protected:
+   virtual dst_reg *make_reg_for_system_value(int location);
+   virtual void setup_payload();
+   virtual void emit_prolog();
+   virtual void emit_thread_end();
+   virtual void emit_urb_write_header(int mrf);
+   virtual void emit_urb_slot(dst_reg reg, int varying);
+   virtual vec4_instruction *emit_urb_write_opcode(bool complete);
+
+private:
+   int setup_attributes(int payload_reg);
+   void setup_uniform_clipplane_values();
+   void emit_clip_distances(dst_reg reg, int offset);
+
+   const struct brw_vs_prog_key *const key;
+   struct brw_vs_prog_data * const vs_prog_data;
+
+   gl_clip_plane *clip_planes;
+
+   bool use_legacy_snorm_formula;
+};
+
+} /* namespace brw */
+
+#endif /* BRW_VEC4_VS_VISITOR_H */
diff --git a/src/intel/compiler/brw_vec4_vs_visitor.cpp b/src/intel/compiler/brw_vec4_vs_visitor.cpp
new file mode 100644
index 00000000000..0cec77990d6
--- /dev/null
+++ b/src/intel/compiler/brw_vec4_vs_visitor.cpp
@@ -0,0 +1,221 @@
+/*
+ * Copyright © 2013 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+
+#include "brw_vec4_vs.h"
+#include "common/gen_debug.h"
+
+namespace brw {
+
+void
+vec4_vs_visitor::emit_prolog()
+{
+}
+
+
+dst_reg *
+vec4_vs_visitor::make_reg_for_system_value(int location)
+{
+   /* VertexID is stored by the VF as the last vertex element, but
+    * we don't represent it with a flag in inputs_read, so we call
+    * it VERT_ATTRIB_MAX, which setup_attributes() picks up on.
+    */
+   dst_reg *reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX);
+
+   switch (location) {
+   case SYSTEM_VALUE_BASE_VERTEX:
+      reg->writemask = WRITEMASK_X;
+      vs_prog_data->uses_basevertex = true;
+      break;
+   case SYSTEM_VALUE_BASE_INSTANCE:
+      reg->writemask = WRITEMASK_Y;
+      vs_prog_data->uses_baseinstance = true;
+      break;
+   case SYSTEM_VALUE_VERTEX_ID:
+   case SYSTEM_VALUE_VERTEX_ID_ZERO_BASE:
+      reg->writemask = WRITEMASK_Z;
+      vs_prog_data->uses_vertexid = true;
+      break;
+   case SYSTEM_VALUE_INSTANCE_ID:
+      reg->writemask = WRITEMASK_W;
+      vs_prog_data->uses_instanceid = true;
+      break;
+   case SYSTEM_VALUE_DRAW_ID:
+      reg = new(mem_ctx) dst_reg(ATTR, VERT_ATTRIB_MAX + 1);
+      reg->writemask = WRITEMASK_X;
+      vs_prog_data->uses_drawid = true;
+      break;
+   default:
+      unreachable("not reached");
+   }
+
+   return reg;
+}
+
+
+void
+vec4_vs_visitor::emit_urb_write_header(int mrf)
+{
+   /* No need to do anything for VS; an implied write to this MRF will be
+    * performed by VS_OPCODE_URB_WRITE.
+    */
+   (void) mrf;
+}
+
+
+vec4_instruction *
+vec4_vs_visitor::emit_urb_write_opcode(bool complete)
+{
+   /* For VS, the URB writes end the thread. */
+   if (complete) {
+      if (INTEL_DEBUG & DEBUG_SHADER_TIME)
+         emit_shader_time_end();
+   }
+
+   vec4_instruction *inst = emit(VS_OPCODE_URB_WRITE);
+   inst->urb_write_flags = complete ?
+      BRW_URB_WRITE_EOT_COMPLETE : BRW_URB_WRITE_NO_FLAGS;
+
+   return inst;
+}
+
+
+void
+vec4_vs_visitor::emit_urb_slot(dst_reg reg, int varying)
+{
+   reg.type = BRW_REGISTER_TYPE_F;
+   output_reg[varying][0].type = reg.type;
+
+   switch (varying) {
+   case VARYING_SLOT_COL0:
+   case VARYING_SLOT_COL1:
+   case VARYING_SLOT_BFC0:
+   case VARYING_SLOT_BFC1: {
+      /* These built-in varyings are only supported in compatibility mode,
+       * and we only support GS in core profile.  So, this must be a vertex
+       * shader.
+       */
+      vec4_instruction *inst = emit_generic_urb_slot(reg, varying, 0);
+      if (inst && key->clamp_vertex_color)
+         inst->saturate = true;
+      break;
+   }
+   default:
+      return vec4_visitor::emit_urb_slot(reg, varying);
+   }
+}
+
+
+void
+vec4_vs_visitor::emit_clip_distances(dst_reg reg, int offset)
+{
+   /* From the GLSL 1.30 spec, section 7.1 (Vertex Shader Special Variables):
+    *
+    *     "If a linked set of shaders forming the vertex stage contains no
+    *     static write to gl_ClipVertex or gl_ClipDistance, but the
+    *     application has requested clipping against user clip planes through
+    *     the API, then the coordinate written to gl_Position is used for
+    *     comparison against the user clip planes."
+    *
+    * This function is only called if the shader didn't write to
+    * gl_ClipDistance.  Accordingly, we use gl_ClipVertex to perform clipping
+    * if the user wrote to it; otherwise we use gl_Position.
+    */
+   gl_varying_slot clip_vertex = VARYING_SLOT_CLIP_VERTEX;
+   if (!(prog_data->vue_map.slots_valid & VARYING_BIT_CLIP_VERTEX)) {
+      clip_vertex = VARYING_SLOT_POS;
+   }
+
+   for (int i = 0; i + offset < key->nr_userclip_plane_consts && i < 4;
+        ++i) {
+      reg.writemask = 1 << i;
+      emit(DP4(reg,
+               src_reg(output_reg[clip_vertex][0]),
+               src_reg(this->userplane[i + offset])));
+   }
+}
+
+
+void
+vec4_vs_visitor::setup_uniform_clipplane_values()
+{
+   for (int i = 0; i < key->nr_userclip_plane_consts; ++i) {
+      this->userplane[i] = dst_reg(UNIFORM, this->uniforms);
+      this->userplane[i].type = BRW_REGISTER_TYPE_F;
+      for (int j = 0; j < 4; ++j) {
+         stage_prog_data->param[this->uniforms * 4 + j] =
+            (gl_constant_value *) &clip_planes[i][j];
+      }
+      ++this->uniforms;
+   }
+}
+
+
+void
+vec4_vs_visitor::emit_thread_end()
+{
+   setup_uniform_clipplane_values();
+
+   /* Lower legacy ff and ClipVertex clipping to clip distances */
+   if (key->nr_userclip_plane_consts > 0) {
+      current_annotation = "user clip distances";
+
+      output_reg[VARYING_SLOT_CLIP_DIST0][0] =
+         dst_reg(this, glsl_type::vec4_type);
+      output_reg[VARYING_SLOT_CLIP_DIST1][0] =
+         dst_reg(this, glsl_type::vec4_type);
+      output_num_components[VARYING_SLOT_CLIP_DIST0][0] = 4;
+      output_num_components[VARYING_SLOT_CLIP_DIST1][0] = 4;
+
+      emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST0][0], 0);
+      emit_clip_distances(output_reg[VARYING_SLOT_CLIP_DIST1][0], 4);
+   }
+
+   /* For VS, we always end the thread by emitting a single vertex.
+    * emit_urb_write_opcode() will take care of setting the eot flag on the
+    * SEND instruction.
+    */
+   emit_vertex();
+}
+
+
+vec4_vs_visitor::vec4_vs_visitor(const struct brw_compiler *compiler,
+                                 void *log_data,
+                                 const struct brw_vs_prog_key *key,
+                                 struct brw_vs_prog_data *vs_prog_data,
+                                 const nir_shader *shader,
+                                 gl_clip_plane *clip_planes,
+                                 void *mem_ctx,
+                                 int shader_time_index,
+                                 bool use_legacy_snorm_formula)
+   : vec4_visitor(compiler, log_data, &key->tex, &vs_prog_data->base, shader,
+                  mem_ctx, false /* no_spills */, shader_time_index),
+     key(key),
+     vs_prog_data(vs_prog_data),
+     clip_planes(clip_planes),
+     use_legacy_snorm_formula(use_legacy_snorm_formula)
+{
+}
+
+
+} /* namespace brw */
diff --git a/src/intel/compiler/brw_vue_map.c b/src/intel/compiler/brw_vue_map.c
new file mode 100644
index 00000000000..e14cba8f67d
--- /dev/null
+++ b/src/intel/compiler/brw_vue_map.c
@@ -0,0 +1,307 @@
+/*
+ * Copyright © 2011 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+/**
+ * @file brw_vue_map.c
+ *
+ * This file computes the "VUE map" for a (non-fragment) shader stage, which
+ * describes the layout of its output varyings.  The VUE map is used to match
+ * outputs from one stage with the inputs of the next.
+ *
+ * Largely, varyings can be placed however we like - producers/consumers simply
+ * have to agree on the layout.  However, there is also a "VUE Header" that
+ * prescribes a fixed-layout for items that interact with fixed function
+ * hardware, such as the clipper and rasterizer.
+ *
+ * Authors:
+ *   Paul Berry <stereotype441@gmail.com>
+ *   Chris Forbes <chrisf@ijw.co.nz>
+ *   Eric Anholt <eric@anholt.net>
+ */
+
+
+#include "brw_compiler.h"
+#include "common/gen_debug.h"
+
+static inline void
+assign_vue_slot(struct brw_vue_map *vue_map, int varying, int slot)
+{
+   /* Make sure this varying hasn't been assigned a slot already */
+   assert (vue_map->varying_to_slot[varying] == -1);
+
+   vue_map->varying_to_slot[varying] = slot;
+   vue_map->slot_to_varying[slot] = varying;
+}
+
+/**
+ * Compute the VUE map for a shader stage.
+ */
+void
+brw_compute_vue_map(const struct gen_device_info *devinfo,
+                    struct brw_vue_map *vue_map,
+                    uint64_t slots_valid,
+                    bool separate)
+{
+   /* Keep using the packed/contiguous layout on old hardware - we only need
+    * the SSO layout when using geometry/tessellation shaders or 32 FS input
+    * varyings, which only exist on Gen >= 6.  It's also a bit more efficient.
+    */
+   if (devinfo->gen < 6)
+      separate = false;
+
+   if (separate) {
+      /* In SSO mode, we don't know whether the adjacent stage will
+       * read/write gl_ClipDistance, which has a fixed slot location.
+       * We have to assume the worst and reserve a slot for it, or else
+       * the rest of our varyings will be off by a slot.
+       *
+       * Note that we don't have to worry about COL/BFC, as those built-in
+       * variables only exist in legacy GL, which only supports VS and FS.
+       */
+      slots_valid |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0);
+      slots_valid |= BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1);
+   }
+
+   vue_map->slots_valid = slots_valid;
+   vue_map->separate = separate;
+
+   /* gl_Layer and gl_ViewportIndex don't get their own varying slots -- they
+    * are stored in the first VUE slot (VARYING_SLOT_PSIZ).
+    */
+   slots_valid &= ~(VARYING_BIT_LAYER | VARYING_BIT_VIEWPORT);
+
+   /* Make sure that the values we store in vue_map->varying_to_slot and
+    * vue_map->slot_to_varying won't overflow the signed chars that are used
+    * to store them.  Note that since vue_map->slot_to_varying sometimes holds
+    * values equal to BRW_VARYING_SLOT_COUNT, we need to ensure that
+    * BRW_VARYING_SLOT_COUNT is <= 127, not 128.
+    */
+   STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 127);
+
+   for (int i = 0; i < BRW_VARYING_SLOT_COUNT; ++i) {
+      vue_map->varying_to_slot[i] = -1;
+      vue_map->slot_to_varying[i] = BRW_VARYING_SLOT_PAD;
+   }
+
+   int slot = 0;
+
+   /* VUE header: format depends on chip generation and whether clipping is
+    * enabled.
+    *
+    * See the Sandybridge PRM, Volume 2 Part 1, section 1.5.1 (page 30),
+    * "Vertex URB Entry (VUE) Formats" which describes the VUE header layout.
+    */
+   if (devinfo->gen < 6) {
+      /* There are 8 dwords in VUE header pre-Ironlake:
+       * dword 0-3 is indices, point width, clip flags.
+       * dword 4-7 is ndc position
+       * dword 8-11 is the first vertex data.
+       *
+       * On Ironlake the VUE header is nominally 20 dwords, but the hardware
+       * will accept the same header layout as Gen4 [and should be a bit faster]
+       */
+      assign_vue_slot(vue_map, VARYING_SLOT_PSIZ, slot++);
+      assign_vue_slot(vue_map, BRW_VARYING_SLOT_NDC, slot++);
+      assign_vue_slot(vue_map, VARYING_SLOT_POS, slot++);
+   } else {
+      /* There are 8 or 16 DWs (D0-D15) in VUE header on Sandybridge:
+       * dword 0-3 of the header is indices, point width, clip flags.
+       * dword 4-7 is the 4D space position
+       * dword 8-15 of the vertex header is the user clip distance if
+       * enabled.
+       * dword 8-11 or 16-19 is the first vertex element data we fill.
+       */
+      assign_vue_slot(vue_map, VARYING_SLOT_PSIZ, slot++);
+      assign_vue_slot(vue_map, VARYING_SLOT_POS, slot++);
+      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST0))
+         assign_vue_slot(vue_map, VARYING_SLOT_CLIP_DIST0, slot++);
+      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_CLIP_DIST1))
+         assign_vue_slot(vue_map, VARYING_SLOT_CLIP_DIST1, slot++);
+
+      /* front and back colors need to be consecutive so that we can use
+       * ATTRIBUTE_SWIZZLE_INPUTATTR_FACING to swizzle them when doing
+       * two-sided color.
+       */
+      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_COL0))
+         assign_vue_slot(vue_map, VARYING_SLOT_COL0, slot++);
+      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_BFC0))
+         assign_vue_slot(vue_map, VARYING_SLOT_BFC0, slot++);
+      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_COL1))
+         assign_vue_slot(vue_map, VARYING_SLOT_COL1, slot++);
+      if (slots_valid & BITFIELD64_BIT(VARYING_SLOT_BFC1))
+         assign_vue_slot(vue_map, VARYING_SLOT_BFC1, slot++);
+   }
+
+   /* The hardware doesn't care about the rest of the vertex outputs, so we
+    * can assign them however we like.  For normal programs, we simply assign
+    * them contiguously.
+    *
+    * For separate shader pipelines, we first assign built-in varyings
+    * contiguous slots.  This works because ARB_separate_shader_objects
+    * requires that all shaders have matching built-in varying interface
+    * blocks.  Next, we assign generic varyings based on their location
+    * (either explicit or linker assigned).  This guarantees a fixed layout.
+    *
+    * We generally don't need to assign a slot for VARYING_SLOT_CLIP_VERTEX,
+    * since it's encoded as the clip distances by emit_clip_distances().
+    * However, it may be output by transform feedback, and we'd rather not
+    * recompute state when TF changes, so we just always include it.
+    */
+   uint64_t builtins = slots_valid & BITFIELD64_MASK(VARYING_SLOT_VAR0);
+   while (builtins != 0) {
+      const int varying = ffsll(builtins) - 1;
+      if (vue_map->varying_to_slot[varying] == -1) {
+         assign_vue_slot(vue_map, varying, slot++);
+      }
+      builtins &= ~BITFIELD64_BIT(varying);
+   }
+
+   const int first_generic_slot = slot;
+   uint64_t generics = slots_valid & ~BITFIELD64_MASK(VARYING_SLOT_VAR0);
+   while (generics != 0) {
+      const int varying = ffsll(generics) - 1;
+      if (separate) {
+         slot = first_generic_slot + varying - VARYING_SLOT_VAR0;
+      }
+      assign_vue_slot(vue_map, varying, slot++);
+      generics &= ~BITFIELD64_BIT(varying);
+   }
+
+   vue_map->num_slots = slot;
+   vue_map->num_per_vertex_slots = 0;
+   vue_map->num_per_patch_slots = 0;
+}
+
+/**
+ * Compute the VUE map for tessellation control shader outputs and
+ * tessellation evaluation shader inputs.
+ */
+void
+brw_compute_tess_vue_map(struct brw_vue_map *vue_map,
+                         uint64_t vertex_slots,
+                         uint32_t patch_slots)
+{
+   /* I don't think anything actually uses this... */
+   vue_map->slots_valid = vertex_slots;
+
+   /* separate isn't really meaningful, but make sure it's initialized */
+   vue_map->separate = false;
+
+   vertex_slots &= ~(VARYING_BIT_TESS_LEVEL_OUTER |
+                     VARYING_BIT_TESS_LEVEL_INNER);
+
+   /* Make sure that the values we store in vue_map->varying_to_slot and
+    * vue_map->slot_to_varying won't overflow the signed chars that are used
+    * to store them.  Note that since vue_map->slot_to_varying sometimes holds
+    * values equal to VARYING_SLOT_TESS_MAX , we need to ensure that
+    * VARYING_SLOT_TESS_MAX is <= 127, not 128.
+    */
+   STATIC_ASSERT(VARYING_SLOT_TESS_MAX <= 127);
+
+   for (int i = 0; i < VARYING_SLOT_TESS_MAX ; ++i) {
+      vue_map->varying_to_slot[i] = -1;
+      vue_map->slot_to_varying[i] = BRW_VARYING_SLOT_PAD;
+   }
+
+   int slot = 0;
+
+   /* The first 8 DWords are reserved for the "Patch Header".
+    *
+    * VARYING_SLOT_TESS_LEVEL_OUTER / INNER live here, but the exact layout
+    * depends on the domain type.  They might not be in slots 0 and 1 as
+    * described here, but pretending they're separate allows us to uniquely
+    * identify them by distinct slot locations.
+    */
+   assign_vue_slot(vue_map, VARYING_SLOT_TESS_LEVEL_INNER, slot++);
+   assign_vue_slot(vue_map, VARYING_SLOT_TESS_LEVEL_OUTER, slot++);
+
+   /* first assign per-patch varyings */
+   while (patch_slots != 0) {
+      const int varying = ffsll(patch_slots) - 1;
+      if (vue_map->varying_to_slot[varying + VARYING_SLOT_PATCH0] == -1) {
+         assign_vue_slot(vue_map, varying + VARYING_SLOT_PATCH0, slot++);
+      }
+      patch_slots &= ~BITFIELD64_BIT(varying);
+   }
+
+   /* apparently, including the patch header... */
+   vue_map->num_per_patch_slots = slot;
+
+   /* then assign per-vertex varyings for each vertex in our patch */
+   while (vertex_slots != 0) {
+      const int varying = ffsll(vertex_slots) - 1;
+      if (vue_map->varying_to_slot[varying] == -1) {
+         assign_vue_slot(vue_map, varying, slot++);
+      }
+      vertex_slots &= ~BITFIELD64_BIT(varying);
+   }
+
+   vue_map->num_per_vertex_slots = slot - vue_map->num_per_patch_slots;
+   vue_map->num_slots = slot;
+}
+
+static const char *
+varying_name(brw_varying_slot slot)
+{
+   assume(slot < BRW_VARYING_SLOT_COUNT);
+
+   if (slot < VARYING_SLOT_MAX)
+      return gl_varying_slot_name(slot);
+
+   static const char *brw_names[] = {
+      [BRW_VARYING_SLOT_NDC - VARYING_SLOT_MAX] = "BRW_VARYING_SLOT_NDC",
+      [BRW_VARYING_SLOT_PAD - VARYING_SLOT_MAX] = "BRW_VARYING_SLOT_PAD",
+      [BRW_VARYING_SLOT_PNTC - VARYING_SLOT_MAX] = "BRW_VARYING_SLOT_PNTC",
+   };
+
+   return brw_names[slot - VARYING_SLOT_MAX];
+}
+
+void
+brw_print_vue_map(FILE *fp, const struct brw_vue_map *vue_map)
+{
+   if (vue_map->num_per_vertex_slots > 0 || vue_map->num_per_patch_slots > 0) {
+      fprintf(fp, "PUE map (%d slots, %d/patch, %d/vertex, %s)\n",
+              vue_map->num_slots,
+              vue_map->num_per_patch_slots,
+              vue_map->num_per_vertex_slots,
+              vue_map->separate ? "SSO" : "non-SSO");
+      for (int i = 0; i < vue_map->num_slots; i++) {
+         if (vue_map->slot_to_varying[i] >= VARYING_SLOT_PATCH0) {
+            fprintf(fp, "  [%d] VARYING_SLOT_PATCH%d\n", i,
+                    vue_map->slot_to_varying[i] - VARYING_SLOT_PATCH0);
+         } else {
+            fprintf(fp, "  [%d] %s\n", i,
+                    varying_name(vue_map->slot_to_varying[i]));
+         }
+      }
+   } else {
+      fprintf(fp, "VUE map (%d slots, %s)\n",
+              vue_map->num_slots, vue_map->separate ? "SSO" : "non-SSO");
+      for (int i = 0; i < vue_map->num_slots; i++) {
+         fprintf(fp, "  [%d] %s\n", i,
+                 varying_name(vue_map->slot_to_varying[i]));
+      }
+   }
+   fprintf(fp, "\n");
+}
diff --git a/src/intel/compiler/brw_wm_iz.cpp b/src/intel/compiler/brw_wm_iz.cpp
new file mode 100644
index 00000000000..5162a369765
--- /dev/null
+++ b/src/intel/compiler/brw_wm_iz.cpp
@@ -0,0 +1,169 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <keithw@vmware.com>
+  */
+
+
+#include "brw_fs.h"
+
+
+#undef P                        /* prompted depth */
+#undef C                        /* computed */
+#undef N                        /* non-promoted? */
+
+#define P 0
+#define C 1
+#define N 2
+
+static const struct {
+   GLuint mode:2;
+   GLuint sd_present:1;
+   GLuint sd_to_rt:1;
+   GLuint dd_present:1;
+   GLuint ds_present:1;
+} wm_iz_table[BRW_WM_IZ_BIT_MAX] =
+{
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { N, 1, 1, 0, 0 },
+ { N, 0, 1, 0, 0 },
+ { N, 0, 1, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { C, 0, 1, 1, 0 },
+ { C, 0, 1, 1, 0 },
+ { P, 0, 0, 0, 0 },
+ { N, 1, 1, 0, 0 },
+ { C, 0, 1, 1, 0 },
+ { C, 0, 1, 1, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { N, 1, 1, 0, 0 },
+ { N, 0, 1, 0, 0 },
+ { N, 0, 1, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { C, 0, 1, 1, 0 },
+ { C, 0, 1, 1, 0 },
+ { P, 0, 0, 0, 0 },
+ { N, 1, 1, 0, 0 },
+ { C, 0, 1, 1, 0 },
+ { C, 0, 1, 1, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { N, 1, 1, 0, 1 },
+ { N, 0, 1, 0, 1 },
+ { N, 0, 1, 0, 1 },
+ { P, 0, 0, 0, 0 },
+ { P, 0, 0, 0, 0 },
+ { C, 0, 1, 1, 1 },
+ { C, 0, 1, 1, 1 },
+ { P, 0, 0, 0, 0 },
+ { N, 1, 1, 0, 1 },
+ { C, 0, 1, 1, 1 },
+ { C, 0, 1, 1, 1 },
+ { P, 0, 0, 0, 0 },
+ { C, 0, 0, 0, 1 },
+ { P, 0, 0, 0, 0 },
+ { C, 0, 1, 0, 1 },
+ { P, 0, 0, 0, 0 },
+ { C, 1, 1, 0, 1 },
+ { C, 0, 1, 0, 1 },
+ { C, 0, 1, 0, 1 },
+ { P, 0, 0, 0, 0 },
+ { C, 1, 1, 1, 1 },
+ { C, 0, 1, 1, 1 },
+ { C, 0, 1, 1, 1 },
+ { P, 0, 0, 0, 0 },
+ { C, 1, 1, 1, 1 },
+ { C, 0, 1, 1, 1 },
+ { C, 0, 1, 1, 1 }
+};
+
+/**
+ * \param line_aa  BRW_WM_AA_NEVER, BRW_WM_AA_ALWAYS or BRW_WM_AA_SOMETIMES
+ * \param lookup  bitmask of BRW_WM_IZ_* flags
+ */
+void fs_visitor::setup_fs_payload_gen4()
+{
+   assert(stage == MESA_SHADER_FRAGMENT);
+   struct brw_wm_prog_data *prog_data = brw_wm_prog_data(this->prog_data);
+   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
+   GLuint reg = 2;
+   bool kill_stats_promoted_workaround = false;
+   int lookup = key->iz_lookup;
+
+   assert(lookup < BRW_WM_IZ_BIT_MAX);
+
+   /* Crazy workaround in the windowizer, which we need to track in
+    * our register allocation and render target writes.  See the "If
+    * statistics are enabled..." paragraph of 11.5.3.2: Early Depth
+    * Test Cases [Pre-DevGT] of the 3D Pipeline - Windower B-Spec.
+    */
+   if (key->stats_wm &&
+       (lookup & BRW_WM_IZ_PS_KILL_ALPHATEST_BIT) &&
+       wm_iz_table[lookup].mode == P) {
+      kill_stats_promoted_workaround = true;
+   }
+
+   prog_data->uses_src_depth =
+      (nir->info->inputs_read & (1 << VARYING_SLOT_POS)) != 0;
+   if (wm_iz_table[lookup].sd_present || prog_data->uses_src_depth ||
+       kill_stats_promoted_workaround) {
+      payload.source_depth_reg = reg;
+      reg += 2;
+   }
+
+   if (wm_iz_table[lookup].sd_to_rt || kill_stats_promoted_workaround)
+      source_depth_to_render_target = true;
+
+   if (wm_iz_table[lookup].ds_present || key->line_aa != BRW_WM_AA_NEVER) {
+      payload.aa_dest_stencil_reg = reg;
+      runtime_check_aads_emit =
+         !wm_iz_table[lookup].ds_present && key->line_aa == BRW_WM_AA_SOMETIMES;
+      reg++;
+   }
+
+   if (wm_iz_table[lookup].dd_present) {
+      payload.dest_depth_reg = reg;
+      reg+=2;
+   }
+
+   payload.num_regs = reg;
+}
+
diff --git a/src/intel/compiler/gen6_gs_visitor.cpp b/src/intel/compiler/gen6_gs_visitor.cpp
new file mode 100644
index 00000000000..075bc4ad487
--- /dev/null
+++ b/src/intel/compiler/gen6_gs_visitor.cpp
@@ -0,0 +1,753 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * This code is based on original work by Ilia Mirkin.
+ */
+
+/**
+ * \file gen6_gs_visitor.cpp
+ *
+ * Gen6 geometry shader implementation
+ */
+
+#include "gen6_gs_visitor.h"
+#include "brw_eu.h"
+
+namespace brw {
+
+void
+gen6_gs_visitor::emit_prolog()
+{
+   vec4_gs_visitor::emit_prolog();
+
+   /* Gen6 geometry shaders require to allocate an initial VUE handle via
+    * FF_SYNC message, however the documentation remarks that only one thread
+    * can write to the URB simultaneously and the FF_SYNC message provides the
+    * synchronization mechanism for this, so using this message effectively
+    * stalls the thread until it is its turn to write to the URB. Because of
+    * this, the best way to implement geometry shader algorithms in gen6 is to
+    * execute the algorithm before the FF_SYNC message to maximize parallelism.
+    *
+    * To achieve this we buffer the geometry shader outputs for each emitted
+    * vertex in vertex_output during operation. Then, when we have processed
+    * the last vertex (that is, at thread end time), we send the FF_SYNC
+    * message to allocate the initial VUE handle and write all buffered vertex
+    * data to the URB in one go.
+    *
+    * For each emitted vertex, vertex_output will hold vue_map.num_slots
+    * data items plus one additional item to hold required flags
+    * (PrimType, PrimStart, PrimEnd, as expected by the URB_WRITE message)
+    * which come right after the data items for that vertex. Vertex data and
+    * flags for the next vertex come right after the data items and flags for
+    * the previous vertex.
+    */
+   this->current_annotation = "gen6 prolog";
+   this->vertex_output = src_reg(this,
+                                 glsl_type::uint_type,
+                                 (prog_data->vue_map.num_slots + 1) *
+                                 nir->info->gs.vertices_out);
+   this->vertex_output_offset = src_reg(this, glsl_type::uint_type);
+   emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
+
+   /* MRF 1 will be the header for all messages (FF_SYNC and URB_WRITES),
+    * so initialize it once to R0.
+    */
+   vec4_instruction *inst = emit(MOV(dst_reg(MRF, 1),
+                                     retype(brw_vec8_grf(0, 0),
+                                            BRW_REGISTER_TYPE_UD)));
+   inst->force_writemask_all = true;
+
+   /* This will be used as a temporary to store writeback data of FF_SYNC
+    * and URB_WRITE messages.
+    */
+   this->temp = src_reg(this, glsl_type::uint_type);
+
+   /* This will be used to know when we are processing the first vertex of
+    * a primitive. We will set this to URB_WRITE_PRIM_START only when we know
+    * that we are processing the first vertex in the primitive and to zero
+    * otherwise. This way we can use its value directly in the URB write
+    * headers.
+    */
+   this->first_vertex = src_reg(this, glsl_type::uint_type);
+   emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(URB_WRITE_PRIM_START)));
+
+   /* The FF_SYNC message requires to know the number of primitives generated,
+    * so keep a counter for this.
+    */
+   this->prim_count = src_reg(this, glsl_type::uint_type);
+   emit(MOV(dst_reg(this->prim_count), brw_imm_ud(0u)));
+
+   if (prog->info.has_transform_feedback_varyings) {
+      /* Create a virtual register to hold destination indices in SOL */
+      this->destination_indices = src_reg(this, glsl_type::uvec4_type);
+      /* Create a virtual register to hold number of written primitives */
+      this->sol_prim_written = src_reg(this, glsl_type::uint_type);
+      /* Create a virtual register to hold Streamed Vertex Buffer Indices */
+      this->svbi = src_reg(this, glsl_type::uvec4_type);
+      /* Create a virtual register to hold max values of SVBI */
+      this->max_svbi = src_reg(this, glsl_type::uvec4_type);
+      emit(MOV(dst_reg(this->max_svbi),
+               src_reg(retype(brw_vec1_grf(1, 4), BRW_REGISTER_TYPE_UD))));
+
+      xfb_setup();
+   }
+
+   /* PrimitveID is delivered in r0.1 of the thread payload. If the program
+    * needs it we have to move it to a separate register where we can map
+    * the atttribute.
+    *
+    * Notice that we cannot use a virtual register for this, because we need to
+    * map all input attributes to hardware registers in setup_payload(),
+    * which happens before virtual registers are mapped to hardware registers.
+    * We could work around that issue if we were able to compute the first
+    * non-payload register here and move the PrimitiveID information to that
+    * register, but we can't because at this point we don't know the final
+    * number uniforms that will be included in the payload.
+    *
+    * So, what we do is to place PrimitiveID information in r1, which is always
+    * delivered as part of the payload, but its only populated with data
+    * relevant for transform feedback when we set GEN6_GS_SVBI_PAYLOAD_ENABLE
+    * in the 3DSTATE_GS state packet. That information can be obtained by other
+    * means though, so we can safely use r1 for this purpose.
+    */
+   if (gs_prog_data->include_primitive_id) {
+      this->primitive_id =
+         src_reg(retype(brw_vec8_grf(1, 0), BRW_REGISTER_TYPE_UD));
+      emit(GS_OPCODE_SET_PRIMITIVE_ID, dst_reg(this->primitive_id));
+   }
+}
+
+void
+gen6_gs_visitor::gs_emit_vertex(int stream_id)
+{
+   this->current_annotation = "gen6 emit vertex";
+
+   /* Buffer all output slots for this vertex in vertex_output */
+   for (int slot = 0; slot < prog_data->vue_map.num_slots; ++slot) {
+      int varying = prog_data->vue_map.slot_to_varying[slot];
+      if (varying != VARYING_SLOT_PSIZ) {
+         dst_reg dst(this->vertex_output);
+         dst.reladdr = ralloc(mem_ctx, src_reg);
+         memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
+         emit_urb_slot(dst, varying);
+      } else {
+         /* The PSIZ slot can pack multiple varyings in different channels
+          * and emit_urb_slot() will produce a MOV instruction for each of
+          * them. Since we are writing to an array, that will translate to
+          * possibly multiple MOV instructions with an array destination and
+          * each will generate a scratch write with the same offset into
+          * scratch space (thus, each one overwriting the previous). This is
+          * not what we want. What we will do instead is emit PSIZ to a
+          * a regular temporary register, then move that resgister into the
+          * array. This way we only have one instruction with an array
+          * destination and we only produce a single scratch write.
+          */
+         dst_reg tmp = dst_reg(src_reg(this, glsl_type::uvec4_type));
+         emit_urb_slot(tmp, varying);
+         dst_reg dst(this->vertex_output);
+         dst.reladdr = ralloc(mem_ctx, src_reg);
+         memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
+         vec4_instruction *inst = emit(MOV(dst, src_reg(tmp)));
+         inst->force_writemask_all = true;
+      }
+
+      emit(ADD(dst_reg(this->vertex_output_offset),
+               this->vertex_output_offset, brw_imm_ud(1u)));
+   }
+
+   /* Now buffer flags for this vertex */
+   dst_reg dst(this->vertex_output);
+   dst.reladdr = ralloc(mem_ctx, src_reg);
+   memcpy(dst.reladdr, &this->vertex_output_offset, sizeof(src_reg));
+   if (nir->info->gs.output_primitive == GL_POINTS) {
+      /* If we are outputting points, then every vertex has PrimStart and
+       * PrimEnd set.
+       */
+      emit(MOV(dst, brw_imm_d((_3DPRIM_POINTLIST << URB_WRITE_PRIM_TYPE_SHIFT) |
+                              URB_WRITE_PRIM_START | URB_WRITE_PRIM_END)));
+      emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u)));
+   } else {
+      /* Otherwise, we can only set the PrimStart flag, which we have stored
+       * in the first_vertex register. We will have to wait until we execute
+       * EndPrimitive() or we end the thread to set the PrimEnd flag on a
+       * vertex.
+       */
+      emit(OR(dst, this->first_vertex,
+              brw_imm_ud(gs_prog_data->output_topology <<
+                         URB_WRITE_PRIM_TYPE_SHIFT)));
+      emit(MOV(dst_reg(this->first_vertex), brw_imm_ud(0u)));
+   }
+   emit(ADD(dst_reg(this->vertex_output_offset),
+            this->vertex_output_offset, brw_imm_ud(1u)));
+}
+
+void
+gen6_gs_visitor::gs_end_primitive()
+{
+   this->current_annotation = "gen6 end primitive";
+   /* Calling EndPrimitive() is optional for point output. In this case we set
+    * the PrimEnd flag when we process EmitVertex().
+    */
+   if (nir->info->gs.output_primitive == GL_POINTS)
+      return;
+
+   /* Otherwise we know that the last vertex we have processed was the last
+    * vertex in the primitive and we need to set its PrimEnd flag, so do this
+    * unless we haven't emitted that vertex at all (vertex_count != 0).
+    *
+    * Notice that we have already incremented vertex_count when we processed
+    * the last emit_vertex, so we need to take that into account in the
+    * comparison below (hence the num_output_vertices + 1 in the comparison
+    * below).
+    */
+   unsigned num_output_vertices = nir->info->gs.vertices_out;
+   emit(CMP(dst_null_ud(), this->vertex_count,
+            brw_imm_ud(num_output_vertices + 1), BRW_CONDITIONAL_L));
+   vec4_instruction *inst = emit(CMP(dst_null_ud(),
+                                     this->vertex_count, brw_imm_ud(0u),
+                                     BRW_CONDITIONAL_NEQ));
+   inst->predicate = BRW_PREDICATE_NORMAL;
+   emit(IF(BRW_PREDICATE_NORMAL));
+   {
+      /* vertex_output_offset is already pointing at the first entry of the
+       * next vertex. So subtract 1 to modify the flags for the previous
+       * vertex.
+       */
+      src_reg offset(this, glsl_type::uint_type);
+      emit(ADD(dst_reg(offset), this->vertex_output_offset, brw_imm_d(-1)));
+
+      src_reg dst(this->vertex_output);
+      dst.reladdr = ralloc(mem_ctx, src_reg);
+      memcpy(dst.reladdr, &offset, sizeof(src_reg));
+
+      emit(OR(dst_reg(dst), dst, brw_imm_d(URB_WRITE_PRIM_END)));
+      emit(ADD(dst_reg(this->prim_count), this->prim_count, brw_imm_ud(1u)));
+
+      /* Set the first vertex flag to indicate that the next vertex will start
+       * a primitive.
+       */
+      emit(MOV(dst_reg(this->first_vertex), brw_imm_d(URB_WRITE_PRIM_START)));
+   }
+   emit(BRW_OPCODE_ENDIF);
+}
+
+void
+gen6_gs_visitor::emit_urb_write_header(int mrf)
+{
+   this->current_annotation = "gen6 urb header";
+   /* Compute offset of the flags for the current vertex in vertex_output and
+    * write them in dw2 of the message header.
+    *
+    * Notice that by the time that emit_thread_end() calls here
+    * vertex_output_offset should point to the first data item of the current
+    * vertex in vertex_output, thus we only need to add the number of output
+    * slots per vertex to that offset to obtain the flags data offset.
+    */
+   src_reg flags_offset(this, glsl_type::uint_type);
+   emit(ADD(dst_reg(flags_offset),
+            this->vertex_output_offset,
+            brw_imm_d(prog_data->vue_map.num_slots)));
+
+   src_reg flags_data(this->vertex_output);
+   flags_data.reladdr = ralloc(mem_ctx, src_reg);
+   memcpy(flags_data.reladdr, &flags_offset, sizeof(src_reg));
+
+   emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, mrf), flags_data);
+}
+
+static int
+align_interleaved_urb_mlen(int mlen)
+{
+   /* URB data written (does not include the message header reg) must
+    * be a multiple of 256 bits, or 2 VS registers.  See vol5c.5,
+    * section 5.4.3.2.2: URB_INTERLEAVED.
+    */
+   if ((mlen % 2) != 1)
+      mlen++;
+   return mlen;
+}
+
+void
+gen6_gs_visitor::emit_urb_write_opcode(bool complete, int base_mrf,
+                                       int last_mrf, int urb_offset)
+{
+   vec4_instruction *inst = NULL;
+
+   if (!complete) {
+      /* If the vertex is not complete we don't have to do anything special */
+      inst = emit(GS_OPCODE_URB_WRITE);
+      inst->urb_write_flags = BRW_URB_WRITE_NO_FLAGS;
+   } else {
+      /* Otherwise we always request to allocate a new VUE handle. If this is
+       * the last write before the EOT message and the new handle never gets
+       * used it will be dereferenced when we send the EOT message. This is
+       * necessary to avoid different setups for the EOT message (one for the
+       * case when there is no output and another for the case when there is)
+       * which would require to end the program with an IF/ELSE/ENDIF block,
+       * something we do not want.
+       */
+      inst = emit(GS_OPCODE_URB_WRITE_ALLOCATE);
+      inst->urb_write_flags = BRW_URB_WRITE_COMPLETE;
+      inst->dst = dst_reg(MRF, base_mrf);
+      inst->src[0] = this->temp;
+   }
+
+   inst->base_mrf = base_mrf;
+   inst->mlen = align_interleaved_urb_mlen(last_mrf - base_mrf);
+   inst->offset = urb_offset;
+}
+
+void
+gen6_gs_visitor::emit_thread_end()
+{
+   /* Make sure the current primitive is ended: we know it is not ended when
+    * first_vertex is not zero. This is only relevant for outputs other than
+    * points because in the point case we set PrimEnd on all vertices.
+    */
+   if (nir->info->gs.output_primitive != GL_POINTS) {
+      emit(CMP(dst_null_ud(), this->first_vertex, brw_imm_ud(0u), BRW_CONDITIONAL_Z));
+      emit(IF(BRW_PREDICATE_NORMAL));
+      gs_end_primitive();
+      emit(BRW_OPCODE_ENDIF);
+   }
+
+   /* Here we have to:
+    * 1) Emit an FF_SYNC messsage to obtain an initial VUE handle.
+    * 2) Loop over all buffered vertex data and write it to corresponding
+    *    URB entries.
+    * 3) Allocate new VUE handles for all vertices other than the first.
+    * 4) Send a final EOT message.
+    */
+
+   /* MRF 0 is reserved for the debugger, so start with message header
+    * in MRF 1.
+    */
+   int base_mrf = 1;
+
+   /* In the process of generating our URB write message contents, we
+    * may need to unspill a register or load from an array.  Those
+    * reads would use MRFs 21..23
+    */
+   int max_usable_mrf = FIRST_SPILL_MRF(devinfo->gen);
+
+   /* Issue the FF_SYNC message and obtain the initial VUE handle. */
+   emit(CMP(dst_null_ud(), this->vertex_count, brw_imm_ud(0u), BRW_CONDITIONAL_G));
+   emit(IF(BRW_PREDICATE_NORMAL));
+   {
+      this->current_annotation = "gen6 thread end: ff_sync";
+
+      vec4_instruction *inst;
+      if (prog->info.has_transform_feedback_varyings) {
+         src_reg sol_temp(this, glsl_type::uvec4_type);
+         emit(GS_OPCODE_FF_SYNC_SET_PRIMITIVES,
+              dst_reg(this->svbi),
+              this->vertex_count,
+              this->prim_count,
+              sol_temp);
+         inst = emit(GS_OPCODE_FF_SYNC,
+                     dst_reg(this->temp), this->prim_count, this->svbi);
+      } else {
+         inst = emit(GS_OPCODE_FF_SYNC,
+                     dst_reg(this->temp), this->prim_count, brw_imm_ud(0u));
+      }
+      inst->base_mrf = base_mrf;
+
+      /* Loop over all buffered vertices and emit URB write messages */
+      this->current_annotation = "gen6 thread end: urb writes init";
+      src_reg vertex(this, glsl_type::uint_type);
+      emit(MOV(dst_reg(vertex), brw_imm_ud(0u)));
+      emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
+
+      this->current_annotation = "gen6 thread end: urb writes";
+      emit(BRW_OPCODE_DO);
+      {
+         emit(CMP(dst_null_d(), vertex, this->vertex_count, BRW_CONDITIONAL_GE));
+         inst = emit(BRW_OPCODE_BREAK);
+         inst->predicate = BRW_PREDICATE_NORMAL;
+
+         /* First we prepare the message header */
+         emit_urb_write_header(base_mrf);
+
+         /* Then add vertex data to the message in interleaved fashion */
+         int slot = 0;
+         bool complete = false;
+         do {
+            int mrf = base_mrf + 1;
+
+            /* URB offset is in URB row increments, and each of our MRFs is half
+             * of one of those, since we're doing interleaved writes.
+             */
+            int urb_offset = slot / 2;
+
+            for (; slot < prog_data->vue_map.num_slots; ++slot) {
+               int varying = prog_data->vue_map.slot_to_varying[slot];
+               current_annotation = output_reg_annotation[varying];
+
+               /* Compute offset of this slot for the current vertex
+                * in vertex_output
+                */
+               src_reg data(this->vertex_output);
+               data.reladdr = ralloc(mem_ctx, src_reg);
+               memcpy(data.reladdr, &this->vertex_output_offset,
+                      sizeof(src_reg));
+
+               /* Copy this slot to the appropriate message register */
+               dst_reg reg = dst_reg(MRF, mrf);
+               reg.type = output_reg[varying][0].type;
+               data.type = reg.type;
+               vec4_instruction *inst = emit(MOV(reg, data));
+               inst->force_writemask_all = true;
+
+               mrf++;
+               emit(ADD(dst_reg(this->vertex_output_offset),
+                        this->vertex_output_offset, brw_imm_ud(1u)));
+
+               /* If this was max_usable_mrf, we can't fit anything more into
+                * this URB WRITE. Same if we reached the max. message length.
+                */
+               if (mrf > max_usable_mrf ||
+                   align_interleaved_urb_mlen(mrf - base_mrf + 1) > BRW_MAX_MSG_LENGTH) {
+                  slot++;
+                  break;
+               }
+            }
+
+            complete = slot >= prog_data->vue_map.num_slots;
+            emit_urb_write_opcode(complete, base_mrf, mrf, urb_offset);
+         } while (!complete);
+
+         /* Skip over the flags data item so that vertex_output_offset points
+          * to the first data item of the next vertex, so that we can start
+          * writing the next vertex.
+          */
+         emit(ADD(dst_reg(this->vertex_output_offset),
+                  this->vertex_output_offset, brw_imm_ud(1u)));
+
+         emit(ADD(dst_reg(vertex), vertex, brw_imm_ud(1u)));
+      }
+      emit(BRW_OPCODE_WHILE);
+
+      if (prog->info.has_transform_feedback_varyings)
+         xfb_write();
+   }
+   emit(BRW_OPCODE_ENDIF);
+
+   /* Finally, emit EOT message.
+    *
+    * In gen6 we need to end the thread differently depending on whether we have
+    * emitted at least one vertex or not. In case we did, the EOT message must
+    * always include the COMPLETE flag or else the GPU hangs. If we have not
+    * produced any output we can't use the COMPLETE flag.
+    *
+    * However, this would lead us to end the program with an ENDIF opcode,
+    * which we want to avoid, so what we do is that we always request a new
+    * VUE handle every time we do a URB WRITE, even for the last vertex we emit.
+    * With this we make sure that whether we have emitted at least one vertex
+    * or none at all, we have to finish the thread without writing to the URB,
+    * which works for both cases by setting the COMPLETE and UNUSED flags in
+    * the EOT message.
+    */
+   this->current_annotation = "gen6 thread end: EOT";
+
+   if (prog->info.has_transform_feedback_varyings) {
+      /* When emitting EOT, set SONumPrimsWritten Increment Value. */
+      src_reg data(this, glsl_type::uint_type);
+      emit(AND(dst_reg(data), this->sol_prim_written, brw_imm_ud(0xffffu)));
+      emit(SHL(dst_reg(data), data, brw_imm_ud(16u)));
+      emit(GS_OPCODE_SET_DWORD_2, dst_reg(MRF, base_mrf), data);
+   }
+
+   vec4_instruction *inst = emit(GS_OPCODE_THREAD_END);
+   inst->urb_write_flags = BRW_URB_WRITE_COMPLETE | BRW_URB_WRITE_UNUSED;
+   inst->base_mrf = base_mrf;
+   inst->mlen = 1;
+}
+
+void
+gen6_gs_visitor::setup_payload()
+{
+   int attribute_map[BRW_VARYING_SLOT_COUNT * MAX_GS_INPUT_VERTICES];
+
+   /* Attributes are going to be interleaved, so one register contains two
+    * attribute slots.
+    */
+   int attributes_per_reg = 2;
+
+   /* If a geometry shader tries to read from an input that wasn't written by
+    * the vertex shader, that produces undefined results, but it shouldn't
+    * crash anything.  So initialize attribute_map to zeros--that ensures that
+    * these undefined results are read from r0.
+    */
+   memset(attribute_map, 0, sizeof(attribute_map));
+
+   int reg = 0;
+
+   /* The payload always contains important data in r0. */
+   reg++;
+
+   /* r1 is always part of the payload and it holds information relevant
+    * for transform feedback when we set the GEN6_GS_SVBI_PAYLOAD_ENABLE bit in
+    * the 3DSTATE_GS packet. We will overwrite it with the PrimitiveID
+    * information (and move the original value to a virtual register if
+    * necessary).
+    */
+   if (gs_prog_data->include_primitive_id)
+      attribute_map[VARYING_SLOT_PRIMITIVE_ID] = attributes_per_reg * reg;
+   reg++;
+
+   reg = setup_uniforms(reg);
+
+   reg = setup_varying_inputs(reg, attribute_map, attributes_per_reg);
+
+   lower_attributes_to_hw_regs(attribute_map, true);
+
+   this->first_non_payload_grf = reg;
+}
+
+void
+gen6_gs_visitor::xfb_setup()
+{
+   static const unsigned swizzle_for_offset[4] = {
+      BRW_SWIZZLE4(0, 1, 2, 3),
+      BRW_SWIZZLE4(1, 2, 3, 3),
+      BRW_SWIZZLE4(2, 3, 3, 3),
+      BRW_SWIZZLE4(3, 3, 3, 3)
+   };
+
+   const struct gl_transform_feedback_info *linked_xfb_info =
+      this->prog->sh.LinkedTransformFeedback;
+   int i;
+
+   /* Make sure that the VUE slots won't overflow the unsigned chars in
+    * prog_data->transform_feedback_bindings[].
+    */
+   STATIC_ASSERT(BRW_VARYING_SLOT_COUNT <= 256);
+
+   /* Make sure that we don't need more binding table entries than we've
+    * set aside for use in transform feedback.  (We shouldn't, since we
+    * set aside enough binding table entries to have one per component).
+    */
+   assert(linked_xfb_info->NumOutputs <= BRW_MAX_SOL_BINDINGS);
+
+   gs_prog_data->num_transform_feedback_bindings = linked_xfb_info->NumOutputs;
+   for (i = 0; i < gs_prog_data->num_transform_feedback_bindings; i++) {
+      gs_prog_data->transform_feedback_bindings[i] =
+         linked_xfb_info->Outputs[i].OutputRegister;
+      gs_prog_data->transform_feedback_swizzles[i] =
+         swizzle_for_offset[linked_xfb_info->Outputs[i].ComponentOffset];
+   }
+}
+
+void
+gen6_gs_visitor::xfb_write()
+{
+   unsigned num_verts;
+
+   if (!gs_prog_data->num_transform_feedback_bindings)
+      return;
+
+   switch (gs_prog_data->output_topology) {
+   case _3DPRIM_POINTLIST:
+      num_verts = 1;
+      break;
+   case _3DPRIM_LINELIST:
+   case _3DPRIM_LINESTRIP:
+   case _3DPRIM_LINELOOP:
+      num_verts = 2;
+      break;
+   case _3DPRIM_TRILIST:
+   case _3DPRIM_TRIFAN:
+   case _3DPRIM_TRISTRIP:
+   case _3DPRIM_RECTLIST:
+      num_verts = 3;
+      break;
+   case _3DPRIM_QUADLIST:
+   case _3DPRIM_QUADSTRIP:
+   case _3DPRIM_POLYGON:
+      num_verts = 3;
+      break;
+   default:
+      unreachable("Unexpected primitive type in Gen6 SOL program.");
+   }
+
+   this->current_annotation = "gen6 thread end: svb writes init";
+
+   emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_ud(0u)));
+   emit(MOV(dst_reg(this->sol_prim_written), brw_imm_ud(0u)));
+
+   /* Check that at least one primitive can be written
+    *
+    * Note: since we use the binding table to keep track of buffer offsets
+    * and stride, the GS doesn't need to keep track of a separate pointer
+    * into each buffer; it uses a single pointer which increments by 1 for
+    * each vertex.  So we use SVBI0 for this pointer, regardless of whether
+    * transform feedback is in interleaved or separate attribs mode.
+    */
+   src_reg sol_temp(this, glsl_type::uvec4_type);
+   emit(ADD(dst_reg(sol_temp), this->svbi, brw_imm_ud(num_verts)));
+
+   /* Compare SVBI calculated number with the maximum value, which is
+    * in R1.4 (previously saved in this->max_svbi) for gen6.
+    */
+   emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
+   emit(IF(BRW_PREDICATE_NORMAL));
+   {
+      vec4_instruction *inst = emit(MOV(dst_reg(destination_indices),
+                                        brw_imm_vf4(brw_float_to_vf(0.0),
+                                                    brw_float_to_vf(1.0),
+                                                    brw_float_to_vf(2.0),
+                                                    brw_float_to_vf(0.0))));
+      inst->force_writemask_all = true;
+
+      emit(ADD(dst_reg(this->destination_indices),
+               this->destination_indices,
+               this->svbi));
+   }
+   emit(BRW_OPCODE_ENDIF);
+
+   /* Write transform feedback data for all processed vertices. */
+   for (int i = 0; i < (int)nir->info->gs.vertices_out; i++) {
+      emit(MOV(dst_reg(sol_temp), brw_imm_d(i)));
+      emit(CMP(dst_null_d(), sol_temp, this->vertex_count,
+               BRW_CONDITIONAL_L));
+      emit(IF(BRW_PREDICATE_NORMAL));
+      {
+         xfb_program(i, num_verts);
+      }
+      emit(BRW_OPCODE_ENDIF);
+   }
+}
+
+void
+gen6_gs_visitor::xfb_program(unsigned vertex, unsigned num_verts)
+{
+   unsigned binding;
+   unsigned num_bindings = gs_prog_data->num_transform_feedback_bindings;
+   src_reg sol_temp(this, glsl_type::uvec4_type);
+
+   /* Check for buffer overflow: we need room to write the complete primitive
+    * (all vertices). Otherwise, avoid writing any vertices for it
+    */
+   emit(ADD(dst_reg(sol_temp), this->sol_prim_written, brw_imm_ud(1u)));
+   emit(MUL(dst_reg(sol_temp), sol_temp, brw_imm_ud(num_verts)));
+   emit(ADD(dst_reg(sol_temp), sol_temp, this->svbi));
+   emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
+   emit(IF(BRW_PREDICATE_NORMAL));
+   {
+      /* Avoid overwriting MRF 1 as it is used as URB write message header */
+      dst_reg mrf_reg(MRF, 2);
+
+      this->current_annotation = "gen6: emit SOL vertex data";
+      /* For each vertex, generate code to output each varying using the
+       * appropriate binding table entry.
+       */
+      for (binding = 0; binding < num_bindings; ++binding) {
+         unsigned char varying =
+            gs_prog_data->transform_feedback_bindings[binding];
+
+         /* Set up the correct destination index for this vertex */
+         vec4_instruction *inst = emit(GS_OPCODE_SVB_SET_DST_INDEX,
+                                       mrf_reg,
+                                       this->destination_indices);
+         inst->sol_vertex = vertex % num_verts;
+
+         /* From the Sandybridge PRM, Volume 2, Part 1, Section 4.5.1:
+          *
+          *   "Prior to End of Thread with a URB_WRITE, the kernel must
+          *   ensure that all writes are complete by sending the final
+          *   write as a committed write."
+          */
+         bool final_write = binding == (unsigned) num_bindings - 1 &&
+                            inst->sol_vertex == num_verts - 1;
+
+         /* Compute offset of this varying for the current vertex
+          * in vertex_output
+          */
+         this->current_annotation = output_reg_annotation[varying];
+         src_reg data(this->vertex_output);
+         data.reladdr = ralloc(mem_ctx, src_reg);
+         int offset = get_vertex_output_offset_for_varying(vertex, varying);
+         emit(MOV(dst_reg(this->vertex_output_offset), brw_imm_d(offset)));
+         memcpy(data.reladdr, &this->vertex_output_offset, sizeof(src_reg));
+         data.type = output_reg[varying][0].type;
+
+         /* PSIZ, LAYER and VIEWPORT are packed in different channels of the
+          * same slot, so make sure we write the appropriate channel
+          */
+         if (varying == VARYING_SLOT_PSIZ)
+            data.swizzle = BRW_SWIZZLE_WWWW;
+         else if (varying == VARYING_SLOT_LAYER)
+            data.swizzle = BRW_SWIZZLE_YYYY;
+         else if (varying == VARYING_SLOT_VIEWPORT)
+            data.swizzle = BRW_SWIZZLE_ZZZZ;
+         else
+            data.swizzle = gs_prog_data->transform_feedback_swizzles[binding];
+
+         /* Write data */
+         inst = emit(GS_OPCODE_SVB_WRITE, mrf_reg, data, sol_temp);
+         inst->sol_binding = binding;
+         inst->sol_final_write = final_write;
+
+         if (final_write) {
+            /* This is the last vertex of the primitive, then increment
+             * SO num primitive counter and destination indices.
+             */
+            emit(ADD(dst_reg(this->destination_indices),
+                     this->destination_indices,
+                     brw_imm_ud(num_verts)));
+            emit(ADD(dst_reg(this->sol_prim_written),
+                     this->sol_prim_written, brw_imm_ud(1u)));
+         }
+
+      }
+      this->current_annotation = NULL;
+   }
+   emit(BRW_OPCODE_ENDIF);
+}
+
+int
+gen6_gs_visitor::get_vertex_output_offset_for_varying(int vertex, int varying)
+{
+   /* Find the output slot assigned to this varying.
+    *
+    * VARYING_SLOT_LAYER and VARYING_SLOT_VIEWPORT are packed in the same slot
+    * as VARYING_SLOT_PSIZ.
+    */
+   if (varying == VARYING_SLOT_LAYER || varying == VARYING_SLOT_VIEWPORT)
+      varying = VARYING_SLOT_PSIZ;
+   int slot = prog_data->vue_map.varying_to_slot[varying];
+
+   if (slot < 0) {
+      /* This varying does not exist in the VUE so we are not writing to it
+       * and its value is undefined. We still want to return a valid offset
+       * into vertex_output though, to prevent any out-of-bound accesses into
+       * the vertex_output array. Since the value for this varying is undefined
+       * we don't really care for the value we assign to it, so any offset
+       * within the limits of vertex_output will do.
+       */
+      slot = 0;
+   }
+
+   return vertex * (prog_data->vue_map.num_slots + 1) + slot;
+}
+
+} /* namespace brw */
diff --git a/src/intel/compiler/gen6_gs_visitor.h b/src/intel/compiler/gen6_gs_visitor.h
new file mode 100644
index 00000000000..1bdcf925880
--- /dev/null
+++ b/src/intel/compiler/gen6_gs_visitor.h
@@ -0,0 +1,91 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ */
+
+#ifndef GEN6_GS_VISITOR_H
+#define GEN6_GS_VISITOR_H
+
+#include "brw_vec4.h"
+#include "brw_vec4_gs_visitor.h"
+
+#ifdef __cplusplus
+
+namespace brw {
+
+class gen6_gs_visitor : public vec4_gs_visitor
+{
+public:
+   gen6_gs_visitor(const struct brw_compiler *comp,
+                   void *log_data,
+                   struct brw_gs_compile *c,
+                   struct brw_gs_prog_data *prog_data,
+                   struct gl_program *prog,
+                   const nir_shader *shader,
+                   void *mem_ctx,
+                   bool no_spills,
+                   int shader_time_index) :
+      vec4_gs_visitor(comp, log_data, c, prog_data, shader, mem_ctx, no_spills,
+                      shader_time_index),
+      prog(prog)
+      {
+      }
+
+protected:
+   virtual void emit_prolog();
+   virtual void emit_thread_end();
+   virtual void gs_emit_vertex(int stream_id);
+   virtual void gs_end_primitive();
+   virtual void emit_urb_write_header(int mrf);
+   virtual void emit_urb_write_opcode(bool complete,
+                                      int base_mrf,
+                                      int last_mrf,
+                                      int urb_offset);
+   virtual void setup_payload();
+
+private:
+   void xfb_write();
+   void xfb_program(unsigned vertex, unsigned num_verts);
+   void xfb_setup();
+   int get_vertex_output_offset_for_varying(int vertex, int varying);
+
+   const struct gl_program *prog;
+
+   src_reg vertex_output;
+   src_reg vertex_output_offset;
+   src_reg temp;
+   src_reg first_vertex;
+   src_reg prim_count;
+   src_reg primitive_id;
+
+   /* Transform Feedback members */
+   src_reg sol_prim_written;
+   src_reg svbi;
+   src_reg max_svbi;
+   src_reg destination_indices;
+};
+
+} /* namespace brw */
+
+#endif /* __cplusplus */
+
+#endif /* GEN6_GS_VISITOR_H */
diff --git a/src/intel/compiler/intel_asm_annotation.c b/src/intel/compiler/intel_asm_annotation.c
new file mode 100644
index 00000000000..1f3b78476e3
--- /dev/null
+++ b/src/intel/compiler/intel_asm_annotation.c
@@ -0,0 +1,198 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_cfg.h"
+#include "brw_eu.h"
+#include "common/gen_debug.h"
+#include "intel_asm_annotation.h"
+#include "compiler/nir/nir.h"
+
+void
+dump_assembly(void *assembly, int num_annotations, struct annotation *annotation,
+              const struct gen_device_info *devinfo)
+{
+   const char *last_annotation_string = NULL;
+   const void *last_annotation_ir = NULL;
+
+   for (int i = 0; i < num_annotations; i++) {
+      int start_offset = annotation[i].offset;
+      int end_offset = annotation[i + 1].offset;
+
+      if (annotation[i].block_start) {
+         fprintf(stderr, "   START B%d", annotation[i].block_start->num);
+         foreach_list_typed(struct bblock_link, predecessor_link, link,
+                            &annotation[i].block_start->parents) {
+            struct bblock_t *predecessor_block = predecessor_link->block;
+            fprintf(stderr, " <-B%d", predecessor_block->num);
+         }
+         fprintf(stderr, " (%u cycles)\n", annotation[i].block_start->cycle_count);
+      }
+
+      if (last_annotation_ir != annotation[i].ir) {
+         last_annotation_ir = annotation[i].ir;
+         if (last_annotation_ir) {
+            fprintf(stderr, "   ");
+            nir_print_instr(annotation[i].ir, stderr);
+            fprintf(stderr, "\n");
+         }
+      }
+
+      if (last_annotation_string != annotation[i].annotation) {
+         last_annotation_string = annotation[i].annotation;
+         if (last_annotation_string)
+            fprintf(stderr, "   %s\n", last_annotation_string);
+      }
+
+      brw_disassemble(devinfo, assembly, start_offset, end_offset, stderr);
+
+      if (annotation[i].error) {
+         fputs(annotation[i].error, stderr);
+      }
+
+      if (annotation[i].block_end) {
+         fprintf(stderr, "   END B%d", annotation[i].block_end->num);
+         foreach_list_typed(struct bblock_link, successor_link, link,
+                            &annotation[i].block_end->children) {
+            struct bblock_t *successor_block = successor_link->block;
+            fprintf(stderr, " ->B%d", successor_block->num);
+         }
+         fprintf(stderr, "\n");
+      }
+   }
+   fprintf(stderr, "\n");
+}
+
+static bool
+annotation_array_ensure_space(struct annotation_info *annotation)
+{
+   if (annotation->ann_size <= annotation->ann_count) {
+      int old_size = annotation->ann_size;
+      annotation->ann_size = MAX2(1024, annotation->ann_size * 2);
+      annotation->ann = reralloc(annotation->mem_ctx, annotation->ann,
+                                 struct annotation, annotation->ann_size);
+      if (!annotation->ann)
+         return false;
+
+      memset(annotation->ann + old_size, 0,
+             (annotation->ann_size - old_size) * sizeof(struct annotation));
+   }
+
+   return true;
+}
+
+void annotate(const struct gen_device_info *devinfo,
+              struct annotation_info *annotation, const struct cfg_t *cfg,
+              struct backend_instruction *inst, unsigned offset)
+{
+   if (annotation->mem_ctx == NULL)
+      annotation->mem_ctx = ralloc_context(NULL);
+
+   if (!annotation_array_ensure_space(annotation))
+      return;
+
+   struct annotation *ann = &annotation->ann[annotation->ann_count++];
+   ann->offset = offset;
+   if ((INTEL_DEBUG & DEBUG_ANNOTATION) != 0) {
+      ann->ir = inst->ir;
+      ann->annotation = inst->annotation;
+   }
+
+   if (bblock_start(cfg->blocks[annotation->cur_block]) == inst) {
+      ann->block_start = cfg->blocks[annotation->cur_block];
+   }
+
+   /* There is no hardware DO instruction on Gen6+, so since DO always
+    * starts a basic block, we need to set the .block_start of the next
+    * instruction's annotation with a pointer to the bblock started by
+    * the DO.
+    *
+    * There's also only complication from emitting an annotation without
+    * a corresponding hardware instruction to disassemble.
+    */
+   if (devinfo->gen >= 6 && inst->opcode == BRW_OPCODE_DO) {
+      annotation->ann_count--;
+   }
+
+   if (bblock_end(cfg->blocks[annotation->cur_block]) == inst) {
+      ann->block_end = cfg->blocks[annotation->cur_block];
+      annotation->cur_block++;
+   }
+}
+
+void
+annotation_finalize(struct annotation_info *annotation,
+                    unsigned next_inst_offset)
+{
+   if (!annotation->ann_count)
+      return;
+
+   if (annotation->ann_count == annotation->ann_size) {
+      annotation->ann = reralloc(annotation->mem_ctx, annotation->ann,
+                                 struct annotation, annotation->ann_size + 1);
+   }
+   annotation->ann[annotation->ann_count].offset = next_inst_offset;
+}
+
+void
+annotation_insert_error(struct annotation_info *annotation, unsigned offset,
+                        const char *error)
+{
+   struct annotation *ann;
+
+   if (!annotation->ann_count)
+      return;
+
+   /* We may have to split an annotation, so ensure we have enough space
+    * allocated for that case up front.
+    */
+   if (!annotation_array_ensure_space(annotation))
+      return;
+
+   assume(annotation->ann_count > 0);
+
+   for (int i = 0; i < annotation->ann_count; i++) {
+      struct annotation *cur = &annotation->ann[i];
+      struct annotation *next = &annotation->ann[i + 1];
+      ann = cur;
+
+      if (next->offset <= offset)
+         continue;
+
+      if (offset + sizeof(brw_inst) != next->offset) {
+         memmove(next, cur,
+                 (annotation->ann_count - i + 2) * sizeof(struct annotation));
+         cur->error = NULL;
+         cur->error_length = 0;
+         cur->block_end = NULL;
+         next->offset = offset + sizeof(brw_inst);
+         next->block_start = NULL;
+         annotation->ann_count++;
+      }
+      break;
+   }
+
+   if (ann->error)
+      ralloc_strcat(&ann->error, error);
+   else
+      ann->error = ralloc_strdup(annotation->mem_ctx, error);
+}
diff --git a/src/intel/compiler/intel_asm_annotation.h b/src/intel/compiler/intel_asm_annotation.h
new file mode 100644
index 00000000000..2d905b10a96
--- /dev/null
+++ b/src/intel/compiler/intel_asm_annotation.h
@@ -0,0 +1,80 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#ifndef _INTEL_ASM_ANNOTATION_H
+#define _INTEL_ASM_ANNOTATION_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct backend_instruction;
+struct cfg_t;
+
+struct annotation {
+   int offset;
+
+   size_t error_length;
+   char *error;
+
+   /* Pointers to the basic block in the CFG if the instruction group starts
+    * or ends a basic block.
+    */
+   struct bblock_t *block_start;
+   struct bblock_t *block_end;
+
+   /* Annotation for the generated IR.  One of the two can be set. */
+   const void *ir;
+   const char *annotation;
+};
+
+struct annotation_info {
+   void *mem_ctx;
+   struct annotation *ann;
+   int ann_count;
+   int ann_size;
+
+   /** Block index in the cfg. */
+   int cur_block;
+};
+
+void
+dump_assembly(void *assembly, int num_annotations, struct annotation *annotation,
+              const struct gen_device_info *devinfo);
+
+void
+annotate(const struct gen_device_info *devinfo,
+         struct annotation_info *annotation, const struct cfg_t *cfg,
+         struct backend_instruction *inst, unsigned offset);
+void
+annotation_finalize(struct annotation_info *annotation, unsigned offset);
+
+void
+annotation_insert_error(struct annotation_info *annotation, unsigned offset,
+                        const char *error);
+
+#ifdef __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* _INTEL_ASM_ANNOTATION_H */
diff --git a/src/intel/compiler/test_eu_compact.c b/src/intel/compiler/test_eu_compact.c
new file mode 100644
index 00000000000..77a57f4aa65
--- /dev/null
+++ b/src/intel/compiler/test_eu_compact.c
@@ -0,0 +1,300 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdbool.h>
+#include "util/ralloc.h"
+#include "brw_eu.h"
+
+static bool
+test_compact_instruction(struct brw_codegen *p, brw_inst src)
+{
+   brw_compact_inst dst;
+   memset(&dst, 0xd0, sizeof(dst));
+
+   if (brw_try_compact_instruction(p->devinfo, &dst, &src)) {
+      brw_inst uncompacted;
+
+      brw_uncompact_instruction(p->devinfo, &uncompacted, &dst);
+      if (memcmp(&uncompacted, &src, sizeof(src))) {
+	 brw_debug_compact_uncompact(p->devinfo, &src, &uncompacted);
+	 return false;
+      }
+   } else {
+      brw_compact_inst unchanged;
+      memset(&unchanged, 0xd0, sizeof(unchanged));
+      /* It's not supposed to change dst unless it compacted. */
+      if (memcmp(&unchanged, &dst, sizeof(dst))) {
+	 fprintf(stderr, "Failed to compact, but dst changed\n");
+	 fprintf(stderr, "  Instruction: ");
+	 brw_disassemble_inst(stderr, p->devinfo, &src, false);
+	 return false;
+      }
+   }
+
+   return true;
+}
+
+/**
+ * When doing fuzz testing, pad bits won't round-trip.
+ *
+ * This sort of a superset of skip_bit, which is testing for changing bits that
+ * aren't worth testing for fuzzing.  We also just want to clear bits that
+ * become meaningless once fuzzing twiddles a related bit.
+ */
+static void
+clear_pad_bits(const struct gen_device_info *devinfo, brw_inst *inst)
+{
+   if (brw_inst_opcode(devinfo, inst) != BRW_OPCODE_SEND &&
+       brw_inst_opcode(devinfo, inst) != BRW_OPCODE_SENDC &&
+       brw_inst_opcode(devinfo, inst) != BRW_OPCODE_BREAK &&
+       brw_inst_opcode(devinfo, inst) != BRW_OPCODE_CONTINUE &&
+       brw_inst_src0_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE &&
+       brw_inst_src1_reg_file(devinfo, inst) != BRW_IMMEDIATE_VALUE) {
+      brw_inst_set_bits(inst, 127, 111, 0);
+   }
+}
+
+static bool
+skip_bit(const struct gen_device_info *devinfo, brw_inst *src, int bit)
+{
+   /* pad bit */
+   if (bit == 7)
+      return true;
+
+   /* The compact bit -- uncompacted can't have it set. */
+   if (bit == 29)
+      return true;
+
+   /* pad bit */
+   if (bit == 47)
+      return true;
+
+   /* pad bits */
+   if (bit >= 90 && bit <= 95)
+      return true;
+
+   /* sometimes these are pad bits. */
+   if (brw_inst_opcode(devinfo, src) != BRW_OPCODE_SEND &&
+       brw_inst_opcode(devinfo, src) != BRW_OPCODE_SENDC &&
+       brw_inst_opcode(devinfo, src) != BRW_OPCODE_BREAK &&
+       brw_inst_opcode(devinfo, src) != BRW_OPCODE_CONTINUE &&
+       brw_inst_src0_reg_file(devinfo, src) != BRW_IMMEDIATE_VALUE &&
+       brw_inst_src1_reg_file(devinfo, src) != BRW_IMMEDIATE_VALUE &&
+       bit >= 121) {
+      return true;
+   }
+
+   return false;
+}
+
+static bool
+test_fuzz_compact_instruction(struct brw_codegen *p, brw_inst src)
+{
+   for (int bit0 = 0; bit0 < 128; bit0++) {
+      if (skip_bit(p->devinfo, &src, bit0))
+	 continue;
+
+      for (int bit1 = 0; bit1 < 128; bit1++) {
+         brw_inst instr = src;
+	 uint32_t *bits = (uint32_t *)&instr;
+
+         if (skip_bit(p->devinfo, &src, bit1))
+	    continue;
+
+	 bits[bit0 / 32] ^= (1 << (bit0 & 31));
+	 bits[bit1 / 32] ^= (1 << (bit1 & 31));
+
+         clear_pad_bits(p->devinfo, &instr);
+
+	 if (!test_compact_instruction(p, instr)) {
+	    printf("  twiddled bits for fuzzing %d, %d\n", bit0, bit1);
+	    return false;
+	 }
+      }
+   }
+
+   return true;
+}
+
+static void
+gen_ADD_GRF_GRF_GRF(struct brw_codegen *p)
+{
+   struct brw_reg g0 = brw_vec8_grf(0, 0);
+   struct brw_reg g2 = brw_vec8_grf(2, 0);
+   struct brw_reg g4 = brw_vec8_grf(4, 0);
+
+   brw_ADD(p, g0, g2, g4);
+}
+
+static void
+gen_ADD_GRF_GRF_IMM(struct brw_codegen *p)
+{
+   struct brw_reg g0 = brw_vec8_grf(0, 0);
+   struct brw_reg g2 = brw_vec8_grf(2, 0);
+
+   brw_ADD(p, g0, g2, brw_imm_f(1.0));
+}
+
+static void
+gen_ADD_GRF_GRF_IMM_d(struct brw_codegen *p)
+{
+   struct brw_reg g0 = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_D);
+   struct brw_reg g2 = retype(brw_vec8_grf(2, 0), BRW_REGISTER_TYPE_D);
+
+   brw_ADD(p, g0, g2, brw_imm_d(1));
+}
+
+static void
+gen_MOV_GRF_GRF(struct brw_codegen *p)
+{
+   struct brw_reg g0 = brw_vec8_grf(0, 0);
+   struct brw_reg g2 = brw_vec8_grf(2, 0);
+
+   brw_MOV(p, g0, g2);
+}
+
+static void
+gen_ADD_MRF_GRF_GRF(struct brw_codegen *p)
+{
+   struct brw_reg m6 = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 6, 0);
+   struct brw_reg g2 = brw_vec8_grf(2, 0);
+   struct brw_reg g4 = brw_vec8_grf(4, 0);
+
+   brw_ADD(p, m6, g2, g4);
+}
+
+static void
+gen_ADD_vec1_GRF_GRF_GRF(struct brw_codegen *p)
+{
+   struct brw_reg g0 = brw_vec1_grf(0, 0);
+   struct brw_reg g2 = brw_vec1_grf(2, 0);
+   struct brw_reg g4 = brw_vec1_grf(4, 0);
+
+   brw_ADD(p, g0, g2, g4);
+}
+
+static void
+gen_PLN_MRF_GRF_GRF(struct brw_codegen *p)
+{
+   struct brw_reg m6 = brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE, 6, 0);
+   struct brw_reg interp = brw_vec1_grf(2, 0);
+   struct brw_reg g4 = brw_vec8_grf(4, 0);
+
+   brw_PLN(p, m6, interp, g4);
+}
+
+static void
+gen_f0_0_MOV_GRF_GRF(struct brw_codegen *p)
+{
+   struct brw_reg g0 = brw_vec8_grf(0, 0);
+   struct brw_reg g2 = brw_vec8_grf(2, 0);
+
+   brw_push_insn_state(p);
+   brw_set_default_predicate_control(p, true);
+   brw_MOV(p, g0, g2);
+   brw_pop_insn_state(p);
+}
+
+/* The handling of f0.1 vs f0.0 changes between gen6 and gen7.  Explicitly test
+ * it, so that we run the fuzzing can run over all the other bits that might
+ * interact with it.
+ */
+static void
+gen_f0_1_MOV_GRF_GRF(struct brw_codegen *p)
+{
+   struct brw_reg g0 = brw_vec8_grf(0, 0);
+   struct brw_reg g2 = brw_vec8_grf(2, 0);
+
+   brw_push_insn_state(p);
+   brw_set_default_predicate_control(p, true);
+   brw_inst *mov = brw_MOV(p, g0, g2);
+   brw_inst_set_flag_subreg_nr(p->devinfo, mov, 1);
+   brw_pop_insn_state(p);
+}
+
+struct {
+   void (*func)(struct brw_codegen *p);
+} tests[] = {
+   { gen_MOV_GRF_GRF },
+   { gen_ADD_GRF_GRF_GRF },
+   { gen_ADD_GRF_GRF_IMM },
+   { gen_ADD_GRF_GRF_IMM_d },
+   { gen_ADD_MRF_GRF_GRF },
+   { gen_ADD_vec1_GRF_GRF_GRF },
+   { gen_PLN_MRF_GRF_GRF },
+   { gen_f0_0_MOV_GRF_GRF },
+   { gen_f0_1_MOV_GRF_GRF },
+};
+
+static bool
+run_tests(const struct gen_device_info *devinfo)
+{
+   brw_init_compaction_tables(devinfo);
+   bool fail = false;
+
+   for (int i = 0; i < ARRAY_SIZE(tests); i++) {
+      for (int align_16 = 0; align_16 <= 1; align_16++) {
+	 struct brw_codegen *p = rzalloc(NULL, struct brw_codegen);
+	 brw_init_codegen(devinfo, p, p);
+
+	 brw_set_default_predicate_control(p, BRW_PREDICATE_NONE);
+	 if (align_16)
+	    brw_set_default_access_mode(p, BRW_ALIGN_16);
+	 else
+	    brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+	 tests[i].func(p);
+	 assert(p->nr_insn == 1);
+
+	 if (!test_compact_instruction(p, p->store[0])) {
+	    fail = true;
+	    continue;
+	 }
+
+	 if (!test_fuzz_compact_instruction(p, p->store[0])) {
+	    fail = true;
+	    continue;
+	 }
+
+	 ralloc_free(p);
+      }
+   }
+
+   return fail;
+}
+
+int
+main(int argc, char **argv)
+{
+   struct gen_device_info *devinfo = calloc(1, sizeof(*devinfo));
+   devinfo->gen = 6;
+   bool fail = false;
+
+   for (devinfo->gen = 6; devinfo->gen <= 7; devinfo->gen++) {
+      fail |= run_tests(devinfo);
+   }
+
+   return fail;
+}
diff --git a/src/intel/compiler/test_eu_validate.cpp b/src/intel/compiler/test_eu_validate.cpp
new file mode 100644
index 00000000000..76652dc43d0
--- /dev/null
+++ b/src/intel/compiler/test_eu_validate.cpp
@@ -0,0 +1,847 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <gtest/gtest.h>
+#include "brw_eu.h"
+#include "util/ralloc.h"
+
+enum subgen {
+   IS_G45 = 1,
+   IS_BYT,
+   IS_HSW,
+   IS_CHV,
+   IS_BXT,
+   IS_KBL,
+};
+
+static const struct gen_info {
+   const char *name;
+   int gen;
+   enum subgen subgen;
+} gens[] = {
+   { "brw", 4 },
+   { "g45", 4, IS_G45 },
+   { "ilk", 5 },
+   { "snb", 6 },
+   { "ivb", 7 },
+   { "byt", 7, IS_BYT },
+   { "hsw", 7, IS_HSW },
+   { "bdw", 8 },
+   { "chv", 8, IS_CHV },
+   { "skl", 9 },
+   { "bxt", 9, IS_BXT },
+   { "kbl", 9, IS_KBL },
+};
+
+class validation_test: public ::testing::TestWithParam<struct gen_info> {
+   virtual void SetUp();
+
+public:
+   validation_test();
+   virtual ~validation_test();
+
+   struct brw_codegen *p;
+   struct gen_device_info devinfo;
+};
+
+validation_test::validation_test()
+{
+   p = rzalloc(NULL, struct brw_codegen);
+   memset(&devinfo, 0, sizeof(devinfo));
+}
+
+validation_test::~validation_test()
+{
+   ralloc_free(p);
+}
+
+void validation_test::SetUp()
+{
+   struct gen_info info = GetParam();
+
+   devinfo.gen           = info.gen;
+   devinfo.is_g4x        = info.subgen == IS_G45;
+   devinfo.is_baytrail   = info.subgen == IS_BYT;
+   devinfo.is_haswell    = info.subgen == IS_HSW;
+   devinfo.is_cherryview = info.subgen == IS_CHV;
+   devinfo.is_broxton    = info.subgen == IS_BXT;
+   devinfo.is_kabylake   = info.subgen == IS_KBL;
+
+   brw_init_codegen(&devinfo, p, p);
+}
+
+struct gen_name {
+   template <class ParamType>
+   std::string
+   operator()(const ::testing::TestParamInfo<ParamType>& info) const {
+      return info.param.name;
+   }
+};
+
+INSTANTIATE_TEST_CASE_P(eu_assembly, validation_test,
+                        ::testing::ValuesIn(gens),
+                        gen_name());
+
+static bool
+validate(struct brw_codegen *p)
+{
+   const bool print = getenv("TEST_DEBUG");
+   struct annotation_info annotation;
+   memset(&annotation, 0, sizeof(annotation));
+
+   if (print) {
+      annotation.mem_ctx = ralloc_context(NULL);
+      annotation.ann_count = 1;
+      annotation.ann_size = 2;
+      annotation.ann = rzalloc_array(annotation.mem_ctx, struct annotation,
+                                     annotation.ann_size);
+      annotation.ann[annotation.ann_count].offset = p->next_insn_offset;
+   }
+
+   bool ret = brw_validate_instructions(p, 0, &annotation);
+
+   if (print) {
+      dump_assembly(p->store, annotation.ann_count, annotation.ann, p->devinfo);
+      ralloc_free(annotation.mem_ctx);
+   }
+
+   return ret;
+}
+
+#define last_inst    (&p->store[p->nr_insn - 1])
+#define g0           brw_vec8_grf(0, 0)
+#define null         brw_null_reg()
+
+static void
+clear_instructions(struct brw_codegen *p)
+{
+   p->next_insn_offset = 0;
+   p->nr_insn = 0;
+}
+
+TEST_P(validation_test, sanity)
+{
+   brw_ADD(p, g0, g0, g0);
+
+   EXPECT_TRUE(validate(p));
+}
+
+TEST_P(validation_test, src0_null_reg)
+{
+   brw_MOV(p, g0, null);
+
+   EXPECT_FALSE(validate(p));
+}
+
+TEST_P(validation_test, src1_null_reg)
+{
+   brw_ADD(p, g0, g0, null);
+
+   EXPECT_FALSE(validate(p));
+}
+
+TEST_P(validation_test, math_src0_null_reg)
+{
+   if (devinfo.gen >= 6) {
+      gen6_math(p, g0, BRW_MATH_FUNCTION_SIN, null, null);
+   } else {
+      gen4_math(p, g0, BRW_MATH_FUNCTION_SIN, 0, null, BRW_MATH_PRECISION_FULL);
+   }
+
+   EXPECT_FALSE(validate(p));
+}
+
+TEST_P(validation_test, math_src1_null_reg)
+{
+   if (devinfo.gen >= 6) {
+      gen6_math(p, g0, BRW_MATH_FUNCTION_POW, g0, null);
+      EXPECT_FALSE(validate(p));
+   } else {
+      /* Math instructions on Gen4/5 are actually SEND messages with payloads.
+       * src1 is an immediate message descriptor set by gen4_math.
+       */
+   }
+}
+
+TEST_P(validation_test, opcode46)
+{
+   /* opcode 46 is "push" on Gen 4 and 5
+    *              "fork" on Gen 6
+    *              reserved on Gen 7
+    *              "goto" on Gen8+
+    */
+   brw_next_insn(p, 46);
+
+   if (devinfo.gen == 7) {
+      EXPECT_FALSE(validate(p));
+   } else {
+      EXPECT_TRUE(validate(p));
+   }
+}
+
+/* When the Execution Data Type is wider than the destination data type, the
+ * destination must [...] specify a HorzStride equal to the ratio in sizes of
+ * the two data types.
+ */
+TEST_P(validation_test, dest_stride_must_be_equal_to_the_ratio_of_exec_size_to_dest_size)
+{
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_D);
+   brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_D);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+   brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_D);
+   brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_D);
+
+   EXPECT_TRUE(validate(p));
+}
+
+/* When the Execution Data Type is wider than the destination data type, the
+ * destination must be aligned as required by the wider execution data type
+ * [...]
+ */
+TEST_P(validation_test, dst_subreg_must_be_aligned_to_exec_type_size)
+{
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 2);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+   brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_D);
+   brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_D);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_4);
+   brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 8);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+   brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_D);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+   brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_D);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+   EXPECT_TRUE(validate(p));
+}
+
+/* ExecSize must be greater than or equal to Width. */
+TEST_P(validation_test, exec_size_less_than_width)
+{
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_16);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_16);
+
+   EXPECT_FALSE(validate(p));
+}
+
+/* If ExecSize = Width and HorzStride ≠ 0,
+ * VertStride must be set to Width * HorzStride.
+ */
+TEST_P(validation_test, vertical_stride_is_width_by_horizontal_stride)
+{
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+
+   EXPECT_FALSE(validate(p));
+}
+
+/* If Width = 1, HorzStride must be 0 regardless of the values
+ * of ExecSize and VertStride.
+ */
+TEST_P(validation_test, horizontal_stride_must_be_0_if_width_is_1)
+{
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_0);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_1);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_0);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_1);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+   EXPECT_FALSE(validate(p));
+}
+
+/* If ExecSize = Width = 1, both VertStride and HorzStride must be 0. */
+TEST_P(validation_test, scalar_region_must_be_0_1_0)
+{
+   struct brw_reg g0_0 = brw_vec1_grf(0, 0);
+
+   brw_ADD(p, g0, g0, g0_0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_1);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_1);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_1);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0_0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_1);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_1);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_1);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+
+   EXPECT_FALSE(validate(p));
+}
+
+/* If VertStride = HorzStride = 0, Width must be 1 regardless of the value
+ * of ExecSize.
+ */
+TEST_P(validation_test, zero_stride_implies_0_1_0)
+{
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_0);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_2);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_0);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_2);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+
+   EXPECT_FALSE(validate(p));
+}
+
+/* Dst.HorzStride must not be 0. */
+TEST_P(validation_test, dst_horizontal_stride_0)
+{
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_set_default_access_mode(p, BRW_ALIGN_16);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+
+   EXPECT_FALSE(validate(p));
+}
+
+/* VertStride must be used to cross GRF register boundaries. This rule implies
+ * that elements within a 'Width' cannot cross GRF boundaries.
+ */
+TEST_P(validation_test, must_not_cross_grf_boundary_in_a_width)
+{
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_src0_da1_subreg_nr(&devinfo, last_inst, 4);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_src1_da1_subreg_nr(&devinfo, last_inst, 4);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+
+   EXPECT_FALSE(validate(p));
+}
+
+/* Destination Horizontal must be 1 in Align16 */
+TEST_P(validation_test, dst_hstride_on_align16_must_be_1)
+{
+   brw_set_default_access_mode(p, BRW_ALIGN_16);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+   EXPECT_TRUE(validate(p));
+}
+
+/* VertStride must be 0 or 4 in Align16 */
+TEST_P(validation_test, vstride_on_align16_must_be_0_or_4)
+{
+   const struct {
+      enum brw_vertical_stride vstride;
+      bool expected_result;
+   } vstride[] = {
+      { BRW_VERTICAL_STRIDE_0, true },
+      { BRW_VERTICAL_STRIDE_1, false },
+      { BRW_VERTICAL_STRIDE_2, devinfo.is_haswell || devinfo.gen >= 8 },
+      { BRW_VERTICAL_STRIDE_4, true },
+      { BRW_VERTICAL_STRIDE_8, false },
+      { BRW_VERTICAL_STRIDE_16, false },
+      { BRW_VERTICAL_STRIDE_32, false },
+      { BRW_VERTICAL_STRIDE_ONE_DIMENSIONAL, false },
+   };
+
+   brw_set_default_access_mode(p, BRW_ALIGN_16);
+
+   for (unsigned i = 0; i < sizeof(vstride) / sizeof(vstride[0]); i++) {
+      brw_ADD(p, g0, g0, g0);
+      brw_inst_set_src0_vstride(&devinfo, last_inst, vstride[i].vstride);
+
+      EXPECT_EQ(vstride[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+
+   for (unsigned i = 0; i < sizeof(vstride) / sizeof(vstride[0]); i++) {
+      brw_ADD(p, g0, g0, g0);
+      brw_inst_set_src1_vstride(&devinfo, last_inst, vstride[i].vstride);
+
+      EXPECT_EQ(vstride[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+}
+
+/* In Direct Addressing mode, a source cannot span more than 2 adjacent GRF
+ * registers.
+ */
+TEST_P(validation_test, source_cannot_span_more_than_2_registers)
+{
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_32);
+   brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_8);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+   brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_8);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+   brw_inst_set_src1_da1_subreg_nr(&devinfo, last_inst, 2);
+
+   EXPECT_TRUE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+
+   EXPECT_TRUE(validate(p));
+}
+
+/* A destination cannot span more than 2 adjacent GRF registers. */
+TEST_P(validation_test, destination_cannot_span_more_than_2_registers)
+{
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_32);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+   brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_8);
+   brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 6);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_4);
+   brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+   brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+   EXPECT_TRUE(validate(p));
+}
+
+TEST_P(validation_test, src_region_spans_two_regs_dst_region_spans_one)
+{
+   /* Writes to dest are to the lower OWord */
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+
+   EXPECT_TRUE(validate(p));
+
+   clear_instructions(p);
+
+   /* Writes to dest are to the upper OWord */
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 16);
+   brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+
+   EXPECT_TRUE(validate(p));
+
+   clear_instructions(p);
+
+   /* Writes to dest are evenly split between OWords */
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+   brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_8);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+
+   EXPECT_TRUE(validate(p));
+
+   clear_instructions(p);
+
+   /* Writes to dest are uneven between OWords */
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_4);
+   brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 10);
+   brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+   brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_16);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_2);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+   if (devinfo.gen >= 9) {
+      EXPECT_TRUE(validate(p));
+   } else {
+      EXPECT_FALSE(validate(p));
+   }
+}
+
+TEST_P(validation_test, dst_elements_must_be_evenly_split_between_registers)
+{
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 4);
+
+   if (devinfo.gen >= 9) {
+      EXPECT_TRUE(validate(p));
+   } else {
+      EXPECT_FALSE(validate(p));
+   }
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+
+   EXPECT_TRUE(validate(p));
+
+   clear_instructions(p);
+
+   if (devinfo.gen >= 6) {
+      gen6_math(p, g0, BRW_MATH_FUNCTION_SIN, g0, null);
+
+      EXPECT_TRUE(validate(p));
+
+      clear_instructions(p);
+
+      gen6_math(p, g0, BRW_MATH_FUNCTION_SIN, g0, null);
+      brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 4);
+
+      EXPECT_FALSE(validate(p));
+   }
+}
+
+TEST_P(validation_test, two_src_two_dst_source_offsets_must_be_same)
+{
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_4);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_4);
+   brw_inst_set_src0_da1_subreg_nr(&devinfo, last_inst, 16);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_2);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_1);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+   if (devinfo.gen <= 7) {
+      EXPECT_FALSE(validate(p));
+   } else {
+      EXPECT_TRUE(validate(p));
+   }
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_4);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_4);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_1);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_8);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_2);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+   EXPECT_TRUE(validate(p));
+}
+
+#if 0
+TEST_P(validation_test, two_src_two_dst_each_dst_must_be_derived_from_one_src)
+{
+   // mov (16) r10.0<2>:w r12.4<4;4,1>:w
+
+   brw_MOV(p, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+   brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+   brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_da1_subreg_nr(&devinfo, last_inst, 8);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_4);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+#if 0
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_src1_da1_subreg_nr(&devinfo, last_inst, 16);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_4);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_4);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_1);
+
+   EXPECT_FALSE(validate(p));
+   #endif
+}
+#endif
+
+TEST_P(validation_test, one_src_two_dst)
+{
+   struct brw_reg g0_0 = brw_vec1_grf(0, 0);
+
+   brw_ADD(p, g0, g0_0, g0_0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+
+   EXPECT_TRUE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+   brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_D);
+   brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+
+   EXPECT_TRUE(validate(p));
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+   brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src1_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_0);
+   brw_inst_set_src1_width(&devinfo, last_inst, BRW_WIDTH_1);
+   brw_inst_set_src1_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+
+   if (devinfo.gen >= 8) {
+      EXPECT_TRUE(validate(p));
+   } else {
+      EXPECT_FALSE(validate(p));
+   }
+
+   clear_instructions(p);
+
+   brw_ADD(p, g0, g0, g0);
+   brw_inst_set_exec_size(&devinfo, last_inst, BRW_EXECUTE_16);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+   brw_inst_set_dst_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+   brw_inst_set_src0_vstride(&devinfo, last_inst, BRW_VERTICAL_STRIDE_0);
+   brw_inst_set_src0_width(&devinfo, last_inst, BRW_WIDTH_1);
+   brw_inst_set_src0_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_0);
+   brw_inst_set_src1_reg_type(&devinfo, last_inst, BRW_HW_REG_TYPE_W);
+
+   if (devinfo.gen >= 8) {
+      EXPECT_TRUE(validate(p));
+   } else {
+      EXPECT_FALSE(validate(p));
+   }
+}
+
+TEST_P(validation_test, packed_byte_destination)
+{
+   static const struct {
+      enum brw_reg_type dst_type;
+      enum brw_reg_type src_type;
+      bool neg, abs, sat;
+      bool expected_result;
+   } move[] = {
+      { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_UB, 0, 0, 0, true },
+      { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_B , 0, 0, 0, true },
+      { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_B , 0, 0, 0, true },
+      { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_UB, 0, 0, 0, true },
+
+      { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_UB, 1, 0, 0, false },
+      { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_B , 1, 0, 0, false },
+      { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_B , 1, 0, 0, false },
+      { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_UB, 1, 0, 0, false },
+
+      { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_UB, 0, 1, 0, false },
+      { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_B , 0, 1, 0, false },
+      { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_B , 0, 1, 0, false },
+      { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_UB, 0, 1, 0, false },
+
+      { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_UB, 0, 0, 1, false },
+      { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_B , 0, 0, 1, false },
+      { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_B , 0, 0, 1, false },
+      { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_UB, 0, 0, 1, false },
+
+      { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_UW, 0, 0, 0, false },
+      { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_W , 0, 0, 0, false },
+      { BRW_REGISTER_TYPE_UB, BRW_REGISTER_TYPE_UD, 0, 0, 0, false },
+      { BRW_REGISTER_TYPE_B , BRW_REGISTER_TYPE_D , 0, 0, 0, false },
+   };
+
+   for (unsigned i = 0; i < sizeof(move) / sizeof(move[0]); i++) {
+      brw_MOV(p, retype(g0, move[i].dst_type), retype(g0, move[i].src_type));
+      brw_inst_set_src0_negate(&devinfo, last_inst, move[i].neg);
+      brw_inst_set_src0_abs(&devinfo, last_inst, move[i].abs);
+      brw_inst_set_saturate(&devinfo, last_inst, move[i].sat);
+
+      EXPECT_EQ(move[i].expected_result, validate(p));
+
+      clear_instructions(p);
+   }
+
+   brw_SEL(p, retype(g0, BRW_REGISTER_TYPE_UB),
+              retype(g0, BRW_REGISTER_TYPE_UB),
+              retype(g0, BRW_REGISTER_TYPE_UB));
+   brw_inst_set_pred_control(&devinfo, last_inst, BRW_PREDICATE_NORMAL);
+
+   EXPECT_FALSE(validate(p));
+
+   clear_instructions(p);
+
+   brw_SEL(p, retype(g0, BRW_REGISTER_TYPE_B),
+              retype(g0, BRW_REGISTER_TYPE_B),
+              retype(g0, BRW_REGISTER_TYPE_B));
+   brw_inst_set_pred_control(&devinfo, last_inst, BRW_PREDICATE_NORMAL);
+
+   EXPECT_FALSE(validate(p));
+}
+
+TEST_P(validation_test, byte_destination_relaxed_alignment)
+{
+   brw_SEL(p, retype(g0, BRW_REGISTER_TYPE_B),
+              retype(g0, BRW_REGISTER_TYPE_W),
+              retype(g0, BRW_REGISTER_TYPE_W));
+   brw_inst_set_pred_control(&devinfo, last_inst, BRW_PREDICATE_NORMAL);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+
+   EXPECT_TRUE(validate(p));
+
+   clear_instructions(p);
+
+   brw_SEL(p, retype(g0, BRW_REGISTER_TYPE_B),
+              retype(g0, BRW_REGISTER_TYPE_W),
+              retype(g0, BRW_REGISTER_TYPE_W));
+   brw_inst_set_pred_control(&devinfo, last_inst, BRW_PREDICATE_NORMAL);
+   brw_inst_set_dst_hstride(&devinfo, last_inst, BRW_HORIZONTAL_STRIDE_2);
+   brw_inst_set_dst_da1_subreg_nr(&devinfo, last_inst, 1);
+
+   if (devinfo.gen > 4 || devinfo.is_g4x) {
+      EXPECT_TRUE(validate(p));
+   } else {
+      EXPECT_FALSE(validate(p));
+   }
+
+}
diff --git a/src/intel/compiler/test_fs_cmod_propagation.cpp b/src/intel/compiler/test_fs_cmod_propagation.cpp
new file mode 100644
index 00000000000..a97e374f74e
--- /dev/null
+++ b/src/intel/compiler/test_fs_cmod_propagation.cpp
@@ -0,0 +1,556 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <gtest/gtest.h>
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "program/program.h"
+
+using namespace brw;
+
+class cmod_propagation_test : public ::testing::Test {
+   virtual void SetUp();
+
+public:
+   struct brw_compiler *compiler;
+   struct gen_device_info *devinfo;
+   struct gl_context *ctx;
+   struct brw_wm_prog_data *prog_data;
+   struct gl_shader_program *shader_prog;
+   fs_visitor *v;
+};
+
+class cmod_propagation_fs_visitor : public fs_visitor
+{
+public:
+   cmod_propagation_fs_visitor(struct brw_compiler *compiler,
+                               struct brw_wm_prog_data *prog_data,
+                               nir_shader *shader)
+      : fs_visitor(compiler, NULL, NULL, NULL,
+                   &prog_data->base, (struct gl_program *) NULL,
+                   shader, 8, -1) {}
+};
+
+
+void cmod_propagation_test::SetUp()
+{
+   ctx = (struct gl_context *)calloc(1, sizeof(*ctx));
+   compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler));
+   devinfo = (struct gen_device_info *)calloc(1, sizeof(*devinfo));
+   compiler->devinfo = devinfo;
+
+   prog_data = ralloc(NULL, struct brw_wm_prog_data);
+   nir_shader *shader =
+      nir_shader_create(NULL, MESA_SHADER_FRAGMENT, NULL, NULL);
+
+   v = new cmod_propagation_fs_visitor(compiler, prog_data, shader);
+
+   devinfo->gen = 4;
+}
+
+static fs_inst *
+instruction(bblock_t *block, int num)
+{
+   fs_inst *inst = (fs_inst *)block->start();
+   for (int i = 0; i < num; i++) {
+      inst = (fs_inst *)inst->next;
+   }
+   return inst;
+}
+
+static bool
+cmod_propagation(fs_visitor *v)
+{
+   const bool print = getenv("TEST_DEBUG");
+
+   if (print) {
+      fprintf(stderr, "= Before =\n");
+      v->cfg->dump(v);
+   }
+
+   bool ret = v->opt_cmod_propagation();
+
+   if (print) {
+      fprintf(stderr, "\n= After =\n");
+      v->cfg->dump(v);
+   }
+
+   return ret;
+}
+
+TEST_F(cmod_propagation_test, basic)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg dest = v->vgrf(glsl_type::float_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   fs_reg src1 = v->vgrf(glsl_type::float_type);
+   fs_reg zero(brw_imm_f(0.0f));
+   bld.ADD(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), dest, zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add(8)        dest  src0  src1
+    * 1: cmp.ge.f0(8)  null  dest  0.0f
+    *
+    * = After =
+    * 0: add.ge.f0(8)  dest  src0  src1
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, cmp_nonzero)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg dest = v->vgrf(glsl_type::float_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   fs_reg src1 = v->vgrf(glsl_type::float_type);
+   fs_reg nonzero(brw_imm_f(1.0f));
+   bld.ADD(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), dest, nonzero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add(8)        dest  src0  src1
+    * 1: cmp.ge.f0(8)  null  dest  1.0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, non_cmod_instruction)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg dest = v->vgrf(glsl_type::uint_type);
+   fs_reg src0 = v->vgrf(glsl_type::uint_type);
+   fs_reg zero(brw_imm_ud(0u));
+   bld.FBL(dest, src0);
+   bld.CMP(bld.null_reg_ud(), dest, zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: fbl(8)        dest  src0
+    * 1: cmp.ge.f0(8)  null  dest  0u
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_FBL, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, intervening_flag_write)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg dest = v->vgrf(glsl_type::float_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   fs_reg src1 = v->vgrf(glsl_type::float_type);
+   fs_reg src2 = v->vgrf(glsl_type::float_type);
+   fs_reg zero(brw_imm_f(0.0f));
+   bld.ADD(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), src2, zero, BRW_CONDITIONAL_GE);
+   bld.CMP(bld.null_reg_f(), dest, zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add(8)        dest  src0  src1
+    * 1: cmp.ge.f0(8)  null  src2  0.0f
+    * 2: cmp.ge.f0(8)  null  dest  0.0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, intervening_flag_read)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg dest0 = v->vgrf(glsl_type::float_type);
+   fs_reg dest1 = v->vgrf(glsl_type::float_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   fs_reg src1 = v->vgrf(glsl_type::float_type);
+   fs_reg src2 = v->vgrf(glsl_type::float_type);
+   fs_reg zero(brw_imm_f(0.0f));
+   bld.ADD(dest0, src0, src1);
+   set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero));
+   bld.CMP(bld.null_reg_f(), dest0, zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add(8)        dest0 src0  src1
+    * 1: (+f0) sel(8)  dest1 src2  0.0f
+    * 2: cmp.ge.f0(8)  null  dest0 0.0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, intervening_dest_write)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg dest = v->vgrf(glsl_type::vec4_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   fs_reg src1 = v->vgrf(glsl_type::float_type);
+   fs_reg src2 = v->vgrf(glsl_type::vec2_type);
+   fs_reg zero(brw_imm_f(0.0f));
+   bld.ADD(offset(dest, bld, 2), src0, src1);
+   bld.emit(SHADER_OPCODE_TEX, dest, src2)
+      ->size_written = 4 * REG_SIZE;
+   bld.CMP(bld.null_reg_f(), offset(dest, bld, 2), zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add(8)        dest+2  src0    src1
+    * 1: tex(8) rlen 4 dest+0  src2
+    * 2: cmp.ge.f0(8)  null    dest+2  0.0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(SHADER_OPCODE_TEX, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, intervening_flag_read_same_value)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg dest0 = v->vgrf(glsl_type::float_type);
+   fs_reg dest1 = v->vgrf(glsl_type::float_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   fs_reg src1 = v->vgrf(glsl_type::float_type);
+   fs_reg src2 = v->vgrf(glsl_type::float_type);
+   fs_reg zero(brw_imm_f(0.0f));
+   set_condmod(BRW_CONDITIONAL_GE, bld.ADD(dest0, src0, src1));
+   set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero));
+   bld.CMP(bld.null_reg_f(), dest0, zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add.ge.f0(8)  dest0 src0  src1
+    * 1: (+f0) sel(8)  dest1 src2  0.0f
+    * 2: cmp.ge.f0(8)  null  dest0 0.0f
+    *
+    * = After =
+    * 0: add.ge.f0(8)  dest0 src0  src1
+    * 1: (+f0) sel(8)  dest1 src2  0.0f
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate);
+}
+
+TEST_F(cmod_propagation_test, negate)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg dest = v->vgrf(glsl_type::float_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   fs_reg src1 = v->vgrf(glsl_type::float_type);
+   fs_reg zero(brw_imm_f(0.0f));
+   bld.ADD(dest, src0, src1);
+   dest.negate = true;
+   bld.CMP(bld.null_reg_f(), dest, zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add(8)        dest  src0  src1
+    * 1: cmp.ge.f0(8)  null  -dest 0.0f
+    *
+    * = After =
+    * 0: add.le.f0(8)  dest  src0  src1
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_LE, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, movnz)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg dest = v->vgrf(glsl_type::float_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   fs_reg src1 = v->vgrf(glsl_type::float_type);
+   bld.CMP(dest, src0, src1, BRW_CONDITIONAL_GE);
+   set_condmod(BRW_CONDITIONAL_NZ,
+               bld.MOV(bld.null_reg_f(), dest));
+
+   /* = Before =
+    *
+    * 0: cmp.ge.f0(8)  dest  src0  src1
+    * 1: mov.nz.f0(8)  null  dest
+    *
+    * = After =
+    * 0: cmp.ge.f0(8)  dest  src0  src1
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, different_types_cmod_with_zero)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg dest = v->vgrf(glsl_type::int_type);
+   fs_reg src0 = v->vgrf(glsl_type::int_type);
+   fs_reg src1 = v->vgrf(glsl_type::int_type);
+   fs_reg zero(brw_imm_f(0.0f));
+   bld.ADD(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), retype(dest, BRW_REGISTER_TYPE_F), zero,
+           BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add(8)        dest:D  src0:D  src1:D
+    * 1: cmp.ge.f0(8)  null:F  dest:F  0.0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, andnz_one)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg dest = v->vgrf(glsl_type::int_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   fs_reg zero(brw_imm_f(0.0f));
+   fs_reg one(brw_imm_d(1));
+
+   bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L);
+   set_condmod(BRW_CONDITIONAL_NZ,
+               bld.AND(bld.null_reg_d(), dest, one));
+
+   /* = Before =
+    * 0: cmp.l.f0(8)     dest:F  src0:F  0F
+    * 1: and.nz.f0(8)    null:D  dest:D  1D
+    *
+    * = After =
+    * 0: cmp.l.f0(8)     dest:F  src0:F  0F
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+   EXPECT_TRUE(retype(dest, BRW_REGISTER_TYPE_F)
+               .equals(instruction(block0, 0)->dst));
+}
+
+TEST_F(cmod_propagation_test, andnz_non_one)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg dest = v->vgrf(glsl_type::int_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   fs_reg zero(brw_imm_f(0.0f));
+   fs_reg nonone(brw_imm_d(38));
+
+   bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L);
+   set_condmod(BRW_CONDITIONAL_NZ,
+               bld.AND(bld.null_reg_d(), dest, nonone));
+
+   /* = Before =
+    * 0: cmp.l.f0(8)     dest:F  src0:F  0F
+    * 1: and.nz.f0(8)    null:D  dest:D  38D
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_AND, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, andz_one)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg dest = v->vgrf(glsl_type::int_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   fs_reg zero(brw_imm_f(0.0f));
+   fs_reg one(brw_imm_d(1));
+
+   bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L);
+   set_condmod(BRW_CONDITIONAL_Z,
+               bld.AND(bld.null_reg_d(), dest, one));
+
+   /* = Before =
+    * 0: cmp.l.f0(8)     dest:F  src0:F  0F
+    * 1: and.z.f0(8)     null:D  dest:D  1D
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_AND, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_EQ, instruction(block0, 1)->conditional_mod);
+}
diff --git a/src/intel/compiler/test_fs_copy_propagation.cpp b/src/intel/compiler/test_fs_copy_propagation.cpp
new file mode 100644
index 00000000000..37736ec86f4
--- /dev/null
+++ b/src/intel/compiler/test_fs_copy_propagation.cpp
@@ -0,0 +1,213 @@
+/*
+ * Copyright © 2016 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <gtest/gtest.h>
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "program/program.h"
+
+using namespace brw;
+
+class copy_propagation_test : public ::testing::Test {
+   virtual void SetUp();
+
+public:
+   struct brw_compiler *compiler;
+   struct gen_device_info *devinfo;
+   struct gl_context *ctx;
+   struct brw_wm_prog_data *prog_data;
+   struct gl_shader_program *shader_prog;
+   fs_visitor *v;
+};
+
+class copy_propagation_fs_visitor : public fs_visitor
+{
+public:
+   copy_propagation_fs_visitor(struct brw_compiler *compiler,
+                               struct brw_wm_prog_data *prog_data,
+                               nir_shader *shader)
+      : fs_visitor(compiler, NULL, NULL, NULL,
+                   &prog_data->base, (struct gl_program *) NULL,
+                   shader, 8, -1) {}
+};
+
+
+void copy_propagation_test::SetUp()
+{
+   ctx = (struct gl_context *)calloc(1, sizeof(*ctx));
+   compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler));
+   devinfo = (struct gen_device_info *)calloc(1, sizeof(*devinfo));
+   compiler->devinfo = devinfo;
+
+   prog_data = ralloc(NULL, struct brw_wm_prog_data);
+   nir_shader *shader =
+      nir_shader_create(NULL, MESA_SHADER_FRAGMENT, NULL, NULL);
+
+   v = new copy_propagation_fs_visitor(compiler, prog_data, shader);
+
+   devinfo->gen = 4;
+}
+
+static fs_inst *
+instruction(bblock_t *block, int num)
+{
+   fs_inst *inst = (fs_inst *)block->start();
+   for (int i = 0; i < num; i++) {
+      inst = (fs_inst *)inst->next;
+   }
+   return inst;
+}
+
+static bool
+copy_propagation(fs_visitor *v)
+{
+   const bool print = getenv("TEST_DEBUG");
+
+   if (print) {
+      fprintf(stderr, "= Before =\n");
+      v->cfg->dump(v);
+   }
+
+   bool ret = v->opt_copy_propagation();
+
+   if (print) {
+      fprintf(stderr, "\n= After =\n");
+      v->cfg->dump(v);
+   }
+
+   return ret;
+}
+
+TEST_F(copy_propagation_test, basic)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg vgrf0 = v->vgrf(glsl_type::float_type);
+   fs_reg vgrf1 = v->vgrf(glsl_type::float_type);
+   fs_reg vgrf2 = v->vgrf(glsl_type::float_type);
+   fs_reg vgrf3 = v->vgrf(glsl_type::float_type);
+   bld.MOV(vgrf0, vgrf2);
+   bld.ADD(vgrf1, vgrf0, vgrf3);
+
+   /* = Before =
+    *
+    * 0: mov(8)        vgrf0  vgrf2
+    * 1: add(8)        vgrf1  vgrf0  vgrf3
+    *
+    * = After =
+    * 0: mov(8)        vgrf0  vgrf2
+    * 1: add(8)        vgrf1  vgrf2  vgrf3
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(copy_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   fs_inst *mov = instruction(block0, 0);
+   EXPECT_EQ(BRW_OPCODE_MOV, mov->opcode);
+   EXPECT_TRUE(mov->dst.equals(vgrf0));
+   EXPECT_TRUE(mov->src[0].equals(vgrf2));
+
+   fs_inst *add = instruction(block0, 1);
+   EXPECT_EQ(BRW_OPCODE_ADD, add->opcode);
+   EXPECT_TRUE(add->dst.equals(vgrf1));
+   EXPECT_TRUE(add->src[0].equals(vgrf2));
+   EXPECT_TRUE(add->src[1].equals(vgrf3));
+}
+
+TEST_F(copy_propagation_test, maxmax_sat_imm)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg vgrf0 = v->vgrf(glsl_type::float_type);
+   fs_reg vgrf1 = v->vgrf(glsl_type::float_type);
+   fs_reg vgrf2 = v->vgrf(glsl_type::float_type);
+
+   static const struct {
+      enum brw_conditional_mod conditional_mod;
+      float immediate;
+      bool expected_result;
+   } test[] = {
+      /*   conditional mod,     imm, expected_result */
+      { BRW_CONDITIONAL_GE  ,  0.1f, true },
+      { BRW_CONDITIONAL_L   ,  0.1f, true },
+      { BRW_CONDITIONAL_GE  ,  0.5f, true },
+      { BRW_CONDITIONAL_L   ,  0.5f, true },
+      { BRW_CONDITIONAL_GE  ,  0.9f, true },
+      { BRW_CONDITIONAL_L   ,  0.9f, true },
+      { BRW_CONDITIONAL_GE  , -1.5f, false },
+      { BRW_CONDITIONAL_L   , -1.5f, false },
+      { BRW_CONDITIONAL_GE  ,  1.5f, false },
+      { BRW_CONDITIONAL_L   ,  1.5f, false },
+
+      { BRW_CONDITIONAL_NONE, 0.5f, false },
+      { BRW_CONDITIONAL_Z   , 0.5f, false },
+      { BRW_CONDITIONAL_NZ  , 0.5f, false },
+      { BRW_CONDITIONAL_G   , 0.5f, false },
+      { BRW_CONDITIONAL_LE  , 0.5f, false },
+      { BRW_CONDITIONAL_R   , 0.5f, false },
+      { BRW_CONDITIONAL_O   , 0.5f, false },
+      { BRW_CONDITIONAL_U   , 0.5f, false },
+   };
+
+   for (unsigned i = 0; i < sizeof(test) / sizeof(test[0]); i++) {
+      fs_inst *mov = set_saturate(true, bld.MOV(vgrf0, vgrf1));
+      fs_inst *sel = set_condmod(test[i].conditional_mod,
+                                 bld.SEL(vgrf2, vgrf0,
+                                         brw_imm_f(test[i].immediate)));
+
+      v->calculate_cfg();
+
+      bblock_t *block0 = v->cfg->blocks[0];
+
+      EXPECT_EQ(0, block0->start_ip);
+      EXPECT_EQ(1, block0->end_ip);
+
+      EXPECT_EQ(test[i].expected_result, copy_propagation(v));
+      EXPECT_EQ(0, block0->start_ip);
+      EXPECT_EQ(1, block0->end_ip);
+
+      EXPECT_EQ(BRW_OPCODE_MOV, mov->opcode);
+      EXPECT_TRUE(mov->saturate);
+      EXPECT_TRUE(mov->dst.equals(vgrf0));
+      EXPECT_TRUE(mov->src[0].equals(vgrf1));
+
+      EXPECT_EQ(BRW_OPCODE_SEL, sel->opcode);
+      EXPECT_EQ(test[i].conditional_mod, sel->conditional_mod);
+      EXPECT_EQ(test[i].expected_result, sel->saturate);
+      EXPECT_TRUE(sel->dst.equals(vgrf2));
+      if (test[i].expected_result) {
+         EXPECT_TRUE(sel->src[0].equals(vgrf1));
+      } else {
+         EXPECT_TRUE(sel->src[0].equals(vgrf0));
+      }
+      EXPECT_TRUE(sel->src[1].equals(brw_imm_f(test[i].immediate)));
+
+      delete v->cfg;
+      v->cfg = NULL;
+   }
+}
diff --git a/src/intel/compiler/test_fs_saturate_propagation.cpp b/src/intel/compiler/test_fs_saturate_propagation.cpp
new file mode 100644
index 00000000000..db472143994
--- /dev/null
+++ b/src/intel/compiler/test_fs_saturate_propagation.cpp
@@ -0,0 +1,600 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <gtest/gtest.h>
+#include "brw_fs.h"
+#include "brw_cfg.h"
+#include "program/program.h"
+
+using namespace brw;
+
+class saturate_propagation_test : public ::testing::Test {
+   virtual void SetUp();
+
+public:
+   struct brw_compiler *compiler;
+   struct gen_device_info *devinfo;
+   struct gl_context *ctx;
+   struct brw_wm_prog_data *prog_data;
+   struct gl_shader_program *shader_prog;
+   fs_visitor *v;
+};
+
+class saturate_propagation_fs_visitor : public fs_visitor
+{
+public:
+   saturate_propagation_fs_visitor(struct brw_compiler *compiler,
+                                   struct brw_wm_prog_data *prog_data,
+                                   nir_shader *shader)
+      : fs_visitor(compiler, NULL, NULL, NULL,
+                   &prog_data->base, (struct gl_program *) NULL,
+                   shader, 8, -1) {}
+};
+
+
+void saturate_propagation_test::SetUp()
+{
+   ctx = (struct gl_context *)calloc(1, sizeof(*ctx));
+   compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler));
+   devinfo = (struct gen_device_info *)calloc(1, sizeof(*devinfo));
+   compiler->devinfo = devinfo;
+
+   prog_data = ralloc(NULL, struct brw_wm_prog_data);
+   nir_shader *shader =
+      nir_shader_create(NULL, MESA_SHADER_FRAGMENT, NULL, NULL);
+
+   v = new saturate_propagation_fs_visitor(compiler, prog_data, shader);
+
+   devinfo->gen = 4;
+}
+
+static fs_inst *
+instruction(bblock_t *block, int num)
+{
+   fs_inst *inst = (fs_inst *)block->start();
+   for (int i = 0; i < num; i++) {
+      inst = (fs_inst *)inst->next;
+   }
+   return inst;
+}
+
+static bool
+saturate_propagation(fs_visitor *v)
+{
+   const bool print = false;
+
+   if (print) {
+      fprintf(stderr, "= Before =\n");
+      v->cfg->dump(v);
+   }
+
+   bool ret = v->opt_saturate_propagation();
+
+   if (print) {
+      fprintf(stderr, "\n= After =\n");
+      v->cfg->dump(v);
+   }
+
+   return ret;
+}
+
+TEST_F(saturate_propagation_test, basic)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg dst0 = v->vgrf(glsl_type::float_type);
+   fs_reg dst1 = v->vgrf(glsl_type::float_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   fs_reg src1 = v->vgrf(glsl_type::float_type);
+   bld.ADD(dst0, src0, src1);
+   set_saturate(true, bld.MOV(dst1, dst0));
+
+   /* = Before =
+    *
+    * 0: add(8)        dst0  src0  src1
+    * 1: mov.sat(8)    dst1  dst0
+    *
+    * = After =
+    * 0: add.sat(8)    dst0  src0  src1
+    * 1: mov(8)        dst1  dst0
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_TRUE(instruction(block0, 0)->saturate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_FALSE(instruction(block0, 1)->saturate);
+}
+
+TEST_F(saturate_propagation_test, other_non_saturated_use)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg dst0 = v->vgrf(glsl_type::float_type);
+   fs_reg dst1 = v->vgrf(glsl_type::float_type);
+   fs_reg dst2 = v->vgrf(glsl_type::float_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   fs_reg src1 = v->vgrf(glsl_type::float_type);
+   bld.ADD(dst0, src0, src1);
+   set_saturate(true, bld.MOV(dst1, dst0));
+   bld.ADD(dst2, dst0, src0);
+
+   /* = Before =
+    *
+    * 0: add(8)        dst0  src0  src1
+    * 1: mov.sat(8)    dst1  dst0
+    * 2: add(8)        dst2  dst0  src0
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_FALSE(instruction(block0, 0)->saturate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_TRUE(instruction(block0, 1)->saturate);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 2)->opcode);
+}
+
+TEST_F(saturate_propagation_test, predicated_instruction)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg dst0 = v->vgrf(glsl_type::float_type);
+   fs_reg dst1 = v->vgrf(glsl_type::float_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   fs_reg src1 = v->vgrf(glsl_type::float_type);
+   bld.ADD(dst0, src0, src1)
+      ->predicate = BRW_PREDICATE_NORMAL;
+   set_saturate(true, bld.MOV(dst1, dst0));
+
+   /* = Before =
+    *
+    * 0: (+f0) add(8)  dst0  src0  src1
+    * 1: mov.sat(8)    dst1  dst0
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_FALSE(instruction(block0, 0)->saturate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_TRUE(instruction(block0, 1)->saturate);
+}
+
+TEST_F(saturate_propagation_test, neg_mov_sat)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg dst0 = v->vgrf(glsl_type::float_type);
+   fs_reg dst1 = v->vgrf(glsl_type::float_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   bld.RNDU(dst0, src0);
+   dst0.negate = true;
+   set_saturate(true, bld.MOV(dst1, dst0));
+
+   /* = Before =
+    *
+    * 0: rndu(8)       dst0  src0
+    * 1: mov.sat(8)    dst1  -dst0
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_RNDU, instruction(block0, 0)->opcode);
+   EXPECT_FALSE(instruction(block0, 0)->saturate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_TRUE(instruction(block0, 1)->saturate);
+}
+
+TEST_F(saturate_propagation_test, add_neg_mov_sat)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg dst0 = v->vgrf(glsl_type::float_type);
+   fs_reg dst1 = v->vgrf(glsl_type::float_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   fs_reg src1 = v->vgrf(glsl_type::float_type);
+   bld.ADD(dst0, src0, src1);
+   dst0.negate = true;
+   set_saturate(true, bld.MOV(dst1, dst0));
+
+   /* = Before =
+    *
+    * 0: add(8)        dst0  src0  src1
+    * 1: mov.sat(8)    dst1  -dst0
+    *
+    * = After =
+    * 0: add.sat(8)    dst0  -src0 -src1
+    * 1: mov(8)        dst1  dst0
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_TRUE(instruction(block0, 0)->saturate);
+   EXPECT_TRUE(instruction(block0, 0)->src[0].negate);
+   EXPECT_TRUE(instruction(block0, 0)->src[1].negate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_FALSE(instruction(block0, 1)->saturate);
+}
+
+TEST_F(saturate_propagation_test, mul_neg_mov_sat)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg dst0 = v->vgrf(glsl_type::float_type);
+   fs_reg dst1 = v->vgrf(glsl_type::float_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   fs_reg src1 = v->vgrf(glsl_type::float_type);
+   bld.MUL(dst0, src0, src1);
+   dst0.negate = true;
+   set_saturate(true, bld.MOV(dst1, dst0));
+
+   /* = Before =
+    *
+    * 0: mul(8)        dst0  src0  src1
+    * 1: mov.sat(8)    dst1  -dst0
+    *
+    * = After =
+    * 0: mul.sat(8)    dst0  src0 -src1
+    * 1: mov(8)        dst1  dst0
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode);
+   EXPECT_TRUE(instruction(block0, 0)->saturate);
+   EXPECT_TRUE(instruction(block0, 0)->src[0].negate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_FALSE(instruction(block0, 1)->saturate);
+   EXPECT_FALSE(instruction(block0, 1)->src[0].negate);
+}
+
+TEST_F(saturate_propagation_test, mul_mov_sat_neg_mov_sat)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg dst0 = v->vgrf(glsl_type::float_type);
+   fs_reg dst1 = v->vgrf(glsl_type::float_type);
+   fs_reg dst2 = v->vgrf(glsl_type::float_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   fs_reg src1 = v->vgrf(glsl_type::float_type);
+   bld.MUL(dst0, src0, src1);
+   set_saturate(true, bld.MOV(dst1, dst0));
+   dst0.negate = true;
+   set_saturate(true, bld.MOV(dst2, dst0));
+
+   /* = Before =
+    *
+    * 0: mul(8)        dst0  src0  src1
+    * 1: mov.sat(8)    dst1  dst0
+    * 2: mov.sat(8)    dst2  -dst0
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode);
+   EXPECT_FALSE(instruction(block0, 0)->saturate);
+   EXPECT_FALSE(instruction(block0, 0)->src[1].negate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_TRUE(instruction(block0, 1)->saturate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 2)->opcode);
+   EXPECT_TRUE(instruction(block0, 2)->src[0].negate);
+   EXPECT_TRUE(instruction(block0, 2)->saturate);
+}
+
+TEST_F(saturate_propagation_test, mul_neg_mov_sat_neg_mov_sat)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg dst0 = v->vgrf(glsl_type::float_type);
+   fs_reg dst1 = v->vgrf(glsl_type::float_type);
+   fs_reg dst2 = v->vgrf(glsl_type::float_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   fs_reg src1 = v->vgrf(glsl_type::float_type);
+   bld.MUL(dst0, src0, src1);
+   dst0.negate = true;
+   set_saturate(true, bld.MOV(dst1, dst0));
+   set_saturate(true, bld.MOV(dst2, dst0));
+
+   /* = Before =
+    *
+    * 0: mul(8)        dst0  src0  src1
+    * 1: mov.sat(8)    dst1  -dst0
+    * 2: mov.sat(8)    dst2  -dst0
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode);
+   EXPECT_FALSE(instruction(block0, 0)->saturate);
+   EXPECT_FALSE(instruction(block0, 0)->src[1].negate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_TRUE(instruction(block0, 1)->src[0].negate);
+   EXPECT_TRUE(instruction(block0, 1)->saturate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 2)->opcode);
+   EXPECT_TRUE(instruction(block0, 2)->src[0].negate);
+   EXPECT_TRUE(instruction(block0, 2)->saturate);
+}
+
+TEST_F(saturate_propagation_test, abs_mov_sat)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg dst0 = v->vgrf(glsl_type::float_type);
+   fs_reg dst1 = v->vgrf(glsl_type::float_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   fs_reg src1 = v->vgrf(glsl_type::float_type);
+   bld.ADD(dst0, src0, src1);
+   dst0.abs = true;
+   set_saturate(true, bld.MOV(dst1, dst0));
+
+   /* = Before =
+    *
+    * 0: add(8)        dst0  src0  src1
+    * 1: mov.sat(8)    dst1  (abs)dst0
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_FALSE(instruction(block0, 0)->saturate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_TRUE(instruction(block0, 1)->saturate);
+}
+
+TEST_F(saturate_propagation_test, producer_saturates)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg dst0 = v->vgrf(glsl_type::float_type);
+   fs_reg dst1 = v->vgrf(glsl_type::float_type);
+   fs_reg dst2 = v->vgrf(glsl_type::float_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   fs_reg src1 = v->vgrf(glsl_type::float_type);
+   set_saturate(true, bld.ADD(dst0, src0, src1));
+   set_saturate(true, bld.MOV(dst1, dst0));
+   bld.MOV(dst2, dst0);
+
+   /* = Before =
+    *
+    * 0: add.sat(8)    dst0  src0  src1
+    * 1: mov.sat(8)    dst1  dst0
+    * 2: mov(8)        dst2  dst0
+    *
+    * = After =
+    * 0: add.sat(8)    dst0  src0  src1
+    * 1: mov(8)        dst1  dst0
+    * 2: mov(8)        dst2  dst0
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_TRUE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_TRUE(instruction(block0, 0)->saturate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_FALSE(instruction(block0, 1)->saturate);
+}
+
+TEST_F(saturate_propagation_test, intervening_saturating_copy)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg dst0 = v->vgrf(glsl_type::float_type);
+   fs_reg dst1 = v->vgrf(glsl_type::float_type);
+   fs_reg dst2 = v->vgrf(glsl_type::float_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   fs_reg src1 = v->vgrf(glsl_type::float_type);
+   bld.ADD(dst0, src0, src1);
+   set_saturate(true, bld.MOV(dst1, dst0));
+   set_saturate(true, bld.MOV(dst2, dst0));
+
+   /* = Before =
+    *
+    * 0: add(8)    dst0  src0  src1
+    * 1: mov.sat(8)    dst1  dst0
+    * 2: mov.sat(8)    dst2  dst0
+    *
+    * = After =
+    * 0: add.sat(8)    dst0  src0  src1
+    * 1: mov(8)        dst1  dst0
+    * 2: mov(8)        dst2  dst0
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_TRUE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_TRUE(instruction(block0, 0)->saturate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_FALSE(instruction(block0, 1)->saturate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 2)->opcode);
+   EXPECT_FALSE(instruction(block0, 2)->saturate);
+}
+
+TEST_F(saturate_propagation_test, intervening_dest_write)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg dst0 = v->vgrf(glsl_type::vec4_type);
+   fs_reg dst1 = v->vgrf(glsl_type::float_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   fs_reg src1 = v->vgrf(glsl_type::float_type);
+   fs_reg src2 = v->vgrf(glsl_type::vec2_type);
+   bld.ADD(offset(dst0, bld, 2), src0, src1);
+   bld.emit(SHADER_OPCODE_TEX, dst0, src2)
+      ->size_written = 4 * REG_SIZE;
+   set_saturate(true, bld.MOV(dst1, offset(dst0, bld, 2)));
+
+   /* = Before =
+    *
+    * 0: add(8)        dst0+2  src0    src1
+    * 1: tex(8) rlen 4 dst0+0  src2
+    * 2: mov.sat(8)    dst1    dst0+2
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_FALSE(instruction(block0, 0)->saturate);
+   EXPECT_EQ(SHADER_OPCODE_TEX, instruction(block0, 1)->opcode);
+   EXPECT_FALSE(instruction(block0, 0)->saturate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 2)->opcode);
+   EXPECT_TRUE(instruction(block0, 2)->saturate);
+}
+
+TEST_F(saturate_propagation_test, mul_neg_mov_sat_mov_sat)
+{
+   const fs_builder &bld = v->bld;
+   fs_reg dst0 = v->vgrf(glsl_type::float_type);
+   fs_reg dst1 = v->vgrf(glsl_type::float_type);
+   fs_reg dst2 = v->vgrf(glsl_type::float_type);
+   fs_reg src0 = v->vgrf(glsl_type::float_type);
+   fs_reg src1 = v->vgrf(glsl_type::float_type);
+   bld.MUL(dst0, src0, src1);
+   dst0.negate = true;
+   set_saturate(true, bld.MOV(dst1, dst0));
+   dst0.negate = false;
+   set_saturate(true, bld.MOV(dst2, dst0));
+
+   /* = Before =
+    *
+    * 0: mul(8)        dst0  src0  src1
+    * 1: mov.sat(8)    dst1  -dst0
+    * 2: mov.sat(8)    dst2  dst0
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(saturate_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode);
+   EXPECT_FALSE(instruction(block0, 0)->saturate);
+   EXPECT_FALSE(instruction(block0, 0)->src[1].negate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 1)->opcode);
+   EXPECT_TRUE(instruction(block0, 1)->saturate);
+   EXPECT_TRUE(instruction(block0, 1)->src[0].negate);
+   EXPECT_EQ(BRW_OPCODE_MOV, instruction(block0, 2)->opcode);
+   EXPECT_TRUE(instruction(block0, 2)->saturate);
+}
diff --git a/src/intel/compiler/test_vec4_cmod_propagation.cpp b/src/intel/compiler/test_vec4_cmod_propagation.cpp
new file mode 100644
index 00000000000..7d9792b4a55
--- /dev/null
+++ b/src/intel/compiler/test_vec4_cmod_propagation.cpp
@@ -0,0 +1,823 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Based on test_fs_cmod_propagation.cpp
+ */
+
+#include <gtest/gtest.h>
+#include "brw_vec4.h"
+#include "brw_vec4_builder.h"
+#include "brw_cfg.h"
+#include "program/program.h"
+
+using namespace brw;
+
+class cmod_propagation_test : public ::testing::Test {
+   virtual void SetUp();
+
+public:
+   struct brw_compiler *compiler;
+   struct gen_device_info *devinfo;
+   struct gl_context *ctx;
+   struct gl_shader_program *shader_prog;
+   struct brw_vue_prog_data *prog_data;
+   vec4_visitor *v;
+};
+
+class cmod_propagation_vec4_visitor : public vec4_visitor
+{
+public:
+   cmod_propagation_vec4_visitor(struct brw_compiler *compiler,
+                                 nir_shader *shader,
+                                 struct brw_vue_prog_data *prog_data)
+      : vec4_visitor(compiler, NULL, NULL, prog_data, shader, NULL,
+                     false, -1)
+      {
+         prog_data->dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
+      }
+
+protected:
+   /* Dummy implementation for pure virtual methods */
+   virtual dst_reg *make_reg_for_system_value(int location)
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void setup_payload()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_prolog()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_program_code()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_thread_end()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_urb_write_header(int mrf)
+   {
+      unreachable("Not reached");
+   }
+
+   virtual vec4_instruction *emit_urb_write_opcode(bool complete)
+   {
+      unreachable("Not reached");
+   }
+};
+
+
+void cmod_propagation_test::SetUp()
+{
+   ctx = (struct gl_context *)calloc(1, sizeof(*ctx));
+   compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler));
+   devinfo = (struct gen_device_info *)calloc(1, sizeof(*devinfo));
+   prog_data = (struct brw_vue_prog_data *)calloc(1, sizeof(*prog_data));
+   compiler->devinfo = devinfo;
+
+   nir_shader *shader =
+      nir_shader_create(NULL, MESA_SHADER_VERTEX, NULL, NULL);
+
+   v = new cmod_propagation_vec4_visitor(compiler, shader, prog_data);
+
+   devinfo->gen = 4;
+}
+
+static vec4_instruction *
+instruction(bblock_t *block, int num)
+{
+   vec4_instruction *inst = (vec4_instruction *)block->start();
+   for (int i = 0; i < num; i++) {
+      inst = (vec4_instruction *)inst->next;
+   }
+   return inst;
+}
+
+static bool
+cmod_propagation(vec4_visitor *v)
+{
+   const bool print = getenv("TEST_DEBUG");
+
+   if (print) {
+      fprintf(stderr, "= Before =\n");
+      v->dump_instructions();
+   }
+
+   bool ret = v->opt_cmod_propagation();
+
+   if (print) {
+      fprintf(stderr, "\n= After =\n");
+      v->dump_instructions();
+   }
+
+   return ret;
+}
+
+TEST_F(cmod_propagation_test, basic)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::float_type);
+   src_reg src0 = src_reg(v, glsl_type::float_type);
+   src_reg src1 = src_reg(v, glsl_type::float_type);
+   src_reg zero(brw_imm_f(0.0f));
+   dst_reg dest_null = bld.null_reg_f();
+   dest_null.writemask = WRITEMASK_X;
+
+   bld.ADD(dest, src0, src1);
+   bld.CMP(dest_null, src_reg(dest), zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add        dest.x  src0.xxxx  src1.xxxx
+    * 1: cmp.ge.f0  null.x  dest.xxxx  0.0f
+    *
+    * = After =
+    * 0: add.ge.f0  dest.x  src0.xxxx  src1.xxxx
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, basic_different_dst_writemask)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::float_type);
+   src_reg src0 = src_reg(v, glsl_type::float_type);
+   src_reg src1 = src_reg(v, glsl_type::float_type);
+   src_reg zero(brw_imm_f(0.0f));
+   dst_reg dest_null = bld.null_reg_f();
+
+   bld.ADD(dest, src0, src1);
+   bld.CMP(dest_null, src_reg(dest), zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add        dest.x     src0  src1
+    * 1: cmp.ge.f0  null.xyzw  dest  0.0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, andz_one)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::int_type);
+   src_reg src0 = src_reg(v, glsl_type::float_type);
+   src_reg zero(brw_imm_f(0.0f));
+   src_reg one(brw_imm_d(1));
+
+   bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L);
+   set_condmod(BRW_CONDITIONAL_Z,
+               bld.AND(bld.null_reg_d(), src_reg(dest), one));
+
+   /* = Before =
+    * 0: cmp.l.f0     dest:F  src0:F  0F
+    * 1: and.z.f0     null:D  dest:D  1D
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_AND, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_EQ, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, non_cmod_instruction)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::uint_type);
+   src_reg src0 = src_reg(v, glsl_type::uint_type);
+   src_reg zero(brw_imm_ud(0u));
+   bld.FBL(dest, src0);
+   bld.CMP(bld.null_reg_ud(), src_reg(dest), zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: fbl        dest  src0
+    * 1: cmp.ge.f0  null  dest  0u
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_FBL, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, intervening_flag_write)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::float_type);
+   src_reg src0 = src_reg(v, glsl_type::float_type);
+   src_reg src1 = src_reg(v, glsl_type::float_type);
+   src_reg src2 = src_reg(v, glsl_type::float_type);
+   src_reg zero(brw_imm_f(0.0f));
+   bld.ADD(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), src2, zero, BRW_CONDITIONAL_GE);
+   bld.CMP(bld.null_reg_f(), src_reg(dest), zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add        dest  src0  src1
+    * 1: cmp.ge.f0  null  src2  0.0f
+    * 2: cmp.ge.f0  null  dest  0.0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, intervening_flag_read)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest0 = dst_reg(v, glsl_type::float_type);
+   dst_reg dest1 = dst_reg(v, glsl_type::float_type);
+   src_reg src0 = src_reg(v, glsl_type::float_type);
+   src_reg src1 = src_reg(v, glsl_type::float_type);
+   src_reg src2 = src_reg(v, glsl_type::float_type);
+   src_reg zero(brw_imm_f(0.0f));
+   bld.ADD(dest0, src0, src1);
+   set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero));
+   bld.CMP(bld.null_reg_f(), src_reg(dest0), zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add        dest0 src0  src1
+    * 1: (+f0) sel  dest1 src2  0.0f
+    * 2: cmp.ge.f0  null  dest0 0.0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, intervening_dest_write)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::vec4_type);
+   src_reg src0 = src_reg(v, glsl_type::float_type);
+   src_reg src1 = src_reg(v, glsl_type::float_type);
+   src_reg src2 = src_reg(v, glsl_type::vec2_type);
+   src_reg zero(brw_imm_f(0.0f));
+   bld.ADD(offset(dest, 8, 2), src0, src1);
+   bld.emit(SHADER_OPCODE_TEX, dest, src2)
+      ->size_written = 4 * REG_SIZE;
+   bld.CMP(bld.null_reg_f(), offset(src_reg(dest), 8, 2), zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add        dest+2  src0    src1
+    * 1: tex rlen 4 dest+0  src2
+    * 2: cmp.ge.f0  null    dest+2  0.0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(2, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(SHADER_OPCODE_TEX, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 2)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 2)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, intervening_flag_read_same_value)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest0 = dst_reg(v, glsl_type::float_type);
+   dst_reg dest1 = dst_reg(v, glsl_type::float_type);
+   src_reg src0 = src_reg(v, glsl_type::float_type);
+   src_reg src1 = src_reg(v, glsl_type::float_type);
+   src_reg src2 = src_reg(v, glsl_type::float_type);
+   src_reg zero(brw_imm_f(0.0f));
+   dst_reg dest_null = bld.null_reg_f();
+   dest_null.writemask = WRITEMASK_X;
+
+   set_condmod(BRW_CONDITIONAL_GE, bld.ADD(dest0, src0, src1));
+   set_predicate(BRW_PREDICATE_NORMAL, bld.SEL(dest1, src2, zero));
+   bld.CMP(dest_null, src_reg(dest0), zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add.ge.f0  dest0   src0  src1
+    * 1: (+f0) sel  dest1   src2  0.0f
+    * 2: cmp.ge.f0  null.x  dest0 0.0f
+    *
+    * = After =
+    * 0: add.ge.f0  dest0 src0  src1
+    * 1: (+f0) sel  dest1 src2  0.0f
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(2, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_SEL, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_PREDICATE_NORMAL, instruction(block0, 1)->predicate);
+}
+
+TEST_F(cmod_propagation_test, negate)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::float_type);
+   src_reg src0 = src_reg(v, glsl_type::float_type);
+   src_reg src1 = src_reg(v, glsl_type::float_type);
+   src_reg zero(brw_imm_f(0.0f));
+   bld.ADD(dest, src0, src1);
+   src_reg tmp_src = src_reg(dest);
+   tmp_src.negate = true;
+   dst_reg dest_null = bld.null_reg_f();
+   dest_null.writemask = WRITEMASK_X;
+   bld.CMP(dest_null, tmp_src, zero, BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add        dest     src0  src1
+    * 1: cmp.ge.f0  null.x  -dest 0.0f
+    *
+    * = After =
+    * 0: add.le.f0  dest     src0  src1
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_LE, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, movnz)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::float_type);
+   src_reg src0 = src_reg(v, glsl_type::float_type);
+   src_reg src1 = src_reg(v, glsl_type::float_type);
+   dst_reg dest_null = bld.null_reg_f();
+   dest_null.writemask = WRITEMASK_X;
+
+   bld.CMP(dest, src0, src1, BRW_CONDITIONAL_L);
+   set_condmod(BRW_CONDITIONAL_NZ,
+               bld.MOV(dest_null, src_reg(dest)));
+
+   /* = Before =
+    *
+    * 0: cmp.l.f0  dest:F  src0:F  src1:F
+    * 1: mov.nz.f0 null.x  dest:F
+    *
+    * = After =
+    * 0: cmp.l.f0  dest  src0:F  src1:F
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, different_types_cmod_with_zero)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::int_type);
+   src_reg src0 = src_reg(v, glsl_type::int_type);
+   src_reg src1 = src_reg(v, glsl_type::int_type);
+   src_reg zero(brw_imm_f(0.0f));
+   bld.ADD(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), retype(src_reg(dest), BRW_REGISTER_TYPE_F), zero,
+           BRW_CONDITIONAL_GE);
+
+   /* = Before =
+    *
+    * 0: add        dest:D  src0:D  src1:D
+    * 1: cmp.ge.f0  null:F  dest:F  0.0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_ADD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, andnz_non_one)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::int_type);
+   src_reg src0 = src_reg(v, glsl_type::float_type);
+   src_reg zero(brw_imm_f(0.0f));
+   src_reg nonone(brw_imm_d(38));
+
+   bld.CMP(retype(dest, BRW_REGISTER_TYPE_F), src0, zero, BRW_CONDITIONAL_L);
+   set_condmod(BRW_CONDITIONAL_NZ,
+               bld.AND(bld.null_reg_d(), src_reg(dest), nonone));
+
+   /* = Before =
+    * 0: cmp.l.f0     dest:F  src0:F  0F
+    * 1: and.nz.f0    null:D  dest:D  38D
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_AND, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod);
+}
+
+/* Note that basic is using glsl_type:float types, while this one is using
+ * glsl_type::vec4 */
+TEST_F(cmod_propagation_test, basic_vec4)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::vec4_type);
+   src_reg src0 = src_reg(v, glsl_type::vec4_type);
+   src_reg src1 = src_reg(v, glsl_type::vec4_type);
+   src_reg zero(brw_imm_f(0.0f));
+
+   bld.MUL(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), src_reg(dest), zero, BRW_CONDITIONAL_NZ);
+
+   /* = Before =
+    * 0: mul         dest.xyzw  src0.xyzw  src1.xyzw
+    * 1: cmp.nz.f0.0 null.xyzw  dest.xyzw  0.0f
+    *
+    * = After =
+    * 0: mul.nz.f0.0 dest.xyzw  src0.xyzw  src1.xyzw
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, basic_vec4_different_dst_writemask)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::vec4_type);
+   dest.writemask = WRITEMASK_X;
+   src_reg src0 = src_reg(v, glsl_type::vec4_type);
+   src_reg src1 = src_reg(v, glsl_type::vec4_type);
+   src_reg zero(brw_imm_f(0.0f));
+   dst_reg dest_null = bld.null_reg_f();
+
+   bld.MUL(dest, src0, src1);
+   bld.CMP(dest_null, src_reg(dest), zero, BRW_CONDITIONAL_NZ);
+
+   /* = Before =
+    * 0: mul         dest.x  src0  src1
+    * 1: cmp.nz.f0.0 null    dest  0.0f
+    *
+    * = After =
+    * (no changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, mad_one_component_vec4)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::vec4_type);
+   dest.writemask = WRITEMASK_X;
+   src_reg src0 = src_reg(v, glsl_type::vec4_type);
+   src_reg src1 = src_reg(v, glsl_type::vec4_type);
+   src_reg src2 = src_reg(v, glsl_type::vec4_type);
+   src0.swizzle = src1.swizzle = src2.swizzle = BRW_SWIZZLE_XXXX;
+   src2.negate = true;
+   src_reg zero(brw_imm_f(0.0f));
+   src_reg tmp(dest);
+   tmp.swizzle = BRW_SWIZZLE_XXXX;
+   dst_reg dest_null = bld.null_reg_f();
+   dest_null.writemask = WRITEMASK_X;
+
+   bld.MAD(dest, src0, src1, src2);
+   bld.CMP(dest_null, tmp, zero, BRW_CONDITIONAL_L);
+
+   /* = Before =
+    *
+    * 0: mad         dest.x:F  src0.xxxx:F  src10.xxxx:F  -src2.xxxx:F
+    * 1: cmp.l.f0.0  null.x:F  dest.xxxx:F  0.0f
+    *
+    * = After =
+    * 0: mad.l.f0    dest.x:F  src0.xxxx:F  src10.xxxx:F  -src2.xxxx:F
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_MAD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, mad_more_one_component_vec4)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::vec4_type);
+   dest.writemask = WRITEMASK_XW;
+   src_reg src0 = src_reg(v, glsl_type::vec4_type);
+   src_reg src1 = src_reg(v, glsl_type::vec4_type);
+   src_reg src2 = src_reg(v, glsl_type::vec4_type);
+   src0.swizzle = src1.swizzle = src2.swizzle = BRW_SWIZZLE_XXXX;
+   src2.negate = true;
+   src_reg zero(brw_imm_f(0.0f));
+   src_reg tmp(dest);
+   tmp.swizzle = BRW_SWIZZLE_XXXX;
+   dst_reg dest_null = bld.null_reg_f();
+
+   bld.MAD(dest, src0, src1, src2);
+   bld.CMP(dest_null, tmp, zero, BRW_CONDITIONAL_L);
+
+   /* = Before =
+    *
+    * 0: mad         dest.xw:F  src0.xxxx:F  src10.xxxx:F  -src2.xxxx:F
+    * 1: cmp.l.f0.0  null:F  dest.xxxx:F  zeroF
+    *
+    * = After =
+    * (No changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_MAD, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_L, instruction(block0, 1)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, cmp_mov_vec4)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::ivec4_type);
+   dest.writemask = WRITEMASK_X;
+   src_reg src0 = src_reg(v, glsl_type::ivec4_type);
+   src0.swizzle = BRW_SWIZZLE_XXXX;
+   src0.file = UNIFORM;
+   src_reg nonone = retype(brw_imm_d(16), BRW_REGISTER_TYPE_D);
+   src_reg mov_src = src_reg(dest);
+   mov_src.swizzle = BRW_SWIZZLE_XXXX;
+   dst_reg dest_null = bld.null_reg_d();
+   dest_null.writemask = WRITEMASK_X;
+
+   bld.CMP(dest, src0, nonone, BRW_CONDITIONAL_GE);
+   set_condmod(BRW_CONDITIONAL_NZ,
+               bld.MOV(dest_null, mov_src));
+
+   /* = Before =
+    *
+    * 0: cmp.ge.f0  dest.x:D  u.xxxx:D  16D
+    * 1: mov.nz.f0  null.x:D  dest.xxxx:D
+    *
+    * = After =
+    * 0: cmp.ge.f0  dest.x:D  u.xxxx:D  16D
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_TRUE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(0, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_GE, instruction(block0, 0)->conditional_mod);
+}
+
+TEST_F(cmod_propagation_test, mul_cmp_different_channels_vec4)
+{
+   const vec4_builder bld = vec4_builder(v).at_end();
+   dst_reg dest = dst_reg(v, glsl_type::vec4_type);
+   src_reg src0 = src_reg(v, glsl_type::vec4_type);
+   src_reg src1 = src_reg(v, glsl_type::vec4_type);
+   src_reg zero(brw_imm_f(0.0f));
+   src_reg cmp_src = src_reg(dest);
+   cmp_src.swizzle = BRW_SWIZZLE4(0,1,3,2);
+
+   bld.MUL(dest, src0, src1);
+   bld.CMP(bld.null_reg_f(), cmp_src, zero, BRW_CONDITIONAL_NZ);
+
+   /* = Before =
+    * 0: mul         dest  src0       src1
+    * 1: cmp.nz.f0.0 null  dest.xywz  0.0f
+    *
+    * = After =
+    * (No changes)
+    */
+
+   v->calculate_cfg();
+   bblock_t *block0 = v->cfg->blocks[0];
+
+   EXPECT_EQ(0, block0->start_ip);
+   EXPECT_EQ(1, block0->end_ip);
+
+   EXPECT_FALSE(cmod_propagation(v));
+
+   ASSERT_EQ(0, block0->start_ip);
+   ASSERT_EQ(1, block0->end_ip);
+   EXPECT_EQ(BRW_OPCODE_MUL, instruction(block0, 0)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NONE, instruction(block0, 0)->conditional_mod);
+   EXPECT_EQ(BRW_OPCODE_CMP, instruction(block0, 1)->opcode);
+   EXPECT_EQ(BRW_CONDITIONAL_NZ, instruction(block0, 1)->conditional_mod);
+}
diff --git a/src/intel/compiler/test_vec4_copy_propagation.cpp b/src/intel/compiler/test_vec4_copy_propagation.cpp
new file mode 100644
index 00000000000..f4f91d8c8c7
--- /dev/null
+++ b/src/intel/compiler/test_vec4_copy_propagation.cpp
@@ -0,0 +1,181 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <gtest/gtest.h>
+#include "brw_vec4.h"
+#include "program/program.h"
+
+using namespace brw;
+
+int ret = 0;
+
+class copy_propagation_test : public ::testing::Test {
+   virtual void SetUp();
+
+public:
+   struct brw_compiler *compiler;
+   struct gen_device_info *devinfo;
+   struct gl_context *ctx;
+   struct gl_shader_program *shader_prog;
+   struct brw_vue_prog_data *prog_data;
+   vec4_visitor *v;
+};
+
+class copy_propagation_vec4_visitor : public vec4_visitor
+{
+public:
+   copy_propagation_vec4_visitor(struct brw_compiler *compiler,
+                                 nir_shader *shader,
+                                 struct brw_vue_prog_data *prog_data)
+      : vec4_visitor(compiler, NULL, NULL, prog_data, shader, NULL,
+                     false /* no_spills */, -1)
+   {
+      prog_data->dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
+   }
+
+protected:
+   virtual dst_reg *make_reg_for_system_value(int location)
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void setup_payload()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_prolog()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_thread_end()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_urb_write_header(int mrf)
+   {
+      unreachable("Not reached");
+   }
+
+   virtual vec4_instruction *emit_urb_write_opcode(bool complete)
+   {
+      unreachable("Not reached");
+   }
+};
+
+
+void copy_propagation_test::SetUp()
+{
+   ctx = (struct gl_context *)calloc(1, sizeof(*ctx));
+   compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler));
+   devinfo = (struct gen_device_info *)calloc(1, sizeof(*devinfo));
+   prog_data = (struct brw_vue_prog_data *)calloc(1, sizeof(*prog_data));
+   compiler->devinfo = devinfo;
+
+   nir_shader *shader =
+      nir_shader_create(NULL, MESA_SHADER_VERTEX, NULL, NULL);
+
+   v = new copy_propagation_vec4_visitor(compiler, shader, prog_data);
+
+   devinfo->gen = 4;
+}
+
+static void
+copy_propagation(vec4_visitor *v)
+{
+   bool print = false;
+
+   if (print) {
+      fprintf(stderr, "instructions before:\n");
+      v->dump_instructions();
+   }
+
+   v->calculate_cfg();
+   v->opt_copy_propagation();
+
+   if (print) {
+      fprintf(stderr, "instructions after:\n");
+      v->dump_instructions();
+   }
+}
+
+TEST_F(copy_propagation_test, test_swizzle_swizzle)
+{
+   dst_reg a = dst_reg(v, glsl_type::vec4_type);
+   dst_reg b = dst_reg(v, glsl_type::vec4_type);
+   dst_reg c = dst_reg(v, glsl_type::vec4_type);
+
+   v->emit(v->ADD(a, src_reg(a), src_reg(a)));
+
+   v->emit(v->MOV(b, swizzle(src_reg(a), BRW_SWIZZLE4(SWIZZLE_Y,
+                                                      SWIZZLE_Z,
+                                                      SWIZZLE_W,
+                                                      SWIZZLE_X))));
+
+   vec4_instruction *test_mov =
+      v->MOV(c, swizzle(src_reg(b), BRW_SWIZZLE4(SWIZZLE_Y,
+                                                 SWIZZLE_Z,
+                                                 SWIZZLE_W,
+                                                 SWIZZLE_X)));
+   v->emit(test_mov);
+
+   copy_propagation(v);
+
+   EXPECT_EQ(test_mov->src[0].nr, a.nr);
+   EXPECT_EQ(test_mov->src[0].swizzle, BRW_SWIZZLE4(SWIZZLE_Z,
+                                                    SWIZZLE_W,
+                                                    SWIZZLE_X,
+                                                    SWIZZLE_Y));
+}
+
+TEST_F(copy_propagation_test, test_swizzle_writemask)
+{
+   dst_reg a = dst_reg(v, glsl_type::vec4_type);
+   dst_reg b = dst_reg(v, glsl_type::vec4_type);
+   dst_reg c = dst_reg(v, glsl_type::vec4_type);
+
+   v->emit(v->MOV(b, swizzle(src_reg(a), BRW_SWIZZLE4(SWIZZLE_X,
+                                                      SWIZZLE_Y,
+                                                      SWIZZLE_X,
+                                                      SWIZZLE_Z))));
+
+   v->emit(v->MOV(writemask(a, WRITEMASK_XYZ), brw_imm_f(1.0f)));
+
+   vec4_instruction *test_mov =
+      v->MOV(c, swizzle(src_reg(b), BRW_SWIZZLE4(SWIZZLE_W,
+                                                 SWIZZLE_W,
+                                                 SWIZZLE_W,
+                                                 SWIZZLE_W)));
+   v->emit(test_mov);
+
+   copy_propagation(v);
+
+   /* should not copy propagate */
+   EXPECT_EQ(test_mov->src[0].nr, b.nr);
+   EXPECT_EQ(test_mov->src[0].swizzle, BRW_SWIZZLE4(SWIZZLE_W,
+                                                    SWIZZLE_W,
+                                                    SWIZZLE_W,
+                                                    SWIZZLE_W));
+}
diff --git a/src/intel/compiler/test_vec4_register_coalesce.cpp b/src/intel/compiler/test_vec4_register_coalesce.cpp
new file mode 100644
index 00000000000..a3dbb0a72e4
--- /dev/null
+++ b/src/intel/compiler/test_vec4_register_coalesce.cpp
@@ -0,0 +1,242 @@
+/*
+ * Copyright © 2012 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <gtest/gtest.h>
+#include "brw_vec4.h"
+#include "program/program.h"
+
+using namespace brw;
+
+int ret = 0;
+
+#define register_coalesce(v) _register_coalesce(v, __func__)
+
+class register_coalesce_test : public ::testing::Test {
+   virtual void SetUp();
+
+public:
+   struct brw_compiler *compiler;
+   struct gen_device_info *devinfo;
+   struct gl_context *ctx;
+   struct gl_shader_program *shader_prog;
+   struct brw_vue_prog_data *prog_data;
+   vec4_visitor *v;
+};
+
+
+class register_coalesce_vec4_visitor : public vec4_visitor
+{
+public:
+   register_coalesce_vec4_visitor(struct brw_compiler *compiler,
+                                  nir_shader *shader,
+                                  struct brw_vue_prog_data *prog_data)
+      : vec4_visitor(compiler, NULL, NULL, prog_data, shader, NULL,
+                     false /* no_spills */, -1)
+   {
+      prog_data->dispatch_mode = DISPATCH_MODE_4X2_DUAL_OBJECT;
+   }
+
+protected:
+   virtual dst_reg *make_reg_for_system_value(int location)
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void setup_payload()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_prolog()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_thread_end()
+   {
+      unreachable("Not reached");
+   }
+
+   virtual void emit_urb_write_header(int mrf)
+   {
+      unreachable("Not reached");
+   }
+
+   virtual vec4_instruction *emit_urb_write_opcode(bool complete)
+   {
+      unreachable("Not reached");
+   }
+};
+
+
+void register_coalesce_test::SetUp()
+{
+   ctx = (struct gl_context *)calloc(1, sizeof(*ctx));
+   compiler = (struct brw_compiler *)calloc(1, sizeof(*compiler));
+   devinfo = (struct gen_device_info *)calloc(1, sizeof(*devinfo));
+   prog_data = (struct brw_vue_prog_data *)calloc(1, sizeof(*prog_data));
+   compiler->devinfo = devinfo;
+
+   nir_shader *shader =
+      nir_shader_create(NULL, MESA_SHADER_VERTEX, NULL, NULL);
+
+   v = new register_coalesce_vec4_visitor(compiler, shader, prog_data);
+
+   devinfo->gen = 4;
+}
+
+static void
+_register_coalesce(vec4_visitor *v, const char *func)
+{
+   bool print = false;
+
+   if (print) {
+      printf("%s: instructions before:\n", func);
+      v->dump_instructions();
+   }
+
+   v->calculate_cfg();
+   v->opt_register_coalesce();
+
+   if (print) {
+      printf("%s: instructions after:\n", func);
+      v->dump_instructions();
+   }
+}
+
+TEST_F(register_coalesce_test, test_compute_to_mrf)
+{
+   src_reg something = src_reg(v, glsl_type::float_type);
+   dst_reg temp = dst_reg(v, glsl_type::float_type);
+   dst_reg init;
+
+   dst_reg m0 = dst_reg(MRF, 0);
+   m0.writemask = WRITEMASK_X;
+   m0.type = BRW_REGISTER_TYPE_F;
+
+   vec4_instruction *mul = v->emit(v->MUL(temp, something, brw_imm_f(1.0f)));
+   v->emit(v->MOV(m0, src_reg(temp)));
+
+   register_coalesce(v);
+
+   EXPECT_EQ(mul->dst.file, MRF);
+}
+
+
+TEST_F(register_coalesce_test, test_multiple_use)
+{
+   src_reg something = src_reg(v, glsl_type::float_type);
+   dst_reg temp = dst_reg(v, glsl_type::vec4_type);
+   dst_reg init;
+
+   dst_reg m0 = dst_reg(MRF, 0);
+   m0.writemask = WRITEMASK_X;
+   m0.type = BRW_REGISTER_TYPE_F;
+
+   dst_reg m1 = dst_reg(MRF, 1);
+   m1.writemask = WRITEMASK_XYZW;
+   m1.type = BRW_REGISTER_TYPE_F;
+
+   src_reg src = src_reg(temp);
+   vec4_instruction *mul = v->emit(v->MUL(temp, something, brw_imm_f(1.0f)));
+   src.swizzle = BRW_SWIZZLE_XXXX;
+   v->emit(v->MOV(m0, src));
+   src.swizzle = BRW_SWIZZLE_XYZW;
+   v->emit(v->MOV(m1, src));
+
+   register_coalesce(v);
+
+   EXPECT_NE(mul->dst.file, MRF);
+}
+
+TEST_F(register_coalesce_test, test_dp4_mrf)
+{
+   src_reg some_src_1 = src_reg(v, glsl_type::vec4_type);
+   src_reg some_src_2 = src_reg(v, glsl_type::vec4_type);
+   dst_reg init;
+
+   dst_reg m0 = dst_reg(MRF, 0);
+   m0.writemask = WRITEMASK_Y;
+   m0.type = BRW_REGISTER_TYPE_F;
+
+   dst_reg temp = dst_reg(v, glsl_type::float_type);
+
+   vec4_instruction *dp4 = v->emit(v->DP4(temp, some_src_1, some_src_2));
+   v->emit(v->MOV(m0, src_reg(temp)));
+
+   register_coalesce(v);
+
+   EXPECT_EQ(dp4->dst.file, MRF);
+   EXPECT_EQ(dp4->dst.writemask, WRITEMASK_Y);
+}
+
+TEST_F(register_coalesce_test, test_dp4_grf)
+{
+   src_reg some_src_1 = src_reg(v, glsl_type::vec4_type);
+   src_reg some_src_2 = src_reg(v, glsl_type::vec4_type);
+   dst_reg init;
+
+   dst_reg to = dst_reg(v, glsl_type::vec4_type);
+   dst_reg temp = dst_reg(v, glsl_type::float_type);
+
+   vec4_instruction *dp4 = v->emit(v->DP4(temp, some_src_1, some_src_2));
+   to.writemask = WRITEMASK_Y;
+   v->emit(v->MOV(to, src_reg(temp)));
+
+   /* if we don't do something with the result, the automatic dead code
+    * elimination will remove all our instructions.
+    */
+   src_reg src = src_reg(to);
+   src.negate = true;
+   v->emit(v->MOV(dst_reg(MRF, 0), src));
+
+   register_coalesce(v);
+
+   EXPECT_EQ(dp4->dst.nr, to.nr);
+   EXPECT_EQ(dp4->dst.writemask, WRITEMASK_Y);
+}
+
+TEST_F(register_coalesce_test, test_channel_mul_grf)
+{
+   src_reg some_src_1 = src_reg(v, glsl_type::vec4_type);
+   src_reg some_src_2 = src_reg(v, glsl_type::vec4_type);
+   dst_reg init;
+
+   dst_reg to = dst_reg(v, glsl_type::vec4_type);
+   dst_reg temp = dst_reg(v, glsl_type::float_type);
+
+   vec4_instruction *mul = v->emit(v->MUL(temp, some_src_1, some_src_2));
+   to.writemask = WRITEMASK_Y;
+   v->emit(v->MOV(to, src_reg(temp)));
+
+   /* if we don't do something with the result, the automatic dead code
+    * elimination will remove all our instructions.
+    */
+   src_reg src = src_reg(to);
+   src.negate = true;
+   v->emit(v->MOV(dst_reg(MRF, 0), src));
+
+   register_coalesce(v);
+
+   EXPECT_EQ(mul->dst.nr, to.nr);
+}
diff --git a/src/intel/compiler/test_vf_float_conversions.cpp b/src/intel/compiler/test_vf_float_conversions.cpp
new file mode 100644
index 00000000000..7af97d0d097
--- /dev/null
+++ b/src/intel/compiler/test_vf_float_conversions.cpp
@@ -0,0 +1,110 @@
+/*
+ * Copyright © 2014 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include <gtest/gtest.h>
+#include <math.h>
+#include "brw_reg.h"
+
+class vf_float_conversion_test : public ::testing::Test {
+   virtual void SetUp();
+
+public:
+   float vf_to_float[128];
+};
+
+void vf_float_conversion_test::SetUp() {
+   /* 0 is special cased. */
+   vf_to_float[0] = 0.0;
+
+   for (int vf = 1; vf < 128; vf++) {
+      int ebits = (vf >> 4) & 0x7;
+      int mbits = vf & 0xf;
+
+      float x = 1.0f + mbits / 16.0f;
+      int exp = ebits - 3;
+
+      vf_to_float[vf] = ldexpf(x, exp);
+   }
+}
+
+union fu {
+   float f;
+   unsigned u;
+};
+
+static unsigned
+f2u(float f)
+{
+   union fu fu;
+   fu.f = f;
+   return fu.u;
+}
+
+TEST_F(vf_float_conversion_test, test_vf_to_float)
+{
+   for (int vf = 0; vf < 256; vf++) {
+      float expected = vf_to_float[vf % 128];
+      if (vf > 127)
+         expected = -expected;
+
+      EXPECT_EQ(f2u(expected), f2u(brw_vf_to_float(vf)));
+   }
+}
+
+TEST_F(vf_float_conversion_test, test_float_to_vf)
+{
+   for (int vf = 0; vf < 256; vf++) {
+      float f = vf_to_float[vf % 128];
+      if (vf > 127)
+         f = -f;
+
+      EXPECT_EQ(vf, brw_float_to_vf(f));
+   }
+}
+
+TEST_F(vf_float_conversion_test, test_special_case_0)
+{
+   /* ±0.0f are special cased to the VFs that would otherwise correspond
+    * to ±0.125f. Make sure we can't convert these values to VF.
+    */
+   EXPECT_EQ(brw_float_to_vf(+0.125f), -1);
+   EXPECT_EQ(brw_float_to_vf(-0.125f), -1);
+
+   EXPECT_EQ(f2u(brw_vf_to_float(brw_float_to_vf(+0.0f))), f2u(+0.0f));
+   EXPECT_EQ(f2u(brw_vf_to_float(brw_float_to_vf(-0.0f))), f2u(-0.0f));
+}
+
+TEST_F(vf_float_conversion_test, test_nonrepresentable_float_input)
+{
+   EXPECT_EQ(brw_float_to_vf(+32.0f), -1);
+   EXPECT_EQ(brw_float_to_vf(-32.0f), -1);
+
+   EXPECT_EQ(brw_float_to_vf(+16.5f), -1);
+   EXPECT_EQ(brw_float_to_vf(-16.5f), -1);
+
+   EXPECT_EQ(brw_float_to_vf(+8.25f), -1);
+   EXPECT_EQ(brw_float_to_vf(-8.25f), -1);
+
+   EXPECT_EQ(brw_float_to_vf(+4.125f), -1);
+   EXPECT_EQ(brw_float_to_vf(-4.125f), -1);
+}
-- 
cgit v1.2.3