Merge remote-tracking branch 'public/master' into vulkan

author: Jason Ekstrand <[email protected]> 2016-04-01 14:59:38 -0700
committer: Jason Ekstrand <[email protected]> 2016-04-01 15:16:21 -0700
commit: 95106f6bfbbb87b702e4bbba98e2eaea71924cd9 (patch)
tree: 9650d284ec7f7417b2bcf8a906dfa43dfc547cf7 /src
parent: cf2257069cbde19fd177a02c079206914aac5d14 (diff)
parent: 14c46954c910efb1db94a068a866c7259deaa9d9 (diff)
247 files changed, 8668 insertions, 3186 deletions
diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources
index 43377f18ad8..120ef2935a7 100644
--- a/src/compiler/Makefile.sources
+++ b/src/compiler/Makefile.sources
@@ -179,10 +179,10 @@ NIR_FILES = \
 	nir/nir_gather_info.c \
 	nir/nir_gs_count_vertices.c \
 	nir/nir_inline_functions.c \
-	nir/nir_intrinsics.c \
-	nir/nir_intrinsics.h \
 	nir/nir_instr_set.c \
 	nir/nir_instr_set.h \
+	nir/nir_intrinsics.c \
+	nir/nir_intrinsics.h \
 	nir/nir_liveness.c \
 	nir/nir_lower_alu_to_scalar.c \
 	nir/nir_lower_atomics.c \
diff --git a/src/compiler/glsl/ast.h b/src/compiler/glsl/ast.h
index 727aa432631..7436edce88a 100644
--- a/src/compiler/glsl/ast.h
+++ b/src/compiler/glsl/ast.h
@@ -214,6 +214,7 @@ public:
       subexpressions[2] = NULL;
       primary_expression.identifier = identifier;
       this->non_lvalue_description = NULL;
+      this->is_lhs = false;
    }
 
    static const char *operator_string(enum ast_operators op);
@@ -263,6 +264,11 @@ public:
     * This pointer may be \c NULL.
     */
    const char *non_lvalue_description;
+
+   void set_is_lhs(bool new_value);
+
+private:
+   bool is_lhs;
 };
 
 class ast_expression_bin : public ast_expression {
@@ -556,6 +562,15 @@ struct ast_type_qualifier {
          unsigned explicit_stream:1; /**< stream value assigned explicitly by shader code */
          /** \} */
 
+         /** \name Layout qualifiers for GL_ARB_enhanced_layouts */
+         /** \{ */
+         unsigned explicit_xfb_offset:1; /**< xfb_offset value assigned explicitly by shader code */
+         unsigned xfb_buffer:1; /**< Has xfb_buffer value assigned  */
+         unsigned explicit_xfb_buffer:1; /**< xfb_buffer value assigned explicitly by shader code */
+         unsigned xfb_stride:1; /**< Is xfb_stride value yet to be merged with global values  */
+         unsigned explicit_xfb_stride:1; /**< xfb_stride value assigned explicitly by shader code */
+         /** \} */
+
 	 /** \name Layout qualifiers for GL_ARB_tessellation_shader */
 	 /** \{ */
 	 /* tess eval input layout */
@@ -612,6 +627,15 @@ struct ast_type_qualifier {
    /** Stream in GLSL 1.50 geometry shaders. */
    ast_expression *stream;
 
+   /** xfb_buffer specified via the GL_ARB_enhanced_layouts keyword. */
+   ast_expression *xfb_buffer;
+
+   /** xfb_stride specified via the GL_ARB_enhanced_layouts keyword. */
+   ast_expression *xfb_stride;
+
+   /** global xfb_stride values for each buffer */
+   ast_layout_expression *out_xfb_stride[MAX_FEEDBACK_BUFFERS];
+
    /**
     * Input or output primitive type in GLSL 1.50 geometry shaders
     * and tessellation shaders.
@@ -627,8 +651,9 @@ struct ast_type_qualifier {
    ast_expression *binding;
 
    /**
-    * Offset specified via GL_ARB_shader_atomic_counter's "offset"
-    * keyword.
+    * Offset specified via GL_ARB_shader_atomic_counter's or
+    * GL_ARB_enhanced_layouts "offset" keyword, or by GL_ARB_enhanced_layouts
+    * "xfb_offset" keyword.
     *
     * \note
     * This field is only valid if \c explicit_offset is set.
@@ -1199,4 +1224,10 @@ extern void _mesa_ast_process_interface_block(YYLTYPE *locp,
                                               ast_interface_block *const block,
                                               const struct ast_type_qualifier &q);
 
+extern bool
+process_qualifier_constant(struct _mesa_glsl_parse_state *state,
+                           YYLTYPE *loc,
+                           const char *qual_indentifier,
+                           ast_expression *const_expression,
+                           unsigned *value);
 #endif /* AST_H */
diff --git a/src/compiler/glsl/ast_function.cpp b/src/compiler/glsl/ast_function.cpp
index 1a440203cfc..db68d5dfa48 100644
--- a/src/compiler/glsl/ast_function.cpp
+++ b/src/compiler/glsl/ast_function.cpp
@@ -1727,6 +1727,10 @@ ast_function_expression::handle_method(exec_list *instructions,
    const char *method;
    method = field->primary_expression.identifier;
 
+   /* This would prevent to raise "uninitialized variable" warnings when
+    * calling array.length.
+    */
+   field->subexpressions[0]->set_is_lhs(true);
    op = field->subexpressions[0]->hir(instructions, state);
    if (strcmp(method, "length") == 0) {
       if (!this->expressions.is_empty()) {
diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp
index 35def8e3ae0..3fe90079420 100644
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -54,6 +54,7 @@
 #include "ast.h"
 #include "compiler/glsl_types.h"
 #include "program/hash_table.h"
+#include "main/macros.h"
 #include "main/shaderobj.h"
 #include "ir.h"
 #include "ir_builder.h"
@@ -819,7 +820,7 @@ validate_assignment(struct _mesa_glsl_parse_state *state,
     * if the expression indicating the vertex number is not the identifier
     * `gl_InvocationID`.
     */
-   if (state->stage == MESA_SHADER_TESS_CTRL) {
+   if (state->stage == MESA_SHADER_TESS_CTRL && !lhs->type->is_error()) {
       ir_variable *var = lhs->variable_referenced();
       if (var->data.mode == ir_var_shader_out && !var->data.patch) {
          ir_rvalue *index = find_innermost_array_index(lhs);
@@ -1248,6 +1249,24 @@ ast_expression::hir_no_rvalue(exec_list *instructions,
    do_hir(instructions, state, false);
 }
 
+void
+ast_expression::set_is_lhs(bool new_value)
+{
+   /* is_lhs is tracked only to print "variable used uninitialized" warnings,
+    * if we lack a identifier we can just skip it.
+    */
+   if (this->primary_expression.identifier == NULL)
+      return;
+
+   this->is_lhs = new_value;
+
+   /* We need to go through the subexpressions tree to cover cases like
+    * ast_field_selection
+    */
+   if (this->subexpressions[0] != NULL)
+      this->subexpressions[0]->set_is_lhs(new_value);
+}
+
 ir_rvalue *
 ast_expression::do_hir(exec_list *instructions,
                        struct _mesa_glsl_parse_state *state,
@@ -1323,6 +1342,7 @@ ast_expression::do_hir(exec_list *instructions,
       break;
 
    case ast_assign: {
+      this->subexpressions[0]->set_is_lhs(true);
       op[0] = this->subexpressions[0]->hir(instructions, state);
       op[1] = this->subexpressions[1]->hir(instructions, state);
 
@@ -1592,6 +1612,7 @@ ast_expression::do_hir(exec_list *instructions,
    case ast_div_assign:
    case ast_add_assign:
    case ast_sub_assign: {
+      this->subexpressions[0]->set_is_lhs(true);
       op[0] = this->subexpressions[0]->hir(instructions, state);
       op[1] = this->subexpressions[1]->hir(instructions, state);
 
@@ -1618,6 +1639,7 @@ ast_expression::do_hir(exec_list *instructions,
    }
 
    case ast_mod_assign: {
+      this->subexpressions[0]->set_is_lhs(true);
       op[0] = this->subexpressions[0]->hir(instructions, state);
       op[1] = this->subexpressions[1]->hir(instructions, state);
 
@@ -1640,6 +1662,7 @@ ast_expression::do_hir(exec_list *instructions,
 
    case ast_ls_assign:
    case ast_rs_assign: {
+      this->subexpressions[0]->set_is_lhs(true);
       op[0] = this->subexpressions[0]->hir(instructions, state);
       op[1] = this->subexpressions[1]->hir(instructions, state);
       type = shift_result_type(op[0]->type, op[1]->type, this->oper, state,
@@ -1658,6 +1681,7 @@ ast_expression::do_hir(exec_list *instructions,
    case ast_and_assign:
    case ast_xor_assign:
    case ast_or_assign: {
+      this->subexpressions[0]->set_is_lhs(true);
       op[0] = this->subexpressions[0]->hir(instructions, state);
       op[1] = this->subexpressions[1]->hir(instructions, state);
       type = bit_logic_result_type(op[0], op[1], this->oper, state, &loc);
@@ -1839,6 +1863,11 @@ ast_expression::do_hir(exec_list *instructions,
    case ast_array_index: {
       YYLTYPE index_loc = subexpressions[1]->get_location();
 
+      /* Getting if an array is being used uninitialized is beyond what we get
+       * from ir_value.data.assigned. Setting is_lhs as true would force to
+       * not raise a uninitialized warning when using an array
+       */
+      subexpressions[0]->set_is_lhs(true);
       op[0] = subexpressions[0]->hir(instructions, state);
       op[1] = subexpressions[1]->hir(instructions, state);
 
@@ -1873,6 +1902,14 @@ ast_expression::do_hir(exec_list *instructions,
       if (var != NULL) {
          var->data.used = true;
          result = new(ctx) ir_dereference_variable(var);
+
+         if ((var->data.mode == ir_var_auto || var->data.mode == ir_var_shader_out)
+             && !this->is_lhs
+             && result->variable_referenced()->data.assigned != true
+             && !is_gl_identifier(var->name)) {
+            _mesa_glsl_warning(&loc, state, "`%s' used uninitialized",
+                               this->primary_expression.identifier);
+         }
       } else {
          _mesa_glsl_error(& loc, state, "`%s' undeclared",
                           this->primary_expression.identifier);
@@ -2318,11 +2355,11 @@ get_type_name_for_precision_qualifier(const glsl_type *type)
             return names[type_idx];
          }
          case GLSL_SAMPLER_DIM_BUF: {
-            assert(type->base_type == GLSL_TYPE_SAMPLER);
-            static const char *const names[4] = {
-              "samplerBuffer", NULL, NULL, NULL
+            static const char *const names[8] = {
+              "samplerBuffer", NULL, NULL, NULL,
+              "imageBuffer", NULL, NULL, NULL
             };
-            return names[type_idx];
+            return names[offset + type_idx];
          }
          case GLSL_SAMPLER_DIM_EXTERNAL: {
             assert(type->base_type == GLSL_TYPE_SAMPLER);
@@ -2380,11 +2417,11 @@ get_type_name_for_precision_qualifier(const glsl_type *type)
             return names[type_idx];
          }
          case GLSL_SAMPLER_DIM_BUF: {
-            assert(type->base_type == GLSL_TYPE_SAMPLER);
-            static const char *const names[4] = {
-              "isamplerBuffer", NULL, NULL, NULL
+            static const char *const names[8] = {
+              "isamplerBuffer", NULL, NULL, NULL,
+              "iimageBuffer", NULL, NULL, NULL
             };
-            return names[type_idx];
+            return names[offset + type_idx];
          }
          default:
             unreachable("Unsupported isampler/iimage dimensionality");
@@ -2435,11 +2472,11 @@ get_type_name_for_precision_qualifier(const glsl_type *type)
             return names[type_idx];
          }
          case GLSL_SAMPLER_DIM_BUF: {
-            assert(type->base_type == GLSL_TYPE_SAMPLER);
-            static const char *const names[4] = {
-              "usamplerBuffer", NULL, NULL, NULL
+            static const char *const names[8] = {
+              "usamplerBuffer", NULL, NULL, NULL,
+              "uimageBuffer", NULL, NULL, NULL
             };
-            return names[type_idx];
+            return names[offset + type_idx];
          }
          default:
             unreachable("Unsupported usampler/uimage dimensionality");
@@ -2550,43 +2587,79 @@ validate_matrix_layout_for_type(struct _mesa_glsl_parse_state *state,
 }
 
 static bool
-process_qualifier_constant(struct _mesa_glsl_parse_state *state,
-                           YYLTYPE *loc,
-                           const char *qual_indentifier,
-                           ast_expression *const_expression,
-                           unsigned *value)
-{
-   exec_list dummy_instructions;
-
-   if (const_expression == NULL) {
-      *value = 0;
-      return true;
+validate_xfb_buffer_qualifier(YYLTYPE *loc,
+                              struct _mesa_glsl_parse_state *state,
+                              unsigned xfb_buffer) {
+   if (xfb_buffer >= state->Const.MaxTransformFeedbackBuffers) {
+      _mesa_glsl_error(loc, state,
+                       "invalid xfb_buffer specified %d is larger than "
+                       "MAX_TRANSFORM_FEEDBACK_BUFFERS - 1 (%d).",
+                       xfb_buffer,
+                       state->Const.MaxTransformFeedbackBuffers - 1);
+      return false;
    }
 
-   ir_rvalue *const ir = const_expression->hir(&dummy_instructions, state);
+   return true;
+}
 
-   ir_constant *const const_int = ir->constant_expression_value();
-   if (const_int == NULL || !const_int->type->is_integer()) {
-      _mesa_glsl_error(loc, state, "%s must be an integral constant "
-                       "expression", qual_indentifier);
-      return false;
-   }
+/* From the ARB_enhanced_layouts spec:
+ *
+ *    "Variables and block members qualified with *xfb_offset* can be
+ *    scalars, vectors, matrices, structures, and (sized) arrays of these.
+ *    The offset must be a multiple of the size of the first component of
+ *    the first qualified variable or block member, or a compile-time error
+ *    results.  Further, if applied to an aggregate containing a double,
+ *    the offset must also be a multiple of 8, and the space taken in the
+ *    buffer will be a multiple of 8.
+ */
+static bool
+validate_xfb_offset_qualifier(YYLTYPE *loc,
+                              struct _mesa_glsl_parse_state *state,
+                              int xfb_offset, const glsl_type *type,
+                              unsigned component_size) {
+  const glsl_type *t_without_array = type->without_array();
 
-   if (const_int->value.i[0] < 0) {
-      _mesa_glsl_error(loc, state, "%s layout qualifier is invalid (%d < 0)",
-                       qual_indentifier, const_int->value.u[0]);
+   if (xfb_offset != -1 && type->is_unsized_array()) {
+      _mesa_glsl_error(loc, state,
+                       "xfb_offset can't be used with unsized arrays.");
       return false;
    }
 
-   /* If the location is const (and we've verified that
-    * it is) then no instructions should have been emitted
-    * when we converted it to HIR. If they were emitted,
-    * then either the location isn't const after all, or
-    * we are emitting unnecessary instructions.
+   /* Make sure nested structs don't contain unsized arrays, and validate
+    * any xfb_offsets on interface members.
     */
-   assert(dummy_instructions.is_empty());
+   if (t_without_array->is_record() || t_without_array->is_interface())
+      for (unsigned int i = 0; i < t_without_array->length; i++) {
+         const glsl_type *member_t = t_without_array->fields.structure[i].type;
+
+         /* When the interface block doesn't have an xfb_offset qualifier then
+          * we apply the component size rules at the member level.
+          */
+         if (xfb_offset == -1)
+            component_size = member_t->contains_double() ? 8 : 4;
+
+         int xfb_offset = t_without_array->fields.structure[i].offset;
+         validate_xfb_offset_qualifier(loc, state, xfb_offset, member_t,
+                                       component_size);
+      }
+
+  /* Nested structs or interface block without offset may not have had an
+   * offset applied yet so return.
+   */
+   if (xfb_offset == -1) {
+     return true;
+   }
+
+   if (xfb_offset % component_size) {
+      _mesa_glsl_error(loc, state,
+                       "invalid qualifier xfb_offset=%d must be a multiple "
+                       "of the first component size of the first qualified "
+                       "variable or block member. Or double if an aggregate "
+                       "that contains a double (%d).",
+                       xfb_offset, component_size);
+      return false;
+   }
 
-   *value = const_int->value.u[0];
    return true;
 }
 
@@ -3151,6 +3224,39 @@ apply_layout_qualifier_to_variable(const struct ast_type_qualifier *qual,
       }
    }
 
+   if (qual->flags.q.out && qual->flags.q.xfb_buffer) {
+      unsigned qual_xfb_buffer;
+      if (process_qualifier_constant(state, loc, "xfb_buffer",
+                                     qual->xfb_buffer, &qual_xfb_buffer) &&
+          validate_xfb_buffer_qualifier(loc, state, qual_xfb_buffer)) {
+         var->data.xfb_buffer = qual_xfb_buffer;
+         if (qual->flags.q.explicit_xfb_buffer)
+            var->data.explicit_xfb_buffer = true;
+      }
+   }
+
+   if (qual->flags.q.explicit_xfb_offset) {
+      unsigned qual_xfb_offset;
+      unsigned component_size = var->type->contains_double() ? 8 : 4;
+
+      if (process_qualifier_constant(state, loc, "xfb_offset",
+                                     qual->offset, &qual_xfb_offset) &&
+          validate_xfb_offset_qualifier(loc, state, (int) qual_xfb_offset,
+                                        var->type, component_size)) {
+         var->data.offset = qual_xfb_offset;
+         var->data.explicit_xfb_offset = true;
+      }
+   }
+
+   if (qual->flags.q.explicit_xfb_stride) {
+      unsigned qual_xfb_stride;
+      if (process_qualifier_constant(state, loc, "xfb_stride",
+                                     qual->xfb_stride, &qual_xfb_stride)) {
+         var->data.xfb_stride = qual_xfb_stride;
+         var->data.explicit_xfb_stride = true;
+      }
+   }
+
    if (var->type->contains_atomic()) {
       if (var->data.mode == ir_var_uniform) {
          if (var->data.explicit_binding) {
@@ -5746,6 +5852,11 @@ ast_switch_statement::test_to_hir(exec_list *instructions,
 {
    void *ctx = state;
 
+   /* set to true to avoid a duplicate "use of uninitialized variable" warning
+    * on the switch test case. The first one would be already raised when
+    * getting the test_expression at ast_switch_statement::hir
+    */
+   test_expression->set_is_lhs(true);
    /* Cache value of test expression. */
    ir_rvalue *const test_val =
       test_expression->hir(instructions,
@@ -6258,6 +6369,8 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
                                           ir_variable_mode var_mode,
                                           ast_type_qualifier *layout,
                                           unsigned block_stream,
+                                          unsigned block_xfb_buffer,
+                                          unsigned block_xfb_offset,
                                           unsigned expl_location,
                                           unsigned expl_align)
 {
@@ -6413,6 +6526,35 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
          }
       }
 
+      int xfb_buffer;
+      unsigned explicit_xfb_buffer = 0;
+      if (qual->flags.q.explicit_xfb_buffer) {
+         unsigned qual_xfb_buffer;
+         if (process_qualifier_constant(state, &loc, "xfb_buffer",
+                                        qual->xfb_buffer, &qual_xfb_buffer)) {
+            explicit_xfb_buffer = 1;
+            if (qual_xfb_buffer != block_xfb_buffer)
+               _mesa_glsl_error(&loc, state, "xfb_buffer layout qualifier on "
+                                "interface block member does not match "
+                                "the interface block (%u vs %u)",
+                                qual_xfb_buffer, block_xfb_buffer);
+         }
+         xfb_buffer = (int) qual_xfb_buffer;
+      } else {
+         if (layout)
+            explicit_xfb_buffer = layout->flags.q.xfb_buffer;
+         xfb_buffer = (int) block_xfb_buffer;
+      }
+
+      int xfb_stride = -1;
+      if (qual->flags.q.explicit_xfb_stride) {
+         unsigned qual_xfb_stride;
+         if (process_qualifier_constant(state, &loc, "xfb_stride",
+                                        qual->xfb_stride, &qual_xfb_stride)) {
+            xfb_stride = (int) qual_xfb_stride;
+         }
+      }
+
       if (qual->flags.q.uniform && qual->has_interpolation()) {
          _mesa_glsl_error(&loc, state,
                           "interpolation qualifiers cannot be used "
@@ -6458,6 +6600,10 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
          fields[i].sample = qual->flags.q.sample ? 1 : 0;
          fields[i].patch = qual->flags.q.patch ? 1 : 0;
          fields[i].precision = qual->precision;
+         fields[i].offset = -1;
+         fields[i].explicit_xfb_buffer = explicit_xfb_buffer;
+         fields[i].xfb_buffer = xfb_buffer;
+         fields[i].xfb_stride = xfb_stride;
 
          if (qual->flags.q.explicit_location) {
             unsigned qual_location;
@@ -6520,8 +6666,6 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
                                    "with std430 and std140 layouts");
                }
             }
-         } else {
-            fields[i].offset = -1;
          }
 
          if (qual->flags.q.explicit_align || expl_align != 0) {
@@ -6554,6 +6698,32 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
                next_offset = glsl_align(next_offset + size, align);
          }
 
+         /* From the ARB_enhanced_layouts spec:
+          *
+          *    "The given offset applies to the first component of the first
+          *    member of the qualified entity.  Then, within the qualified
+          *    entity, subsequent components are each assigned, in order, to
+          *    the next available offset aligned to a multiple of that
+          *    component's size.  Aggregate types are flattened down to the
+          *    component level to get this sequence of components."
+          */
+         if (qual->flags.q.explicit_xfb_offset) {
+            unsigned xfb_offset;
+            if (process_qualifier_constant(state, &loc, "xfb_offset",
+                                           qual->offset, &xfb_offset)) {
+               fields[i].offset = xfb_offset;
+               block_xfb_offset = fields[i].offset +
+                  MAX2(xfb_stride, (int) (4 * field_type->component_slots()));
+            }
+         } else {
+            if (layout && layout->flags.q.explicit_xfb_offset) {
+               unsigned align = field_type->is_double() ? 8 : 4;
+               fields[i].offset = glsl_align(block_xfb_offset, align);
+               block_xfb_offset +=
+                  MAX2(xfb_stride, (int) (4 * field_type->component_slots()));
+            }
+         }
+
          /* Propogate row- / column-major information down the fields of the
           * structure or interface block.  Structures need this data because
           * the structure may contain a structure that contains ... a matrix
@@ -6648,6 +6818,8 @@ ast_struct_specifier::hir(exec_list *instructions,
                                                 ir_var_auto,
                                                 layout,
                                                 0, /* for interface only */
+                                                0, /* for interface only */
+                                                0, /* for interface only */
                                                 expl_location,
                                                 0 /* for interface only */);
 
@@ -6807,6 +6979,29 @@ ast_interface_block::hir(exec_list *instructions,
       return NULL;
    }
 
+   unsigned qual_xfb_buffer;
+   if (!process_qualifier_constant(state, &loc, "xfb_buffer",
+                                   layout.xfb_buffer, &qual_xfb_buffer) ||
+       !validate_xfb_buffer_qualifier(&loc, state, qual_xfb_buffer)) {
+      return NULL;
+   }
+
+   unsigned qual_xfb_offset;
+   if (layout.flags.q.explicit_xfb_offset) {
+      if (!process_qualifier_constant(state, &loc, "xfb_offset",
+                                      layout.offset, &qual_xfb_offset)) {
+         return NULL;
+      }
+   }
+
+   unsigned qual_xfb_stride;
+   if (layout.flags.q.explicit_xfb_stride) {
+      if (!process_qualifier_constant(state, &loc, "xfb_stride",
+                                      layout.xfb_stride, &qual_xfb_stride)) {
+         return NULL;
+      }
+   }
+
    unsigned expl_location = 0;
    if (layout.flags.q.explicit_location) {
       if (!process_qualifier_constant(state, &loc, "location",
@@ -6842,6 +7037,8 @@ ast_interface_block::hir(exec_list *instructions,
                                                 var_mode,
                                                 &this->layout,
                                                 qual_stream,
+                                                qual_xfb_buffer,
+                                                qual_xfb_offset,
                                                 expl_location,
                                                 expl_align);
 
@@ -6956,6 +7153,12 @@ ast_interface_block::hir(exec_list *instructions,
                earlier_per_vertex->fields.structure[j].patch;
             fields[i].precision =
                earlier_per_vertex->fields.structure[j].precision;
+            fields[i].explicit_xfb_buffer =
+               earlier_per_vertex->fields.structure[j].explicit_xfb_buffer;
+            fields[i].xfb_buffer =
+               earlier_per_vertex->fields.structure[j].xfb_buffer;
+            fields[i].xfb_stride =
+               earlier_per_vertex->fields.structure[j].xfb_stride;
          }
       }
 
@@ -6986,6 +7189,12 @@ ast_interface_block::hir(exec_list *instructions,
                                         packing,
                                         this->block_name);
 
+   unsigned component_size = block_type->contains_double() ? 8 : 4;
+   int xfb_offset =
+      layout.flags.q.explicit_xfb_offset ? (int) qual_xfb_offset : -1;
+   validate_xfb_offset_qualifier(&loc, state, xfb_offset, block_type,
+                                 component_size);
+
    if (!state->symbols->add_interface(block_type->name, block_type, var_mode)) {
       YYLTYPE loc = this->get_location();
       _mesa_glsl_error(&loc, state, "interface block `%s' with type `%s' "
@@ -7207,8 +7416,17 @@ ast_interface_block::hir(exec_list *instructions,
          var->data.patch = fields[i].patch;
          var->data.stream = qual_stream;
          var->data.location = fields[i].location;
+
          if (fields[i].location != -1)
             var->data.explicit_location = true;
+
+         var->data.explicit_xfb_buffer = fields[i].explicit_xfb_buffer;
+         var->data.xfb_buffer = fields[i].xfb_buffer;
+
+         if (fields[i].offset != -1)
+            var->data.explicit_xfb_offset = true;
+         var->data.offset = fields[i].offset;
+
          var->init_interface_type(block_type);
 
          if (var_mode == ir_var_shader_in || var_mode == ir_var_uniform)
diff --git a/src/compiler/glsl/ast_type.cpp b/src/compiler/glsl/ast_type.cpp
index 07ed4f2356c..c3d38cbbf8a 100644
--- a/src/compiler/glsl/ast_type.cpp
+++ b/src/compiler/glsl/ast_type.cpp
@@ -79,7 +79,10 @@ ast_type_qualifier::has_layout() const
           || this->flags.q.explicit_index
           || this->flags.q.explicit_binding
           || this->flags.q.explicit_offset
-          || this->flags.q.explicit_stream;
+          || this->flags.q.explicit_stream
+          || this->flags.q.explicit_xfb_buffer
+          || this->flags.q.explicit_xfb_offset
+          || this->flags.q.explicit_xfb_stride;
 }
 
 bool
@@ -229,6 +232,43 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc,
       }
    }
 
+   if (state->has_enhanced_layouts()) {
+      if (!this->flags.q.explicit_xfb_buffer) {
+         if (q.flags.q.xfb_buffer) {
+            this->flags.q.xfb_buffer = 1;
+            this->xfb_buffer = q.xfb_buffer;
+         } else if (!this->flags.q.xfb_buffer && this->flags.q.out) {
+            /* Assign global xfb_buffer value */
+            this->flags.q.xfb_buffer = 1;
+            this->xfb_buffer = state->out_qualifier->xfb_buffer;
+         }
+      }
+
+      if (q.flags.q.explicit_xfb_stride)
+         this->xfb_stride = q.xfb_stride;
+
+      /* Merge all we xfb_stride qualifiers into the global out */
+      if (q.flags.q.explicit_xfb_stride || this->flags.q.xfb_stride) {
+
+         /* Set xfb_stride flag to 0 to avoid adding duplicates every time
+          * there is a merge.
+          */
+         this->flags.q.xfb_stride = 0;
+
+         unsigned buff_idx;
+         if (process_qualifier_constant(state, loc, "xfb_buffer",
+                                        this->xfb_buffer, &buff_idx)) {
+            if (state->out_qualifier->out_xfb_stride[buff_idx]) {
+               state->out_qualifier->out_xfb_stride[buff_idx]->merge_qualifier(
+                  new(state) ast_layout_expression(*loc, this->xfb_stride));
+            } else {
+               state->out_qualifier->out_xfb_stride[buff_idx] =
+                  new(state) ast_layout_expression(*loc, this->xfb_stride);
+            }
+         }
+      }
+   }
+
    if (q.flags.q.vertices) {
       if (this->vertices) {
          this->vertices->merge_qualifier(q.vertices);
@@ -300,7 +340,7 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc,
    if (q.flags.q.explicit_binding)
       this->binding = q.binding;
 
-   if (q.flags.q.explicit_offset)
+   if (q.flags.q.explicit_offset || q.flags.q.explicit_xfb_offset)
       this->offset = q.offset;
 
    if (q.precision != ast_precision_none)
@@ -322,6 +362,8 @@ ast_type_qualifier::merge_out_qualifier(YYLTYPE *loc,
 {
    void *mem_ctx = state;
    const bool r = this->merge_qualifier(loc, state, q, false);
+   ast_type_qualifier valid_out_mask;
+   valid_out_mask.flags.i = 0;
 
    if (state->stage == MESA_SHADER_GEOMETRY) {
       if (q.flags.q.prim_type) {
@@ -340,13 +382,45 @@ ast_type_qualifier::merge_out_qualifier(YYLTYPE *loc,
 
       /* Allow future assigments of global out's stream id value */
       this->flags.q.explicit_stream = 0;
+
+      valid_out_mask.flags.q.stream = 1;
+      valid_out_mask.flags.q.explicit_stream = 1;
+      valid_out_mask.flags.q.explicit_xfb_buffer = 1;
+      valid_out_mask.flags.q.xfb_buffer = 1;
+      valid_out_mask.flags.q.explicit_xfb_stride = 1;
+      valid_out_mask.flags.q.xfb_stride = 1;
+      valid_out_mask.flags.q.max_vertices = 1;
+      valid_out_mask.flags.q.prim_type = 1;
    } else if (state->stage == MESA_SHADER_TESS_CTRL) {
       if (create_node) {
          node = new(mem_ctx) ast_tcs_output_layout(*loc);
       }
+      valid_out_mask.flags.q.vertices = 1;
+      valid_out_mask.flags.q.explicit_xfb_buffer = 1;
+      valid_out_mask.flags.q.xfb_buffer = 1;
+      valid_out_mask.flags.q.explicit_xfb_stride = 1;
+      valid_out_mask.flags.q.xfb_stride = 1;
+   } else if (state->stage == MESA_SHADER_TESS_EVAL ||
+              state->stage == MESA_SHADER_VERTEX) {
+      valid_out_mask.flags.q.explicit_xfb_buffer = 1;
+      valid_out_mask.flags.q.xfb_buffer = 1;
+      valid_out_mask.flags.q.explicit_xfb_stride = 1;
+      valid_out_mask.flags.q.xfb_stride = 1;
    } else {
       _mesa_glsl_error(loc, state, "out layout qualifiers only valid in "
-                       "tessellation control or geometry shaders");
+                       "geometry, tessellation and vertex shaders");
+      return false;
+   }
+
+   /* Allow future assigments of global out's */
+   this->flags.q.explicit_xfb_buffer = 0;
+   this->flags.q.explicit_xfb_stride = 0;
+
+   /* Generate an error when invalid input layout qualifiers are used. */
+   if ((q.flags.i & ~valid_out_mask.flags.i) != 0) {
+      _mesa_glsl_error(loc, state,
+		       "invalid output layout qualifiers used");
+      return false;
    }
 
    return r;
@@ -566,3 +640,44 @@ ast_layout_expression::process_qualifier_constant(struct _mesa_glsl_parse_state
 
    return true;
 }
+
+bool
+process_qualifier_constant(struct _mesa_glsl_parse_state *state,
+                           YYLTYPE *loc,
+                           const char *qual_indentifier,
+                           ast_expression *const_expression,
+                           unsigned *value)
+{
+   exec_list dummy_instructions;
+
+   if (const_expression == NULL) {
+      *value = 0;
+      return true;
+   }
+
+   ir_rvalue *const ir = const_expression->hir(&dummy_instructions, state);
+
+   ir_constant *const const_int = ir->constant_expression_value();
+   if (const_int == NULL || !const_int->type->is_integer()) {
+      _mesa_glsl_error(loc, state, "%s must be an integral constant "
+                       "expression", qual_indentifier);
+      return false;
+   }
+
+   if (const_int->value.i[0] < 0) {
+      _mesa_glsl_error(loc, state, "%s layout qualifier is invalid (%d < 0)",
+                       qual_indentifier, const_int->value.u[0]);
+      return false;
+   }
+
+   /* If the location is const (and we've verified that
+    * it is) then no instructions should have been emitted
+    * when we converted it to HIR. If they were emitted,
+    * then either the location isn't const after all, or
+    * we are emitting unnecessary instructions.
+    */
+   assert(dummy_instructions.is_empty());
+
+   *value = const_int->value.u[0];
+   return true;
+}
diff --git a/src/compiler/glsl/builtin_functions.cpp b/src/compiler/glsl/builtin_functions.cpp
index ff6b628eb64..65309fdc09c 100644
--- a/src/compiler/glsl/builtin_functions.cpp
+++ b/src/compiler/glsl/builtin_functions.cpp
@@ -130,12 +130,6 @@ v130_fs_only(const _mesa_glsl_parse_state *state)
 }
 
 static bool
-v140(const _mesa_glsl_parse_state *state)
-{
-   return state->is_version(140, 0);
-}
-
-static bool
 v140_or_es3(const _mesa_glsl_parse_state *state)
 {
    return state->is_version(140, 300);
@@ -184,6 +178,14 @@ v110_lod(const _mesa_glsl_parse_state *state)
 }
 
 static bool
+texture_buffer(const _mesa_glsl_parse_state *state)
+{
+   return state->is_version(140, 320) ||
+      state->EXT_texture_buffer_enable ||
+      state->OES_texture_buffer_enable;
+}
+
+static bool
 shader_texture_lod(const _mesa_glsl_parse_state *state)
 {
    return state->ARB_shader_texture_lod_enable;
@@ -262,10 +264,12 @@ shader_packing_or_es31_or_gpu_shader5(const _mesa_glsl_parse_state *state)
 }
 
 static bool
-fs_gpu_shader5(const _mesa_glsl_parse_state *state)
+fs_interpolate_at(const _mesa_glsl_parse_state *state)
 {
    return state->stage == MESA_SHADER_FRAGMENT &&
-          (state->is_version(400, 0) || state->ARB_gpu_shader5_enable);
+          (state->is_version(400, 320) ||
+           state->ARB_gpu_shader5_enable ||
+           state->OES_shader_multisample_interpolation_enable);
 }
 
 
@@ -1581,9 +1585,9 @@ builtin_builder::create_builtins()
                 _textureSize(v130, glsl_type::ivec2_type, glsl_type::usampler2DRect_type),
                 _textureSize(v130, glsl_type::ivec2_type, glsl_type::sampler2DRectShadow_type),
 
-                _textureSize(v140, glsl_type::int_type,   glsl_type::samplerBuffer_type),
-                _textureSize(v140, glsl_type::int_type,   glsl_type::isamplerBuffer_type),
-                _textureSize(v140, glsl_type::int_type,   glsl_type::usamplerBuffer_type),
+                _textureSize(texture_buffer, glsl_type::int_type,   glsl_type::samplerBuffer_type),
+                _textureSize(texture_buffer, glsl_type::int_type,   glsl_type::isamplerBuffer_type),
+                _textureSize(texture_buffer, glsl_type::int_type,   glsl_type::usamplerBuffer_type),
                 _textureSize(texture_multisample, glsl_type::ivec2_type, glsl_type::sampler2DMS_type),
                 _textureSize(texture_multisample, glsl_type::ivec2_type, glsl_type::isampler2DMS_type),
                 _textureSize(texture_multisample, glsl_type::ivec2_type, glsl_type::usampler2DMS_type),
@@ -1855,9 +1859,9 @@ builtin_builder::create_builtins()
                 _texelFetch(v130, glsl_type::ivec4_type, glsl_type::isampler2DArray_type, glsl_type::ivec3_type),
                 _texelFetch(v130, glsl_type::uvec4_type, glsl_type::usampler2DArray_type, glsl_type::ivec3_type),
 
-                _texelFetch(v140, glsl_type::vec4_type,  glsl_type::samplerBuffer_type,  glsl_type::int_type),
-                _texelFetch(v140, glsl_type::ivec4_type, glsl_type::isamplerBuffer_type, glsl_type::int_type),
-                _texelFetch(v140, glsl_type::uvec4_type, glsl_type::usamplerBuffer_type, glsl_type::int_type),
+                _texelFetch(texture_buffer, glsl_type::vec4_type,  glsl_type::samplerBuffer_type,  glsl_type::int_type),
+                _texelFetch(texture_buffer, glsl_type::ivec4_type, glsl_type::isamplerBuffer_type, glsl_type::int_type),
+                _texelFetch(texture_buffer, glsl_type::uvec4_type, glsl_type::usamplerBuffer_type, glsl_type::int_type),
 
                 _texelFetch(texture_multisample, glsl_type::vec4_type,  glsl_type::sampler2DMS_type,  glsl_type::ivec2_type),
                 _texelFetch(texture_multisample, glsl_type::ivec4_type, glsl_type::isampler2DMS_type, glsl_type::ivec2_type),
@@ -5163,7 +5167,7 @@ builtin_builder::_interpolateAtCentroid(const glsl_type *type)
 {
    ir_variable *interpolant = in_var(type, "interpolant");
    interpolant->data.must_be_shader_input = 1;
-   MAKE_SIG(type, fs_gpu_shader5, 1, interpolant);
+   MAKE_SIG(type, fs_interpolate_at, 1, interpolant);
 
    body.emit(ret(interpolate_at_centroid(interpolant)));
 
@@ -5176,7 +5180,7 @@ builtin_builder::_interpolateAtOffset(const glsl_type *type)
    ir_variable *interpolant = in_var(type, "interpolant");
    interpolant->data.must_be_shader_input = 1;
    ir_variable *offset = in_var(glsl_type::vec2_type, "offset");
-   MAKE_SIG(type, fs_gpu_shader5, 2, interpolant, offset);
+   MAKE_SIG(type, fs_interpolate_at, 2, interpolant, offset);
 
    body.emit(ret(interpolate_at_offset(interpolant, offset)));
 
@@ -5189,7 +5193,7 @@ builtin_builder::_interpolateAtSample(const glsl_type *type)
    ir_variable *interpolant = in_var(type, "interpolant");
    interpolant->data.must_be_shader_input = 1;
    ir_variable *sample_num = in_var(glsl_type::int_type, "sample_num");
-   MAKE_SIG(type, fs_gpu_shader5, 2, interpolant, sample_num);
+   MAKE_SIG(type, fs_interpolate_at, 2, interpolant, sample_num);
 
    body.emit(ret(interpolate_at_sample(interpolant, sample_num)));
 
diff --git a/src/compiler/glsl/builtin_types.cpp b/src/compiler/glsl/builtin_types.cpp
index ee24bd5e411..d250234f652 100644
--- a/src/compiler/glsl/builtin_types.cpp
+++ b/src/compiler/glsl/builtin_types.cpp
@@ -179,7 +179,7 @@ static const struct builtin_type_versions {
    T(sampler2DArray,                  130, 300)
    T(samplerCubeArray,                400, 999)
    T(sampler2DRect,                   140, 999)
-   T(samplerBuffer,                   140, 999)
+   T(samplerBuffer,                   140, 320)
    T(sampler2DMS,                     150, 310)
    T(sampler2DMSArray,                150, 999)
 
@@ -191,7 +191,7 @@ static const struct builtin_type_versions {
    T(isampler2DArray,                 130, 300)
    T(isamplerCubeArray,               400, 999)
    T(isampler2DRect,                  140, 999)
-   T(isamplerBuffer,                  140, 999)
+   T(isamplerBuffer,                  140, 320)
    T(isampler2DMS,                    150, 310)
    T(isampler2DMSArray,               150, 999)
 
@@ -203,7 +203,7 @@ static const struct builtin_type_versions {
    T(usampler2DArray,                 130, 300)
    T(usamplerCubeArray,               400, 999)
    T(usampler2DRect,                  140, 999)
-   T(usamplerBuffer,                  140, 999)
+   T(usamplerBuffer,                  140, 320)
    T(usampler2DMS,                    150, 310)
    T(usampler2DMSArray,               150, 999)
 
@@ -222,7 +222,7 @@ static const struct builtin_type_versions {
    T(image3D,                         420, 310)
    T(image2DRect,                     420, 999)
    T(imageCube,                       420, 310)
-   T(imageBuffer,                     420, 999)
+   T(imageBuffer,                     420, 320)
    T(image1DArray,                    420, 999)
    T(image2DArray,                    420, 310)
    T(imageCubeArray,                  420, 999)
@@ -233,7 +233,7 @@ static const struct builtin_type_versions {
    T(iimage3D,                        420, 310)
    T(iimage2DRect,                    420, 999)
    T(iimageCube,                      420, 310)
-   T(iimageBuffer,                    420, 999)
+   T(iimageBuffer,                    420, 320)
    T(iimage1DArray,                   420, 999)
    T(iimage2DArray,                   420, 310)
    T(iimageCubeArray,                 420, 999)
@@ -244,7 +244,7 @@ static const struct builtin_type_versions {
    T(uimage3D,                        420, 310)
    T(uimage2DRect,                    420, 999)
    T(uimageCube,                      420, 310)
-   T(uimageBuffer,                    420, 999)
+   T(uimageBuffer,                    420, 320)
    T(uimage1DArray,                   420, 999)
    T(uimage2DArray,                   420, 310)
    T(uimageCubeArray,                 420, 999)
@@ -371,6 +371,16 @@ _mesa_glsl_initialize_types(struct _mesa_glsl_parse_state *state)
       add_type(symbols, glsl_type::uimage2DMSArray_type);
    }
 
+   if (state->EXT_texture_buffer_enable || state->OES_texture_buffer_enable) {
+      add_type(symbols, glsl_type::samplerBuffer_type);
+      add_type(symbols, glsl_type::isamplerBuffer_type);
+      add_type(symbols, glsl_type::usamplerBuffer_type);
+
+      add_type(symbols, glsl_type::imageBuffer_type);
+      add_type(symbols, glsl_type::iimageBuffer_type);
+      add_type(symbols, glsl_type::uimageBuffer_type);
+   }
+
    if (state->has_atomic_counters()) {
       add_type(symbols, glsl_type::atomic_uint_type);
    }
diff --git a/src/compiler/glsl/builtin_variables.cpp b/src/compiler/glsl/builtin_variables.cpp
index 4e2de37fbba..7d77f705356 100644
--- a/src/compiler/glsl/builtin_variables.cpp
+++ b/src/compiler/glsl/builtin_variables.cpp
@@ -334,6 +334,9 @@ per_vertex_accumulator::add_field(int slot, const glsl_type *type,
    this->fields[this->num_fields].image_coherent = 0;
    this->fields[this->num_fields].image_volatile = 0;
    this->fields[this->num_fields].image_restrict = 0;
+   this->fields[this->num_fields].explicit_xfb_buffer = 0;
+   this->fields[this->num_fields].xfb_buffer = -1;
+   this->fields[this->num_fields].xfb_stride = -1;
    this->num_fields++;
 }
 
@@ -812,6 +815,13 @@ builtin_variable_generator::generate_constants()
        */
    }
 
+   if (state->has_enhanced_layouts()) {
+      add_const("gl_MaxTransformFeedbackBuffers",
+                state->Const.MaxTransformFeedbackBuffers);
+      add_const("gl_MaxTransformFeedbackInterleavedComponents",
+                state->Const.MaxTransformFeedbackInterleavedComponents);
+   }
+
    if (state->is_version(420, 310) ||
        state->ARB_shader_image_load_store_enable) {
       add_const("gl_MaxImageUnits",
@@ -868,6 +878,10 @@ builtin_variable_generator::generate_constants()
       add_const("gl_MaxTessControlUniformComponents", state->Const.MaxTessControlUniformComponents);
       add_const("gl_MaxTessEvaluationUniformComponents", state->Const.MaxTessEvaluationUniformComponents);
    }
+
+   if (state->is_version(450, 320) ||
+       state->OES_sample_variables_enable)
+      add_const("gl_MaxSamples", state->Const.MaxSamples);
 }
 
 
@@ -877,7 +891,9 @@ builtin_variable_generator::generate_constants()
 void
 builtin_variable_generator::generate_uniforms()
 {
-   if (state->is_version(400, 0) || state->ARB_sample_shading_enable)
+   if (state->is_version(400, 320) ||
+       state->ARB_sample_shading_enable ||
+       state->OES_sample_variables_enable)
       add_uniform(int_t, "gl_NumSamples");
    add_uniform(type("gl_DepthRangeParameters"), "gl_DepthRange");
    add_uniform(array(vec4_t, VERT_ATTRIB_MAX), "gl_CurrentAttribVertMESA");
@@ -1130,7 +1146,9 @@ builtin_variable_generator::generate_fs_special_vars()
          var->enable_extension_warning("GL_AMD_shader_stencil_export");
    }
 
-   if (state->is_version(400, 0) || state->ARB_sample_shading_enable) {
+   if (state->is_version(400, 320) ||
+       state->ARB_sample_shading_enable ||
+       state->OES_sample_variables_enable) {
       add_system_value(SYSTEM_VALUE_SAMPLE_ID, int_t, "gl_SampleID");
       add_system_value(SYSTEM_VALUE_SAMPLE_POS, vec2_t, "gl_SamplePosition");
       /* From the ARB_sample_shading specification:
@@ -1143,7 +1161,9 @@ builtin_variable_generator::generate_fs_special_vars()
       add_output(FRAG_RESULT_SAMPLE_MASK, array(int_t, 1), "gl_SampleMask");
    }
 
-   if (state->is_version(400, 0) || state->ARB_gpu_shader5_enable) {
+   if (state->is_version(400, 320) ||
+       state->ARB_gpu_shader5_enable ||
+       state->OES_sample_variables_enable) {
       add_system_value(SYSTEM_VALUE_SAMPLE_MASK_IN, array(int_t, 1), "gl_SampleMaskIn");
    }
 
diff --git a/src/compiler/glsl/glcpp/glcpp-parse.y b/src/compiler/glsl/glcpp/glcpp-parse.y
index 007b70b020d..e8646c0ad32 100644
--- a/src/compiler/glsl/glcpp/glcpp-parse.y
+++ b/src/compiler/glsl/glcpp/glcpp-parse.y
@@ -2371,6 +2371,10 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio
 	   if (extensions != NULL) {
 	      if (extensions->OES_EGL_image_external)
 	         add_builtin_define(parser, "GL_OES_EGL_image_external", 1);
+              if (extensions->OES_sample_variables) {
+                 add_builtin_define(parser, "GL_OES_sample_variables", 1);
+                 add_builtin_define(parser, "GL_OES_shader_multisample_interpolation", 1);
+              }
               if (extensions->OES_standard_derivatives)
                  add_builtin_define(parser, "GL_OES_standard_derivatives", 1);
               if (extensions->ARB_texture_multisample)
@@ -2390,6 +2394,10 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio
                     add_builtin_define(parser, "GL_EXT_gpu_shader5", 1);
                     add_builtin_define(parser, "GL_OES_gpu_shader5", 1);
                  }
+                 if (extensions->OES_texture_buffer) {
+                    add_builtin_define(parser, "GL_EXT_texture_buffer", 1);
+                    add_builtin_define(parser, "GL_OES_texture_buffer", 1);
+                 }
               }
 	   }
 	} else {
diff --git a/src/compiler/glsl/glsl_lexer.ll b/src/compiler/glsl/glsl_lexer.ll
index 1f122654340..0b7695f8d3e 100644
--- a/src/compiler/glsl/glsl_lexer.ll
+++ b/src/compiler/glsl/glsl_lexer.ll
@@ -369,7 +369,7 @@ image2D         KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_l
 image3D         KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IMAGE3D);
 image2DRect     KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGE2DRECT);
 imageCube       KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IMAGECUBE);
-imageBuffer     KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGEBUFFER);
+imageBuffer     KEYWORD_WITH_ALT(130, 300, 420, 320, yyextra->ARB_shader_image_load_store_enable || yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, IMAGEBUFFER);
 image1DArray    KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGE1DARRAY);
 image2DArray    KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IMAGE2DARRAY);
 imageCubeArray  KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IMAGECUBEARRAY);
@@ -380,7 +380,7 @@ iimage2D        KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_l
 iimage3D        KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IIMAGE3D);
 iimage2DRect    KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGE2DRECT);
 iimageCube      KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IIMAGECUBE);
-iimageBuffer    KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGEBUFFER);
+iimageBuffer    KEYWORD_WITH_ALT(130, 300, 420, 320, yyextra->ARB_shader_image_load_store_enable || yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, IIMAGEBUFFER);
 iimage1DArray   KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGE1DARRAY);
 iimage2DArray   KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, IIMAGE2DARRAY);
 iimageCubeArray KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, IIMAGECUBEARRAY);
@@ -391,7 +391,7 @@ uimage2D        KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_l
 uimage3D        KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, UIMAGE3D);
 uimage2DRect    KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGE2DRECT);
 uimageCube      KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, UIMAGECUBE);
-uimageBuffer    KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGEBUFFER);
+uimageBuffer    KEYWORD_WITH_ALT(130, 300, 420, 320, yyextra->ARB_shader_image_load_store_enable || yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, UIMAGEBUFFER);
 uimage1DArray   KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGE1DARRAY);
 uimage2DArray   KEYWORD_WITH_ALT(130, 300, 420, 310, yyextra->ARB_shader_image_load_store_enable, UIMAGE2DARRAY);
 uimageCubeArray KEYWORD_WITH_ALT(130, 300, 420, 0, yyextra->ARB_shader_image_load_store_enable, UIMAGECUBEARRAY);
@@ -472,6 +472,13 @@ layout		{
 \.[0-9]+([eE][+-]?[0-9]+)?[fF]?		|
 [0-9]+\.([eE][+-]?[0-9]+)?[fF]?		|
 [0-9]+[eE][+-]?[0-9]+[fF]?		{
+			    struct _mesa_glsl_parse_state *state = yyextra;
+			    char suffix = yytext[strlen(yytext) - 1];
+			    if (!state->is_version(120, 300) &&
+			        (suffix == 'f' || suffix == 'F')) {
+			        _mesa_glsl_error(yylloc, state,
+			                         "Float suffixes are invalid in GLSL 1.10");
+			    }
 			    yylval->real = _mesa_strtof(yytext, NULL);
 			    return FLOATCONSTANT;
 			}
@@ -565,19 +572,19 @@ common		KEYWORD(130, 300, 0, 0, COMMON);
 partition	KEYWORD(130, 300, 0, 0, PARTITION);
 active		KEYWORD(130, 300, 0, 0, ACTIVE);
 superp		KEYWORD(130, 100, 0, 0, SUPERP);
-samplerBuffer	KEYWORD(130, 300, 140, 0, SAMPLERBUFFER);
+samplerBuffer	KEYWORD_WITH_ALT(130, 300, 140, 320, yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, SAMPLERBUFFER);
 filter		KEYWORD(130, 300, 0, 0, FILTER);
 row_major	KEYWORD_WITH_ALT(130, 0, 140, 0, yyextra->ARB_uniform_buffer_object_enable && !yyextra->es_shader, ROW_MAJOR);
 
     /* Additional reserved words in GLSL 1.40 */
 isampler2DRect	KEYWORD(140, 300, 140, 0, ISAMPLER2DRECT);
 usampler2DRect	KEYWORD(140, 300, 140, 0, USAMPLER2DRECT);
-isamplerBuffer	KEYWORD(140, 300, 140, 0, ISAMPLERBUFFER);
-usamplerBuffer	KEYWORD(140, 300, 140, 0, USAMPLERBUFFER);
+isamplerBuffer	KEYWORD_WITH_ALT(140, 300, 140, 320, yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, ISAMPLERBUFFER);
+usamplerBuffer	KEYWORD_WITH_ALT(140, 300, 140, 320, yyextra->EXT_texture_buffer_enable || yyextra->OES_texture_buffer_enable, USAMPLERBUFFER);
 
     /* Additional reserved words in GLSL ES 3.00 */
 resource	KEYWORD(0, 300, 0, 0, RESOURCE);
-sample		KEYWORD_WITH_ALT(400, 300, 400, 0, yyextra->ARB_gpu_shader5_enable, SAMPLE);
+sample		KEYWORD_WITH_ALT(400, 300, 400, 320, yyextra->ARB_gpu_shader5_enable || yyextra->OES_shader_multisample_interpolation_enable, SAMPLE);
 subroutine	KEYWORD_WITH_ALT(400, 300, 400, 0, yyextra->ARB_shader_subroutine_enable, SUBROUTINE);
 
 
diff --git a/src/compiler/glsl/glsl_parser.yy b/src/compiler/glsl/glsl_parser.yy
index 5ed051a6705..1cecc09b8c8 100644
--- a/src/compiler/glsl/glsl_parser.yy
+++ b/src/compiler/glsl/glsl_parser.yy
@@ -1541,6 +1541,25 @@ layout_qualifier_id:
          }
       }
 
+      if (state->has_enhanced_layouts()) {
+         if (match_layout_qualifier("xfb_buffer", $1, state) == 0) {
+            $$.flags.q.xfb_buffer = 1;
+            $$.flags.q.explicit_xfb_buffer = 1;
+            $$.xfb_buffer = $3;
+         }
+
+         if (match_layout_qualifier("xfb_offset", $1, state) == 0) {
+            $$.flags.q.explicit_xfb_offset = 1;
+            $$.offset = $3;
+         }
+
+         if (match_layout_qualifier("xfb_stride", $1, state) == 0) {
+            $$.flags.q.xfb_stride = 1;
+            $$.flags.q.explicit_xfb_stride = 1;
+            $$.xfb_stride = $3;
+         }
+      }
+
       static const char * const local_size_qualifiers[3] = {
          "local_size_x",
          "local_size_y",
@@ -1915,6 +1934,12 @@ storage_qualifier:
           $$.flags.q.explicit_stream = 0;
           $$.stream = state->out_qualifier->stream;
       }
+
+      if (state->has_enhanced_layouts()) {
+          $$.flags.q.xfb_buffer = 1;
+          $$.flags.q.explicit_xfb_buffer = 0;
+          $$.xfb_buffer = state->out_qualifier->xfb_buffer;
+      }
    }
    | UNIFORM
    {
diff --git a/src/compiler/glsl/glsl_parser_extras.cpp b/src/compiler/glsl/glsl_parser_extras.cpp
index 5d010fd47e5..2941277b525 100644
--- a/src/compiler/glsl/glsl_parser_extras.cpp
+++ b/src/compiler/glsl/glsl_parser_extras.cpp
@@ -140,6 +140,10 @@ _mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *_ctx,
    this->Const.MaxAtomicCounterBufferSize =
       ctx->Const.MaxAtomicBufferSize;
 
+   /* ARB_enhanced_layouts constants */
+   this->Const.MaxTransformFeedbackBuffers = ctx->Const.MaxTransformFeedbackBuffers;
+   this->Const.MaxTransformFeedbackInterleavedComponents = ctx->Const.MaxTransformFeedbackInterleavedComponents;
+
    /* Compute shader constants */
    for (unsigned i = 0; i < ARRAY_SIZE(this->Const.MaxComputeWorkGroupCount); i++)
       this->Const.MaxComputeWorkGroupCount[i] = ctx->Const.MaxComputeWorkGroupCount[i];
@@ -177,6 +181,9 @@ _mesa_glsl_parse_state::_mesa_glsl_parse_state(struct gl_context *_ctx,
    this->Const.MaxTessControlUniformComponents = ctx->Const.Program[MESA_SHADER_TESS_CTRL].MaxUniformComponents;
    this->Const.MaxTessEvaluationUniformComponents = ctx->Const.Program[MESA_SHADER_TESS_EVAL].MaxUniformComponents;
 
+   /* GL 4.5 / OES_sample_variables */
+   this->Const.MaxSamples = ctx->Const.MaxSamples;
+
    this->current_function = NULL;
    this->toplevel_ir = NULL;
    this->found_return = false;
@@ -610,9 +617,12 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = {
    EXT(OES_geometry_point_size,        false, true,      OES_geometry_shader),
    EXT(OES_geometry_shader,            false, true,      OES_geometry_shader),
    EXT(OES_gpu_shader5,                false, true,      ARB_gpu_shader5),
+   EXT(OES_sample_variables,           false, true,      OES_sample_variables),
    EXT(OES_shader_image_atomic,        false, true,      ARB_shader_image_load_store),
+   EXT(OES_shader_multisample_interpolation, false, true, OES_sample_variables),
    EXT(OES_standard_derivatives,       false, true,      OES_standard_derivatives),
    EXT(OES_texture_3D,                 false, true,      dummy_true),
+   EXT(OES_texture_buffer,             false, true,      OES_texture_buffer),
    EXT(OES_texture_storage_multisample_2d_array, false, true, ARB_texture_multisample),
 
    /* All other extensions go here, sorted alphabetically.
@@ -629,6 +639,7 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = {
    EXT(EXT_shader_integer_mix,         true,  true,      EXT_shader_integer_mix),
    EXT(EXT_shader_samples_identical,   true,  true,      EXT_shader_samples_identical),
    EXT(EXT_texture_array,              true,  false,     EXT_texture_array),
+   EXT(EXT_texture_buffer,             false, true,      OES_texture_buffer),
 };
 
 #undef EXT
@@ -935,6 +946,13 @@ _mesa_ast_process_interface_block(YYLTYPE *locp,
       block->layout.stream = state->out_qualifier->stream;
    }
 
+   if (state->has_enhanced_layouts() && block->layout.flags.q.out) {
+      /* Assign global layout's xfb_buffer value. */
+      block->layout.flags.q.xfb_buffer = 1;
+      block->layout.flags.q.explicit_xfb_buffer = 0;
+      block->layout.xfb_buffer = state->out_qualifier->xfb_buffer;
+   }
+
    foreach_list_typed (ast_declarator_list, member, link, &block->declarations) {
       ast_type_qualifier& qualifier = member->type->qualifier;
       if ((qualifier.flags.i & interface_type_mask) == 0) {
@@ -1206,6 +1224,7 @@ ast_expression::ast_expression(int oper,
    this->subexpressions[1] = ex1;
    this->subexpressions[2] = ex2;
    this->non_lvalue_description = NULL;
+   this->is_lhs = false;
 }
 
 
@@ -1583,13 +1602,12 @@ set_shader_inout_layout(struct gl_shader *shader,
 		     struct _mesa_glsl_parse_state *state)
 {
    /* Should have been prevented by the parser. */
-   if (shader->Stage == MESA_SHADER_TESS_CTRL) {
+   if (shader->Stage == MESA_SHADER_TESS_CTRL ||
+       shader->Stage == MESA_SHADER_VERTEX) {
       assert(!state->in_qualifier->flags.i);
-   } else if (shader->Stage == MESA_SHADER_TESS_EVAL) {
-      assert(!state->out_qualifier->flags.i);
-   } else if (shader->Stage != MESA_SHADER_GEOMETRY) {
+   } else if (shader->Stage != MESA_SHADER_GEOMETRY &&
+              shader->Stage != MESA_SHADER_TESS_EVAL) {
       assert(!state->in_qualifier->flags.i);
-      assert(!state->out_qualifier->flags.i);
    }
 
    if (shader->Stage != MESA_SHADER_COMPUTE) {
@@ -1606,6 +1624,17 @@ set_shader_inout_layout(struct gl_shader *shader,
       assert(!state->fs_early_fragment_tests);
    }
 
+   for (unsigned i = 0; i < MAX_FEEDBACK_BUFFERS; i++) {
+      if (state->out_qualifier->out_xfb_stride[i]) {
+         unsigned xfb_stride;
+         if (state->out_qualifier->out_xfb_stride[i]->
+                process_qualifier_constant(state, "xfb_stride", &xfb_stride,
+                true)) {
+            shader->TransformFeedback.BufferStride[i] = xfb_stride;
+         }
+      }
+   }
+
    switch (shader->Stage) {
    case MESA_SHADER_TESS_CTRL:
       shader->TessCtrl.VerticesOut = 0;
diff --git a/src/compiler/glsl/glsl_parser_extras.h b/src/compiler/glsl/glsl_parser_extras.h
index 12a3a46928c..0cc2d259f3a 100644
--- a/src/compiler/glsl/glsl_parser_extras.h
+++ b/src/compiler/glsl/glsl_parser_extras.h
@@ -383,6 +383,10 @@ struct _mesa_glsl_parse_state {
       /* ARB_draw_buffers */
       unsigned MaxDrawBuffers;
 
+      /* ARB_enhanced_layouts */
+      unsigned MaxTransformFeedbackBuffers;
+      unsigned MaxTransformFeedbackInterleavedComponents;
+
       /* ARB_blend_func_extended */
       unsigned MaxDualSourceDrawBuffers;
 
@@ -457,6 +461,9 @@ struct _mesa_glsl_parse_state {
       unsigned MaxTessControlTotalOutputComponents;
       unsigned MaxTessControlUniformComponents;
       unsigned MaxTessEvaluationUniformComponents;
+
+      /* GL 4.5 / OES_sample_variables */
+      unsigned MaxSamples;
    } Const;
 
    /**
@@ -597,12 +604,18 @@ struct _mesa_glsl_parse_state {
    bool OES_geometry_shader_warn;
    bool OES_gpu_shader5_enable;
    bool OES_gpu_shader5_warn;
+   bool OES_sample_variables_enable;
+   bool OES_sample_variables_warn;
    bool OES_shader_image_atomic_enable;
    bool OES_shader_image_atomic_warn;
+   bool OES_shader_multisample_interpolation_enable;
+   bool OES_shader_multisample_interpolation_warn;
    bool OES_standard_derivatives_enable;
    bool OES_standard_derivatives_warn;
    bool OES_texture_3D_enable;
    bool OES_texture_3D_warn;
+   bool OES_texture_buffer_enable;
+   bool OES_texture_buffer_warn;
    bool OES_texture_storage_multisample_2d_array_enable;
    bool OES_texture_storage_multisample_2d_array_warn;
 
@@ -632,6 +645,8 @@ struct _mesa_glsl_parse_state {
    bool EXT_shader_samples_identical_warn;
    bool EXT_texture_array_enable;
    bool EXT_texture_array_warn;
+   bool EXT_texture_buffer_enable;
+   bool EXT_texture_buffer_warn;
    /*@}*/
 
    /** Extensions supported by the OpenGL implementation. */
diff --git a/src/compiler/glsl/ir.h b/src/compiler/glsl/ir.h
index b74d68a605b..b1a1d5656d1 100644
--- a/src/compiler/glsl/ir.h
+++ b/src/compiler/glsl/ir.h
@@ -727,6 +727,21 @@ public:
       unsigned is_xfb_only:1;
 
       /**
+       * Was a transfor feedback buffer set in the shader?
+       */
+      unsigned explicit_xfb_buffer:1;
+
+      /**
+       * Was a transfor feedback offset set in the shader?
+       */
+      unsigned explicit_xfb_offset:1;
+
+      /**
+       * Was a transfor feedback stride set in the shader?
+       */
+      unsigned explicit_xfb_stride:1;
+
+      /**
        * If non-zero, then this variable may be packed along with other variables
        * into a single varying slot, so this offset should be applied when
        * accessing components.  For example, an offset of 1 means that the x
@@ -742,21 +757,9 @@ public:
 
       /**
        * Non-zero if this variable was created by lowering a named interface
-       * block which was not an array.
-       *
-       * Note that this variable and \c from_named_ifc_block_array will never
-       * both be non-zero.
+       * block.
        */
-      unsigned from_named_ifc_block_nonarray:1;
-
-      /**
-       * Non-zero if this variable was created by lowering a named interface
-       * block which was an array.
-       *
-       * Note that this variable and \c from_named_ifc_block_nonarray will never
-       * both be non-zero.
-       */
-      unsigned from_named_ifc_block_array:1;
+      unsigned from_named_ifc_block:1;
 
       /**
        * Non-zero if the variable must be a shader input. This is useful for
@@ -873,7 +876,7 @@ public:
       unsigned stream;
 
       /**
-       * Atomic or block member offset.
+       * Atomic, transform feedback or block member offset.
        */
       unsigned offset;
 
@@ -885,6 +888,16 @@ public:
       unsigned max_array_access;
 
       /**
+       * Transform feedback buffer.
+       */
+      unsigned xfb_buffer;
+
+      /**
+       * Transform feedback stride.
+       */
+      unsigned xfb_stride;
+
+      /**
        * Allow (only) ir_variable direct access private members.
        */
       friend class ir_variable;
diff --git a/src/compiler/glsl/ir_uniform.h b/src/compiler/glsl/ir_uniform.h
index 1854279925b..e72e7b42c57 100644
--- a/src/compiler/glsl/ir_uniform.h
+++ b/src/compiler/glsl/ir_uniform.h
@@ -105,11 +105,6 @@ struct gl_uniform_storage {
     */
    unsigned array_elements;
 
-   /**
-    * Has this uniform ever been set?
-    */
-   bool initialized;
-
    struct gl_opaque_uniform_index opaque[MESA_SHADER_STAGES];
 
    /**
diff --git a/src/compiler/glsl/link_interface_blocks.cpp b/src/compiler/glsl/link_interface_blocks.cpp
index 4c6fb56f891..26072591b0e 100644
--- a/src/compiler/glsl/link_interface_blocks.cpp
+++ b/src/compiler/glsl/link_interface_blocks.cpp
@@ -242,7 +242,8 @@ public:
          return entry ? (ir_variable *) entry->data : NULL;
       } else {
          const struct hash_entry *entry =
-            _mesa_hash_table_search(ht, var->get_interface_type()->name);
+            _mesa_hash_table_search(ht,
+               var->get_interface_type()->without_array()->name);
          return entry ? (ir_variable *) entry->data : NULL;
       }
    }
@@ -263,7 +264,8 @@ public:
          snprintf(location_str, 11, "%d", var->data.location);
          _mesa_hash_table_insert(ht, ralloc_strdup(mem_ctx, location_str), var);
       } else {
-         _mesa_hash_table_insert(ht, var->get_interface_type()->name, var);
+         _mesa_hash_table_insert(ht,
+            var->get_interface_type()->without_array()->name, var);
       }
    }
 
diff --git a/src/compiler/glsl/link_uniform_initializers.cpp b/src/compiler/glsl/link_uniform_initializers.cpp
index 3609f81771e..870bc5bfebd 100644
--- a/src/compiler/glsl/link_uniform_initializers.cpp
+++ b/src/compiler/glsl/link_uniform_initializers.cpp
@@ -162,8 +162,6 @@ set_opaque_binding(void *mem_ctx, gl_shader_program *prog,
             }
          }
       }
-
-      storage->initialized = true;
    }
 }
 
@@ -183,7 +181,7 @@ set_block_binding(gl_shader_program *prog, const char *block_name, int binding)
 
          if (stage_index != -1) {
             struct gl_shader *sh = prog->_LinkedShaders[i];
-            sh->BufferInterfaceBlocks[stage_index].Binding = binding;
+            sh->BufferInterfaceBlocks[stage_index]->Binding = binding;
          }
       }
 }
@@ -267,8 +265,6 @@ set_uniform_initializer(void *mem_ctx, gl_shader_program *prog,
          }
       }
    }
-
-   storage->initialized = true;
 }
 }
 
diff --git a/src/compiler/glsl/link_uniforms.cpp b/src/compiler/glsl/link_uniforms.cpp
index 940cc61181d..0a230cad034 100644
--- a/src/compiler/glsl/link_uniforms.cpp
+++ b/src/compiler/glsl/link_uniforms.cpp
@@ -68,7 +68,7 @@ program_resource_visitor::process(const glsl_type *type, const char *name)
    unsigned packing = type->interface_packing;
 
    recursion(type, &name_copy, strlen(name), false, NULL, packing, false,
-             record_array_count);
+             record_array_count, NULL);
    ralloc_free(name_copy);
 }
 
@@ -76,8 +76,6 @@ void
 program_resource_visitor::process(ir_variable *var)
 {
    unsigned record_array_count = 1;
-   const glsl_type *t = var->type;
-   const glsl_type *t_without_array = var->type->without_array();
    const bool row_major =
       var->data.matrix_layout == GLSL_MATRIX_LAYOUT_ROW_MAJOR;
 
@@ -85,80 +83,28 @@ program_resource_visitor::process(ir_variable *var)
       var->get_interface_type()->interface_packing :
       var->type->interface_packing;
 
+   const glsl_type *t =
+      var->data.from_named_ifc_block ? var->get_interface_type() : var->type;
+   const glsl_type *t_without_array = t->without_array();
+
    /* false is always passed for the row_major parameter to the other
     * processing functions because no information is available to do
     * otherwise.  See the warning in linker.h.
     */
-
-   /* Only strdup the name if we actually will need to modify it. */
-   if (var->data.from_named_ifc_block_array) {
-      /* lower_named_interface_blocks created this variable by lowering an
-       * interface block array to an array variable.  For example if the
-       * original source code was:
-       *
-       *     out Blk { vec4 bar } foo[3];
-       *
-       * Then the variable is now:
-       *
-       *     out vec4 bar[3];
-       *
-       * We need to visit each array element using the names constructed like
-       * so:
-       *
-       *     Blk[0].bar
-       *     Blk[1].bar
-       *     Blk[2].bar
-       */
-      assert(t->is_array());
-      const glsl_type *ifc_type = var->get_interface_type();
-      char *name = ralloc_strdup(NULL, ifc_type->name);
-      size_t name_length = strlen(name);
-      for (unsigned i = 0; i < t->length; i++) {
-         size_t new_length = name_length;
-         ralloc_asprintf_rewrite_tail(&name, &new_length, "[%u].%s", i,
-                                      var->name);
-         /* Note: row_major is only meaningful for uniform blocks, and
-          * lowering is only applied to non-uniform interface blocks, so we
-          * can safely pass false for row_major.
-          */
-         recursion(var->type, &name, new_length, row_major, NULL, packing,
-                   false, record_array_count);
-      }
-      ralloc_free(name);
-   } else if (var->data.from_named_ifc_block_nonarray) {
-      /* lower_named_interface_blocks created this variable by lowering a
-       * named interface block (non-array) to an ordinary variable.  For
-       * example if the original source code was:
-       *
-       *     out Blk { vec4 bar } foo;
-       *
-       * Then the variable is now:
-       *
-       *     out vec4 bar;
-       *
-       * We need to visit this variable using the name:
-       *
-       *     Blk.bar
-       */
-      const glsl_type *ifc_type = var->get_interface_type();
-      char *name = ralloc_asprintf(NULL, "%s.%s", ifc_type->name, var->name);
-      /* Note: row_major is only meaningful for uniform blocks, and lowering
-       * is only applied to non-uniform interface blocks, so we can safely
-       * pass false for row_major.
-       */
-      recursion(var->type, &name, strlen(name), row_major, NULL, packing,
-                false, record_array_count);
-      ralloc_free(name);
-   } else if (t_without_array->is_record() ||
+   if (t_without_array->is_record() ||
               (t->is_array() && t->fields.array->is_array())) {
       char *name = ralloc_strdup(NULL, var->name);
       recursion(var->type, &name, strlen(name), row_major, NULL, packing,
-                false, record_array_count);
+                false, record_array_count, NULL);
       ralloc_free(name);
    } else if (t_without_array->is_interface()) {
       char *name = ralloc_strdup(NULL, t_without_array->name);
-      recursion(var->type, &name, strlen(name), row_major, NULL, packing,
-                false, record_array_count);
+      const glsl_struct_field *ifc_member = var->data.from_named_ifc_block ?
+         &t_without_array->
+            fields.structure[t_without_array->field_index(var->name)] : NULL;
+
+      recursion(t, &name, strlen(name), row_major, NULL, packing,
+                false, record_array_count, ifc_member);
       ralloc_free(name);
    } else {
       this->set_record_array_count(record_array_count);
@@ -172,7 +118,8 @@ program_resource_visitor::recursion(const glsl_type *t, char **name,
                                     const glsl_type *record_type,
                                     const unsigned packing,
                                     bool last_field,
-                                    unsigned record_array_count)
+                                    unsigned record_array_count,
+                                    const glsl_struct_field *named_ifc_member)
 {
    /* Records need to have each field processed individually.
     *
@@ -180,7 +127,12 @@ program_resource_visitor::recursion(const glsl_type *t, char **name,
     * individually, then each field of the resulting array elements processed
     * individually.
     */
-   if (t->is_record() || t->is_interface()) {
+   if (t->is_interface() && named_ifc_member) {
+      ralloc_asprintf_rewrite_tail(name, &name_length, ".%s",
+                                   named_ifc_member->name);
+      recursion(named_ifc_member->type, name, name_length, row_major, NULL,
+                packing, false, record_array_count, NULL);
+   } else if (t->is_record() || t->is_interface()) {
       if (record_type == NULL && t->is_record())
          record_type = t;
 
@@ -223,7 +175,7 @@ program_resource_visitor::recursion(const glsl_type *t, char **name,
                    field_row_major,
                    record_type,
                    packing,
-                   (i + 1) == t->length, record_array_count);
+                   (i + 1) == t->length, record_array_count, NULL);
 
          /* Only the first leaf-field of the record gets called with the
           * record type pointer.
@@ -258,7 +210,8 @@ program_resource_visitor::recursion(const glsl_type *t, char **name,
          recursion(t->fields.array, name, new_length, row_major,
                    record_type,
                    packing,
-                   (i + 1) == t->length, record_array_count);
+                   (i + 1) == t->length, record_array_count,
+                   named_ifc_member);
 
          /* Only the first leaf-field of the record gets called with the
           * record type pointer.
@@ -799,7 +752,6 @@ private:
 
       this->uniforms[id].name = ralloc_strdup(this->uniforms, name);
       this->uniforms[id].type = base_type;
-      this->uniforms[id].initialized = 0;
       this->uniforms[id].num_driver_storage = 0;
       this->uniforms[id].driver_storage = NULL;
       this->uniforms[id].atomic_buffer_index = -1;
@@ -954,6 +906,8 @@ link_cross_validate_uniform_block(void *mem_ctx,
           new_block->Uniforms,
           sizeof(*linked_block->Uniforms) * linked_block->NumUniforms);
 
+   linked_block->Name = ralloc_strdup(*linked_blocks, linked_block->Name);
+
    for (unsigned int i = 0; i < linked_block->NumUniforms; i++) {
       struct gl_uniform_buffer_variable *ubo_var =
          &linked_block->Uniforms[i];
@@ -1005,9 +959,9 @@ link_update_uniform_buffer_variables(struct gl_shader *shader)
 
       const unsigned l = strlen(var->name);
       for (unsigned i = 0; i < shader->NumBufferInterfaceBlocks; i++) {
-         for (unsigned j = 0; j < shader->BufferInterfaceBlocks[i].NumUniforms; j++) {
+         for (unsigned j = 0; j < shader->BufferInterfaceBlocks[i]->NumUniforms; j++) {
             if (sentinel) {
-               const char *begin = shader->BufferInterfaceBlocks[i].Uniforms[j].Name;
+               const char *begin = shader->BufferInterfaceBlocks[i]->Uniforms[j].Name;
                const char *end = strchr(begin, sentinel);
 
                if (end == NULL)
@@ -1022,7 +976,7 @@ link_update_uniform_buffer_variables(struct gl_shader *shader)
                   break;
                }
             } else if (!strcmp(var->name,
-                               shader->BufferInterfaceBlocks[i].Uniforms[j].Name)) {
+                               shader->BufferInterfaceBlocks[i]->Uniforms[j].Name)) {
                found = true;
                var->data.location = j;
                break;
@@ -1148,9 +1102,9 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
       sh->num_combined_uniform_components = sh->num_uniform_components;
 
       for (unsigned i = 0; i < sh->NumBufferInterfaceBlocks; i++) {
-         if (!sh->BufferInterfaceBlocks[i].IsShaderStorage) {
+         if (!sh->BufferInterfaceBlocks[i]->IsShaderStorage) {
             sh->num_combined_uniform_components +=
-               sh->BufferInterfaceBlocks[i].UniformBufferSize / 4;
+               sh->BufferInterfaceBlocks[i]->UniformBufferSize / 4;
          }
       }
    }
diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index 44fc8f617f8..848668c4381 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -63,6 +63,125 @@ get_varying_type(const ir_variable *var, gl_shader_stage stage)
    return type;
 }
 
+static void
+create_xfb_varying_names(void *mem_ctx, const glsl_type *t, char **name,
+                         size_t name_length, unsigned *count,
+                         const char *ifc_member_name,
+                         const glsl_type *ifc_member_t, char ***varying_names)
+{
+   if (t->is_interface()) {
+      size_t new_length = name_length;
+
+      assert(ifc_member_name && ifc_member_t);
+      ralloc_asprintf_rewrite_tail(name, &new_length, ".%s", ifc_member_name);
+
+      create_xfb_varying_names(mem_ctx, ifc_member_t, name, new_length, count,
+                               NULL, NULL, varying_names);
+   } else if (t->is_record()) {
+      for (unsigned i = 0; i < t->length; i++) {
+         const char *field = t->fields.structure[i].name;
+         size_t new_length = name_length;
+
+         ralloc_asprintf_rewrite_tail(name, &new_length, ".%s", field);
+
+         create_xfb_varying_names(mem_ctx, t->fields.structure[i].type, name,
+                                  new_length, count, NULL, NULL,
+                                  varying_names);
+      }
+   } else if (t->without_array()->is_record() ||
+              t->without_array()->is_interface() ||
+              (t->is_array() && t->fields.array->is_array())) {
+      for (unsigned i = 0; i < t->length; i++) {
+         size_t new_length = name_length;
+
+         /* Append the subscript to the current variable name */
+         ralloc_asprintf_rewrite_tail(name, &new_length, "[%u]", i);
+
+         create_xfb_varying_names(mem_ctx, t->fields.array, name, new_length,
+                                  count, ifc_member_name, ifc_member_t,
+                                  varying_names);
+      }
+   } else {
+      (*varying_names)[(*count)++] = ralloc_strdup(mem_ctx, *name);
+   }
+}
+
+bool
+process_xfb_layout_qualifiers(void *mem_ctx, const gl_shader *sh,
+                              unsigned *num_tfeedback_decls,
+                              char ***varying_names)
+{
+   bool has_xfb_qualifiers = false;
+
+   /* We still need to enable transform feedback mode even if xfb_stride is
+    * only applied to a global out. Also we don't bother to propagate
+    * xfb_stride to interface block members so this will catch that case also.
+    */
+   for (unsigned j = 0; j < MAX_FEEDBACK_BUFFERS; j++) {
+      if (sh->TransformFeedback.BufferStride[j]) {
+         has_xfb_qualifiers = true;
+      }
+   }
+
+   foreach_in_list(ir_instruction, node, sh->ir) {
+      ir_variable *var = node->as_variable();
+      if (!var || var->data.mode != ir_var_shader_out)
+         continue;
+
+      /* From the ARB_enhanced_layouts spec:
+       *
+       *    "Any shader making any static use (after preprocessing) of any of
+       *     these *xfb_* qualifiers will cause the shader to be in a
+       *     transform feedback capturing mode and hence responsible for
+       *     describing the transform feedback setup.  This mode will capture
+       *     any output selected by *xfb_offset*, directly or indirectly, to
+       *     a transform feedback buffer."
+       */
+      if (var->data.explicit_xfb_buffer || var->data.explicit_xfb_stride) {
+         has_xfb_qualifiers = true;
+      }
+
+      if (var->data.explicit_xfb_offset) {
+         *num_tfeedback_decls += var->type->varying_count();
+         has_xfb_qualifiers = true;
+      }
+   }
+
+   if (*num_tfeedback_decls == 0)
+      return has_xfb_qualifiers;
+
+   unsigned i = 0;
+   *varying_names = ralloc_array(mem_ctx, char *, *num_tfeedback_decls);
+   foreach_in_list(ir_instruction, node, sh->ir) {
+      ir_variable *var = node->as_variable();
+      if (!var || var->data.mode != ir_var_shader_out)
+         continue;
+
+      if (var->data.explicit_xfb_offset) {
+         char *name;
+         const glsl_type *type, *member_type;
+
+         if (var->data.from_named_ifc_block) {
+            type = var->get_interface_type();
+            /* Find the member type before it was altered by lowering */
+            member_type =
+               type->fields.structure[type->field_index(var->name)].type;
+            name = ralloc_strdup(NULL, type->without_array()->name);
+         } else {
+            type = var->type;
+            member_type = NULL;
+            name = ralloc_strdup(NULL, var->name);
+         }
+         create_xfb_varying_names(mem_ctx, type, &name, strlen(name), &i,
+                                  var->name, member_type, varying_names);
+         ralloc_free(name);
+      }
+   }
+
+   assert(i == *num_tfeedback_decls);
+   return has_xfb_qualifiers;
+}
+
 /**
  * Validate the types and qualifiers of an output from one stage against the
  * matching input to another stage.
@@ -397,6 +516,8 @@ tfeedback_decl::init(struct gl_context *ctx, const void *mem_ctx,
    this->next_buffer_separator = false;
    this->matched_candidate = NULL;
    this->stream_id = 0;
+   this->buffer = 0;
+   this->offset = 0;
 
    if (ctx->Extensions.ARB_transform_feedback3) {
       /* Parse gl_NextBuffer. */
@@ -489,6 +610,8 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
       = this->matched_candidate->toplevel_var->data.location * 4
       + this->matched_candidate->toplevel_var->data.location_frac
       + this->matched_candidate->offset;
+   const unsigned dmul =
+      this->matched_candidate->type->without_array()->is_double() ? 2 : 1;
 
    if (this->matched_candidate->type->is_array()) {
       /* Array variable */
@@ -496,8 +619,6 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
          this->matched_candidate->type->fields.array->matrix_columns;
       const unsigned vector_elements =
          this->matched_candidate->type->fields.array->vector_elements;
-      const unsigned dmul =
-         this->matched_candidate->type->fields.array->is_double() ? 2 : 1;
       unsigned actual_array_size;
       switch (this->lowered_builtin_array_variable) {
       case clip_distance:
@@ -575,6 +696,12 @@ tfeedback_decl::assign_location(struct gl_context *ctx,
     */
    this->stream_id = this->matched_candidate->toplevel_var->data.stream;
 
+   unsigned array_offset = this->array_subscript * 4 * dmul;
+   unsigned struct_offset = this->matched_candidate->offset * 4 * dmul;
+   this->buffer = this->matched_candidate->toplevel_var->data.xfb_buffer;
+   this->offset = this->matched_candidate->toplevel_var->data.offset +
+      array_offset + struct_offset;
+
    return true;
 }
 
@@ -598,55 +725,108 @@ tfeedback_decl::get_num_outputs() const
 bool
 tfeedback_decl::store(struct gl_context *ctx, struct gl_shader_program *prog,
                       struct gl_transform_feedback_info *info,
-                      unsigned buffer, const unsigned max_outputs) const
+                      unsigned buffer, unsigned buffer_index,
+                      const unsigned max_outputs, bool *explicit_stride,
+                      bool has_xfb_qualifiers) const
 {
    assert(!this->next_buffer_separator);
 
    /* Handle gl_SkipComponents. */
    if (this->skip_components) {
-      info->BufferStride[buffer] += this->skip_components;
+      info->Buffers[buffer].Stride += this->skip_components;
       return true;
    }
 
+   unsigned xfb_offset = 0;
+   if (has_xfb_qualifiers) {
+      xfb_offset = this->offset / 4;
+   } else {
+      xfb_offset = info->Buffers[buffer].Stride;
+   }
+   info->Varyings[info->NumVarying].Offset = xfb_offset * 4;
+
+   unsigned location = this->location;
+   unsigned location_frac = this->location_frac;
+   unsigned num_components = this->num_components();
+   while (num_components > 0) {
+      unsigned output_size = MIN2(num_components, 4 - location_frac);
+      assert((info->NumOutputs == 0 && max_outputs == 0) ||
+             info->NumOutputs < max_outputs);
+
+      /* From the ARB_enhanced_layouts spec:
+       *
+       *    "If such a block member or variable is not written during a shader
+       *    invocation, the buffer contents at the assigned offset will be
+       *    undefined.  Even if there are no static writes to a variable or
+       *    member that is assigned a transform feedback offset, the space is
+       *    still allocated in the buffer and still affects the stride."
+       */
+      if (this->is_varying_written()) {
+         info->Outputs[info->NumOutputs].ComponentOffset = location_frac;
+         info->Outputs[info->NumOutputs].OutputRegister = location;
+         info->Outputs[info->NumOutputs].NumComponents = output_size;
+         info->Outputs[info->NumOutputs].StreamId = stream_id;
+         info->Outputs[info->NumOutputs].OutputBuffer = buffer;
+         info->Outputs[info->NumOutputs].DstOffset = xfb_offset;
+         ++info->NumOutputs;
+      }
+      info->Buffers[buffer].Stream = this->stream_id;
+      xfb_offset += output_size;
+
+      num_components -= output_size;
+      location++;
+      location_frac = 0;
+   }
+
+   if (explicit_stride && explicit_stride[buffer]) {
+      if (this->is_double() && info->Buffers[buffer].Stride % 2) {
+         linker_error(prog, "invalid qualifier xfb_stride=%d must be a "
+                      "multiple of 8 as its applied to a type that is or "
+                      "contains a double.",
+                      info->Buffers[buffer].Stride * 4);
+         return false;
+      }
+
+      if ((this->offset / 4) / info->Buffers[buffer].Stride !=
+          (xfb_offset - 1) / info->Buffers[buffer].Stride) {
+         linker_error(prog, "xfb_offset (%d) overflows xfb_stride (%d) for "
+                      "buffer (%d)", xfb_offset * 4,
+                      info->Buffers[buffer].Stride * 4, buffer);
+         return false;
+      }
+   } else {
+      info->Buffers[buffer].Stride = xfb_offset;
+   }
+
    /* From GL_EXT_transform_feedback:
     *   A program will fail to link if:
     *
     *     * the total number of components to capture is greater than
     *       the constant MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS_EXT
     *       and the buffer mode is INTERLEAVED_ATTRIBS_EXT.
+    *
+    * From GL_ARB_enhanced_layouts:
+    *
+    *   "The resulting stride (implicit or explicit) must be less than or
+    *   equal to the implementation-dependent constant
+    *   gl_MaxTransformFeedbackInterleavedComponents."
     */
-   if (prog->TransformFeedback.BufferMode == GL_INTERLEAVED_ATTRIBS &&
-       info->BufferStride[buffer] + this->num_components() >
+   if ((prog->TransformFeedback.BufferMode == GL_INTERLEAVED_ATTRIBS ||
+        has_xfb_qualifiers) &&
+       info->Buffers[buffer].Stride >
        ctx->Const.MaxTransformFeedbackInterleavedComponents) {
       linker_error(prog, "The MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS "
                    "limit has been exceeded.");
       return false;
    }
 
-   unsigned location = this->location;
-   unsigned location_frac = this->location_frac;
-   unsigned num_components = this->num_components();
-   while (num_components > 0) {
-      unsigned output_size = MIN2(num_components, 4 - location_frac);
-      assert(info->NumOutputs < max_outputs);
-      info->Outputs[info->NumOutputs].ComponentOffset = location_frac;
-      info->Outputs[info->NumOutputs].OutputRegister = location;
-      info->Outputs[info->NumOutputs].NumComponents = output_size;
-      info->Outputs[info->NumOutputs].StreamId = stream_id;
-      info->Outputs[info->NumOutputs].OutputBuffer = buffer;
-      info->Outputs[info->NumOutputs].DstOffset = info->BufferStride[buffer];
-      ++info->NumOutputs;
-      info->BufferStride[buffer] += output_size;
-      info->BufferStream[buffer] = this->stream_id;
-      num_components -= output_size;
-      location++;
-      location_frac = 0;
-   }
-
-   info->Varyings[info->NumVarying].Name = ralloc_strdup(prog, this->orig_name);
+   info->Varyings[info->NumVarying].Name = ralloc_strdup(prog,
+                                                         this->orig_name);
    info->Varyings[info->NumVarying].Type = this->type;
    info->Varyings[info->NumVarying].Size = this->size;
+   info->Varyings[info->NumVarying].BufferIndex = buffer_index;
    info->NumVarying++;
+   info->Buffers[buffer].NumVaryings++;
 
    return true;
 }
@@ -731,6 +911,17 @@ parse_tfeedback_decls(struct gl_context *ctx, struct gl_shader_program *prog,
 }
 
 
+static int
+cmp_xfb_offset(const void * x_generic, const void * y_generic)
+{
+   tfeedback_decl *x = (tfeedback_decl *) x_generic;
+   tfeedback_decl *y = (tfeedback_decl *) y_generic;
+
+   if (x->get_buffer() != y->get_buffer())
+      return x->get_buffer() - y->get_buffer();
+   return x->get_offset() - y->get_offset();
+}
+
 /**
  * Store transform feedback location assignments into
  * prog->LinkedTransformFeedback based on the data stored in tfeedback_decls.
@@ -741,8 +932,13 @@ parse_tfeedback_decls(struct gl_context *ctx, struct gl_shader_program *prog,
 bool
 store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
                      unsigned num_tfeedback_decls,
-                     tfeedback_decl *tfeedback_decls)
+                     tfeedback_decl *tfeedback_decls, bool has_xfb_qualifiers)
 {
+   /* Make sure MaxTransformFeedbackBuffers is less than 32 so the bitmask for
+    * tracking the number of buffers doesn't overflow.
+    */
+   assert(ctx->Const.MaxTransformFeedbackBuffers < 32);
+
    bool separate_attribs_mode =
       prog->TransformFeedback.BufferMode == GL_SEPARATE_ATTRIBS;
 
@@ -752,14 +948,24 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
    memset(&prog->LinkedTransformFeedback, 0,
           sizeof(prog->LinkedTransformFeedback));
 
+   /* The xfb_offset qualifier does not have to be used in increasing order
+    * however some drivers expect to receive the list of transform feedback
+    * declarations in order so sort it now for convenience.
+    */
+   if (has_xfb_qualifiers)
+      qsort(tfeedback_decls, num_tfeedback_decls, sizeof(*tfeedback_decls),
+            cmp_xfb_offset);
+
    prog->LinkedTransformFeedback.Varyings =
       rzalloc_array(prog,
                     struct gl_transform_feedback_varying_info,
                     num_tfeedback_decls);
 
    unsigned num_outputs = 0;
-   for (unsigned i = 0; i < num_tfeedback_decls; ++i)
-      num_outputs += tfeedback_decls[i].get_num_outputs();
+   for (unsigned i = 0; i < num_tfeedback_decls; ++i) {
+      if (tfeedback_decls[i].is_varying_written())
+         num_outputs += tfeedback_decls[i].get_num_outputs();
+   }
 
    prog->LinkedTransformFeedback.Outputs =
       rzalloc_array(prog,
@@ -767,21 +973,47 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
                     num_outputs);
 
    unsigned num_buffers = 0;
+   unsigned buffers = 0;
 
-   if (separate_attribs_mode) {
+   if (!has_xfb_qualifiers && separate_attribs_mode) {
       /* GL_SEPARATE_ATTRIBS */
       for (unsigned i = 0; i < num_tfeedback_decls; ++i) {
          if (!tfeedback_decls[i].store(ctx, prog, &prog->LinkedTransformFeedback,
-                                       num_buffers, num_outputs))
+                                       num_buffers, num_buffers, num_outputs,
+                                       NULL, has_xfb_qualifiers))
             return false;
 
+         buffers |= 1 << num_buffers;
          num_buffers++;
       }
    }
    else {
       /* GL_INVERLEAVED_ATTRIBS */
       int buffer_stream_id = -1;
+      unsigned buffer =
+         num_tfeedback_decls ? tfeedback_decls[0].get_buffer() : 0;
+      bool explicit_stride[MAX_FEEDBACK_BUFFERS] = { false };
+
+      /* Apply any xfb_stride global qualifiers */
+      if (has_xfb_qualifiers) {
+         for (unsigned j = 0; j < MAX_FEEDBACK_BUFFERS; j++) {
+            if (prog->TransformFeedback.BufferStride[j]) {
+               buffers |= 1 << j;
+               explicit_stride[j] = true;
+               prog->LinkedTransformFeedback.Buffers[j].Stride =
+                  prog->TransformFeedback.BufferStride[j] / 4;
+            }
+         }
+      }
+
       for (unsigned i = 0; i < num_tfeedback_decls; ++i) {
+         if (has_xfb_qualifiers &&
+             buffer != tfeedback_decls[i].get_buffer()) {
+            /* we have moved to the next buffer so reset stream id */
+            buffer_stream_id = -1;
+            num_buffers++;
+         }
+
          if (tfeedback_decls[i].is_next_buffer_separator()) {
             num_buffers++;
             buffer_stream_id = -1;
@@ -803,17 +1035,24 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
             return false;
          }
 
+         if (has_xfb_qualifiers) {
+            buffer = tfeedback_decls[i].get_buffer();
+         } else {
+            buffer = num_buffers;
+         }
+         buffers |= 1 << buffer;
+
          if (!tfeedback_decls[i].store(ctx, prog,
                                        &prog->LinkedTransformFeedback,
-                                       num_buffers, num_outputs))
+                                       buffer, num_buffers, num_outputs,
+                                       explicit_stride, has_xfb_qualifiers))
             return false;
       }
-      num_buffers++;
    }
 
    assert(prog->LinkedTransformFeedback.NumOutputs == num_outputs);
 
-   prog->LinkedTransformFeedback.NumBuffers = num_buffers;
+   prog->LinkedTransformFeedback.ActiveBuffers = buffers;
    return true;
 }
 
@@ -1466,8 +1705,8 @@ populate_consumer_input_sets(void *mem_ctx, exec_list *ir,
          } else if (input_var->get_interface_type() != NULL) {
             char *const iface_field_name =
                ralloc_asprintf(mem_ctx, "%s.%s",
-                               input_var->get_interface_type()->name,
-                               input_var->name);
+                  input_var->get_interface_type()->without_array()->name,
+                  input_var->name);
             hash_table_insert(consumer_interface_inputs, input_var,
                               iface_field_name);
          } else {
@@ -1498,8 +1737,8 @@ get_matching_input(void *mem_ctx,
    } else if (output_var->get_interface_type() != NULL) {
       char *const iface_field_name =
          ralloc_asprintf(mem_ctx, "%s.%s",
-                         output_var->get_interface_type()->name,
-                         output_var->name);
+            output_var->get_interface_type()->without_array()->name,
+            output_var->name);
       input_var =
          (ir_variable *) hash_table_find(consumer_interface_inputs,
                                          iface_field_name);
diff --git a/src/compiler/glsl/link_varyings.h b/src/compiler/glsl/link_varyings.h
index b2812614ecc..543b80ff29b 100644
--- a/src/compiler/glsl/link_varyings.h
+++ b/src/compiler/glsl/link_varyings.h
@@ -98,7 +98,8 @@ public:
    unsigned get_num_outputs() const;
    bool store(struct gl_context *ctx, struct gl_shader_program *prog,
               struct gl_transform_feedback_info *info, unsigned buffer,
-              const unsigned max_outputs) const;
+              unsigned buffer_index, const unsigned max_outputs,
+              bool *explicit_stride, bool has_xfb_qualifiers) const;
    const tfeedback_candidate *find_candidate(gl_shader_program *prog,
                                              hash_table *tfeedback_candidates);
 
@@ -107,6 +108,14 @@ public:
       return this->next_buffer_separator;
    }
 
+   bool is_varying_written() const
+   {
+      if (this->next_buffer_separator || this->skip_components)
+         return false;
+
+      return this->matched_candidate->toplevel_var->data.assigned;
+   }
+
    bool is_varying() const
    {
       return !this->next_buffer_separator && !this->skip_components;
@@ -122,6 +131,16 @@ public:
       return this->stream_id;
    }
 
+   unsigned get_buffer() const
+   {
+      return this->buffer;
+   }
+
+   unsigned get_offset() const
+   {
+      return this->offset;
+   }
+
    /**
     * The total number of varying components taken up by this variable.  Only
     * valid if assign_location() has been called.
@@ -202,6 +221,16 @@ private:
    int location;
 
    /**
+    * Used to store the buffer assigned by xfb_buffer.
+    */
+   unsigned buffer;
+
+   /**
+    * Used to store the offset assigned by xfb_offset.
+    */
+   unsigned offset;
+
+   /**
     * If non-zero, then this variable may be packed along with other variables
     * into a single varying slot, so this offset should be applied when
     * accessing components.  For example, an offset of 1 means that the x
@@ -268,6 +297,11 @@ parse_tfeedback_decls(struct gl_context *ctx, struct gl_shader_program *prog,
                       const void *mem_ctx, unsigned num_names,
                       char **varying_names, tfeedback_decl *decls);
 
+bool
+process_xfb_layout_qualifiers(void *mem_ctx, const gl_shader *sh,
+                              unsigned *num_tfeedback_decls,
+                              char ***varying_names);
+
 void
 remove_unused_shader_inputs_and_outputs(bool is_separate_shader_object,
                                         gl_shader *sh,
@@ -276,7 +310,8 @@ remove_unused_shader_inputs_and_outputs(bool is_separate_shader_object,
 bool
 store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
                      unsigned num_tfeedback_decls,
-                     tfeedback_decl *tfeedback_decls);
+                     tfeedback_decl *tfeedback_decls,
+                     bool has_xfb_qualifiers);
 
 bool
 assign_varying_locations(struct gl_context *ctx,
diff --git a/src/compiler/glsl/linker.cpp b/src/compiler/glsl/linker.cpp
index 76b700d3451..510a22e5bd3 100644
--- a/src/compiler/glsl/linker.cpp
+++ b/src/compiler/glsl/linker.cpp
@@ -1192,11 +1192,11 @@ interstage_cross_validate_uniform_blocks(struct gl_shader_program *prog)
 	 int index = link_cross_validate_uniform_block(prog,
 						       &prog->BufferInterfaceBlocks,
 						       &prog->NumBufferInterfaceBlocks,
-						       &sh->BufferInterfaceBlocks[j]);
+						       sh->BufferInterfaceBlocks[j]);
 
 	 if (index == -1) {
 	    linker_error(prog, "uniform block `%s' has mismatching definitions\n",
-			 sh->BufferInterfaceBlocks[j].Name);
+			 sh->BufferInterfaceBlocks[j]->Name);
 	    return false;
 	 }
 
@@ -1204,6 +1204,23 @@ interstage_cross_validate_uniform_blocks(struct gl_shader_program *prog)
       }
    }
 
+   /* Update per stage block pointers to point to the program list.
+    * FIXME: We should be able to free the per stage blocks here.
+    */
+   for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
+      for (unsigned j = 0; j < prog->NumBufferInterfaceBlocks; j++) {
+	 int stage_index =
+            prog->InterfaceBlockStageIndex[i][j];
+
+	 if (stage_index != -1) {
+	    struct gl_shader *sh = prog->_LinkedShaders[i];
+
+            sh->BufferInterfaceBlocks[stage_index] =
+               &prog->BufferInterfaceBlocks[j];
+	 }
+      }
+   }
+
    return true;
 }
 
@@ -1567,6 +1584,69 @@ private:
    hash_table *unnamed_interfaces;
 };
 
+/**
+ * Check for conflicting xfb_stride default qualifiers and store buffer stride
+ * for later use.
+ */
+static void
+link_xfb_stride_layout_qualifiers(struct gl_context *ctx,
+                                  struct gl_shader_program *prog,
+			          struct gl_shader *linked_shader,
+			          struct gl_shader **shader_list,
+			          unsigned num_shaders)
+{
+   for (unsigned i = 0; i < MAX_FEEDBACK_BUFFERS; i++) {
+      linked_shader->TransformFeedback.BufferStride[i] = 0;
+   }
+
+   for (unsigned i = 0; i < num_shaders; i++) {
+      struct gl_shader *shader = shader_list[i];
+
+      for (unsigned j = 0; j < MAX_FEEDBACK_BUFFERS; j++) {
+         if (shader->TransformFeedback.BufferStride[j]) {
+	    if (linked_shader->TransformFeedback.BufferStride[j] != 0 &&
+                shader->TransformFeedback.BufferStride[j] != 0 &&
+	        linked_shader->TransformFeedback.BufferStride[j] !=
+                   shader->TransformFeedback.BufferStride[j]) {
+	       linker_error(prog,
+                            "intrastage shaders defined with conflicting "
+                            "xfb_stride for buffer %d (%d and %d)\n", j,
+                            linked_shader->TransformFeedback.BufferStride[j],
+			    shader->TransformFeedback.BufferStride[j]);
+	       return;
+	    }
+
+            if (shader->TransformFeedback.BufferStride[j])
+	       linked_shader->TransformFeedback.BufferStride[j] =
+                  shader->TransformFeedback.BufferStride[j];
+         }
+      }
+   }
+
+   for (unsigned j = 0; j < MAX_FEEDBACK_BUFFERS; j++) {
+      if (linked_shader->TransformFeedback.BufferStride[j]) {
+         prog->TransformFeedback.BufferStride[j] =
+            linked_shader->TransformFeedback.BufferStride[j];
+
+         /* We will validate doubles at a later stage */
+         if (prog->TransformFeedback.BufferStride[j] % 4) {
+            linker_error(prog, "invalid qualifier xfb_stride=%d must be a "
+                         "multiple of 4 or if its applied to a type that is "
+                         "or contains a double a multiple of 8.",
+                         prog->TransformFeedback.BufferStride[j]);
+            return;
+         }
+
+         if (prog->TransformFeedback.BufferStride[j] / 4 >
+             ctx->Const.MaxTransformFeedbackInterleavedComponents) {
+            linker_error(prog,
+                         "The MAX_TRANSFORM_FEEDBACK_INTERLEAVED_COMPONENTS "
+                         "limit has been exceeded.");
+                  return;
+         }
+      }
+   }
+}
 
 /**
  * Performs the cross-validation of tessellation control shader vertices and
@@ -2069,15 +2149,23 @@ link_intrastage_shaders(void *mem_ctx,
    linked->ir = new(linked) exec_list;
    clone_ir_list(mem_ctx, linked->ir, main->ir);
 
-   linked->BufferInterfaceBlocks = uniform_blocks;
+   linked->BufferInterfaceBlocks =
+      ralloc_array(linked, gl_uniform_block *, num_uniform_blocks);
+
+   ralloc_steal(linked, uniform_blocks);
+   for (unsigned i = 0; i < num_uniform_blocks; i++) {
+      linked->BufferInterfaceBlocks[i] = &uniform_blocks[i];
+   }
+
    linked->NumBufferInterfaceBlocks = num_uniform_blocks;
-   ralloc_steal(linked, linked->BufferInterfaceBlocks);
 
    link_fs_input_layout_qualifiers(prog, linked, shader_list, num_shaders);
    link_tcs_out_layout_qualifiers(prog, linked, shader_list, num_shaders);
    link_tes_in_layout_qualifiers(prog, linked, shader_list, num_shaders);
    link_gs_inout_layout_qualifiers(prog, linked, shader_list, num_shaders);
    link_cs_input_layout_qualifiers(prog, linked, shader_list, num_shaders);
+   link_xfb_stride_layout_qualifiers(ctx, prog, linked, shader_list,
+                                     num_shaders);
 
    populate_symbol_table(linked);
 
@@ -2869,7 +2957,8 @@ check_resources(struct gl_context *ctx, struct gl_shader_program *prog)
 	 if (prog->InterfaceBlockStageIndex[j][i] != -1) {
             struct gl_shader *sh = prog->_LinkedShaders[j];
             int stage_index = prog->InterfaceBlockStageIndex[j][i];
-            if (sh && sh->BufferInterfaceBlocks[stage_index].IsShaderStorage) {
+            if (sh &&
+                sh->BufferInterfaceBlocks[stage_index]->IsShaderStorage) {
                shader_blocks[j]++;
                total_shader_storage_blocks++;
             } else {
@@ -2986,7 +3075,8 @@ check_image_resources(struct gl_context *ctx, struct gl_shader_program *prog)
 
          for (unsigned j = 0; j < prog->NumBufferInterfaceBlocks; j++) {
             int stage_index = prog->InterfaceBlockStageIndex[i][j];
-            if (stage_index != -1 && sh->BufferInterfaceBlocks[stage_index].IsShaderStorage)
+            if (stage_index != -1 &&
+                sh->BufferInterfaceBlocks[stage_index]->IsShaderStorage)
                total_shader_storage_blocks++;
          }
 
@@ -3762,7 +3852,8 @@ write_top_level_array_size_and_stride:
  * resource data.
  */
 void
-build_program_resource_list(struct gl_shader_program *shProg)
+build_program_resource_list(struct gl_context *ctx,
+                            struct gl_shader_program *shProg)
 {
    /* Rebuild resource list. */
    if (shProg->ProgramResourceList) {
@@ -3820,6 +3911,17 @@ build_program_resource_list(struct gl_shader_program *shProg)
       }
    }
 
+   /* Add transform feedback buffers. */
+   for (unsigned i = 0; i < ctx->Const.MaxTransformFeedbackBuffers; i++) {
+      if ((shProg->LinkedTransformFeedback.ActiveBuffers >> i) & 1) {
+         shProg->LinkedTransformFeedback.Buffers[i].Binding = i;
+         if (!add_program_resource(shProg, GL_TRANSFORM_FEEDBACK_BUFFER,
+                                   &shProg->LinkedTransformFeedback.Buffers[i],
+                                   0))
+         return;
+      }
+   }
+
    /* Add uniforms from uniform storage. */
    for (unsigned i = 0; i < shProg->NumUniformStorage; i++) {
       /* Do not add uniforms internally used by Mesa. */
@@ -4006,20 +4108,22 @@ link_assign_subroutine_types(struct gl_shader_program *prog)
 
 static void
 split_ubos_and_ssbos(void *mem_ctx,
-                     struct gl_uniform_block *blocks,
+                     struct gl_uniform_block **s_blks,
+                     struct gl_uniform_block *p_blks,
                      unsigned num_blocks,
                      struct gl_uniform_block ***ubos,
                      unsigned *num_ubos,
-                     unsigned **ubo_interface_block_indices,
                      struct gl_uniform_block ***ssbos,
-                     unsigned *num_ssbos,
-                     unsigned **ssbo_interface_block_indices)
+                     unsigned *num_ssbos)
 {
    unsigned num_ubo_blocks = 0;
    unsigned num_ssbo_blocks = 0;
 
+   /* Are we spliting the list of blocks for the shader or the program */
+   bool is_shader = p_blks == NULL;
+
    for (unsigned i = 0; i < num_blocks; i++) {
-      if (blocks[i].IsShaderStorage)
+      if (is_shader ? s_blks[i]->IsShaderStorage : p_blks[i].IsShaderStorage)
          num_ssbo_blocks++;
       else
          num_ubo_blocks++;
@@ -4031,24 +4135,13 @@ split_ubos_and_ssbos(void *mem_ctx,
    *ssbos = ralloc_array(mem_ctx, gl_uniform_block *, num_ssbo_blocks);
    *num_ssbos = 0;
 
-   if (ubo_interface_block_indices)
-      *ubo_interface_block_indices =
-         ralloc_array(mem_ctx, unsigned, num_ubo_blocks);
-
-   if (ssbo_interface_block_indices)
-      *ssbo_interface_block_indices =
-         ralloc_array(mem_ctx, unsigned, num_ssbo_blocks);
-
    for (unsigned i = 0; i < num_blocks; i++) {
-      if (blocks[i].IsShaderStorage) {
-         (*ssbos)[*num_ssbos] = &blocks[i];
-         if (ssbo_interface_block_indices)
-            (*ssbo_interface_block_indices)[*num_ssbos] = i;
+      struct gl_uniform_block *blk = is_shader ? s_blks[i] : &p_blks[i];
+      if (blk->IsShaderStorage) {
+         (*ssbos)[*num_ssbos] = blk;
          (*num_ssbos)++;
       } else {
-         (*ubos)[*num_ubos] = &blocks[i];
-         if (ubo_interface_block_indices)
-            (*ubo_interface_block_indices)[*num_ubos] = i;
+         (*ubos)[*num_ubos] = blk;
          (*num_ubos)++;
       }
    }
@@ -4153,9 +4246,11 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
       return;
    }
 
-   tfeedback_decl *tfeedback_decls = NULL;
-   unsigned num_tfeedback_decls = prog->TransformFeedback.NumVarying;
+   unsigned num_tfeedback_decls = 0;
    unsigned int num_explicit_uniform_locs = 0;
+   bool has_xfb_qualifiers = false;
+   char **varying_names = NULL;
+   tfeedback_decl *tfeedback_decls = NULL;
 
    void *mem_ctx = ralloc_context(NULL); // temporary linker context
 
@@ -4465,6 +4560,30 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
       goto done;
    }
 
+   /* From the ARB_enhanced_layouts spec:
+    *
+    *    "If the shader used to record output variables for transform feedback
+    *    varyings uses the "xfb_buffer", "xfb_offset", or "xfb_stride" layout
+    *    qualifiers, the values specified by TransformFeedbackVaryings are
+    *    ignored, and the set of variables captured for transform feedback is
+    *    instead derived from the specified layout qualifiers."
+    */
+   for (int i = MESA_SHADER_FRAGMENT - 1; i >= 0; i--) {
+      /* Find last stage before fragment shader */
+      if (prog->_LinkedShaders[i]) {
+         has_xfb_qualifiers =
+            process_xfb_layout_qualifiers(mem_ctx, prog->_LinkedShaders[i],
+                                          &num_tfeedback_decls,
+                                          &varying_names);
+         break;
+      }
+   }
+
+   if (!has_xfb_qualifiers) {
+      num_tfeedback_decls = prog->TransformFeedback.NumVarying;
+      varying_names = prog->TransformFeedback.VaryingNames;
+   }
+
    if (num_tfeedback_decls != 0) {
       /* From GL_EXT_transform_feedback:
        *   A program will fail to link if:
@@ -4481,10 +4600,9 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
       }
 
       tfeedback_decls = ralloc_array(mem_ctx, tfeedback_decl,
-                                     prog->TransformFeedback.NumVarying);
+                                     num_tfeedback_decls);
       if (!parse_tfeedback_decls(ctx, prog, mem_ctx, num_tfeedback_decls,
-                                 prog->TransformFeedback.VaryingNames,
-                                 tfeedback_decls))
+                                 varying_names, tfeedback_decls))
          goto done;
    }
 
@@ -4564,7 +4682,8 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
       }
    }
 
-   if (!store_tfeedback_info(ctx, prog, num_tfeedback_decls, tfeedback_decls))
+   if (!store_tfeedback_info(ctx, prog, num_tfeedback_decls, tfeedback_decls,
+                             has_xfb_qualifiers))
       goto done;
 
    update_array_sizes(prog);
@@ -4627,25 +4746,23 @@ link_shaders(struct gl_context *ctx, struct gl_shader_program *prog)
          gl_shader *sh = prog->_LinkedShaders[i];
          split_ubos_and_ssbos(sh,
                               sh->BufferInterfaceBlocks,
+                              NULL,
                               sh->NumBufferInterfaceBlocks,
                               &sh->UniformBlocks,
                               &sh->NumUniformBlocks,
-                              NULL,
                               &sh->ShaderStorageBlocks,
-                              &sh->NumShaderStorageBlocks,
-                              NULL);
+                              &sh->NumShaderStorageBlocks);
       }
    }
 
    split_ubos_and_ssbos(prog,
+                        NULL,
                         prog->BufferInterfaceBlocks,
                         prog->NumBufferInterfaceBlocks,
                         &prog->UniformBlocks,
                         &prog->NumUniformBlocks,
-                        &prog->UboInterfaceBlockIndex,
                         &prog->ShaderStorageBlocks,
-                        &prog->NumShaderStorageBlocks,
-                        &prog->SsboInterfaceBlockIndex);
+                        &prog->NumShaderStorageBlocks);
 
    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
       if (prog->_LinkedShaders[i] == NULL)
diff --git a/src/compiler/glsl/linker.h b/src/compiler/glsl/linker.h
index 4311d1659ec..97144df8ff7 100644
--- a/src/compiler/glsl/linker.h
+++ b/src/compiler/glsl/linker.h
@@ -197,7 +197,8 @@ private:
    void recursion(const glsl_type *t, char **name, size_t name_length,
                   bool row_major, const glsl_type *record_type,
                   const unsigned packing,
-                  bool last_field, unsigned record_array_count);
+                  bool last_field, unsigned record_array_count,
+                  const glsl_struct_field *named_ifc_member);
 };
 
 void
diff --git a/src/compiler/glsl/lower_named_interface_blocks.cpp b/src/compiler/glsl/lower_named_interface_blocks.cpp
index f29eba4f75f..f780ecacbd2 100644
--- a/src/compiler/glsl/lower_named_interface_blocks.cpp
+++ b/src/compiler/glsl/lower_named_interface_blocks.cpp
@@ -169,7 +169,6 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions)
                   new(mem_ctx) ir_variable(iface_t->fields.structure[i].type,
                                            var_name,
                                            (ir_variable_mode) var->data.mode);
-               new_var->data.from_named_ifc_block_nonarray = 1;
             } else {
                const glsl_type *new_array_type =
                   process_array_type(var->type, i);
@@ -177,10 +176,16 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions)
                   new(mem_ctx) ir_variable(new_array_type,
                                            var_name,
                                            (ir_variable_mode) var->data.mode);
-               new_var->data.from_named_ifc_block_array = 1;
             }
             new_var->data.location = iface_t->fields.structure[i].location;
             new_var->data.explicit_location = (new_var->data.location >= 0);
+            new_var->data.offset = iface_t->fields.structure[i].offset;
+            new_var->data.explicit_xfb_offset =
+               (iface_t->fields.structure[i].offset >= 0);
+            new_var->data.xfb_buffer =
+               iface_t->fields.structure[i].xfb_buffer;
+            new_var->data.explicit_xfb_buffer =
+               iface_t->fields.structure[i].explicit_xfb_buffer;
             new_var->data.interpolation =
                iface_t->fields.structure[i].interpolation;
             new_var->data.centroid = iface_t->fields.structure[i].centroid;
@@ -188,8 +193,9 @@ flatten_named_interface_blocks_declarations::run(exec_list *instructions)
             new_var->data.patch = iface_t->fields.structure[i].patch;
             new_var->data.stream = var->data.stream;
             new_var->data.how_declared = var->data.how_declared;
+            new_var->data.from_named_ifc_block = 1;
 
-            new_var->init_interface_type(iface_t);
+            new_var->init_interface_type(var->type);
             hash_table_insert(interface_namespace, new_var,
                               iface_field_name);
             insert_pos->insert_after(new_var);
@@ -211,12 +217,23 @@ ir_visitor_status
 flatten_named_interface_blocks_declarations::visit_leave(ir_assignment *ir)
 {
    ir_dereference_record *lhs_rec = ir->lhs->as_dereference_record();
+
+   ir_variable *lhs_var =  ir->lhs->variable_referenced();
+   if (lhs_var && lhs_var->get_interface_type()) {
+      lhs_var->data.assigned = 1;
+   }
+
    if (lhs_rec) {
       ir_rvalue *lhs_rec_tmp = lhs_rec;
       handle_rvalue(&lhs_rec_tmp);
       if (lhs_rec_tmp != lhs_rec) {
          ir->set_lhs(lhs_rec_tmp);
       }
+
+      ir_variable *lhs_var =  lhs_rec_tmp->variable_referenced();
+      if (lhs_var) {
+         lhs_var->data.assigned = 1;
+      }
    }
    return rvalue_visit(ir);
 }
diff --git a/src/compiler/glsl/program.h b/src/compiler/glsl/program.h
index 31bb9aa2435..8f5a31bd5ba 100644
--- a/src/compiler/glsl/program.h
+++ b/src/compiler/glsl/program.h
@@ -43,7 +43,8 @@ extern void
 link_shaders(struct gl_context *ctx, struct gl_shader_program *prog);
 
 extern void
-build_program_resource_list(struct gl_shader_program *shProg);
+build_program_resource_list(struct gl_context *ctx,
+                            struct gl_shader_program *shProg);
 
 extern void
 linker_error(struct gl_shader_program *prog, const char *fmt, ...)
diff --git a/src/compiler/glsl/standalone_scaffolding.cpp b/src/compiler/glsl/standalone_scaffolding.cpp
index 0f7a16a5e6f..5ce804eed20 100644
--- a/src/compiler/glsl/standalone_scaffolding.cpp
+++ b/src/compiler/glsl/standalone_scaffolding.cpp
@@ -130,11 +130,6 @@ _mesa_clear_shader_program_data(struct gl_shader_program *shProg)
       shProg->InterfaceBlockStageIndex[i] = NULL;
    }
 
-   ralloc_free(shProg->UboInterfaceBlockIndex);
-   shProg->UboInterfaceBlockIndex = NULL;
-   ralloc_free(shProg->SsboInterfaceBlockIndex);
-   shProg->SsboInterfaceBlockIndex = NULL;
-
    ralloc_free(shProg->AtomicBuffers);
    shProg->AtomicBuffers = NULL;
    shProg->NumAtomicBuffers = 0;
diff --git a/src/compiler/glsl/tests/set_uniform_initializer_tests.cpp b/src/compiler/glsl/tests/set_uniform_initializer_tests.cpp
index 0b1f66cb342..a36ffdc58be 100644
--- a/src/compiler/glsl/tests/set_uniform_initializer_tests.cpp
+++ b/src/compiler/glsl/tests/set_uniform_initializer_tests.cpp
@@ -115,7 +115,6 @@ establish_uniform_storage(struct gl_shader_program *prog, unsigned num_storage,
    prog->UniformStorage[index_to_set].name = (char *) name;
    prog->UniformStorage[index_to_set].type = type;
    prog->UniformStorage[index_to_set].array_elements = array_size;
-   prog->UniformStorage[index_to_set].initialized = false;
    for (int sh = 0; sh < MESA_SHADER_STAGES; sh++) {
       prog->UniformStorage[index_to_set].opaque[sh].index = ~0;
       prog->UniformStorage[index_to_set].opaque[sh].active = false;
@@ -136,7 +135,6 @@ establish_uniform_storage(struct gl_shader_program *prog, unsigned num_storage,
       prog->UniformStorage[i].name = (char *) "invalid slot";
       prog->UniformStorage[i].type = glsl_type::void_type;
       prog->UniformStorage[i].array_elements = 0;
-      prog->UniformStorage[i].initialized = false;
       for (int sh = 0; sh < MESA_SHADER_STAGES; sh++) {
          prog->UniformStorage[i].opaque[sh].index = ~0;
          prog->UniformStorage[i].opaque[sh].active = false;
@@ -149,21 +147,6 @@ establish_uniform_storage(struct gl_shader_program *prog, unsigned num_storage,
    return red_zone_components;
 }
 
-/**
- * Verify that the correct uniform is marked as having been initialized.
- */
-static void
-verify_initialization(struct gl_shader_program *prog, unsigned actual_index)
-{
-   for (unsigned i = 0; i < prog->NumUniformStorage; i++) {
-      if (i == actual_index) {
-	 EXPECT_TRUE(prog->UniformStorage[actual_index].initialized);
-      } else {
-	 EXPECT_FALSE(prog->UniformStorage[i].initialized);
-      }
-   }
-}
-
 static void
 non_array_test(void *mem_ctx, struct gl_shader_program *prog,
 	       unsigned actual_index, const char *name,
@@ -181,7 +164,6 @@ non_array_test(void *mem_ctx, struct gl_shader_program *prog,
 
    linker::set_uniform_initializer(mem_ctx, prog, name, type, val, 0xF00F);
 
-   verify_initialization(prog, actual_index);
    verify_data(prog->UniformStorage[actual_index].storage, 0, val,
 	       red_zone_components, 0xF00F);
 }
@@ -338,7 +320,6 @@ array_test(void *mem_ctx, struct gl_shader_program *prog,
    linker::set_uniform_initializer(mem_ctx, prog, name, element_type, val,
                                    0xF00F);
 
-   verify_initialization(prog, actual_index);
    verify_data(prog->UniformStorage[actual_index].storage, array_size,
 	       val, red_zone_components, 0xF00F);
 }
diff --git a/src/compiler/glsl_types.cpp b/src/compiler/glsl_types.cpp
index 2421bd61954..39585bff3b9 100644
--- a/src/compiler/glsl_types.cpp
+++ b/src/compiler/glsl_types.cpp
@@ -132,6 +132,10 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
       this->fields.structure[i].image_volatile = fields[i].image_volatile;
       this->fields.structure[i].image_restrict = fields[i].image_restrict;
       this->fields.structure[i].precision = fields[i].precision;
+      this->fields.structure[i].explicit_xfb_buffer =
+         fields[i].explicit_xfb_buffer;
+      this->fields.structure[i].xfb_buffer = fields[i].xfb_buffer;
+      this->fields.structure[i].xfb_stride = fields[i].xfb_stride;
    }
 
    mtx_unlock(&glsl_type::mutex);
@@ -172,6 +176,10 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
       this->fields.structure[i].image_volatile = fields[i].image_volatile;
       this->fields.structure[i].image_restrict = fields[i].image_restrict;
       this->fields.structure[i].precision = fields[i].precision;
+      this->fields.structure[i].explicit_xfb_buffer =
+         fields[i].explicit_xfb_buffer;
+      this->fields.structure[i].xfb_buffer = fields[i].xfb_buffer;
+      this->fields.structure[i].xfb_stride = fields[i].xfb_stride;
    }
 
    mtx_unlock(&glsl_type::mutex);
@@ -915,6 +923,15 @@ glsl_type::record_compare(const glsl_type *b) const
       if (this->fields.structure[i].precision
           != b->fields.structure[i].precision)
          return false;
+      if (this->fields.structure[i].explicit_xfb_buffer
+          != b->fields.structure[i].explicit_xfb_buffer)
+         return false;
+      if (this->fields.structure[i].xfb_buffer
+          != b->fields.structure[i].xfb_buffer)
+         return false;
+      if (this->fields.structure[i].xfb_stride
+          != b->fields.structure[i].xfb_stride)
+         return false;
    }
 
    return true;
@@ -1333,6 +1350,38 @@ glsl_type::uniform_locations() const
    }
 }
 
+unsigned
+glsl_type::varying_count() const
+{
+   unsigned size = 0;
+
+   switch (this->base_type) {
+   case GLSL_TYPE_UINT:
+   case GLSL_TYPE_INT:
+   case GLSL_TYPE_FLOAT:
+   case GLSL_TYPE_DOUBLE:
+   case GLSL_TYPE_BOOL:
+      return 1;
+
+   case GLSL_TYPE_STRUCT:
+   case GLSL_TYPE_INTERFACE:
+      for (unsigned i = 0; i < this->length; i++)
+         size += this->fields.structure[i].type->varying_count();
+      return size;
+   case GLSL_TYPE_ARRAY:
+      /* Don't count innermost array elements */
+      if (this->without_array()->is_record() ||
+          this->without_array()->is_interface() ||
+          this->fields.array->is_array())
+         return this->length * this->fields.array->varying_count();
+      else
+         return this->fields.array->varying_count();
+   default:
+      assert(!"unsupported varying type");
+      return 0;
+   }
+}
+
 bool
 glsl_type::can_implicitly_convert_to(const glsl_type *desired,
                                      _mesa_glsl_parse_state *state) const
diff --git a/src/compiler/glsl_types.h b/src/compiler/glsl_types.h
index b0e6f3f730f..dd46479755a 100644
--- a/src/compiler/glsl_types.h
+++ b/src/compiler/glsl_types.h
@@ -327,6 +327,12 @@ struct glsl_type {
    unsigned uniform_locations() const;
 
    /**
+    * Used to count the number of varyings contained in the type ignoring
+    * innermost array elements.
+    */
+   unsigned varying_count() const;
+
+   /**
     * Calculate the number of attribute slots required to hold this type
     *
     * This implements the language rules of GLSL 1.50 for counting the number
@@ -839,13 +845,25 @@ struct glsl_struct_field {
 
    /**
     * For interface blocks, members may have an explicit byte offset
-    * specified; -1 otherwise.
+    * specified; -1 otherwise. Also used for xfb_offset layout qualifier.
     *
-    * Ignored for structs.
+    * Unless used for xfb_offset this field is ignored for structs.
     */
    int offset;
 
    /**
+    * For interface blocks, members may define a transform feedback buffer;
+    * -1 otherwise.
+    */
+   int xfb_buffer;
+
+   /**
+    * For interface blocks, members may define a transform feedback stride;
+    * -1 otherwise.
+    */
+   int xfb_stride;
+
+   /**
     * For interface blocks, the interpolation mode (as in
     * ir_variable::interpolation).  0 otherwise.
     */
@@ -889,6 +907,13 @@ struct glsl_struct_field {
    unsigned image_volatile:1;
    unsigned image_restrict:1;
 
+   /**
+    * Any of the xfb_* qualifiers trigger the shader to be in transform
+    * feedback mode so we need to keep track of whether the buffer was
+    * explicitly set or if its just been assigned the default global value.
+    */
+   unsigned explicit_xfb_buffer:1;
+
 #ifdef __cplusplus
    glsl_struct_field(const struct glsl_type *_type, const char *_name)
       : type(_type), name(_name), location(-1), interpolation(0), centroid(0),
diff --git a/src/compiler/nir/Makefile.sources b/src/compiler/nir/Makefile.sources
index a876eff289a..e6367d9c282 100644
--- a/src/compiler/nir/Makefile.sources
+++ b/src/compiler/nir/Makefile.sources
@@ -22,10 +22,10 @@ NIR_FILES = \
 	nir_gather_info.c \
 	nir_gs_count_vertices.c \
 	nir_inline_functions.c \
-	nir_intrinsics.c \
-	nir_intrinsics.h \
 	nir_instr_set.c \
 	nir_instr_set.h \
+	nir_intrinsics.c \
+	nir_intrinsics.h \
 	nir_liveness.c \
 	nir_lower_alu_to_scalar.c \
 	nir_lower_atomics.c \
diff --git a/src/compiler/nir/glsl_to_nir.cpp b/src/compiler/nir/glsl_to_nir.cpp
index 2a469ec7355..14affeee8ac 100644
--- a/src/compiler/nir/glsl_to_nir.cpp
+++ b/src/compiler/nir/glsl_to_nir.cpp
@@ -143,16 +143,7 @@ glsl_to_nir(const struct gl_shader_program *shader_prog,
    v2.run(sh->ir);
    visit_exec_list(sh->ir, &v1);
 
-   nir_function *main = NULL;
-   nir_foreach_function(shader, func) {
-      if (strcmp(func->name, "main") == 0) {
-         main = func;
-         break;
-      }
-   }
-   assert(main);
-
-   nir_lower_outputs_to_temporaries(shader, main);
+   nir_lower_outputs_to_temporaries(shader, nir_shader_get_entrypoint(shader));
 
    shader->info.name = ralloc_asprintf(shader, "GLSL%d", shader_prog->Name);
    if (shader_prog->Label)
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index de6b93c955c..d9e0d679c66 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -1822,6 +1822,8 @@ nir_shader_get_entrypoint(nir_shader *shader)
    assert(exec_list_length(&shader->functions) == 1);
    struct exec_node *func_node = exec_list_get_head(&shader->functions);
    nir_function *func = exec_node_data(nir_function, func_node, node);
+   assert(func->return_type == glsl_void_type());
+   assert(func->num_params == 0);
    return func;
 }
 
diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index 2e9cd5fcc94..ddfe94d9e73 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -127,6 +127,7 @@ optimizations = [
    (('bcsel', ('flt', a, b), b, a), ('fmax', a, b)),
    (('bcsel', ('inot', 'a@bool'), b, c), ('bcsel', a, c, b)),
    (('bcsel', a, ('bcsel', a, b, c), d), ('bcsel', a, b, d)),
+   (('bcsel', a, True, 'b@bool'), ('ior', a, b)),
    (('fmin', a, a), a),
    (('fmax', a, a), a),
    (('imin', a, a), a),
@@ -270,6 +271,10 @@ optimizations = [
    (('fabs', ('fsub', 0.0, a)), ('fabs', a)),
    (('iabs', ('isub', 0, a)), ('iabs', a)),
 
+   # Propagate negation up multiplication chains
+   (('fmul', ('fneg', a), b), ('fneg', ('fmul', a, b))),
+   (('imul', ('ineg', a), b), ('ineg', ('imul', a, b))),
+
    # Misc. lowering
    (('fmod', a, b), ('fsub', a, ('fmul', b, ('ffloor', ('fdiv', a, b)))), 'options->lower_fmod'),
    (('frem', a, b), ('fsub', a, ('fmul', b, ('ftrunc', ('fdiv', a, b)))), 'options->lower_fmod'),
diff --git a/src/compiler/shader_enums.h b/src/compiler/shader_enums.h
index d44aabf8f3c..0c2740866fd 100644
--- a/src/compiler/shader_enums.h
+++ b/src/compiler/shader_enums.h
@@ -31,7 +31,7 @@ extern "C" {
 #endif
 
 /**
- * Shader stages. Note that these will become 5 with tessellation.
+ * Shader stages.
  *
  * The order must match how shaders are ordered in the pipeline.
  * The GLSL linker assumes that if i<j, then the j-th shader is
diff --git a/src/egl/drivers/dri2/platform_android.c b/src/egl/drivers/dri2/platform_android.c
index 7d546650272..41840aa7b2a 100644
--- a/src/egl/drivers/dri2/platform_android.c
+++ b/src/egl/drivers/dri2/platform_android.c
@@ -537,6 +537,8 @@ droid_add_configs_for_visuals(_EGLDriver *drv, _EGLDisplay *dpy)
    EGLint config_attrs[] = {
      EGL_NATIVE_VISUAL_ID,   0,
      EGL_NATIVE_VISUAL_TYPE, 0,
+     EGL_FRAMEBUFFER_TARGET_ANDROID, EGL_TRUE,
+     EGL_RECORDABLE_ANDROID, EGL_TRUE,
      EGL_NONE
    };
    int count, i, j;
@@ -714,7 +716,9 @@ dri2_initialize_android(_EGLDriver *drv, _EGLDisplay *dpy)
       goto cleanup_screen;
    }
 
+   dpy->Extensions.ANDROID_framebuffer_target = EGL_TRUE;
    dpy->Extensions.ANDROID_image_native_buffer = EGL_TRUE;
+   dpy->Extensions.ANDROID_recordable = EGL_TRUE;
    dpy->Extensions.KHR_image_base = EGL_TRUE;
 
    /* Fill vtbl last to prevent accidentally calling virtual function during
diff --git a/src/egl/main/eglapi.c b/src/egl/main/eglapi.c
index dd145a1195e..8886759011a 100644
--- a/src/egl/main/eglapi.c
+++ b/src/egl/main/eglapi.c
@@ -381,7 +381,9 @@ _eglCreateExtensionsString(_EGLDisplay *dpy)
    char *exts = dpy->ExtensionsString;
 
    /* Please keep these sorted alphabetically. */
+   _EGL_CHECK_EXTENSION(ANDROID_framebuffer_target);
    _EGL_CHECK_EXTENSION(ANDROID_image_native_buffer);
+   _EGL_CHECK_EXTENSION(ANDROID_recordable);
 
    _EGL_CHECK_EXTENSION(CHROMIUM_sync_control);
 
diff --git a/src/egl/main/eglconfig.c b/src/egl/main/eglconfig.c
index d79c0e15422..435d9245384 100644
--- a/src/egl/main/eglconfig.c
+++ b/src/egl/main/eglconfig.c
@@ -245,7 +245,13 @@ static const struct {
    /* extensions */
    { EGL_Y_INVERTED_NOK,            ATTRIB_TYPE_BOOLEAN,
                                     ATTRIB_CRITERION_EXACT,
-                                    EGL_DONT_CARE }
+                                    EGL_DONT_CARE },
+   { EGL_FRAMEBUFFER_TARGET_ANDROID, ATTRIB_TYPE_BOOLEAN,
+                                    ATTRIB_CRITERION_EXACT,
+                                    EGL_DONT_CARE },
+   { EGL_RECORDABLE_ANDROID,        ATTRIB_TYPE_BOOLEAN,
+                                    ATTRIB_CRITERION_EXACT,
+                                    EGL_DONT_CARE },
 };
 
 
@@ -488,6 +494,10 @@ _eglIsConfigAttribValid(_EGLConfig *conf, EGLint attr)
    switch (attr) {
    case EGL_Y_INVERTED_NOK:
       return conf->Display->Extensions.NOK_texture_from_pixmap;
+   case EGL_FRAMEBUFFER_TARGET_ANDROID:
+      return conf->Display->Extensions.ANDROID_framebuffer_target;
+   case EGL_RECORDABLE_ANDROID:
+      return conf->Display->Extensions.ANDROID_recordable;
    default:
       break;
    }
diff --git a/src/egl/main/eglconfig.h b/src/egl/main/eglconfig.h
index 84cb2276b70..22da697e83c 100644
--- a/src/egl/main/eglconfig.h
+++ b/src/egl/main/eglconfig.h
@@ -86,6 +86,8 @@ struct _egl_config
 
    /* extensions */
    EGLint YInvertedNOK;
+   EGLint FramebufferTargetAndroid;
+   EGLint RecordableAndroid;
 };
 
 
@@ -133,6 +135,8 @@ _eglOffsetOfConfig(EGLint attr)
    ATTRIB_MAP(EGL_CONFORMANT,                Conformant);
    /* extensions */
    ATTRIB_MAP(EGL_Y_INVERTED_NOK,            YInvertedNOK);
+   ATTRIB_MAP(EGL_FRAMEBUFFER_TARGET_ANDROID, FramebufferTargetAndroid);
+   ATTRIB_MAP(EGL_RECORDABLE_ANDROID,        RecordableAndroid);
 #undef ATTRIB_MAP
    default:
       return -1;
diff --git a/src/egl/main/egldisplay.h b/src/egl/main/egldisplay.h
index cec6d59e6a4..6bfc8589a42 100644
--- a/src/egl/main/egldisplay.h
+++ b/src/egl/main/egldisplay.h
@@ -90,7 +90,9 @@ struct _egl_resource
 struct _egl_extensions
 {
    /* Please keep these sorted alphabetically. */
+   EGLBoolean ANDROID_framebuffer_target;
    EGLBoolean ANDROID_image_native_buffer;
+   EGLBoolean ANDROID_recordable;
 
    EGLBoolean CHROMIUM_sync_control;
 
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 16a261c14cf..2ba9b099664 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -731,6 +731,24 @@ draw_texture_sampler(struct draw_context *draw,
    }
 }
 
+/**
+ * Provide TGSI image objects for vertex/geometry shaders that use
+ * texture fetches.  This state only needs to be set once per context.
+ * This might only be used by software drivers for the time being.
+ */
+void
+draw_image(struct draw_context *draw,
+           uint shader,
+           struct tgsi_image *image)
+{
+   if (shader == PIPE_SHADER_VERTEX) {
+      draw->vs.tgsi.image = image;
+   } else {
+      debug_assert(shader == PIPE_SHADER_GEOMETRY);
+      draw->gs.tgsi.image = image;
+   }
+}
+
 
 
 
diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h
index a5a6df5b72e..5d9870b115c 100644
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -48,6 +48,7 @@ struct draw_vertex_shader;
 struct draw_geometry_shader;
 struct draw_fragment_shader;
 struct tgsi_sampler;
+struct tgsi_image;
 
 /*
  * structure to contain driver internal information 
@@ -155,6 +156,11 @@ draw_texture_sampler(struct draw_context *draw,
                      struct tgsi_sampler *sampler);
 
 void
+draw_image(struct draw_context *draw,
+           uint shader_type,
+           struct tgsi_image *image);
+
+void
 draw_set_sampler_views(struct draw_context *draw,
                        unsigned shader_stage,
                        struct pipe_sampler_view **views,
diff --git a/src/gallium/auxiliary/draw/draw_gs.c b/src/gallium/auxiliary/draw/draw_gs.c
index fcef31b4ff5..14db2d6f39d 100644
--- a/src/gallium/auxiliary/draw/draw_gs.c
+++ b/src/gallium/auxiliary/draw/draw_gs.c
@@ -681,7 +681,7 @@ void draw_geometry_shader_prepare(struct draw_geometry_shader *shader,
    if (!use_llvm && shader && shader->machine->Tokens != shader->state.tokens) {
       tgsi_exec_machine_bind_shader(shader->machine,
                                     shader->state.tokens,
-                                    draw->gs.tgsi.sampler);
+                                    draw->gs.tgsi.sampler, draw->gs.tgsi.image);
    }
 }
 
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 8774bebd5f9..211bd6f7e70 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -66,6 +66,7 @@ struct draw_stage;
 struct vbuf_render;
 struct tgsi_exec_machine;
 struct tgsi_sampler;
+struct tgsi_image;
 struct draw_pt_front_end;
 struct draw_assembler;
 struct draw_llvm;
@@ -267,6 +268,7 @@ struct draw_context
          struct tgsi_exec_machine *machine;
 
          struct tgsi_sampler *sampler;
+         struct tgsi_image *image;
       } tgsi;
 
       struct translate *fetch;
@@ -286,6 +288,7 @@ struct draw_context
          struct tgsi_exec_machine *machine;
 
          struct tgsi_sampler *sampler;
+         struct tgsi_image *image;
       } tgsi;
 
    } gs;
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index 3fd8ef3cd2f..5b53cff29f0 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -70,7 +70,7 @@ vs_exec_prepare( struct draw_vertex_shader *shader,
    if (evs->machine->Tokens != shader->state.tokens) {
       tgsi_exec_machine_bind_shader(evs->machine,
                                     shader->state.tokens,
-                                    draw->vs.tgsi.sampler);
+                                    draw->vs.tgsi.sampler, draw->vs.tgsi.image);
    }
 }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
index efaf2fa306a..11e9f92189f 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
+++ b/src/gallium/auxiliary/gallivm/lp_bld_debug.cpp
@@ -128,7 +128,7 @@ lp_debug_dump_value(LLVMValueRef value)
  * - http://blog.llvm.org/2010/04/intro-to-llvm-mc-project.html
  */
 static size_t
-disassemble(const void* func, std::stringstream &buffer)
+disassemble(const void* func, std::ostream &buffer)
 {
    const uint8_t *bytes = (const uint8_t *)func;
 
@@ -235,15 +235,16 @@ disassemble(const void* func, std::stringstream &buffer)
 
 
 extern "C" void
-lp_disassemble(LLVMValueRef func, const void *code) {
-   std::stringstream buffer;
+lp_disassemble(LLVMValueRef func, const void *code)
+{
+   std::ostringstream buffer;
    std::string s;
 
    buffer << LLVMGetValueName(func) << ":\n";
    disassemble(code, buffer);
    s = buffer.str();
-   _debug_printf("%s", s.c_str());
-   _debug_printf("\n");
+   os_log_message(s.c_str());
+   os_log_message("\n");
 }
 
 
@@ -259,7 +260,6 @@ extern "C" void
 lp_profile(LLVMValueRef func, const void *code)
 {
 #if defined(__linux__) && defined(PROFILE)
-   std::stringstream buffer;
    static std::ofstream perf_asm_file;
    static boolean first_time = TRUE;
    static FILE *perf_map_file = NULL;
@@ -283,9 +283,9 @@ lp_profile(LLVMValueRef func, const void *code)
    if (perf_map_file) {
       const char *symbol = LLVMGetValueName(func);
       unsigned long addr = (uintptr_t)code;
-      buffer << symbol << ":\n";
-      unsigned long size = disassemble(code, buffer);
-      perf_asm_file << buffer.rdbuf() << std::flush;
+      perf_asm_file << symbol << ":\n";
+      unsigned long size = disassemble(code, perf_asm_file);
+      perf_asm_file.flush();
       fprintf(perf_map_file, "%lx %lx %s\n", addr, size, symbol);
       fflush(perf_map_file);
    }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_logic.c b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
index 19d30d0d63c..5b0b6c6b234 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_logic.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_logic.c
@@ -314,11 +314,13 @@ lp_build_select(struct lp_build_context *bld,
       mask = LLVMBuildTrunc(builder, mask, LLVMInt1TypeInContext(lc), "");
       res = LLVMBuildSelect(builder, mask, a, b, "");
    }
-   else if (0) {
+   else if (HAVE_LLVM >= 0x0303) {
       /* Generate a vector select.
        *
-       * XXX: Using vector selects would avoid emitting intrinsics, but they aren't
-       * properly supported yet.
+       * Using vector selects would avoid emitting intrinsics, but they weren't
+       * properly supported yet for a long time.
+       *
+       * LLVM 3.3 appears to reliably support it.
        *
        * LLVM 3.1 supports it, but it yields buggy code (e.g. lp_blend_test).
        *
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
index 2678268e923..fbbe8d11eb0 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
@@ -108,14 +108,14 @@ struct fenced_manager
  */
 struct fenced_buffer
 {
-   /*
+   /**
     * Immutable members.
     */
 
    struct pb_buffer base;
    struct fenced_manager *mgr;
 
-   /*
+   /**
     * Following members are mutable and protected by fenced_manager::mutex.
     */
 
@@ -205,7 +205,7 @@ fenced_manager_dump_locked(struct fenced_manager *fenced_mgr)
 
    curr = fenced_mgr->unfenced.next;
    next = curr->next;
-   while(curr != &fenced_mgr->unfenced) {
+   while (curr != &fenced_mgr->unfenced) {
       fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
       assert(!fenced_buf->fence);
       debug_printf("%10p %7u %8u %7s\n",
@@ -219,7 +219,7 @@ fenced_manager_dump_locked(struct fenced_manager *fenced_mgr)
 
    curr = fenced_mgr->fenced.next;
    next = curr->next;
-   while(curr != &fenced_mgr->fenced) {
+   while (curr != &fenced_mgr->fenced) {
       int signaled;
       fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
       assert(fenced_buf->buffer);
@@ -340,7 +340,7 @@ fenced_buffer_finish_locked(struct fenced_manager *fenced_mgr,
    assert(pipe_is_referenced(&fenced_buf->base.reference));
    assert(fenced_buf->fence);
 
-   if(fenced_buf->fence) {
+   if (fenced_buf->fence) {
       struct pipe_fence_handle *fence = NULL;
       int finished;
       boolean proceed;
@@ -355,8 +355,7 @@ fenced_buffer_finish_locked(struct fenced_manager *fenced_mgr,
 
       assert(pipe_is_referenced(&fenced_buf->base.reference));
 
-      /*
-       * Only proceed if the fence object didn't change in the meanwhile.
+      /* Only proceed if the fence object didn't change in the meanwhile.
        * Otherwise assume the work has been already carried out by another
        * thread that re-aquired the lock before us.
        */
@@ -364,14 +363,9 @@ fenced_buffer_finish_locked(struct fenced_manager *fenced_mgr,
 
       ops->fence_reference(ops, &fence, NULL);
 
-      if(proceed && finished == 0) {
-         /*
-          * Remove from the fenced list
-          */
-
-         boolean destroyed;
-
-         destroyed = fenced_buffer_remove_locked(fenced_mgr, fenced_buf);
+      if (proceed && finished == 0) {
+         /* Remove from the fenced list. */
+         boolean destroyed = fenced_buffer_remove_locked(fenced_mgr, fenced_buf);
 
          /* TODO: remove consequents buffers with the same fence? */
 
@@ -405,36 +399,33 @@ fenced_manager_check_signalled_locked(struct fenced_manager *fenced_mgr,
 
    curr = fenced_mgr->fenced.next;
    next = curr->next;
-   while(curr != &fenced_mgr->fenced) {
+   while (curr != &fenced_mgr->fenced) {
       fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
 
-      if(fenced_buf->fence != prev_fence) {
-	 int signaled;
+      if (fenced_buf->fence != prev_fence) {
+         int signaled;
 
-	 if (wait) {
-	    signaled = ops->fence_finish(ops, fenced_buf->fence, 0);
+         if (wait) {
+            signaled = ops->fence_finish(ops, fenced_buf->fence, 0);
 
-	    /*
-	     * Don't return just now. Instead preemptively check if the
-	     * following buffers' fences already expired, without further waits.
-	     */
-	    wait = FALSE;
-	 }
-	 else {
-	    signaled = ops->fence_signalled(ops, fenced_buf->fence, 0);
-	 }
+            /* Don't return just now. Instead preemptively check if the
+             * following buffers' fences already expired, without further waits.
+             */
+            wait = FALSE;
+         } else {
+            signaled = ops->fence_signalled(ops, fenced_buf->fence, 0);
+         }
 
-	 if (signaled != 0) {
-	    return ret;
+         if (signaled != 0) {
+            return ret;
          }
 
-	 prev_fence = fenced_buf->fence;
-      }
-      else {
+         prev_fence = fenced_buf->fence;
+      } else {
          /* This buffer's fence object is identical to the previous buffer's
           * fence object, so no need to check the fence again.
           */
-	 assert(ops->fence_signalled(ops, fenced_buf->fence, 0) == 0);
+         assert(ops->fence_signalled(ops, fenced_buf->fence, 0) == 0);
       }
 
       fenced_buffer_remove_locked(fenced_mgr, fenced_buf);
@@ -462,22 +453,21 @@ fenced_manager_free_gpu_storage_locked(struct fenced_manager *fenced_mgr)
 
    curr = fenced_mgr->unfenced.next;
    next = curr->next;
-   while(curr != &fenced_mgr->unfenced) {
+   while (curr != &fenced_mgr->unfenced) {
       fenced_buf = LIST_ENTRY(struct fenced_buffer, curr, head);
 
-      /*
-       * We can only move storage if the buffer is not mapped and not
+      /* We can only move storage if the buffer is not mapped and not
        * validated.
        */
-      if(fenced_buf->buffer &&
+      if (fenced_buf->buffer &&
          !fenced_buf->mapcount &&
          !fenced_buf->vl) {
          enum pipe_error ret;
 
          ret = fenced_buffer_create_cpu_storage_locked(fenced_mgr, fenced_buf);
-         if(ret == PIPE_OK) {
+         if (ret == PIPE_OK) {
             ret = fenced_buffer_copy_storage_to_cpu_locked(fenced_buf);
-            if(ret == PIPE_OK) {
+            if (ret == PIPE_OK) {
                fenced_buffer_destroy_gpu_storage_locked(fenced_buf);
                return TRUE;
             }
@@ -499,7 +489,7 @@ fenced_manager_free_gpu_storage_locked(struct fenced_manager *fenced_mgr)
 static void
 fenced_buffer_destroy_cpu_storage_locked(struct fenced_buffer *fenced_buf)
 {
-   if(fenced_buf->data) {
+   if (fenced_buf->data) {
       align_free(fenced_buf->data);
       fenced_buf->data = NULL;
       assert(fenced_buf->mgr->cpu_total_size >= fenced_buf->size);
@@ -516,14 +506,14 @@ fenced_buffer_create_cpu_storage_locked(struct fenced_manager *fenced_mgr,
                                         struct fenced_buffer *fenced_buf)
 {
    assert(!fenced_buf->data);
-   if(fenced_buf->data)
+   if (fenced_buf->data)
       return PIPE_OK;
 
    if (fenced_mgr->cpu_total_size + fenced_buf->size > fenced_mgr->max_cpu_total_size)
       return PIPE_ERROR_OUT_OF_MEMORY;
 
    fenced_buf->data = align_malloc(fenced_buf->size, fenced_buf->desc.alignment);
-   if(!fenced_buf->data)
+   if (!fenced_buf->data)
       return PIPE_ERROR_OUT_OF_MEMORY;
 
    fenced_mgr->cpu_total_size += fenced_buf->size;
@@ -538,7 +528,7 @@ fenced_buffer_create_cpu_storage_locked(struct fenced_manager *fenced_mgr,
 static void
 fenced_buffer_destroy_gpu_storage_locked(struct fenced_buffer *fenced_buf)
 {
-   if(fenced_buf->buffer) {
+   if (fenced_buf->buffer) {
       pb_reference(&fenced_buf->buffer, NULL);
    }
 }
@@ -575,41 +565,37 @@ fenced_buffer_create_gpu_storage_locked(struct fenced_manager *fenced_mgr,
 {
    assert(!fenced_buf->buffer);
 
-   /*
-    * Check for signaled buffers before trying to allocate.
-    */
+   /* Check for signaled buffers before trying to allocate. */
    fenced_manager_check_signalled_locked(fenced_mgr, FALSE);
 
    fenced_buffer_try_create_gpu_storage_locked(fenced_mgr, fenced_buf);
 
-   /*
-    * Keep trying while there is some sort of progress:
+   /* Keep trying while there is some sort of progress:
     * - fences are expiring,
     * - or buffers are being being swapped out from GPU memory into CPU memory.
     */
-   while(!fenced_buf->buffer &&
+   while (!fenced_buf->buffer &&
          (fenced_manager_check_signalled_locked(fenced_mgr, FALSE) ||
           fenced_manager_free_gpu_storage_locked(fenced_mgr))) {
       fenced_buffer_try_create_gpu_storage_locked(fenced_mgr, fenced_buf);
    }
 
-   if(!fenced_buf->buffer && wait) {
-      /*
-       * Same as before, but this time around, wait to free buffers if
+   if (!fenced_buf->buffer && wait) {
+      /* Same as before, but this time around, wait to free buffers if
        * necessary.
        */
-      while(!fenced_buf->buffer &&
+      while (!fenced_buf->buffer &&
             (fenced_manager_check_signalled_locked(fenced_mgr, TRUE) ||
              fenced_manager_free_gpu_storage_locked(fenced_mgr))) {
          fenced_buffer_try_create_gpu_storage_locked(fenced_mgr, fenced_buf);
       }
    }
 
-   if(!fenced_buf->buffer) {
-      if(0)
+   if (!fenced_buf->buffer) {
+      if (0)
          fenced_manager_dump_locked(fenced_mgr);
 
-      /* give up */
+      /* Give up. */
       return PIPE_ERROR_OUT_OF_MEMORY;
    }
 
@@ -686,18 +672,16 @@ fenced_buffer_map(struct pb_buffer *buf,
 
    assert(!(flags & PB_USAGE_GPU_READ_WRITE));
 
-   /*
-    * Serialize writes.
-    */
-   while((fenced_buf->flags & PB_USAGE_GPU_WRITE) ||
+   /* Serialize writes. */
+   while ((fenced_buf->flags & PB_USAGE_GPU_WRITE) ||
          ((fenced_buf->flags & PB_USAGE_GPU_READ) &&
           (flags & PB_USAGE_CPU_WRITE))) {
 
-      /* 
-       * Don't wait for the GPU to finish accessing it, if blocking is forbidden.
+      /* Don't wait for the GPU to finish accessing it,
+       * if blocking is forbidden.
        */
-      if((flags & PB_USAGE_DONTBLOCK) &&
-          ops->fence_signalled(ops, fenced_buf->fence, 0) != 0) {
+      if ((flags & PB_USAGE_DONTBLOCK) &&
+         ops->fence_signalled(ops, fenced_buf->fence, 0) != 0) {
          goto done;
       }
 
@@ -705,17 +689,15 @@ fenced_buffer_map(struct pb_buffer *buf,
          break;
       }
 
-      /*
-       * Wait for the GPU to finish accessing. This will release and re-acquire
+      /* Wait for the GPU to finish accessing. This will release and re-acquire
        * the mutex, so all copies of mutable state must be discarded.
        */
       fenced_buffer_finish_locked(fenced_mgr, fenced_buf);
    }
 
-   if(fenced_buf->buffer) {
+   if (fenced_buf->buffer) {
       map = pb_map(fenced_buf->buffer, flags, flush_ctx);
-   }
-   else {
+   } else {
       assert(fenced_buf->data);
       map = fenced_buf->data;
    }
@@ -725,7 +707,7 @@ fenced_buffer_map(struct pb_buffer *buf,
       fenced_buf->flags |= flags & PB_USAGE_CPU_READ_WRITE;
    }
 
-done:
+ done:
    pipe_mutex_unlock(fenced_mgr->mutex);
 
    return map;
@@ -741,12 +723,12 @@ fenced_buffer_unmap(struct pb_buffer *buf)
    pipe_mutex_lock(fenced_mgr->mutex);
 
    assert(fenced_buf->mapcount);
-   if(fenced_buf->mapcount) {
+   if (fenced_buf->mapcount) {
       if (fenced_buf->buffer)
          pb_unmap(fenced_buf->buffer);
       --fenced_buf->mapcount;
-      if(!fenced_buf->mapcount)
-	 fenced_buf->flags &= ~PB_USAGE_CPU_READ_WRITE;
+      if (!fenced_buf->mapcount)
+         fenced_buf->flags &= ~PB_USAGE_CPU_READ_WRITE;
    }
 
    pipe_mutex_unlock(fenced_mgr->mutex);
@@ -765,7 +747,7 @@ fenced_buffer_validate(struct pb_buffer *buf,
    pipe_mutex_lock(fenced_mgr->mutex);
 
    if (!vl) {
-      /* invalidate */
+      /* Invalidate. */
       fenced_buf->vl = NULL;
       fenced_buf->validation_flags = 0;
       ret = PIPE_OK;
@@ -776,40 +758,37 @@ fenced_buffer_validate(struct pb_buffer *buf,
    assert(!(flags & ~PB_USAGE_GPU_READ_WRITE));
    flags &= PB_USAGE_GPU_READ_WRITE;
 
-   /* Buffer cannot be validated in two different lists */
-   if(fenced_buf->vl && fenced_buf->vl != vl) {
+   /* Buffer cannot be validated in two different lists. */
+   if (fenced_buf->vl && fenced_buf->vl != vl) {
       ret = PIPE_ERROR_RETRY;
       goto done;
    }
 
-   if(fenced_buf->vl == vl &&
+   if (fenced_buf->vl == vl &&
       (fenced_buf->validation_flags & flags) == flags) {
-      /* Nothing to do -- buffer already validated */
+      /* Nothing to do -- buffer already validated. */
       ret = PIPE_OK;
       goto done;
    }
 
-   /*
-    * Create and update GPU storage.
-    */
-   if(!fenced_buf->buffer) {
+   /* Create and update GPU storage. */
+   if (!fenced_buf->buffer) {
       assert(!fenced_buf->mapcount);
 
       ret = fenced_buffer_create_gpu_storage_locked(fenced_mgr, fenced_buf, TRUE);
-      if(ret != PIPE_OK) {
+      if (ret != PIPE_OK) {
          goto done;
       }
 
       ret = fenced_buffer_copy_storage_to_gpu_locked(fenced_buf);
-      if(ret != PIPE_OK) {
+      if (ret != PIPE_OK) {
          fenced_buffer_destroy_gpu_storage_locked(fenced_buf);
          goto done;
       }
 
-      if(fenced_buf->mapcount) {
+      if (fenced_buf->mapcount) {
          debug_printf("warning: validating a buffer while it is still mapped\n");
-      }
-      else {
+      } else {
          fenced_buffer_destroy_cpu_storage_locked(fenced_buf);
       }
    }
@@ -821,7 +800,7 @@ fenced_buffer_validate(struct pb_buffer *buf,
    fenced_buf->vl = vl;
    fenced_buf->validation_flags |= flags;
 
-done:
+ done:
    pipe_mutex_unlock(fenced_mgr->mutex);
 
    return ret;
@@ -841,13 +820,12 @@ fenced_buffer_fence(struct pb_buffer *buf,
    assert(pipe_is_referenced(&fenced_buf->base.reference));
    assert(fenced_buf->buffer);
 
-   if(fence != fenced_buf->fence) {
+   if (fence != fenced_buf->fence) {
       assert(fenced_buf->vl);
       assert(fenced_buf->validation_flags);
 
       if (fenced_buf->fence) {
-         boolean destroyed;
-         destroyed = fenced_buffer_remove_locked(fenced_mgr, fenced_buf);
+         boolean destroyed = fenced_buffer_remove_locked(fenced_mgr, fenced_buf);
          assert(!destroyed);
       }
       if (fence) {
@@ -876,16 +854,15 @@ fenced_buffer_get_base_buffer(struct pb_buffer *buf,
 
    pipe_mutex_lock(fenced_mgr->mutex);
 
-   /*
-    * This should only be called when the buffer is validated. Typically
+   /* This should only be called when the buffer is validated. Typically
     * when processing relocations.
     */
    assert(fenced_buf->vl);
    assert(fenced_buf->buffer);
 
-   if(fenced_buf->buffer)
+   if (fenced_buf->buffer) {
       pb_get_base_buffer(fenced_buf->buffer, base_buf, offset);
-   else {
+   } else {
       *base_buf = buf;
       *offset = 0;
    }
@@ -896,12 +873,12 @@ fenced_buffer_get_base_buffer(struct pb_buffer *buf,
 
 static const struct pb_vtbl
 fenced_buffer_vtbl = {
-      fenced_buffer_destroy,
-      fenced_buffer_map,
-      fenced_buffer_unmap,
-      fenced_buffer_validate,
-      fenced_buffer_fence,
-      fenced_buffer_get_base_buffer
+   fenced_buffer_destroy,
+   fenced_buffer_map,
+   fenced_buffer_unmap,
+   fenced_buffer_validate,
+   fenced_buffer_fence,
+   fenced_buffer_get_base_buffer
 };
 
 
@@ -917,12 +894,11 @@ fenced_bufmgr_create_buffer(struct pb_manager *mgr,
    struct fenced_buffer *fenced_buf;
    enum pipe_error ret;
 
-   /*
-    * Don't stall the GPU, waste time evicting buffers, or waste memory
+   /* Don't stall the GPU, waste time evicting buffers, or waste memory
     * trying to create a buffer that will most likely never fit into the
     * graphics aperture.
     */
-   if(size > fenced_mgr->max_buffer_size) {
+   if (size > fenced_mgr->max_buffer_size) {
       goto no_buffer;
    }
 
@@ -942,29 +918,21 @@ fenced_bufmgr_create_buffer(struct pb_manager *mgr,
 
    pipe_mutex_lock(fenced_mgr->mutex);
 
-   /*
-    * Try to create GPU storage without stalling,
-    */
+   /* Try to create GPU storage without stalling. */
    ret = fenced_buffer_create_gpu_storage_locked(fenced_mgr, fenced_buf, FALSE);
 
-   /*
-    * Attempt to use CPU memory to avoid stalling the GPU.
-    */
-   if(ret != PIPE_OK) {
+   /* Attempt to use CPU memory to avoid stalling the GPU. */
+   if (ret != PIPE_OK) {
       ret = fenced_buffer_create_cpu_storage_locked(fenced_mgr, fenced_buf);
    }
 
-   /*
-    * Create GPU storage, waiting for some to be available.
-    */
-   if(ret != PIPE_OK) {
+   /* Create GPU storage, waiting for some to be available. */
+   if (ret != PIPE_OK) {
       ret = fenced_buffer_create_gpu_storage_locked(fenced_mgr, fenced_buf, TRUE);
    }
 
-   /*
-    * Give up.
-    */
-   if(ret != PIPE_OK) {
+   /* Give up. */
+   if (ret != PIPE_OK) {
       goto no_storage;
    }
 
@@ -976,10 +944,10 @@ fenced_bufmgr_create_buffer(struct pb_manager *mgr,
 
    return &fenced_buf->base;
 
-no_storage:
+ no_storage:
    pipe_mutex_unlock(fenced_mgr->mutex);
    FREE(fenced_buf);
-no_buffer:
+ no_buffer:
    return NULL;
 }
 
@@ -990,12 +958,12 @@ fenced_bufmgr_flush(struct pb_manager *mgr)
    struct fenced_manager *fenced_mgr = fenced_manager(mgr);
 
    pipe_mutex_lock(fenced_mgr->mutex);
-   while(fenced_manager_check_signalled_locked(fenced_mgr, TRUE))
+   while (fenced_manager_check_signalled_locked(fenced_mgr, TRUE))
       ;
    pipe_mutex_unlock(fenced_mgr->mutex);
 
    assert(fenced_mgr->provider->flush);
-   if(fenced_mgr->provider->flush)
+   if (fenced_mgr->provider->flush)
       fenced_mgr->provider->flush(fenced_mgr->provider);
 }
 
@@ -1007,25 +975,25 @@ fenced_bufmgr_destroy(struct pb_manager *mgr)
 
    pipe_mutex_lock(fenced_mgr->mutex);
 
-   /* Wait on outstanding fences */
+   /* Wait on outstanding fences. */
    while (fenced_mgr->num_fenced) {
       pipe_mutex_unlock(fenced_mgr->mutex);
 #if defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD) || defined(PIPE_OS_SOLARIS)
       sched_yield();
 #endif
       pipe_mutex_lock(fenced_mgr->mutex);
-      while(fenced_manager_check_signalled_locked(fenced_mgr, TRUE))
+      while (fenced_manager_check_signalled_locked(fenced_mgr, TRUE))
          ;
    }
 
 #ifdef DEBUG
-   /*assert(!fenced_mgr->num_unfenced);*/
+   /* assert(!fenced_mgr->num_unfenced); */
 #endif
 
    pipe_mutex_unlock(fenced_mgr->mutex);
    pipe_mutex_destroy(fenced_mgr->mutex);
 
-   if(fenced_mgr->provider)
+   if (fenced_mgr->provider)
       fenced_mgr->provider->destroy(fenced_mgr->provider);
 
    fenced_mgr->ops->destroy(fenced_mgr->ops);
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index 126259fc0f8..a595bbbc6d3 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -853,7 +853,8 @@ void
 tgsi_exec_machine_bind_shader(
    struct tgsi_exec_machine *mach,
    const struct tgsi_token *tokens,
-   struct tgsi_sampler *sampler)
+   struct tgsi_sampler *sampler,
+   struct tgsi_image *image)
 {
    uint k;
    struct tgsi_parse_context parse;
@@ -871,6 +872,7 @@ tgsi_exec_machine_bind_shader(
 
    mach->Tokens = tokens;
    mach->Sampler = sampler;
+   mach->Image = image;
 
    if (!tokens) {
       /* unbind and free all */
@@ -1994,12 +1996,12 @@ fetch_sampler_unit(struct tgsi_exec_machine *mach,
                    const struct tgsi_full_instruction *inst,
                    uint sampler)
 {
-   uint unit;
-
+   uint unit = 0;
+   int i;
    if (inst->Src[sampler].Register.Indirect) {
       const struct tgsi_full_src_register *reg = &inst->Src[sampler];
       union tgsi_exec_channel indir_index, index2;
-
+      const uint execmask = mach->ExecMask;
       index2.i[0] =
       index2.i[1] =
       index2.i[2] =
@@ -2012,7 +2014,13 @@ fetch_sampler_unit(struct tgsi_exec_machine *mach,
                              &index2,
                              &ZeroVec,
                              &indir_index);
-      unit = inst->Src[sampler].Register.Index + indir_index.i[0];
+      for (i = 0; i < TGSI_QUAD_SIZE; i++) {
+         if (execmask & (1 << i)) {
+            unit = inst->Src[sampler].Register.Index + indir_index.i[i];
+            break;
+         }
+      }
+
    } else {
       unit = inst->Src[sampler].Register.Index;
    }
@@ -2046,7 +2054,8 @@ exec_tex(struct tgsi_exec_machine *mach,
    assert(modifier != TEX_MODIFIER_LEVEL_ZERO);
    assert(inst->Texture.Texture != TGSI_TEXTURE_BUFFER);
 
-   dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture, &shadow_ref);
+   dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
+   shadow_ref = tgsi_util_get_shadow_ref_src_index(inst->Texture.Texture);
 
    assert(dim <= 4);
    if (shadow_ref >= 0)
@@ -2145,7 +2154,7 @@ exec_lodq(struct tgsi_exec_machine *mach,
    union tgsi_exec_channel r[2];
 
    unit = fetch_sampler_unit(mach, inst, 1);
-   dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture, NULL);
+   dim = tgsi_util_get_texture_coord_dim(inst->Texture.Texture);
    assert(dim <= Elements(coords));
    /* fetch coordinates */
    for (i = 0; i < dim; i++) {
@@ -3700,6 +3709,247 @@ exec_dfracexp(struct tgsi_exec_machine *mach,
    }
 }
 
+static int
+get_image_coord_dim(unsigned tgsi_tex)
+{
+   int dim;
+   switch (tgsi_tex) {
+   case TGSI_TEXTURE_BUFFER:
+   case TGSI_TEXTURE_1D:
+      dim = 1;
+      break;
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+   case TGSI_TEXTURE_1D_ARRAY:
+   case TGSI_TEXTURE_2D_MSAA:
+      dim = 2;
+      break;
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
+   case TGSI_TEXTURE_2D_ARRAY:
+   case TGSI_TEXTURE_2D_ARRAY_MSAA:
+   case TGSI_TEXTURE_CUBE_ARRAY:
+      dim = 3;
+      break;
+   default:
+      assert(!"unknown texture target");
+      dim = 0;
+      break;
+   }
+
+   return dim;
+}
+
+static int
+get_image_coord_sample(unsigned tgsi_tex)
+{
+   int sample = 0;
+   switch (tgsi_tex) {
+   case TGSI_TEXTURE_2D_MSAA:
+      sample = 3;
+      break;
+   case TGSI_TEXTURE_2D_ARRAY_MSAA:
+      sample = 4;
+      break;
+   default:
+      break;
+   }
+   return sample;
+}
+
+static void
+exec_load(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   union tgsi_exec_channel r[4], sample_r;
+   uint unit;
+   int sample;
+   int i, j;
+   int dim;
+   uint chan;
+   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
+   struct tgsi_image_params params;
+   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+
+   unit = fetch_sampler_unit(mach, inst, 0);
+   dim = get_image_coord_dim(inst->Memory.Texture);
+   sample = get_image_coord_sample(inst->Memory.Texture);
+   assert(dim <= 3);
+
+   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
+   params.unit = unit;
+   params.tgsi_tex_instr = inst->Memory.Texture;
+   params.format = inst->Memory.Format;
+
+   for (i = 0; i < dim; i++) {
+      IFETCH(&r[i], 1, TGSI_CHAN_X + i);
+   }
+
+   if (sample)
+      IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
+
+   mach->Image->load(mach->Image, &params,
+                     r[0].i, r[1].i, r[2].i, sample_r.i,
+                     rgba);
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      r[0].f[j] = rgba[0][j];
+      r[1].f[j] = rgba[1][j];
+      r[2].f[j] = rgba[2][j];
+      r[3].f[j] = rgba[3][j];
+   }
+   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+      }
+   }
+}
+
+static void
+exec_store(struct tgsi_exec_machine *mach,
+           const struct tgsi_full_instruction *inst)
+{
+   union tgsi_exec_channel r[3], sample_r;
+   union tgsi_exec_channel value[4];
+   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
+   struct tgsi_image_params params;
+   int dim;
+   int sample;
+   int i, j;
+   uint unit;
+   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+   unit = inst->Dst[0].Register.Index;
+   dim = get_image_coord_dim(inst->Memory.Texture);
+   sample = get_image_coord_sample(inst->Memory.Texture);
+   assert(dim <= 3);
+
+   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
+   params.unit = unit;
+   params.tgsi_tex_instr = inst->Memory.Texture;
+   params.format = inst->Memory.Format;
+
+   for (i = 0; i < dim; i++) {
+      IFETCH(&r[i], 0, TGSI_CHAN_X + i);
+   }
+
+   for (i = 0; i < 4; i++) {
+      FETCH(&value[i], 1, TGSI_CHAN_X + i);
+   }
+   if (sample)
+      IFETCH(&sample_r, 0, TGSI_CHAN_X + sample);
+
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      rgba[0][j] = value[0].f[j];
+      rgba[1][j] = value[1].f[j];
+      rgba[2][j] = value[2].f[j];
+      rgba[3][j] = value[3].f[j];
+   }
+
+   mach->Image->store(mach->Image, &params,
+                      r[0].i, r[1].i, r[2].i, sample_r.i,
+                      rgba);
+}
+
+static void
+exec_atomop(struct tgsi_exec_machine *mach,
+            const struct tgsi_full_instruction *inst)
+{
+   union tgsi_exec_channel r[4], sample_r;
+   union tgsi_exec_channel value[4], value2[4];
+   float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
+   float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE];
+   struct tgsi_image_params params;
+   int dim;
+   int sample;
+   int i, j;
+   uint unit, chan;
+   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+   unit = fetch_sampler_unit(mach, inst, 0);
+   dim = get_image_coord_dim(inst->Memory.Texture);
+   sample = get_image_coord_sample(inst->Memory.Texture);
+   assert(dim <= 3);
+
+   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
+   params.unit = unit;
+   params.tgsi_tex_instr = inst->Memory.Texture;
+   params.format = inst->Memory.Format;
+
+   for (i = 0; i < dim; i++) {
+      IFETCH(&r[i], 1, TGSI_CHAN_X + i);
+   }
+
+   for (i = 0; i < 4; i++) {
+      FETCH(&value[i], 2, TGSI_CHAN_X + i);
+      if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS)
+         FETCH(&value2[i], 3, TGSI_CHAN_X + i);
+   }
+   if (sample)
+      IFETCH(&sample_r, 1, TGSI_CHAN_X + sample);
+
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      rgba[0][j] = value[0].f[j];
+      rgba[1][j] = value[1].f[j];
+      rgba[2][j] = value[2].f[j];
+      rgba[3][j] = value[3].f[j];
+   }
+   if (inst->Instruction.Opcode == TGSI_OPCODE_ATOMCAS) {
+      for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+         rgba2[0][j] = value2[0].f[j];
+         rgba2[1][j] = value2[1].f[j];
+         rgba2[2][j] = value2[2].f[j];
+         rgba2[3][j] = value2[3].f[j];
+      }
+   }
+
+   mach->Image->op(mach->Image, &params, inst->Instruction.Opcode,
+                   r[0].i, r[1].i, r[2].i, sample_r.i,
+                   rgba, rgba2);
+
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      r[0].f[j] = rgba[0][j];
+      r[1].f[j] = rgba[1][j];
+      r[2].f[j] = rgba[2][j];
+      r[3].f[j] = rgba[3][j];
+   }
+   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan, TGSI_EXEC_DATA_FLOAT);
+      }
+   }
+}
+
+static void
+exec_resq(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   int result[4];
+   union tgsi_exec_channel r[4];
+   uint unit;
+   int i, chan, j;
+   struct tgsi_image_params params;
+   int kilmask = mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+
+   unit = fetch_sampler_unit(mach, inst, 0);
+
+   params.execmask = mach->ExecMask & mach->NonHelperMask & ~kilmask;
+   params.unit = unit;
+   params.tgsi_tex_instr = inst->Memory.Texture;
+   params.format = inst->Memory.Format;
+
+   mach->Image->get_dims(mach->Image, &params, result);
+
+   for (i = 0; i < TGSI_QUAD_SIZE; i++) {
+      for (j = 0; j < 4; j++) {
+         r[j].i[i] = result[j];
+      }
+   }
+
+   for (chan = 0; chan < TGSI_NUM_CHANNELS; chan++) {
+      if (inst->Dst[0].Register.WriteMask & (1 << chan)) {
+         store_dest(mach, &r[chan], &inst->Dst[0], inst, chan,
+                    TGSI_EXEC_DATA_INT);
+      }
+   }
+}
 
 static void
 micro_i2f(union tgsi_exec_channel *dst,
@@ -5166,6 +5416,34 @@ exec_instruction(
    case TGSI_OPCODE_D2U:
       exec_d2u(mach, inst);
       break;
+
+   case TGSI_OPCODE_LOAD:
+      exec_load(mach, inst);
+      break;
+
+   case TGSI_OPCODE_STORE:
+      exec_store(mach, inst);
+      break;
+
+   case TGSI_OPCODE_ATOMUADD:
+   case TGSI_OPCODE_ATOMXCHG:
+   case TGSI_OPCODE_ATOMCAS:
+   case TGSI_OPCODE_ATOMAND:
+   case TGSI_OPCODE_ATOMOR:
+   case TGSI_OPCODE_ATOMXOR:
+   case TGSI_OPCODE_ATOMUMIN:
+   case TGSI_OPCODE_ATOMUMAX:
+   case TGSI_OPCODE_ATOMIMIN:
+   case TGSI_OPCODE_ATOMIMAX:
+      exec_atomop(mach, inst);
+      break;
+
+   case TGSI_OPCODE_RESQ:
+      exec_resq(mach, inst);
+      break;
+   case TGSI_OPCODE_BARRIER:
+   case TGSI_OPCODE_MEMBAR:
+      break;
    default:
       assert( 0 );
    }
@@ -5193,6 +5471,8 @@ tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
       default_mask = 0x1;
    }
 
+   if (mach->NonHelperMask == 0)
+      mach->NonHelperMask = default_mask;
    mach->CondMask = default_mask;
    mach->LoopMask = default_mask;
    mach->ContMask = default_mask;
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.h b/src/gallium/auxiliary/tgsi/tgsi_exec.h
index 991c3bfc5db..45fb8d43c88 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.h
@@ -98,6 +98,46 @@ enum tgsi_sampler_control
    TGSI_SAMPLER_GATHER,
 };
 
+struct tgsi_image_params {
+   unsigned unit;
+   unsigned tgsi_tex_instr;
+   enum pipe_format format;
+   unsigned execmask;
+};
+
+struct tgsi_image {
+   /* image interfaces */
+   void (*load)(const struct tgsi_image *image,
+                const struct tgsi_image_params *params,
+                const int s[TGSI_QUAD_SIZE],
+                const int t[TGSI_QUAD_SIZE],
+                const int r[TGSI_QUAD_SIZE],
+                const int sample[TGSI_QUAD_SIZE],
+                float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]);
+
+   void (*store)(const struct tgsi_image *image,
+                 const struct tgsi_image_params *params,
+                 const int s[TGSI_QUAD_SIZE],
+                 const int t[TGSI_QUAD_SIZE],
+                 const int r[TGSI_QUAD_SIZE],
+                 const int sample[TGSI_QUAD_SIZE],
+                 float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]);
+
+   void (*op)(const struct tgsi_image *image,
+              const struct tgsi_image_params *params,
+              unsigned opcode,
+              const int s[TGSI_QUAD_SIZE],
+              const int t[TGSI_QUAD_SIZE],
+              const int r[TGSI_QUAD_SIZE],
+              const int sample[TGSI_QUAD_SIZE],
+              float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE],
+              float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE]);
+
+   void (*get_dims)(const struct tgsi_image *image,
+                    const struct tgsi_image_params *params,
+                    int dims[4]);
+};
+
 /**
  * Information for sampling textures, which must be implemented
  * by code outside the TGSI executor.
@@ -201,12 +241,13 @@ struct tgsi_sampler
 #define TGSI_EXEC_NUM_TEMP_R        4
 
 #define TGSI_EXEC_TEMP_ADDR         (TGSI_EXEC_NUM_TEMPS + 8)
+#define TGSI_EXEC_NUM_ADDRS         3
 
 /* predicate register */
-#define TGSI_EXEC_TEMP_P0           (TGSI_EXEC_NUM_TEMPS + 9)
+#define TGSI_EXEC_TEMP_P0           (TGSI_EXEC_NUM_TEMPS + 11)
 #define TGSI_EXEC_NUM_PREDS         1
 
-#define TGSI_EXEC_NUM_TEMP_EXTRAS   10
+#define TGSI_EXEC_NUM_TEMP_EXTRAS   12
 
 
 
@@ -292,6 +333,7 @@ struct tgsi_exec_machine
 
    struct tgsi_sampler           *Sampler;
 
+   struct tgsi_image             *Image;
    unsigned                      ImmLimit;
 
    const void *Consts[PIPE_MAX_CONSTANT_BUFFERS];
@@ -311,6 +353,9 @@ struct tgsi_exec_machine
    struct tgsi_exec_vector       QuadPos;
    float                         Face;    /**< +1 if front facing, -1 if back facing */
    bool                          flatshade_color;
+
+   /* See GLSL 4.50 specification for definition of helper invocations */
+   uint NonHelperMask;  /**< non-helpers */
    /* Conditional execution masks */
    uint CondMask;  /**< For IF/ELSE/ENDIF */
    uint LoopMask;  /**< For BGNLOOP/ENDLOOP */
@@ -378,7 +423,8 @@ void
 tgsi_exec_machine_bind_shader(
    struct tgsi_exec_machine *mach,
    const struct tgsi_token *tokens,
-   struct tgsi_sampler *sampler);
+   struct tgsi_sampler *sampler,
+   struct tgsi_image *image);
 
 uint
 tgsi_exec_machine_run(
@@ -451,8 +497,10 @@ tgsi_exec_get_shader_param(enum pipe_shader_cap param)
    case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
    case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
-   case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
       return 0;
+   case PIPE_SHADER_CAP_MAX_SHADER_IMAGES:
+      return PIPE_MAX_SHADER_IMAGES;
+
    case PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT:
       return 32;
    }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.c b/src/gallium/auxiliary/tgsi/tgsi_scan.c
index d32c3a14344..d90fb1d68df 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.c
@@ -54,6 +54,20 @@ is_memory_file(unsigned file)
 }
 
 
+/**
+ * Is the opcode a "true" texture instruction which samples from a
+ * texture map?
+ */
+static bool
+is_texture_inst(unsigned opcode)
+{
+   return (opcode != TGSI_OPCODE_TXQ &&
+           opcode != TGSI_OPCODE_TXQS &&
+           opcode != TGSI_OPCODE_TXQ_LZ &&
+           opcode != TGSI_OPCODE_LODQ &&
+           tgsi_get_opcode_info(opcode)->is_tex);
+}
+
 static void
 scan_instruction(struct tgsi_shader_info *info,
                  const struct tgsi_full_instruction *fullinst,
@@ -181,15 +195,35 @@ scan_instruction(struct tgsi_shader_info *info,
          info->indirect_files_read |= (1 << src->Register.File);
       }
 
-      /* MSAA samplers */
+      /* Texture samplers */
       if (src->Register.File == TGSI_FILE_SAMPLER) {
-         assert(fullinst->Instruction.Texture);
-         assert(src->Register.Index < Elements(info->is_msaa_sampler));
+         const unsigned index = src->Register.Index;
 
-         if (fullinst->Instruction.Texture &&
-             (fullinst->Texture.Texture == TGSI_TEXTURE_2D_MSAA ||
-              fullinst->Texture.Texture == TGSI_TEXTURE_2D_ARRAY_MSAA)) {
-            info->is_msaa_sampler[src->Register.Index] = TRUE;
+         assert(fullinst->Instruction.Texture);
+         assert(index < Elements(info->is_msaa_sampler));
+         assert(index < PIPE_MAX_SAMPLERS);
+
+         if (is_texture_inst(fullinst->Instruction.Opcode)) {
+            const unsigned target = fullinst->Texture.Texture;
+            assert(target < TGSI_TEXTURE_UNKNOWN);
+            /* for texture instructions, check that the texture instruction
+             * target matches the previous sampler view declaration (if there
+             * was one.)
+             */
+            if (info->sampler_targets[index] == TGSI_TEXTURE_UNKNOWN) {
+               /* probably no sampler view declaration */
+               info->sampler_targets[index] = target;
+            } else {
+               /* Make sure the texture instruction's sampler/target info
+                * agrees with the sampler view declaration.
+                */
+               assert(info->sampler_targets[index] == target);
+            }
+            /* MSAA samplers */
+            if (target == TGSI_TEXTURE_2D_MSAA ||
+                target == TGSI_TEXTURE_2D_ARRAY_MSAA) {
+               info->is_msaa_sampler[src->Register.Index] = TRUE;
+            }
          }
       }
 
@@ -431,6 +465,16 @@ scan_declaration(struct tgsi_shader_info *info,
          }
       } else if (file == TGSI_FILE_SAMPLER) {
          info->samplers_declared |= 1 << reg;
+      } else if (file == TGSI_FILE_SAMPLER_VIEW) {
+         unsigned target = fulldecl->SamplerView.Resource;
+         assert(target < TGSI_TEXTURE_UNKNOWN);
+         if (info->sampler_targets[reg] == TGSI_TEXTURE_UNKNOWN) {
+            /* Save sampler target for this sampler index */
+            info->sampler_targets[reg] = target;
+         } else {
+            /* if previously declared, make sure targets agree */
+            assert(info->sampler_targets[reg] == target);
+         }
       } else if (file == TGSI_FILE_IMAGE) {
          if (fulldecl->Image.Resource == TGSI_TEXTURE_BUFFER)
             info->images_buffers |= 1 << reg;
@@ -493,6 +537,8 @@ tgsi_scan_shader(const struct tgsi_token *tokens,
    for (i = 0; i < Elements(info->const_file_max); i++)
       info->const_file_max[i] = -1;
    info->properties[TGSI_PROPERTY_GS_INVOCATIONS] = 1;
+   for (i = 0; i < Elements(info->sampler_targets); i++)
+      info->sampler_targets[i] = TGSI_TEXTURE_UNKNOWN;
 
    /**
     ** Setup to begin parsing input shader
diff --git a/src/gallium/auxiliary/tgsi/tgsi_scan.h b/src/gallium/auxiliary/tgsi/tgsi_scan.h
index 76d8925119e..31adce7a603 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_scan.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_scan.h
@@ -65,6 +65,7 @@ struct tgsi_shader_info
    int file_max[TGSI_FILE_COUNT];  /**< highest index of declared registers */
    int const_file_max[PIPE_MAX_CONSTANT_BUFFERS];
    unsigned samplers_declared; /**< bitmask of declared samplers */
+   ubyte sampler_targets[PIPE_MAX_SHADER_SAMPLER_VIEWS];  /**< TGSI_TEXTURE_x values */
 
    ubyte input_array_first[PIPE_MAX_SHADER_INPUTS];
    ubyte input_array_last[PIPE_MAX_SHADER_INPUTS];
diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.c b/src/gallium/auxiliary/tgsi/tgsi_util.c
index 5fff3f0787f..fbe29626a7f 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_util.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.c
@@ -375,10 +375,8 @@ tgsi_util_get_src_from_ind(const struct tgsi_ind_register *reg)
  * sample index.
  */
 int
-tgsi_util_get_texture_coord_dim(int tgsi_tex, int *shadow_or_sample)
+tgsi_util_get_texture_coord_dim(unsigned tgsi_tex)
 {
-   int dim;
-
    /*
     * Depending on the texture target, (src0.xyzw, src1.x) is interpreted
     * differently:
@@ -407,8 +405,7 @@ tgsi_util_get_texture_coord_dim(int tgsi_tex, int *shadow_or_sample)
    case TGSI_TEXTURE_BUFFER:
    case TGSI_TEXTURE_1D:
    case TGSI_TEXTURE_SHADOW1D:
-      dim = 1;
-      break;
+      return 1;
    case TGSI_TEXTURE_2D:
    case TGSI_TEXTURE_RECT:
    case TGSI_TEXTURE_1D_ARRAY:
@@ -416,52 +413,48 @@ tgsi_util_get_texture_coord_dim(int tgsi_tex, int *shadow_or_sample)
    case TGSI_TEXTURE_SHADOWRECT:
    case TGSI_TEXTURE_SHADOW1D_ARRAY:
    case TGSI_TEXTURE_2D_MSAA:
-      dim = 2;
-      break;
+      return 2;
    case TGSI_TEXTURE_3D:
    case TGSI_TEXTURE_CUBE:
    case TGSI_TEXTURE_2D_ARRAY:
    case TGSI_TEXTURE_SHADOWCUBE:
    case TGSI_TEXTURE_SHADOW2D_ARRAY:
    case TGSI_TEXTURE_2D_ARRAY_MSAA:
-      dim = 3;
-      break;
+      return 3;
    case TGSI_TEXTURE_CUBE_ARRAY:
    case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
-      dim = 4;
-      break;
+      return 4;
    default:
       assert(!"unknown texture target");
-      dim = 0;
-      break;
+      return 0;
    }
+}
 
-   if (shadow_or_sample) {
-      switch (tgsi_tex) {
-      case TGSI_TEXTURE_SHADOW1D:
-         /* there is a gap */
-         *shadow_or_sample = 2;
-         break;
-      case TGSI_TEXTURE_SHADOW2D:
-      case TGSI_TEXTURE_SHADOWRECT:
-      case TGSI_TEXTURE_SHADOWCUBE:
-      case TGSI_TEXTURE_SHADOW1D_ARRAY:
-      case TGSI_TEXTURE_SHADOW2D_ARRAY:
-      case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
-         *shadow_or_sample = dim;
-         break;
-      case TGSI_TEXTURE_2D_MSAA:
-      case TGSI_TEXTURE_2D_ARRAY_MSAA:
-         *shadow_or_sample = 3;
-         break;
-      default:
-         /* no shadow nor sample */
-         *shadow_or_sample = -1;
-         break;
-      }
-   }
 
-   return dim;
+/**
+ * Given a TGSI_TEXTURE_x target, return the src register index for the
+ * shadow reference coordinate.
+ */
+int
+tgsi_util_get_shadow_ref_src_index(unsigned tgsi_tex)
+{
+   switch (tgsi_tex) {
+   case TGSI_TEXTURE_SHADOW1D:
+   case TGSI_TEXTURE_SHADOW2D:
+   case TGSI_TEXTURE_SHADOWRECT:
+   case TGSI_TEXTURE_SHADOW1D_ARRAY:
+      return 2;
+   case TGSI_TEXTURE_SHADOWCUBE:
+   case TGSI_TEXTURE_SHADOW2D_ARRAY:
+   case TGSI_TEXTURE_2D_MSAA:
+   case TGSI_TEXTURE_2D_ARRAY_MSAA:
+      return 3;
+   case TGSI_TEXTURE_SHADOWCUBE_ARRAY:
+      return 4;
+   default:
+      /* no shadow nor sample */
+      return -1;
+   }
 }
 
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_util.h b/src/gallium/auxiliary/tgsi/tgsi_util.h
index 6175d95fcd6..3a049ee5667 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_util.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_util.h
@@ -80,7 +80,10 @@ struct tgsi_src_register
 tgsi_util_get_src_from_ind(const struct tgsi_ind_register *reg);
 
 int
-tgsi_util_get_texture_coord_dim(int tgsi_tex, int *shadow_or_sample);
+tgsi_util_get_texture_coord_dim(unsigned tgsi_tex);
+
+int
+tgsi_util_get_shadow_ref_src_index(unsigned tgsi_tex);
 
 boolean
 tgsi_is_shadow_target(unsigned target);
diff --git a/src/gallium/auxiliary/util/u_framebuffer.c b/src/gallium/auxiliary/util/u_framebuffer.c
index 2e0ef749e82..49b391d8162 100644
--- a/src/gallium/auxiliary/util/u_framebuffer.c
+++ b/src/gallium/auxiliary/util/u_framebuffer.c
@@ -55,16 +55,16 @@ util_framebuffer_state_equal(const struct pipe_framebuffer_state *dst,
        dst->height != src->height)
       return FALSE;
 
-   for (i = 0; i < Elements(src->cbufs); i++) {
+   if (dst->nr_cbufs != src->nr_cbufs) {
+      return FALSE;
+   }
+
+   for (i = 0; i < src->nr_cbufs; i++) {
       if (dst->cbufs[i] != src->cbufs[i]) {
          return FALSE;
       }
    }
 
-   if (dst->nr_cbufs != src->nr_cbufs) {
-      return FALSE;
-   }
-
    if (dst->zsbuf != src->zsbuf) {
       return FALSE;
    }
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index 6366f7e802d..3ac6ba3c25a 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -2095,7 +2095,7 @@ after lookup.
 .. opcode:: SAMPLE
 
   Using provided address, sample data from the specified texture using the
-  filtering mode identified by the gven sampler. The source data may come from
+  filtering mode identified by the given sampler. The source data may come from
   any resource type other than buffers.
 
   Syntax: ``SAMPLE dst, address, sampler_view, sampler``
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
index 54315d2f592..3d656d4a34d 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c
@@ -1109,7 +1109,7 @@ emit_intrinsic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 	default:
 		compile_error(ctx, "Unhandled store deref type: %u\n",
 				darr->deref_array_type);
-		break;
+		return;
 	}
 
 	for (int i = 0; i < intr->num_components; i++) {
@@ -1258,7 +1258,14 @@ emit_intrinsic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
 			ctx->frag_face = create_input(b, 0);
 			ctx->frag_face->regs[0]->flags |= IR3_REG_HALF;
 		}
-		dst[0] = ir3_ADD_S(b, ctx->frag_face, 0, create_immed(b, 1), 0);
+		/* for fragface, we always get -1 or 0, but that is inverse
+		 * of what nir expects (where ~0 is true).  Unfortunately
+		 * trying to widen from half to full in add.s seems to do a
+		 * non-sign-extending widen (resulting in something that
+		 * gets interpreted as float Inf??)
+		 */
+		dst[0] = ir3_COV(b, ctx->frag_face, TYPE_S16, TYPE_S32);
+		dst[0] = ir3_ADD_S(b, dst[0], 0, create_immed(b, 1), 0);
 		break;
 	case nir_intrinsic_discard_if:
 	case nir_intrinsic_discard: {
diff --git a/src/gallium/drivers/ilo/shader/ilo_shader_fs.c b/src/gallium/drivers/ilo/shader/ilo_shader_fs.c
index f46126e8427..6c8f1b5222e 100644
--- a/src/gallium/drivers/ilo/shader/ilo_shader_fs.c
+++ b/src/gallium/drivers/ilo/shader/ilo_shader_fs.c
@@ -740,7 +740,9 @@ fs_prepare_tgsi_sampling(struct fs_compile_context *fcc,
       break;
    }
 
-   num_coords = tgsi_util_get_texture_coord_dim(inst->tex.target, &ref_pos);
+   num_coords = tgsi_util_get_texture_coord_dim(inst->tex.target);
+   ref_pos = tgsi_util_get_shadow_ref_src_index(inst->tex.target);
+
    tsrc_transpose(inst->src[0], coords);
    bias_or_lod = tsrc_null();
    ref_or_si = tsrc_null();
diff --git a/src/gallium/drivers/ilo/shader/ilo_shader_vs.c b/src/gallium/drivers/ilo/shader/ilo_shader_vs.c
index 0df0afc706b..2b46d44f5be 100644
--- a/src/gallium/drivers/ilo/shader/ilo_shader_vs.c
+++ b/src/gallium/drivers/ilo/shader/ilo_shader_vs.c
@@ -407,7 +407,8 @@ vs_prepare_tgsi_sampling(struct vs_compile_context *vcc,
    num_derivs = 0;
    sampler_src = 1;
 
-   num_coords = tgsi_util_get_texture_coord_dim(inst->tex.target, &ref_pos);
+   num_coords = tgsi_util_get_texture_coord_dim(inst->tex.target);
+   ref_pos = tgsi_util_get_shadow_ref_src_index(inst->tex.target);
 
    /* extract the parameters */
    switch (inst->opcode) {
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index 21523a27761..c7f8567cadb 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -177,9 +177,11 @@ struct nv50_ir_prog_info
       bool nv50styleSurfaces;    /* generate gX[] access for raw buffers */
       uint16_t texBindBase;      /* base address for tex handles (nve4) */
       uint16_t suInfoBase;       /* base address for surface info (nve4) */
+      uint16_t bufInfoBase;      /* base address for buffer info */
       uint16_t sampleInfoBase;   /* base address for sample positions */
       uint8_t msInfoCBSlot;      /* cX[] used for multisample info */
       uint16_t msInfoBase;       /* base address for multisample info */
+      uint16_t uboInfoBase;      /* base address for compute UBOs (gk104+) */
    } io;
 
    /* driver callback to assign input/output locations */
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index 8b9328b6296..d61109f0040 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -1858,7 +1858,10 @@ CodeEmitterNVC0::emitLOAD(const Instruction *i)
    if (i->src(0).getFile() == FILE_MEMORY_SHARED) {
       if (i->subOp == NV50_IR_SUBOP_LOAD_LOCKED) {
          assert(i->defExists(1));
-         defId(i->def(1), 32 + 18);
+         if (targ->getChipset() >= NVISA_GK104_CHIPSET)
+            defId(i->def(1), 8);
+         else
+            defId(i->def(1), 32 + 18);
       }
    }
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index 611d5f9c3ed..4f012cd3b91 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -3536,8 +3536,11 @@ Converter::exportOutputs()
          Symbol *sym = mkSymbol(FILE_SHADER_OUTPUT, 0, TYPE_F32,
                                 info->out[i].slot[c] * 4);
          Value *val = oData.load(sub.cur->values, i, c, NULL);
-         if (val)
+         if (val) {
+            if (info->out[i].sn == TGSI_SEMANTIC_POSITION)
+               mkOp1(OP_SAT, TYPE_F32, val, val);
             mkStore(OP_EXPORT, TYPE_F32, sym, NULL, val);
+         }
       }
    }
 }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index e8f8e30918b..ce83618d681 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -874,7 +874,17 @@ NVC0LoweringPass::handleManualTXD(TexInstruction *i)
    Value *zero = bld.loadImm(bld.getSSA(), 0);
    int l, c;
    const int dim = i->tex.target.getDim() + i->tex.target.isCube();
-   const int array = i->tex.target.isArray();
+
+   // This function is invoked after handleTEX lowering, so we have to expect
+   // the arguments in the order that the hw wants them. For Fermi, array and
+   // indirect are both in the leading arg, while for Kepler, array and
+   // indirect are separate (and both precede the coordinates). Maxwell is
+   // handled in a separate function.
+   unsigned array;
+   if (targ->getChipset() < NVISA_GK104_CHIPSET)
+      array = i->tex.target.isArray() || i->tex.rIndirectSrc >= 0;
+   else
+      array = i->tex.target.isArray() + (i->tex.rIndirectSrc >= 0);
 
    i->op = OP_TEX; // no need to clone dPdx/dPdy later
 
@@ -1063,7 +1073,7 @@ bool
 NVC0LoweringPass::handleSUQ(Instruction *suq)
 {
    suq->op = OP_MOV;
-   suq->setSrc(0, loadResLength32(suq->getIndirect(0, 1),
+   suq->setSrc(0, loadBufLength32(suq->getIndirect(0, 1),
                                   suq->getSrc(0)->reg.fileIndex * 16));
    suq->setIndirect(0, 0, NULL);
    suq->setIndirect(0, 1, NULL);
@@ -1071,6 +1081,108 @@ NVC0LoweringPass::handleSUQ(Instruction *suq)
 }
 
 void
+NVC0LoweringPass::handleSharedATOMNVE4(Instruction *atom)
+{
+   assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
+
+   BasicBlock *currBB = atom->bb;
+   BasicBlock *tryLockBB = atom->bb->splitBefore(atom, false);
+   BasicBlock *joinBB = atom->bb->splitAfter(atom);
+   BasicBlock *setAndUnlockBB = new BasicBlock(func);
+   BasicBlock *failLockBB = new BasicBlock(func);
+
+   bld.setPosition(currBB, true);
+   assert(!currBB->joinAt);
+   currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
+
+   CmpInstruction *pred =
+      bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(1, FILE_PREDICATE),
+                TYPE_U32, bld.mkImm(0), bld.mkImm(1));
+
+   bld.mkFlow(OP_BRA, tryLockBB, CC_ALWAYS, NULL);
+   currBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::TREE);
+
+   bld.setPosition(tryLockBB, true);
+
+   Instruction *ld =
+      bld.mkLoad(TYPE_U32, atom->getDef(0),
+                 bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U32, 0), NULL);
+   ld->setDef(1, bld.getSSA(1, FILE_PREDICATE));
+   ld->subOp = NV50_IR_SUBOP_LOAD_LOCKED;
+
+   bld.mkFlow(OP_BRA, setAndUnlockBB, CC_P, ld->getDef(1));
+   bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
+   tryLockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::CROSS);
+   tryLockBB->cfg.attach(&setAndUnlockBB->cfg, Graph::Edge::TREE);
+
+   tryLockBB->cfg.detach(&joinBB->cfg);
+   bld.remove(atom);
+
+   bld.setPosition(setAndUnlockBB, true);
+   Value *stVal;
+   if (atom->subOp == NV50_IR_SUBOP_ATOM_EXCH) {
+      // Read the old value, and write the new one.
+      stVal = atom->getSrc(1);
+   } else if (atom->subOp == NV50_IR_SUBOP_ATOM_CAS) {
+      CmpInstruction *set =
+         bld.mkCmp(OP_SET, CC_EQ, TYPE_U32, bld.getSSA(),
+                   TYPE_U32, ld->getDef(0), atom->getSrc(1));
+
+      bld.mkCmp(OP_SLCT, CC_NE, TYPE_U32, (stVal = bld.getSSA()),
+                TYPE_U32, atom->getSrc(2), ld->getDef(0), set->getDef(0));
+   } else {
+      operation op;
+
+      switch (atom->subOp) {
+      case NV50_IR_SUBOP_ATOM_ADD:
+         op = OP_ADD;
+         break;
+      case NV50_IR_SUBOP_ATOM_AND:
+         op = OP_AND;
+         break;
+      case NV50_IR_SUBOP_ATOM_OR:
+         op = OP_OR;
+         break;
+      case NV50_IR_SUBOP_ATOM_XOR:
+         op = OP_XOR;
+         break;
+      case NV50_IR_SUBOP_ATOM_MIN:
+         op = OP_MIN;
+         break;
+      case NV50_IR_SUBOP_ATOM_MAX:
+         op = OP_MAX;
+         break;
+      default:
+         assert(0);
+         return;
+      }
+
+      stVal = bld.mkOp2v(op, atom->dType, bld.getSSA(), ld->getDef(0),
+                         atom->getSrc(1));
+   }
+
+   Instruction *st =
+      bld.mkStore(OP_STORE, TYPE_U32,
+                  bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U32, 0),
+                  NULL, stVal);
+   st->setDef(0, pred->getDef(0));
+   st->subOp = NV50_IR_SUBOP_STORE_UNLOCKED;
+
+   bld.mkFlow(OP_BRA, failLockBB, CC_ALWAYS, NULL);
+   setAndUnlockBB->cfg.attach(&failLockBB->cfg, Graph::Edge::TREE);
+
+   // Lock until the store has not been performed.
+   bld.setPosition(failLockBB, true);
+   bld.mkFlow(OP_BRA, tryLockBB, CC_NOT_P, pred->getDef(0));
+   bld.mkFlow(OP_BRA, joinBB, CC_ALWAYS, NULL);
+   failLockBB->cfg.attach(&tryLockBB->cfg, Graph::Edge::BACK);
+   failLockBB->cfg.attach(&joinBB->cfg, Graph::Edge::TREE);
+
+   bld.setPosition(joinBB, false);
+   bld.mkFlow(OP_JOIN, NULL, CC_ALWAYS, NULL)->fixed = 1;
+}
+
+void
 NVC0LoweringPass::handleSharedATOM(Instruction *atom)
 {
    assert(atom->src(0).getFile() == FILE_MEMORY_SHARED);
@@ -1176,11 +1288,16 @@ NVC0LoweringPass::handleATOM(Instruction *atom)
       sv = SV_LBASE;
       break;
    case FILE_MEMORY_SHARED:
-      handleSharedATOM(atom);
+      // For Fermi/Kepler, we have to use ld lock/st unlock to perform atomic
+      // operations on shared memory. For Maxwell, ATOMS is enough.
+      if (targ->getChipset() < NVISA_GK104_CHIPSET)
+         handleSharedATOM(atom);
+      else if (targ->getChipset() < NVISA_GM107_CHIPSET)
+         handleSharedATOMNVE4(atom);
       return true;
    default:
       assert(atom->src(0).getFile() == FILE_MEMORY_GLOBAL);
-      base = loadResInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16);
+      base = loadBufInfo64(ind, atom->getSrc(0)->reg.fileIndex * 16);
       assert(base->reg.size == 8);
       if (ptr)
          base = bld.mkOp2v(OP_ADD, TYPE_U64, base, base, ptr);
@@ -1204,9 +1321,11 @@ NVC0LoweringPass::handleATOM(Instruction *atom)
 bool
 NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
 {
-   if (cas->src(0).getFile() == FILE_MEMORY_SHARED) {
-      // ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM().
-      return false;
+   if (targ->getChipset() < NVISA_GM107_CHIPSET) {
+      if (cas->src(0).getFile() == FILE_MEMORY_SHARED) {
+         // ATOM_CAS and ATOM_EXCH are handled in handleSharedATOM().
+         return false;
+      }
    }
 
    if (cas->subOp != NV50_IR_SUBOP_ATOM_CAS &&
@@ -1240,19 +1359,20 @@ NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
 }
 
 inline Value *
-NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off)
+NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off, uint16_t base)
 {
    uint8_t b = prog->driver->io.auxCBSlot;
-   off += prog->driver->io.suInfoBase;
+   off += base;
+
    return bld.
       mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
 }
 
 inline Value *
-NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off)
+NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off, uint16_t base)
 {
    uint8_t b = prog->driver->io.auxCBSlot;
-   off += prog->driver->io.suInfoBase;
+   off += base;
 
    if (ptr)
       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
@@ -1262,10 +1382,10 @@ NVC0LoweringPass::loadResInfo64(Value *ptr, uint32_t off)
 }
 
 inline Value *
-NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off)
+NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off, uint16_t base)
 {
    uint8_t b = prog->driver->io.auxCBSlot;
-   off += prog->driver->io.suInfoBase;
+   off += base;
 
    if (ptr)
       ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getScratch(), ptr, bld.mkImm(4));
@@ -1275,6 +1395,60 @@ NVC0LoweringPass::loadResLength32(Value *ptr, uint32_t off)
 }
 
 inline Value *
+NVC0LoweringPass::loadSuInfo32(Value *ptr, uint32_t off)
+{
+   return loadResInfo32(ptr, off, prog->driver->io.suInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadSuInfo64(Value *ptr, uint32_t off)
+{
+   return loadResInfo64(ptr, off, prog->driver->io.suInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadSuLength32(Value *ptr, uint32_t off)
+{
+   return loadResLength32(ptr, off, prog->driver->io.suInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadBufInfo32(Value *ptr, uint32_t off)
+{
+   return loadResInfo32(ptr, off, prog->driver->io.bufInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadBufInfo64(Value *ptr, uint32_t off)
+{
+   return loadResInfo64(ptr, off, prog->driver->io.bufInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadBufLength32(Value *ptr, uint32_t off)
+{
+   return loadResLength32(ptr, off, prog->driver->io.bufInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadUboInfo32(Value *ptr, uint32_t off)
+{
+   return loadResInfo32(ptr, off, prog->driver->io.uboInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadUboInfo64(Value *ptr, uint32_t off)
+{
+   return loadResInfo64(ptr, off, prog->driver->io.uboInfoBase);
+}
+
+inline Value *
+NVC0LoweringPass::loadUboLength32(Value *ptr, uint32_t off)
+{
+   return loadResLength32(ptr, off, prog->driver->io.uboInfoBase);
+}
+
+inline Value *
 NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)
 {
    uint8_t b = prog->driver->io.msInfoCBSlot;
@@ -1354,8 +1528,8 @@ NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex)
 
    Value *tx = bld.getSSA(), *ty = bld.getSSA(), *ts = bld.getSSA();
 
-   Value *ms_x = loadResInfo32(NULL, base + NVE4_SU_INFO_MS(0));
-   Value *ms_y = loadResInfo32(NULL, base + NVE4_SU_INFO_MS(1));
+   Value *ms_x = loadSuInfo32(NULL, base + NVE4_SU_INFO_MS(0));
+   Value *ms_y = loadSuInfo32(NULL, base + NVE4_SU_INFO_MS(1));
 
    bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
    bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
@@ -1408,9 +1582,9 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
    for (c = 0; c < arg; ++c) {
       src[c] = bld.getScratch();
       if (c == 0 && raw)
-         v = loadResInfo32(NULL, base + NVE4_SU_INFO_RAW_X);
+         v = loadSuInfo32(NULL, base + NVE4_SU_INFO_RAW_X);
       else
-         v = loadResInfo32(NULL, base + NVE4_SU_INFO_DIM(c));
+         v = loadSuInfo32(NULL, base + NVE4_SU_INFO_DIM(c));
       bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero)
          ->subOp = getSuClampSubOp(su, c);
    }
@@ -1432,16 +1606,16 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
          bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
    } else
    if (dim == 3) {
-      v = loadResInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
+      v = loadSuInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
       bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
          ->subOp = NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
 
-      v = loadResInfo32(NULL, base + NVE4_SU_INFO_PITCH);
+      v = loadSuInfo32(NULL, base + NVE4_SU_INFO_PITCH);
       bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
          ->subOp = NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
    } else {
       assert(dim == 2);
-      v = loadResInfo32(NULL, base + NVE4_SU_INFO_PITCH);
+      v = loadSuInfo32(NULL, base + NVE4_SU_INFO_PITCH);
       bld.mkOp3(OP_MADSP, TYPE_U32, off, src[1], v, src[0])
          ->subOp = su->tex.target.isArray() ?
          NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
@@ -1452,7 +1626,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
       if (raw) {
          bf = src[0];
       } else {
-         v = loadResInfo32(NULL, base + NVE4_SU_INFO_FMT);
+         v = loadSuInfo32(NULL, base + NVE4_SU_INFO_FMT);
          bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero)
             ->subOp = NV50_IR_SUBOP_V1(7,6,8|2);
       }
@@ -1469,7 +1643,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
       case 2:
          z = off;
          if (!su->tex.target.isArray()) {
-            z = loadResInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
+            z = loadSuInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
             subOp = NV50_IR_SUBOP_SUBFM_3D;
          }
          break;
@@ -1484,7 +1658,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
    }
 
    // part 2
-   v = loadResInfo32(NULL, base + NVE4_SU_INFO_ADDR);
+   v = loadSuInfo32(NULL, base + NVE4_SU_INFO_ADDR);
 
    if (su->tex.target == TEX_TARGET_BUFFER) {
       eau = v;
@@ -1493,7 +1667,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
    }
    // add array layer offset
    if (su->tex.target.isArray()) {
-      v = loadResInfo32(NULL, base + NVE4_SU_INFO_ARRAY);
+      v = loadSuInfo32(NULL, base + NVE4_SU_INFO_ARRAY);
       if (dim == 1)
          bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
             ->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
@@ -1533,7 +1707,7 @@ NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
 
    // let's just set it 0 for raw access and hope it works
    v = raw ?
-      bld.mkImm(0) : loadResInfo32(NULL, base + NVE4_SU_INFO_FMT);
+      bld.mkImm(0) : loadSuInfo32(NULL, base + NVE4_SU_INFO_FMT);
 
    // get rid of old coordinate sources, make space for fmt info and predicate
    su->moveSources(arg, 3 - arg);
@@ -1645,6 +1819,100 @@ NVC0LoweringPass::handleWRSV(Instruction *i)
 }
 
 void
+NVC0LoweringPass::handleLDST(Instruction *i)
+{
+   if (i->src(0).getFile() == FILE_SHADER_INPUT) {
+      if (prog->getType() == Program::TYPE_COMPUTE) {
+         i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
+         i->getSrc(0)->reg.fileIndex = 0;
+      } else
+      if (prog->getType() == Program::TYPE_GEOMETRY &&
+          i->src(0).isIndirect(0)) {
+         // XXX: this assumes vec4 units
+         Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
+                                 i->getIndirect(0, 0), bld.mkImm(4));
+         i->setIndirect(0, 0, ptr);
+         i->op = OP_VFETCH;
+      } else {
+         i->op = OP_VFETCH;
+         assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
+      }
+   } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
+      if (targ->getChipset() >= NVISA_GK104_CHIPSET &&
+          prog->getType() == Program::TYPE_COMPUTE) {
+         // The launch descriptor only allows to set up 8 CBs, but OpenGL
+         // requires at least 12 UBOs. To bypass this limitation, we store the
+         // addrs into the driver constbuf and we directly load from the global
+         // memory.
+         int8_t fileIndex = i->getSrc(0)->reg.fileIndex - 1;
+         Value *ind = i->getIndirect(0, 1);
+         Value *ptr = loadUboInfo64(ind, fileIndex * 16);
+
+         // TODO: clamp the offset to the maximum number of const buf.
+         if (i->src(0).isIndirect(1)) {
+            Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
+            Value *length = loadUboLength32(ind, fileIndex * 16);
+            Value *pred = new_LValue(func, FILE_PREDICATE);
+            if (i->src(0).isIndirect(0)) {
+               bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
+               bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
+            }
+            i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
+            i->setIndirect(0, 1, NULL);
+            i->setIndirect(0, 0, ptr);
+            bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
+            i->setPredicate(CC_NOT_P, pred);
+            if (i->defExists(0)) {
+               bld.mkMov(i->getDef(0), bld.mkImm(0));
+            }
+         } else if (fileIndex >= 0) {
+            if (i->src(0).isIndirect(0)) {
+               bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
+            }
+            i->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
+            i->setIndirect(0, 1, NULL);
+            i->setIndirect(0, 0, ptr);
+         }
+      } else if (i->src(0).isIndirect(1)) {
+         Value *ptr;
+         if (i->src(0).isIndirect(0))
+            ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),
+                             i->getIndirect(0, 1), bld.mkImm(0x1010),
+                             i->getIndirect(0, 0));
+         else
+            ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
+                             i->getIndirect(0, 1), bld.mkImm(16));
+         i->setIndirect(0, 1, NULL);
+         i->setIndirect(0, 0, ptr);
+         i->subOp = NV50_IR_SUBOP_LDC_IS;
+      }
+   } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {
+      assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
+      i->op = OP_VFETCH;
+   } else if (i->src(0).getFile() == FILE_MEMORY_GLOBAL) {
+      Value *ind = i->getIndirect(0, 1);
+      Value *ptr = loadBufInfo64(ind, i->getSrc(0)->reg.fileIndex * 16);
+      // XXX come up with a way not to do this for EVERY little access but
+      // rather to batch these up somehow. Unfortunately we've lost the
+      // information about the field width by the time we get here.
+      Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
+      Value *length = loadBufLength32(ind, i->getSrc(0)->reg.fileIndex * 16);
+      Value *pred = new_LValue(func, FILE_PREDICATE);
+      if (i->src(0).isIndirect(0)) {
+         bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
+         bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
+      }
+      i->setIndirect(0, 1, NULL);
+      i->setIndirect(0, 0, ptr);
+      bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
+      i->setPredicate(CC_NOT_P, pred);
+      if (i->defExists(0)) {
+         bld.mkMov(i->getDef(0), bld.mkImm(0));
+      }
+   }
+}
+
+void
 NVC0LoweringPass::readTessCoord(LValue *dst, int c)
 {
    Value *laneid = bld.getSSA();
@@ -1969,60 +2237,7 @@ NVC0LoweringPass::visit(Instruction *i)
       return handleWRSV(i);
    case OP_STORE:
    case OP_LOAD:
-      if (i->src(0).getFile() == FILE_SHADER_INPUT) {
-         if (prog->getType() == Program::TYPE_COMPUTE) {
-            i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
-            i->getSrc(0)->reg.fileIndex = 0;
-         } else
-         if (prog->getType() == Program::TYPE_GEOMETRY &&
-             i->src(0).isIndirect(0)) {
-            // XXX: this assumes vec4 units
-            Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
-                                    i->getIndirect(0, 0), bld.mkImm(4));
-            i->setIndirect(0, 0, ptr);
-            i->op = OP_VFETCH;
-         } else {
-            i->op = OP_VFETCH;
-            assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
-         }
-      } else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
-         if (i->src(0).isIndirect(1)) {
-            Value *ptr;
-            if (i->src(0).isIndirect(0))
-               ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),
-                                i->getIndirect(0, 1), bld.mkImm(0x1010),
-                                i->getIndirect(0, 0));
-            else
-               ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
-                                i->getIndirect(0, 1), bld.mkImm(16));
-            i->setIndirect(0, 1, NULL);
-            i->setIndirect(0, 0, ptr);
-            i->subOp = NV50_IR_SUBOP_LDC_IS;
-         }
-      } else if (i->src(0).getFile() == FILE_SHADER_OUTPUT) {
-         assert(prog->getType() == Program::TYPE_TESSELLATION_CONTROL);
-         i->op = OP_VFETCH;
-      } else if (i->src(0).getFile() == FILE_MEMORY_GLOBAL) {
-         Value *ind = i->getIndirect(0, 1);
-         Value *ptr = loadResInfo64(ind, i->getSrc(0)->reg.fileIndex * 16);
-         // XXX come up with a way not to do this for EVERY little access but
-         // rather to batch these up somehow. Unfortunately we've lost the
-         // information about the field width by the time we get here.
-         Value *offset = bld.loadImm(NULL, i->getSrc(0)->reg.data.offset + typeSizeof(i->sType));
-         Value *length = loadResLength32(ind, i->getSrc(0)->reg.fileIndex * 16);
-         Value *pred = new_LValue(func, FILE_PREDICATE);
-         if (i->src(0).isIndirect(0)) {
-            bld.mkOp2(OP_ADD, TYPE_U64, ptr, ptr, i->getIndirect(0, 0));
-            bld.mkOp2(OP_ADD, TYPE_U32, offset, offset, i->getIndirect(0, 0));
-         }
-         i->setIndirect(0, 1, NULL);
-         i->setIndirect(0, 0, ptr);
-         bld.mkCmp(OP_SET, CC_GT, TYPE_U32, pred, TYPE_U32, offset, length);
-         i->setPredicate(CC_NOT_P, pred);
-         if (i->defExists(0)) {
-            bld.mkMov(i->getDef(0), bld.mkImm(0));
-         }
-      }
+      handleLDST(i);
       break;
    case OP_ATOM:
    {
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
index 6eb8aff3036..d5c2cb5e7e1 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.h
@@ -106,6 +106,8 @@ protected:
    bool handleCasExch(Instruction *, bool needCctl);
    void handleSurfaceOpNVE4(TexInstruction *);
    void handleSharedATOM(Instruction *);
+   void handleSharedATOMNVE4(Instruction *);
+   void handleLDST(Instruction *);
 
    void checkPredicate(Instruction *);
 
@@ -117,9 +119,18 @@ private:
 
    void readTessCoord(LValue *dst, int c);
 
-   Value *loadResInfo32(Value *ptr, uint32_t off);
-   Value *loadResInfo64(Value *ptr, uint32_t off);
-   Value *loadResLength32(Value *ptr, uint32_t off);
+   Value *loadResInfo32(Value *ptr, uint32_t off, uint16_t base);
+   Value *loadResInfo64(Value *ptr, uint32_t off, uint16_t base);
+   Value *loadResLength32(Value *ptr, uint32_t off, uint16_t base);
+   Value *loadSuInfo32(Value *ptr, uint32_t off);
+   Value *loadSuInfo64(Value *ptr, uint32_t off);
+   Value *loadSuLength32(Value *ptr, uint32_t off);
+   Value *loadBufInfo32(Value *ptr, uint32_t off);
+   Value *loadBufInfo64(Value *ptr, uint32_t off);
+   Value *loadBufLength32(Value *ptr, uint32_t off);
+   Value *loadUboInfo32(Value *ptr, uint32_t off);
+   Value *loadUboInfo64(Value *ptr, uint32_t off);
+   Value *loadUboLength32(Value *ptr, uint32_t off);
    Value *loadMsInfo32(Value *ptr, uint32_t off);
    Value *loadTexHandle(Value *ptr, unsigned int slot);
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
index d877c253a17..500ab8915de 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_ra.cpp
@@ -853,7 +853,7 @@ isShortRegOp(Instruction *insn)
 static bool
 isShortRegVal(LValue *lval)
 {
-   if (lval->defs.size() == 0)
+   if (lval->getInsn() == NULL)
       return false;
    for (Value::DefCIterator def = lval->defs.begin();
         def != lval->defs.end(); ++def)
@@ -1467,7 +1467,7 @@ GCRA::allocateRegisters(ArrayList& insns)
          nodes[i].init(regs, lval);
          RIG.insert(&nodes[i]);
 
-         if (lval->inFile(FILE_GPR) && lval->defs.size() > 0 &&
+         if (lval->inFile(FILE_GPR) && lval->getInsn() != NULL &&
              prog->getTarget()->getChipset() < 0xc0) {
             Instruction *insn = lval->getInsn();
             if (insn->op == OP_MAD || insn->op == OP_SAD)
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 5836bb23764..57e28992727 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -67,9 +67,18 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen,
       break;
    }
 
+   if (bindings & PIPE_BIND_LINEAR)
+      if (util_format_is_depth_or_stencil(format) ||
+          (target != PIPE_TEXTURE_1D &&
+           target != PIPE_TEXTURE_2D &&
+           target != PIPE_TEXTURE_RECT) ||
+          sample_count > 1)
+         return false;
+
    /* transfers & shared are always supported */
    bindings &= ~(PIPE_BIND_TRANSFER_READ |
                  PIPE_BIND_TRANSFER_WRITE |
+                 PIPE_BIND_LINEAR |
                  PIPE_BIND_SHARED);
 
    return (( nv50_format_table[format].usage |
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index 31e1272aeed..91dffa116e1 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -110,9 +110,18 @@
 /* 32 textures handles, at 1 32-bits integer each */
 #define NVC0_CB_AUX_TEX_INFO(i)     0x020 + (i) * 4
 #define NVC0_CB_AUX_TEX_SIZE        (32 * 4)
+/* 8 sets of 32-bits coordinate offsets */
+#define NVC0_CB_AUX_MS_INFO         0x0a0 /* CP */
+#define NVC0_CB_AUX_MS_SIZE         (8 * 2 * 4)
+/* block/grid size, at 3 32-bits integers each and gridid */
+#define NVC0_CB_AUX_GRID_INFO       0x0e0 /* CP */
+#define NVC0_CB_AUX_GRID_SIZE       (7 * 4)
 /* 8 user clip planes, at 4 32-bits floats each */
 #define NVC0_CB_AUX_UCP_INFO        0x100
 #define NVC0_CB_AUX_UCP_SIZE        (PIPE_MAX_CLIP_PLANES * 4 * 4)
+/* 13 ubos, at 4 32-bits integer each */
+#define NVC0_CB_AUX_UBO_INFO(i)     0x100 + (i) * 4 * 4 /* CP */
+#define NVC0_CB_AUX_UBO_SIZE        ((NVC0_MAX_PIPE_CONSTBUFS - 1) * 4 * 4)
 /* 8 sets of 32-bits integer pairs sample offsets */
 #define NVC0_CB_AUX_SAMPLE_INFO     0x180 /* FP */
 #define NVC0_CB_AUX_SAMPLE_SIZE     (8 * 4 * 2)
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
index b7c6faf9cde..db02fa2df5c 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_program.c
@@ -540,24 +540,24 @@ nvc0_program_translate(struct nvc0_program *prog, uint16_t chipset,
 
    if (prog->type == PIPE_SHADER_COMPUTE) {
       if (chipset >= NVISA_GK104_CHIPSET) {
-         info->io.auxCBSlot = 0;
-         info->io.texBindBase = NVE4_CP_INPUT_TEX(0);
-         info->io.suInfoBase = NVE4_CP_INPUT_SUF(0);
-         info->prop.cp.gridInfoBase = NVE4_CP_INPUT_GRID_INFO(0);
-      } else {
-         info->io.suInfoBase = NVC0_CB_AUX_BUF_INFO(0);
+         info->io.auxCBSlot = 7;
+         info->io.texBindBase = NVC0_CB_AUX_TEX_INFO(0);
+         info->prop.cp.gridInfoBase = NVC0_CB_AUX_GRID_INFO;
+         info->io.uboInfoBase = NVC0_CB_AUX_UBO_INFO(0);
       }
       info->io.msInfoCBSlot = 0;
-      info->io.msInfoBase = NVE4_CP_INPUT_MS_OFFSETS;
+      info->io.msInfoBase = NVC0_CB_AUX_MS_INFO;
+      info->io.bufInfoBase = NVC0_CB_AUX_BUF_INFO(0);
+      info->io.suInfoBase = 0; /* TODO */
    } else {
       if (chipset >= NVISA_GK104_CHIPSET) {
          info->io.texBindBase = NVC0_CB_AUX_TEX_INFO(0);
-         info->io.suInfoBase = 0; /* TODO */
       }
       info->io.sampleInfoBase = NVC0_CB_AUX_SAMPLE_INFO;
-      info->io.suInfoBase = NVC0_CB_AUX_BUF_INFO(0);
+      info->io.bufInfoBase = NVC0_CB_AUX_BUF_INFO(0);
       info->io.msInfoCBSlot = 15;
       info->io.msInfoBase = 0; /* TODO */
+      info->io.suInfoBase = 0; /* TODO */
    }
 
    info->assignSlots = nvc0_program_assign_varying_slots;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 553c001cd2b..590dac972a7 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -57,9 +57,18 @@ nvc0_screen_is_format_supported(struct pipe_screen *pscreen,
       if (util_format_get_blocksizebits(format) == 3 * 32)
          return false;
 
+   if (bindings & PIPE_BIND_LINEAR)
+      if (util_format_is_depth_or_stencil(format) ||
+          (target != PIPE_TEXTURE_1D &&
+           target != PIPE_TEXTURE_2D &&
+           target != PIPE_TEXTURE_RECT) ||
+          sample_count > 1)
+         return false;
+
    /* transfers & shared are always supported */
    bindings &= ~(PIPE_BIND_TRANSFER_READ |
                  PIPE_BIND_TRANSFER_WRITE |
+                 PIPE_BIND_LINEAR |
                  PIPE_BIND_SHARED);
 
    return (( nvc0_format_table[format].usage |
@@ -282,7 +291,8 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_CAP_PREFERRED_IR:
       return PIPE_SHADER_IR_TGSI;
    case PIPE_SHADER_CAP_SUPPORTED_IRS:
-      if (class_3d >= NVE4_3D_CLASS)
+      if (class_3d == NVF0_3D_CLASS &&
+          !debug_get_bool_option("NVF0_COMPUTE", false))
          return 0;
       return 1 << PIPE_SHADER_IR_TGSI;
    case PIPE_SHADER_CAP_MAX_INSTRUCTIONS:
@@ -311,8 +321,6 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE:
       return 65536;
    case PIPE_SHADER_CAP_MAX_CONST_BUFFERS:
-      if (shader == PIPE_SHADER_COMPUTE && class_3d >= NVE4_3D_CLASS)
-         return NVE4_MAX_PIPE_CONSTBUFS_COMPUTE;
       return NVC0_MAX_PIPE_CONSTBUFS;
    case PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR:
       return shader != PIPE_SHADER_FRAGMENT;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
index 46b692df2e3..0f782207f13 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.h
@@ -16,7 +16,6 @@
 
 /* doesn't count reserved slots (for auxiliary constants, immediates, etc.) */
 #define NVC0_MAX_PIPE_CONSTBUFS         14
-#define NVE4_MAX_PIPE_CONSTBUFS_COMPUTE  7
 
 #define NVC0_MAX_SURFACE_SLOTS 16
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
index e8b3a4d549a..e657204128e 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -1295,6 +1295,8 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
                       NVC0_3D_VERTEX_ATTRIB_FORMAT_SIZE_32 |
                       NVC0_3D_VERTEX_ATTRIB_FORMAT_CONST);
    }
+   for (i = 1; i < n; ++i)
+      IMMED_NVC0(push, NVC0_3D(VERTEX_ARRAY_FETCH(i)), 0);
    if (nvc0->state.instance_elts) {
       nvc0->state.instance_elts = 0;
       BEGIN_NVC0(push, NVC0_3D(MACRO_VERTEX_ARRAY_PER_INSTANCE), 2);
@@ -1303,6 +1305,17 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
    }
    nvc0->state.num_vtxelts = 2;
 
+   if (nvc0->state.prim_restart) {
+      IMMED_NVC0(push, NVC0_3D(PRIM_RESTART_ENABLE), 0);
+      nvc0->state.prim_restart = 0;
+   }
+
+   if (nvc0->state.index_bias) {
+      IMMED_NVC0(push, NVC0_3D(VB_ELEMENT_BASE), 0);
+      IMMED_NVC0(push, NVC0_3D(VERTEX_ID_BASE), 0);
+      nvc0->state.index_bias = 0;
+   }
+
    for (i = 0; i < info->dst.box.depth; ++i, z += dz) {
       if (info->dst.box.z + i) {
          BEGIN_NVC0(push, NVC0_3D(LAYER), 1);
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
index b3d841461d6..4d069df983e 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
@@ -41,6 +41,7 @@ nve4_screen_compute_setup(struct nvc0_screen *screen,
    int i;
    int ret;
    uint32_t obj_class;
+   uint64_t address;
 
    switch (dev->chipset & ~0xf) {
    case 0x100:
@@ -65,7 +66,7 @@ nve4_screen_compute_setup(struct nvc0_screen *screen,
       return ret;
    }
 
-   ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 0, NVE4_CP_PARAM_SIZE, NULL,
+   ret = nouveau_bo_new(dev, NV_VRAM_DOMAIN(&screen->base), 0, 1 << 12, NULL,
                         &screen->parm);
    if (ret)
       return ret;
@@ -95,9 +96,9 @@ nve4_screen_compute_setup(struct nvc0_screen *screen,
     *  accessible. We cannot prevent that at the moment, so expect failure.
     */
    BEGIN_NVC0(push, NVE4_CP(LOCAL_BASE), 1);
-   PUSH_DATA (push, 1 << 24);
+   PUSH_DATA (push, 0xff << 24);
    BEGIN_NVC0(push, NVE4_CP(SHARED_BASE), 1);
-   PUSH_DATA (push, 2 << 24);
+   PUSH_DATA (push, 0xfe << 24);
 
    BEGIN_NVC0(push, NVE4_CP(CODE_ADDRESS_HIGH), 2);
    PUSH_DATAh(push, screen->text->offset);
@@ -128,15 +129,17 @@ nve4_screen_compute_setup(struct nvc0_screen *screen,
    }
 
    BEGIN_NVC0(push, NVE4_CP(TEX_CB_INDEX), 1);
-   PUSH_DATA (push, 0); /* does not interefere with 3D */
+   PUSH_DATA (push, 7); /* does not interfere with 3D */
 
    if (obj_class == NVF0_COMPUTE_CLASS)
       IMMED_NVC0(push, SUBC_CP(0x02c4), 1);
 
+   address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5);
+
    /* MS sample coordinate offsets: these do not work with _ALT modes ! */
    BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
-   PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_MS_OFFSETS);
-   PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_MS_OFFSETS);
+   PUSH_DATAh(push, address + NVC0_CB_AUX_MS_INFO);
+   PUSH_DATA (push, address + NVC0_CB_AUX_MS_INFO);
    BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
    PUSH_DATA (push, 64);
    PUSH_DATA (push, 1);
@@ -159,7 +162,7 @@ nve4_screen_compute_setup(struct nvc0_screen *screen,
    PUSH_DATA (push, 3); /* 7 */
    PUSH_DATA (push, 1);
 
-#ifdef DEBUG
+#ifdef NOUVEAU_NVE4_MP_TRAP_HANDLER
    BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
    PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR);
    PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_TRAP_INFO_PTR);
@@ -194,6 +197,9 @@ nve4_compute_validate_surfaces(struct nvc0_context *nvc0)
    uint32_t mask;
    unsigned i;
    const unsigned t = 1;
+   uint64_t address;
+
+   address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5);
 
    mask = nvc0->surfaces_dirty[t];
    while (mask) {
@@ -205,8 +211,8 @@ nve4_compute_validate_surfaces(struct nvc0_context *nvc0)
        * directly instead of via binding points, so we have to supply them.
        */
       BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
-      PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_SUF(i));
-      PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_SUF(i));
+      PUSH_DATAh(push, address + NVC0_CB_AUX_BUF_INFO(i));
+      PUSH_DATA (push, address + NVC0_CB_AUX_BUF_INFO(i));
       BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
       PUSH_DATA (push, 64);
       PUSH_DATA (push, 1);
@@ -271,6 +277,7 @@ static void
 nve4_compute_set_tex_handles(struct nvc0_context *nvc0)
 {
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   struct nvc0_screen *screen = nvc0->screen;
    uint64_t address;
    const unsigned s = nvc0_shader_stage(PIPE_SHADER_COMPUTE);
    unsigned i, n;
@@ -282,11 +289,11 @@ nve4_compute_set_tex_handles(struct nvc0_context *nvc0)
    n = util_logbase2(dirty) + 1 - i;
    assert(n);
 
-   address = nvc0->screen->parm->offset + NVE4_CP_INPUT_TEX(i);
+   address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
 
    BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
-   PUSH_DATAh(push, address);
-   PUSH_DATA (push, address);
+   PUSH_DATAh(push, address + NVC0_CB_AUX_TEX_INFO(i));
+   PUSH_DATA (push, address + NVC0_CB_AUX_TEX_INFO(i));
    BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
    PUSH_DATA (push, n * 4);
    PUSH_DATA (push, 0x1);
@@ -301,6 +308,103 @@ nve4_compute_set_tex_handles(struct nvc0_context *nvc0)
    nvc0->samplers_dirty[s] = 0;
 }
 
+static void
+nve4_compute_validate_constbufs(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   const int s = 5;
+
+   while (nvc0->constbuf_dirty[s]) {
+      int i = ffs(nvc0->constbuf_dirty[s]) - 1;
+      nvc0->constbuf_dirty[s] &= ~(1 << i);
+
+      if (nvc0->constbuf[s][i].user) {
+         struct nouveau_bo *bo = nvc0->screen->uniform_bo;
+         const unsigned base = NVC0_CB_USR_INFO(s);
+         const unsigned size = nvc0->constbuf[s][0].size;
+         assert(i == 0); /* we really only want OpenGL uniforms here */
+         assert(nvc0->constbuf[s][0].u.data);
+
+         BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+         PUSH_DATAh(push, bo->offset + base);
+         PUSH_DATA (push, bo->offset + base);
+         BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+         PUSH_DATA (push, size);
+         PUSH_DATA (push, 0x1);
+         BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (size / 4));
+         PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+         PUSH_DATAp(push, nvc0->constbuf[s][0].u.data, size / 4);
+      }
+      else {
+         struct nv04_resource *res =
+            nv04_resource(nvc0->constbuf[s][i].u.buf);
+         if (res) {
+            uint64_t address
+               = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
+
+            assert(i > 0); /* we really only want uniform buffer objects */
+
+            BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+            PUSH_DATAh(push, address + NVC0_CB_AUX_UBO_INFO(i - 1));
+            PUSH_DATA (push, address + NVC0_CB_AUX_UBO_INFO(i - 1));
+            BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+            PUSH_DATA (push, 4 * 4);
+            PUSH_DATA (push, 0x1);
+            BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4);
+            PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+
+            PUSH_DATA (push, res->address + nvc0->constbuf[s][i].offset);
+            PUSH_DATAh(push, res->address + nvc0->constbuf[s][i].offset);
+            PUSH_DATA (push, nvc0->constbuf[5][i].size);
+            PUSH_DATA (push, 0);
+            BCTX_REFN(nvc0->bufctx_cp, CP_CB(i), res, RD);
+
+            res->cb_bindings[s] |= 1 << i;
+         }
+      }
+   }
+
+   BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
+   PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB);
+}
+
+static void
+nve4_compute_validate_buffers(struct nvc0_context *nvc0)
+{
+   struct nouveau_pushbuf *push = nvc0->base.pushbuf;
+   uint64_t address;
+   const int s = 5;
+   int i;
+
+   address = nvc0->screen->uniform_bo->offset + NVC0_CB_AUX_INFO(s);
+
+   BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, address + NVC0_CB_AUX_BUF_INFO(0));
+   PUSH_DATA (push, address + NVC0_CB_AUX_BUF_INFO(0));
+   BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+   PUSH_DATA (push, 4 * NVC0_MAX_BUFFERS * 4);
+   PUSH_DATA (push, 0x1);
+   BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 4 * NVC0_MAX_BUFFERS);
+   PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+
+   for (i = 0; i < NVC0_MAX_BUFFERS; i++) {
+      if (nvc0->buffers[s][i].buffer) {
+         struct nv04_resource *res =
+            nv04_resource(nvc0->buffers[s][i].buffer);
+         PUSH_DATA (push, res->address + nvc0->buffers[s][i].buffer_offset);
+         PUSH_DATAh(push, res->address + nvc0->buffers[s][i].buffer_offset);
+         PUSH_DATA (push, nvc0->buffers[s][i].buffer_size);
+         PUSH_DATA (push, 0);
+         BCTX_REFN(nvc0->bufctx_cp, CP_BUF, res, RDWR);
+      } else {
+         PUSH_DATA (push, 0);
+         PUSH_DATA (push, 0);
+         PUSH_DATA (push, 0);
+         PUSH_DATA (push, 0);
+      }
+   }
+}
+
 static struct nvc0_state_validate
 validate_list_cp[] = {
    { nvc0_compprog_validate,              NVC0_NEW_CP_PROGRAM     },
@@ -310,6 +414,8 @@ validate_list_cp[] = {
                                           NVC0_NEW_CP_SAMPLERS    },
    { nve4_compute_validate_surfaces,      NVC0_NEW_CP_SURFACES    },
    { nvc0_compute_validate_globals,       NVC0_NEW_CP_GLOBALS     },
+   { nve4_compute_validate_buffers,       NVC0_NEW_CP_BUFFERS     },
+   { nve4_compute_validate_constbufs,     NVC0_NEW_CP_CONSTBUF    },
 };
 
 static bool
@@ -327,13 +433,16 @@ nve4_state_validate_cp(struct nvc0_context *nvc0, uint32_t mask)
 }
 
 static void
-nve4_compute_upload_input(struct nvc0_context *nvc0, const void *input,
-                          const uint *block_layout,
-                          const uint *grid_layout)
+nve4_compute_upload_input(struct nvc0_context *nvc0,
+                          struct nve4_cp_launch_desc *desc,
+                          const struct pipe_grid_info *info)
 {
    struct nvc0_screen *screen = nvc0->screen;
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    struct nvc0_program *cp = nvc0->compprog;
+   uint64_t address;
+
+   address = screen->uniform_bo->offset + NVC0_CB_AUX_INFO(5);
 
    if (cp->parm_size) {
       BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
@@ -344,18 +453,38 @@ nve4_compute_upload_input(struct nvc0_context *nvc0, const void *input,
       PUSH_DATA (push, 0x1);
       BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (cp->parm_size / 4));
       PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
-      PUSH_DATAp(push, input, cp->parm_size / 4);
+      PUSH_DATAp(push, info->input, cp->parm_size / 4);
+
+      /* Bind user parameters coming from clover. */
+      /* TODO: This should be harmonized with uniform_bo. */
+      assert(!(desc->cb_mask & (1 << 0)));
+      nve4_cp_launch_desc_set_cb(desc, 0, screen->parm, 0, 1 << 12);
    }
    BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
-   PUSH_DATAh(push, screen->parm->offset + NVE4_CP_INPUT_GRID_INFO(0));
-   PUSH_DATA (push, screen->parm->offset + NVE4_CP_INPUT_GRID_INFO(0));
+   PUSH_DATAh(push, address + NVC0_CB_AUX_GRID_INFO);
+   PUSH_DATA (push, address + NVC0_CB_AUX_GRID_INFO);
    BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
    PUSH_DATA (push, 7 * 4);
    PUSH_DATA (push, 0x1);
-   BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 7);
-   PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
-   PUSH_DATAp(push, block_layout, 3);
-   PUSH_DATAp(push, grid_layout, 3);
+
+   if (unlikely(info->indirect)) {
+      struct nv04_resource *res = nv04_resource(info->indirect);
+      uint32_t offset = res->offset + info->indirect_offset;
+
+      nouveau_pushbuf_space(push, 16, 0, 1);
+      PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain);
+
+      BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 7);
+      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+      PUSH_DATAp(push, info->block, 3);
+      nouveau_pushbuf_data(push, res->bo, offset,
+                           NVC0_IB_ENTRY_1_NO_PREFETCH | 3 * 4);
+   } else {
+      BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + 7);
+      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x20 << 1));
+      PUSH_DATAp(push, info->block, 3);
+      PUSH_DATAp(push, info->grid, 3);
+   }
    PUSH_DATA (push, 0);
 
    BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
@@ -375,24 +504,21 @@ nve4_compute_derive_cache_split(struct nvc0_context *nvc0, uint32_t shared_size)
 static void
 nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
                                struct nve4_cp_launch_desc *desc,
-                               uint32_t label,
-                               const uint *block_layout,
-                               const uint *grid_layout)
+                               const struct pipe_grid_info *info)
 {
    const struct nvc0_screen *screen = nvc0->screen;
    const struct nvc0_program *cp = nvc0->compprog;
-   unsigned i;
 
    nve4_cp_launch_desc_init_default(desc);
 
-   desc->entry = nvc0_program_symbol_offset(cp, label);
+   desc->entry = nvc0_program_symbol_offset(cp, info->pc);
 
-   desc->griddim_x = grid_layout[0];
-   desc->griddim_y = grid_layout[1];
-   desc->griddim_z = grid_layout[2];
-   desc->blockdim_x = block_layout[0];
-   desc->blockdim_y = block_layout[1];
-   desc->blockdim_z = block_layout[2];
+   desc->griddim_x = info->grid[0];
+   desc->griddim_y = info->grid[1];
+   desc->griddim_z = info->grid[2];
+   desc->blockdim_x = info->block[0];
+   desc->blockdim_y = info->block[1];
+   desc->blockdim_z = info->block[2];
 
    desc->shared_size = align(cp->cp.smem_size, 0x100);
    desc->local_size_p = align(cp->cp.lmem_size, 0x10);
@@ -403,12 +529,15 @@ nve4_compute_setup_launch_desc(struct nvc0_context *nvc0,
    desc->gpr_alloc = cp->num_gprs;
    desc->bar_alloc = cp->num_barriers;
 
-   for (i = 0; i < 7; ++i) {
-      const unsigned s = 5;
-      if (nvc0->constbuf[s][i].u.buf)
-         nve4_cp_launch_desc_set_ctx_cb(desc, i + 1, &nvc0->constbuf[s][i]);
+   // Only bind OpenGL uniforms and the driver constant buffer through the
+   // launch descriptor because UBOs are sticked to the driver cb to avoid the
+   // limitation of 8 CBs.
+   if (nvc0->constbuf[5][0].user) {
+      nve4_cp_launch_desc_set_cb(desc, 0, screen->uniform_bo,
+                                 NVC0_CB_USR_INFO(5), 1 << 16);
    }
-   nve4_cp_launch_desc_set_cb(desc, 0, screen->parm, 0, NVE4_CP_INPUT_SIZE);
+   nve4_cp_launch_desc_set_cb(desc, 7, screen->uniform_bo,
+                              NVC0_CB_AUX_INFO(5), 1 << 10);
 }
 
 static inline struct nve4_cp_launch_desc *
@@ -448,29 +577,62 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
    if (ret)
       goto out;
 
-   nve4_compute_setup_launch_desc(nvc0, desc, info->pc,
-                                  info->block, info->grid);
+   nve4_compute_setup_launch_desc(nvc0, desc, info);
+
+   nve4_compute_upload_input(nvc0, desc, info);
+
 #ifdef DEBUG
    if (debug_get_num_option("NV50_PROG_DEBUG", 0))
       nve4_compute_dump_launch_desc(desc);
 #endif
 
-   nve4_compute_upload_input(nvc0, info->input, info->block, info->grid);
+   if (unlikely(info->indirect)) {
+      struct nv04_resource *res = nv04_resource(info->indirect);
+      uint32_t offset = res->offset + info->indirect_offset;
+
+      /* upload the descriptor */
+      BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+      PUSH_DATAh(push, desc_gpuaddr);
+      PUSH_DATA (push, desc_gpuaddr);
+      BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+      PUSH_DATA (push, 256);
+      PUSH_DATA (push, 1);
+      BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (256 / 4));
+      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
+      PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4);
+
+      /* overwrite griddim_x and griddim_y as two 32-bits integers even
+       * if griddim_y must be a 16-bits integer */
+      BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+      PUSH_DATAh(push, desc_gpuaddr + 48);
+      PUSH_DATA (push, desc_gpuaddr + 48);
+      BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+      PUSH_DATA (push, 8);
+      PUSH_DATA (push, 1);
+
+      nouveau_pushbuf_space(push, 16, 0, 1);
+      PUSH_REFN(push, res->bo, NOUVEAU_BO_RD | res->domain);
+
+      BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (8 / 4));
+      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
+      nouveau_pushbuf_data(push, res->bo, offset,
+                           NVC0_IB_ENTRY_1_NO_PREFETCH | 2 * 4);
+
+      /* overwrite the 16 high bits of griddim_y with griddim_z because
+       * we need (z << 16) | x */
+      BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
+      PUSH_DATAh(push, desc_gpuaddr + 54);
+      PUSH_DATA (push, desc_gpuaddr + 54);
+      BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
+      PUSH_DATA (push, 4);
+      PUSH_DATA (push, 1);
+      BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (4 / 4));
+      PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
+      nouveau_pushbuf_data(push, res->bo, offset + 8,
+                           NVC0_IB_ENTRY_1_NO_PREFETCH | 1 * 4);
+   }
 
    /* upload descriptor and flush */
-#if 0
-   BEGIN_NVC0(push, NVE4_CP(UPLOAD_DST_ADDRESS_HIGH), 2);
-   PUSH_DATAh(push, desc_gpuaddr);
-   PUSH_DATA (push, desc_gpuaddr);
-   BEGIN_NVC0(push, NVE4_CP(UPLOAD_LINE_LENGTH_IN), 2);
-   PUSH_DATA (push, 256);
-   PUSH_DATA (push, 1);
-   BEGIN_1IC0(push, NVE4_CP(UPLOAD_EXEC), 1 + (256 / 4));
-   PUSH_DATA (push, NVE4_COMPUTE_UPLOAD_EXEC_LINEAR | (0x08 << 1));
-   PUSH_DATAp(push, (const uint32_t *)desc, 256 / 4);
-   BEGIN_NVC0(push, NVE4_CP(FLUSH), 1);
-   PUSH_DATA (push, NVE4_COMPUTE_FLUSH_CB | NVE4_COMPUTE_FLUSH_CODE);
-#endif
    BEGIN_NVC0(push, NVE4_CP(LAUNCH_DESC_ADDRESS), 1);
    PUSH_DATA (push, desc_gpuaddr >> 8);
    BEGIN_NVC0(push, NVE4_CP(LAUNCH), 1);
@@ -495,7 +657,7 @@ nve4_compute_validate_textures(struct nvc0_context *nvc0)
    struct nouveau_pushbuf *push = nvc0->base.pushbuf;
    const unsigned s = 5;
    unsigned i;
-   uint32_t commands[2][NVE4_CP_INPUT_TEX_MAX];
+   uint32_t commands[2][32];
    unsigned n[2] = { 0, 0 };
 
    for (i = 0; i < nvc0->num_textures[s]; ++i) {
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.h b/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
index 84f8593b9b6..b98c65d4a09 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.h
@@ -4,31 +4,6 @@
 
 #include "nvc0/nve4_compute.xml.h"
 
-/* Input space is implemented as c0[], to which we bind the screen->parm bo.
- */
-#define NVE4_CP_INPUT_USER           0x0000
-#define NVE4_CP_INPUT_USER_LIMIT     0x1000
-#define NVE4_CP_INPUT_GRID_INFO(i)  (0x1000 + (i) * 4)
-#define NVE4_CP_INPUT_NTID(i)       (0x1000 + (i) * 4)
-#define NVE4_CP_INPUT_NCTAID(i)     (0x100c + (i) * 4)
-#define NVE4_CP_INPUT_GRIDID         0x1018
-#define NVE4_CP_INPUT_TEX(i)        (0x1040 + (i) * 4)
-#define NVE4_CP_INPUT_TEX_STRIDE     4
-#define NVE4_CP_INPUT_TEX_MAX        32
-#define NVE4_CP_INPUT_MS_OFFSETS     0x10c0
-#define NVE4_CP_INPUT_SUF_STRIDE     64
-#define NVE4_CP_INPUT_SUF(i)        (0x1100 + (i) * NVE4_CP_INPUT_SUF_STRIDE)
-#define NVE4_CP_INPUT_SUF_MAX        32
-#define NVE4_CP_INPUT_TRAP_INFO_PTR  0x1900
-#define NVE4_CP_INPUT_TEMP_PTR       0x1908
-#define NVE4_CP_INPUT_MP_TEMP_SIZE   0x1910
-#define NVE4_CP_INPUT_WARP_TEMP_SIZE 0x1914
-#define NVE4_CP_INPUT_CSTACK_SIZE    0x1918
-#define NVE4_CP_INPUT_SIZE           0x1a00
-#define NVE4_CP_PARAM_TRAP_INFO      0x2000
-#define NVE4_CP_PARAM_TRAP_INFO_SZ  (1 << 16)
-#define NVE4_CP_PARAM_SIZE          (NVE4_CP_PARAM_TRAP_INFO + (1 << 16))
-
 struct nve4_cp_launch_desc
 {
    u32 unk0[8];
@@ -81,7 +56,7 @@ static inline void
 nve4_cp_launch_desc_set_cb(struct nve4_cp_launch_desc *desc,
                            unsigned index,
                            struct nouveau_bo *bo,
-                           uint32_t base, uint16_t size)
+                           uint32_t base, uint32_t size)
 {
    uint64_t address = bo->offset + base;
 
@@ -95,23 +70,6 @@ nve4_cp_launch_desc_set_cb(struct nve4_cp_launch_desc *desc,
    desc->cb_mask |= 1 << index;
 }
 
-static inline void
-nve4_cp_launch_desc_set_ctx_cb(struct nve4_cp_launch_desc *desc,
-                               unsigned index,
-                               const struct nvc0_constbuf *cb)
-{
-   assert(index < 8);
-
-   if (!cb->u.buf) {
-      desc->cb_mask &= ~(1 << index);
-   } else {
-      const struct nv04_resource *buf = nv04_resource(cb->u.buf);
-      assert(!cb->user);
-      nve4_cp_launch_desc_set_cb(desc, index,
-                                 buf->bo, buf->offset + cb->offset, cb->size);
-   }
-}
-
 struct nve4_mp_trap_info {
    u32 lock;
    u32 pc;
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index 83313cb28cf..65952676987 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -299,6 +299,11 @@ boolean evergreen_is_format_supported(struct pipe_screen *screen,
 	if (usage & PIPE_BIND_TRANSFER_WRITE)
 		retval |= PIPE_BIND_TRANSFER_WRITE;
 
+	if ((usage & PIPE_BIND_LINEAR) &&
+	    !util_format_is_compressed(format) &&
+	    !(usage & PIPE_BIND_DEPTH_STENCIL))
+		retval |= PIPE_BIND_LINEAR;
+
 	return retval == usage;
 }
 
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index f9026197b26..3189a1360b1 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -239,6 +239,11 @@ boolean r600_is_format_supported(struct pipe_screen *screen,
 	if (usage & PIPE_BIND_TRANSFER_WRITE)
 		retval |= PIPE_BIND_TRANSFER_WRITE;
 
+	if ((usage & PIPE_BIND_LINEAR) &&
+	    !util_format_is_compressed(format) &&
+	    !(usage & PIPE_BIND_DEPTH_STENCIL))
+		retval |= PIPE_BIND_LINEAR;
+
 	return retval == usage;
 }
 
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.c b/src/gallium/drivers/radeon/r600_pipe_common.c
index eed9d83ee49..720fc06ece2 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.c
+++ b/src/gallium/drivers/radeon/r600_pipe_common.c
@@ -467,6 +467,8 @@ static const char* r600_get_chip_name(struct r600_common_screen *rscreen)
 	case CHIP_ICELAND: return "AMD ICELAND";
 	case CHIP_CARRIZO: return "AMD CARRIZO";
 	case CHIP_FIJI: return "AMD FIJI";
+	case CHIP_POLARIS10: return "AMD POLARIS10";
+	case CHIP_POLARIS11: return "AMD POLARIS11";
 	case CHIP_STONEY: return "AMD STONEY";
 	default: return "AMD unknown";
 	}
@@ -598,6 +600,13 @@ const char *r600_get_llvm_processor_name(enum radeon_family family)
 	case CHIP_FIJI: return "fiji";
 	case CHIP_STONEY: return "stoney";
 #endif
+#if HAVE_LLVM <= 0x0308
+	case CHIP_POLARIS10: return "tonga";
+	case CHIP_POLARIS11: return "tonga";
+#else
+	case CHIP_POLARIS10: return "polaris10";
+	case CHIP_POLARIS11: return "polaris11";
+#endif
 	default: return "";
 	}
 }
diff --git a/src/gallium/drivers/radeon/r600_query.c b/src/gallium/drivers/radeon/r600_query.c
index f8b62411722..f9a5721fb97 100644
--- a/src/gallium/drivers/radeon/r600_query.c
+++ b/src/gallium/drivers/radeon/r600_query.c
@@ -1066,7 +1066,7 @@ void r600_query_init_backend_mask(struct r600_common_context *ctx)
 			item_mask = 0x3;
 		}
 
-		while(num_tile_pipes--) {
+		while (num_tile_pipes--) {
 			i = backend_map & item_mask;
 			mask |= (1<<i);
 			backend_map >>= item_width;
diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index 7322f3ee985..83fc0021227 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -335,7 +335,7 @@ static boolean r600_texture_get_handle(struct pipe_screen* screen,
 	 */
 	if (resource->target != PIPE_BUFFER &&
 	    (resource->nr_samples > 1 || rtex->is_depth))
-		return NULL;
+		return false;
 
 	if (!res->is_shared) {
 		res->is_shared = true;
diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c
index 2ab74e9eb6c..99b82ca9409 100644
--- a/src/gallium/drivers/radeon/radeon_vce.c
+++ b/src/gallium/drivers/radeon/radeon_vce.c
@@ -50,6 +50,7 @@
 #define FW_50_10_2 ((50 << 24) | (10 << 16) | (2 << 8))
 #define FW_50_17_3 ((50 << 24) | (17 << 16) | (3 << 8))
 #define FW_52_0_3 ((52 << 24) | (0 << 16) | (3 << 8))
+#define FW_52_4_3 ((52 << 24) | (4 << 16) | (3 << 8))
 
 /**
  * flush commands to the hardware
@@ -408,7 +409,8 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
             rscreen->info.drm_major == 3)
 		enc->use_vui = true;
 	if (rscreen->info.family >= CHIP_TONGA &&
-             rscreen->info.family != CHIP_STONEY)
+	    rscreen->info.family != CHIP_STONEY &&
+	    rscreen->info.family != CHIP_POLARIS11)
 		enc->dual_pipe = true;
 	/* TODO enable B frame with dual instance */
 	if ((rscreen->info.family >= CHIP_TONGA) &&
@@ -482,6 +484,7 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 		break;
 
 	case FW_52_0_3:
+	case FW_52_4_3:
 		radeon_vce_52_init(enc);
 		break;
 
@@ -514,6 +517,7 @@ bool rvce_is_fw_version_supported(struct r600_common_screen *rscreen)
 	case FW_50_10_2:
 	case FW_50_17_3:
 	case FW_52_0_3:
+	case FW_52_4_3:
 		return true;
 	default:
 		return false;
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index d35e963133e..baecca72383 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -124,6 +124,8 @@ enum radeon_family {
     CHIP_CARRIZO,
     CHIP_FIJI,
     CHIP_STONEY,
+    CHIP_POLARIS10,
+    CHIP_POLARIS11,
     CHIP_LAST,
 };
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index dd1103eed06..ed84dc224ff 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -598,6 +598,8 @@ static bool si_init_gs_info(struct si_screen *sscreen)
 	case CHIP_HAWAII:
 	case CHIP_TONGA:
 	case CHIP_FIJI:
+	case CHIP_POLARIS10:
+	case CHIP_POLARIS11:
 		sscreen->gs_table_depth = 32;
 		return true;
 	default:
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 9eb531f8d80..56c575948ab 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -39,6 +39,7 @@
 #include "radeon/radeon_llvm_emit.h"
 #include "util/u_memory.h"
 #include "util/u_pstipple.h"
+#include "util/u_string.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_build.h"
 #include "tgsi/tgsi_util.h"
@@ -2874,8 +2875,7 @@ static LLVMValueRef image_fetch_coords(
 	struct gallivm_state *gallivm = bld_base->base.gallivm;
 	LLVMBuilderRef builder = gallivm->builder;
 	unsigned target = inst->Memory.Texture;
-	int sample;
-	unsigned num_coords = tgsi_util_get_texture_coord_dim(target, &sample);
+	unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
 	LLVMValueRef coords[4];
 	LLVMValueRef tmp;
 	int chan;
@@ -3387,8 +3387,8 @@ static void tex_fetch_args(
 	unsigned target = inst->Texture.Texture;
 	LLVMValueRef coords[5], derivs[6];
 	LLVMValueRef address[16];
-	int ref_pos;
-	unsigned num_coords = tgsi_util_get_texture_coord_dim(target, &ref_pos);
+	unsigned num_coords = tgsi_util_get_texture_coord_dim(target);
+	int ref_pos = tgsi_util_get_shadow_ref_src_index(target);
 	unsigned count = 0;
 	unsigned chan;
 	unsigned num_deriv_channels = 0;
@@ -4996,7 +4996,7 @@ static void si_shader_dump_disassembly(const struct radeon_shader_binary *binary
 
 			line = binary->disasm_string;
 			while (*line) {
-				p = strchrnul(line, '\n');
+				p = util_strchrnul(line, '\n');
 				count = p - line;
 
 				if (count) {
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 1245f56c08a..10d691a92f1 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -2046,6 +2046,11 @@ boolean si_is_format_supported(struct pipe_screen *screen,
 	if (usage & PIPE_BIND_TRANSFER_WRITE)
 		retval |= PIPE_BIND_TRANSFER_WRITE;
 
+	if ((usage & PIPE_BIND_LINEAR) &&
+	    !util_format_is_compressed(format) &&
+	    !(usage & PIPE_BIND_DEPTH_STENCIL))
+		retval |= PIPE_BIND_LINEAR;
+
 	return retval == usage;
 }
 
@@ -3946,6 +3951,14 @@ static void si_init_config(struct si_context *sctx)
 			raster_config_1 = 0x0000002e;
 		}
 		break;
+	case CHIP_POLARIS10:
+		raster_config = 0x16000012;
+		raster_config_1 = 0x0000002a;
+		break;
+	case CHIP_POLARIS11:
+		raster_config = 0x16000012;
+		raster_config_1 = 0x00000000;
+		break;
 	case CHIP_TONGA:
 		raster_config = 0x16000012;
 		raster_config_1 = 0x0000002a;
diff --git a/src/gallium/drivers/softpipe/Makefile.sources b/src/gallium/drivers/softpipe/Makefile.sources
index 2af3d6af21a..efe88468e3f 100644
--- a/src/gallium/drivers/softpipe/Makefile.sources
+++ b/src/gallium/drivers/softpipe/Makefile.sources
@@ -10,6 +10,7 @@ C_SOURCES := \
 	sp_flush.h \
 	sp_fs_exec.c \
 	sp_fs.h \
+	sp_image.c \
 	sp_limits.h \
 	sp_prim_vbuf.c \
 	sp_prim_vbuf.h \
@@ -31,6 +32,7 @@ C_SOURCES := \
 	sp_state_blend.c \
 	sp_state_clip.c \
 	sp_state_derived.c \
+	sp_state_image.c \
 	sp_state.h \
 	sp_state_rasterizer.c \
 	sp_state_sampler.c \
diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
index d2a32200e47..30b0276cfe0 100644
--- a/src/gallium/drivers/softpipe/sp_context.c
+++ b/src/gallium/drivers/softpipe/sp_context.c
@@ -50,7 +50,7 @@
 #include "sp_query.h"
 #include "sp_screen.h"
 #include "sp_tex_sample.h"
-
+#include "sp_image.h"
 
 static void
 softpipe_destroy( struct pipe_context *pipe )
@@ -199,6 +199,10 @@ softpipe_create_context(struct pipe_screen *screen,
       softpipe->tgsi.sampler[i] = sp_create_tgsi_sampler();
    }
 
+   for (i = 0; i < PIPE_SHADER_TYPES; i++) {
+      softpipe->tgsi.image[i] = sp_create_tgsi_image();
+   }
+
    softpipe->dump_fs = debug_get_bool_option( "SOFTPIPE_DUMP_FS", FALSE );
    softpipe->dump_gs = debug_get_bool_option( "SOFTPIPE_DUMP_GS", FALSE );
 
@@ -216,6 +220,7 @@ softpipe_create_context(struct pipe_screen *screen,
    softpipe_init_streamout_funcs(&softpipe->pipe);
    softpipe_init_texture_funcs( &softpipe->pipe );
    softpipe_init_vertex_funcs(&softpipe->pipe);
+   softpipe_init_image_funcs(&softpipe->pipe);
 
    softpipe->pipe.set_framebuffer_state = softpipe_set_framebuffer_state;
 
@@ -223,7 +228,8 @@ softpipe_create_context(struct pipe_screen *screen,
 
    softpipe->pipe.clear = softpipe_clear;
    softpipe->pipe.flush = softpipe_flush_wrapped;
-
+   softpipe->pipe.texture_barrier = softpipe_texture_barrier;
+   softpipe->pipe.memory_barrier = softpipe_memory_barrier;
    softpipe->pipe.render_condition = softpipe_render_condition;
    
    /*
@@ -272,6 +278,16 @@ softpipe_create_context(struct pipe_screen *screen,
                         (struct tgsi_sampler *)
                            softpipe->tgsi.sampler[PIPE_SHADER_GEOMETRY]);
 
+   draw_image(softpipe->draw,
+              PIPE_SHADER_VERTEX,
+              (struct tgsi_image *)
+              softpipe->tgsi.image[PIPE_SHADER_VERTEX]);
+
+   draw_image(softpipe->draw,
+              PIPE_SHADER_GEOMETRY,
+              (struct tgsi_image *)
+              softpipe->tgsi.image[PIPE_SHADER_GEOMETRY]);
+
    if (debug_get_bool_option( "SOFTPIPE_NO_RAST", FALSE ))
       softpipe->no_rast = TRUE;
 
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index d5c4aaae638..20a12353b38 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -83,6 +83,7 @@ struct softpipe_context {
    struct pipe_scissor_state scissors[PIPE_MAX_VIEWPORTS];
    struct pipe_sampler_view *sampler_views[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_SAMPLER_VIEWS];
 
+   struct pipe_image_view images[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_IMAGES];
    struct pipe_viewport_state viewports[PIPE_MAX_VIEWPORTS];
    struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
    struct pipe_index_buffer index_buffer;
@@ -172,9 +173,12 @@ struct softpipe_context {
    /** TGSI exec things */
    struct {
       struct sp_tgsi_sampler *sampler[PIPE_SHADER_TYPES];
+      struct sp_tgsi_image *image[PIPE_SHADER_TYPES];
    } tgsi;
 
    struct tgsi_exec_machine *fs_machine;
+   /** whether early depth testing is enabled */
+   bool early_depth;
 
    /** The primitive drawing context */
    struct draw_context *draw;
diff --git a/src/gallium/drivers/softpipe/sp_flush.c b/src/gallium/drivers/softpipe/sp_flush.c
index 5a29e26517d..59b8ad696ec 100644
--- a/src/gallium/drivers/softpipe/sp_flush.c
+++ b/src/gallium/drivers/softpipe/sp_flush.c
@@ -168,3 +168,29 @@ softpipe_flush_resource(struct pipe_context *pipe,
 
    return TRUE;
 }
+
+void softpipe_texture_barrier(struct pipe_context *pipe)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   uint i, sh;
+
+   for (sh = 0; sh < Elements(softpipe->tex_cache); sh++) {
+      for (i = 0; i < softpipe->num_sampler_views[sh]; i++) {
+         sp_flush_tex_tile_cache(softpipe->tex_cache[sh][i]);
+      }
+   }
+
+   for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++)
+      if (softpipe->cbuf_cache[i])
+         sp_flush_tile_cache(softpipe->cbuf_cache[i]);
+
+   if (softpipe->zsbuf_cache)
+      sp_flush_tile_cache(softpipe->zsbuf_cache);
+
+   softpipe->dirty_render_cache = FALSE;
+}
+
+void softpipe_memory_barrier(struct pipe_context *pipe, unsigned flags)
+{
+   softpipe_texture_barrier(pipe);
+}
diff --git a/src/gallium/drivers/softpipe/sp_flush.h b/src/gallium/drivers/softpipe/sp_flush.h
index ab5f77be264..0674b4a7e48 100644
--- a/src/gallium/drivers/softpipe/sp_flush.h
+++ b/src/gallium/drivers/softpipe/sp_flush.h
@@ -55,4 +55,6 @@ softpipe_flush_resource(struct pipe_context *pipe,
                         boolean cpu_access,
                         boolean do_not_block);
 
+void softpipe_texture_barrier(struct pipe_context *pipe);
+void softpipe_memory_barrier(struct pipe_context *pipe, unsigned flags);
 #endif
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index 89411777ec9..bfd9a4b7496 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -62,14 +62,15 @@ sp_exec_fragment_shader(const struct sp_fragment_shader_variant *var)
 static void
 exec_prepare( const struct sp_fragment_shader_variant *var,
               struct tgsi_exec_machine *machine,
-              struct tgsi_sampler *sampler )
+              struct tgsi_sampler *sampler,
+              struct tgsi_image *image )
 {
    /*
     * Bind tokens/shader to the interpreter's machine state.
     */
    tgsi_exec_machine_bind_shader(machine,
                                  var->tokens,
-                                 sampler);
+                                 sampler, image);
 }
 
 
@@ -116,7 +117,8 @@ setup_pos_vector(const struct tgsi_interp_coef *coef,
 static unsigned 
 exec_run( const struct sp_fragment_shader_variant *var,
 	  struct tgsi_exec_machine *machine,
-	  struct quad_header *quad )
+	  struct quad_header *quad,
+	  bool early_depth_test )
 {
    /* Compute X, Y, Z, W vals for this quad */
    setup_pos_vector(quad->posCoef, 
@@ -126,6 +128,7 @@ exec_run( const struct sp_fragment_shader_variant *var,
    /* convert 0 to 1.0 and 1 to -1.0 */
    machine->Face = (float) (quad->input.facing * -2 + 1);
 
+   machine->NonHelperMask = quad->inout.mask;
    quad->inout.mask &= tgsi_exec_machine_run( machine );
    if (quad->inout.mask == 0)
       return FALSE;
@@ -155,16 +158,19 @@ exec_run( const struct sp_fragment_shader_variant *var,
             {
                uint j;
 
-               for (j = 0; j < 4; j++)
-                  quad->output.depth[j] = machine->Outputs[i].xyzw[2].f[j];
+               if (!early_depth_test) {
+                  for (j = 0; j < 4; j++)
+                     quad->output.depth[j] = machine->Outputs[i].xyzw[2].f[j];
+               }
             }
             break;
          case TGSI_SEMANTIC_STENCIL:
             {
                uint j;
-
-               for (j = 0; j < 4; j++)
-                  quad->output.stencil[j] = (unsigned)machine->Outputs[i].xyzw[1].u[j];
+               if (!early_depth_test) {
+                  for (j = 0; j < 4; j++)
+                     quad->output.stencil[j] = (unsigned)machine->Outputs[i].xyzw[1].u[j];
+               }
             }
             break;
          }
@@ -180,7 +186,7 @@ exec_delete(struct sp_fragment_shader_variant *var,
             struct tgsi_exec_machine *machine)
 {
    if (machine->Tokens == var->tokens) {
-      tgsi_exec_machine_bind_shader(machine, NULL, NULL);
+      tgsi_exec_machine_bind_shader(machine, NULL, NULL, NULL);
    }
 
    FREE( (void *) var->tokens );
diff --git a/src/gallium/drivers/softpipe/sp_image.c b/src/gallium/drivers/softpipe/sp_image.c
new file mode 100644
index 00000000000..3488fa83185
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_image.c
@@ -0,0 +1,762 @@
+/*
+ * Copyright 2016 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "sp_context.h"
+#include "sp_image.h"
+#include "sp_texture.h"
+
+#include "util/u_format.h"
+
+/*
+ * Get the offset into the base image
+ * first element for a buffer or layer/level for texture.
+ */
+static uint32_t
+get_image_offset(const struct softpipe_resource *spr,
+                 const struct pipe_image_view *iview,
+                 enum pipe_format format, unsigned r_coord)
+{
+   int base_layer = 0;
+
+   if (spr->base.target == PIPE_BUFFER)
+      return iview->u.buf.first_element * util_format_get_blocksize(format);
+
+   if (spr->base.target == PIPE_TEXTURE_1D_ARRAY ||
+       spr->base.target == PIPE_TEXTURE_2D_ARRAY ||
+       spr->base.target == PIPE_TEXTURE_CUBE_ARRAY ||
+       spr->base.target == PIPE_TEXTURE_CUBE ||
+       spr->base.target == PIPE_TEXTURE_3D)
+      base_layer = r_coord + iview->u.tex.first_layer;
+   return softpipe_get_tex_image_offset(spr, iview->u.tex.level, base_layer);
+}
+
+/*
+ * Does this texture instruction have a layer or depth parameter.
+ */
+static inline bool
+has_layer_or_depth(unsigned tgsi_tex_instr)
+{
+   return (tgsi_tex_instr == TGSI_TEXTURE_3D ||
+           tgsi_tex_instr == TGSI_TEXTURE_CUBE ||
+           tgsi_tex_instr == TGSI_TEXTURE_1D_ARRAY ||
+           tgsi_tex_instr == TGSI_TEXTURE_2D_ARRAY ||
+           tgsi_tex_instr == TGSI_TEXTURE_CUBE_ARRAY ||
+           tgsi_tex_instr == TGSI_TEXTURE_2D_ARRAY_MSAA);
+}
+
+/*
+ * Is this texture instruction a single non-array coordinate.
+ */
+static inline bool
+has_1coord(unsigned tgsi_tex_instr)
+{
+   return (tgsi_tex_instr == TGSI_TEXTURE_BUFFER ||
+           tgsi_tex_instr == TGSI_TEXTURE_1D ||
+           tgsi_tex_instr == TGSI_TEXTURE_1D_ARRAY);
+}
+
+/*
+ * check the bounds vs w/h/d
+ */
+static inline bool
+bounds_check(int width, int height, int depth,
+             int s, int t, int r)
+{
+   if (s < 0 || s >= width)
+      return false;
+   if (t < 0 || t >= height)
+      return false;
+   if (r < 0 || r >= depth)
+      return false;
+   return true;
+}
+
+/*
+ * Checks if the texture target compatible with the image resource
+ * pipe target.
+ */
+static inline bool
+has_compat_target(unsigned pipe_target, unsigned tgsi_target)
+{
+   switch (pipe_target) {
+   case PIPE_TEXTURE_1D:
+      if (tgsi_target == TGSI_TEXTURE_1D)
+         return true;
+      break;
+   case PIPE_TEXTURE_2D:
+      if (tgsi_target == TGSI_TEXTURE_2D)
+         return true;
+      break;
+   case PIPE_TEXTURE_RECT:
+      if (tgsi_target == TGSI_TEXTURE_RECT)
+         return true;
+      break;
+   case PIPE_TEXTURE_3D:
+      if (tgsi_target == TGSI_TEXTURE_3D ||
+          tgsi_target == TGSI_TEXTURE_2D)
+         return true;
+      break;
+   case PIPE_TEXTURE_CUBE:
+      if (tgsi_target == TGSI_TEXTURE_CUBE ||
+          tgsi_target == TGSI_TEXTURE_2D)
+         return true;
+      break;
+   case PIPE_TEXTURE_1D_ARRAY:
+      if (tgsi_target == TGSI_TEXTURE_1D ||
+          tgsi_target == TGSI_TEXTURE_1D_ARRAY)
+         return true;
+      break;
+   case PIPE_TEXTURE_2D_ARRAY:
+      if (tgsi_target == TGSI_TEXTURE_2D ||
+          tgsi_target == TGSI_TEXTURE_2D_ARRAY)
+         return true;
+      break;
+   case PIPE_TEXTURE_CUBE_ARRAY:
+      if (tgsi_target == TGSI_TEXTURE_CUBE ||
+          tgsi_target == TGSI_TEXTURE_CUBE_ARRAY ||
+          tgsi_target == TGSI_TEXTURE_2D)
+         return true;
+      break;
+   case PIPE_BUFFER:
+      return (tgsi_target == TGSI_TEXTURE_BUFFER);
+   }
+   return false;
+}
+
+static bool
+get_dimensions(const struct pipe_image_view *iview,
+               const struct softpipe_resource *spr,
+               unsigned tgsi_tex_instr,
+               enum pipe_format pformat,
+               unsigned *width,
+               unsigned *height,
+               unsigned *depth)
+{
+   if (tgsi_tex_instr == TGSI_TEXTURE_BUFFER) {
+      *width = iview->u.buf.last_element - iview->u.buf.first_element + 1;
+      *height = 1;
+      *depth = 1;
+      /*
+       * Bounds check the buffer size from the view
+       * and the buffer size from the underlying buffer.
+       */
+      if (util_format_get_stride(pformat, *width) >
+          util_format_get_stride(spr->base.format, spr->base.width0))
+         return false;
+   } else {
+      unsigned level;
+
+      level = spr->base.target == PIPE_BUFFER ? 0 : iview->u.tex.level;
+      *width = u_minify(spr->base.width0, level);
+      *height = u_minify(spr->base.height0, level);
+
+      if (spr->base.target == TGSI_TEXTURE_3D)
+         *depth = u_minify(spr->base.depth0, level);
+      else
+         *depth = spr->base.array_size;
+
+      /* Make sure the resource and view have compatiable formats */
+      if (util_format_get_blocksize(pformat) >
+          util_format_get_blocksize(spr->base.format))
+         return false;
+   }
+   return true;
+}
+
+static void
+fill_coords(const struct tgsi_image_params *params,
+            unsigned index,
+            const int s[TGSI_QUAD_SIZE],
+            const int t[TGSI_QUAD_SIZE],
+            const int r[TGSI_QUAD_SIZE],
+            int *s_coord, int *t_coord, int *r_coord)
+{
+   *s_coord = s[index];
+   *t_coord = has_1coord(params->tgsi_tex_instr) ? 0 : t[index];
+   *r_coord = has_layer_or_depth(params->tgsi_tex_instr) ?
+      (params->tgsi_tex_instr == TGSI_TEXTURE_1D_ARRAY ? t[index] : r[index]) : 0;
+}
+/*
+ * Implement the image LOAD operation.
+ */
+static void
+sp_tgsi_load(const struct tgsi_image *image,
+             const struct tgsi_image_params *params,
+             const int s[TGSI_QUAD_SIZE],
+             const int t[TGSI_QUAD_SIZE],
+             const int r[TGSI_QUAD_SIZE],
+             const int sample[TGSI_QUAD_SIZE],
+             float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
+{
+   struct sp_tgsi_image *sp_img = (struct sp_tgsi_image *)image;
+   struct pipe_image_view *iview;
+   struct softpipe_resource *spr;
+   unsigned width, height, depth;
+   unsigned stride;
+   int c, j;
+   char *data_ptr;
+   unsigned offset = 0;
+
+   if (params->unit > PIPE_MAX_SHADER_IMAGES)
+      goto fail_write_all_zero;
+   iview = &sp_img->sp_iview[params->unit];
+   spr = (struct softpipe_resource *)iview->resource;
+   if (!spr)
+      goto fail_write_all_zero;
+
+   if (!has_compat_target(spr->base.target, params->tgsi_tex_instr))
+      goto fail_write_all_zero;
+
+   if (!get_dimensions(iview, spr, params->tgsi_tex_instr,
+                       params->format, &width, &height, &depth))
+      return;
+
+   stride = util_format_get_stride(params->format, width);
+
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      int s_coord, t_coord, r_coord;
+      bool fill_zero = false;
+
+      if (!(params->execmask & (1 << j)))
+         fill_zero = true;
+
+      fill_coords(params, j, s, t, r, &s_coord, &t_coord, &r_coord);
+      if (!bounds_check(width, height, depth,
+                        s_coord, t_coord, r_coord))
+         fill_zero = true;
+
+      if (fill_zero) {
+         int nc = util_format_get_nr_components(params->format);
+         int ival = util_format_is_pure_integer(params->format);
+         for (c = 0; c < 4; c++) {
+            rgba[c][j] = 0;
+            if (c == 3 && nc < 4) {
+               if (ival)
+                  ((int32_t *)rgba[c])[j] = 1;
+               else
+                  rgba[c][j] = 1.0;
+            }
+         }
+         continue;
+      }
+      offset = get_image_offset(spr, iview, params->format, r_coord);
+      data_ptr = (char *)spr->data + offset;
+
+      if (util_format_is_pure_sint(params->format)) {
+         int32_t sdata[4];
+
+         util_format_read_4i(params->format,
+                             sdata, 0,
+                             data_ptr, stride,
+                             s_coord, t_coord, 1, 1);
+         for (c = 0; c < 4; c++)
+            ((int32_t *)rgba[c])[j] = sdata[c];
+      } else if (util_format_is_pure_uint(params->format)) {
+         uint32_t sdata[4];
+         util_format_read_4ui(params->format,
+                             sdata, 0,
+                             data_ptr, stride,
+                             s_coord, t_coord, 1, 1);
+         for (c = 0; c < 4; c++)
+            ((uint32_t *)rgba[c])[j] = sdata[c];
+      } else {
+         float sdata[4];
+         util_format_read_4f(params->format,
+                             sdata, 0,
+                             data_ptr, stride,
+                             s_coord, t_coord, 1, 1);
+         for (c = 0; c < 4; c++)
+            rgba[c][j] = sdata[c];
+      }
+   }
+   return;
+fail_write_all_zero:
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      for (c = 0; c < 4; c++)
+         rgba[c][j] = 0;
+   }
+   return;
+}
+
+/*
+ * Implement the image STORE operation.
+ */
+static void
+sp_tgsi_store(const struct tgsi_image *image,
+              const struct tgsi_image_params *params,
+              const int s[TGSI_QUAD_SIZE],
+              const int t[TGSI_QUAD_SIZE],
+              const int r[TGSI_QUAD_SIZE],
+              const int sample[TGSI_QUAD_SIZE],
+              float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
+{
+   struct sp_tgsi_image *sp_img = (struct sp_tgsi_image *)image;
+   struct pipe_image_view *iview;
+   struct softpipe_resource *spr;
+   unsigned width, height, depth;
+   unsigned stride;
+   char *data_ptr;
+   int j, c;
+   unsigned offset = 0;
+   unsigned pformat = params->format;
+
+   if (params->unit > PIPE_MAX_SHADER_IMAGES)
+      return;
+   iview = &sp_img->sp_iview[params->unit];
+   spr = (struct softpipe_resource *)iview->resource;
+   if (!spr)
+      return;
+   if (!has_compat_target(spr->base.target, params->tgsi_tex_instr))
+      return;
+
+   if (params->format == PIPE_FORMAT_NONE)
+      pformat = spr->base.format;
+
+   if (!get_dimensions(iview, spr, params->tgsi_tex_instr,
+                       pformat, &width, &height, &depth))
+      return;
+
+   stride = util_format_get_stride(pformat, width);
+
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      int s_coord, t_coord, r_coord;
+
+      if (!(params->execmask & (1 << j)))
+         continue;
+
+      fill_coords(params, j, s, t, r, &s_coord, &t_coord, &r_coord);
+      if (!bounds_check(width, height, depth,
+                        s_coord, t_coord, r_coord))
+         continue;
+
+      offset = get_image_offset(spr, iview, pformat, r_coord);
+      data_ptr = (char *)spr->data + offset;
+
+      if (util_format_is_pure_sint(pformat)) {
+         int32_t sdata[4];
+         for (c = 0; c < 4; c++)
+            sdata[c] = ((int32_t *)rgba[c])[j];
+         util_format_write_4i(pformat, sdata, 0, data_ptr, stride,
+                              s_coord, t_coord, 1, 1);
+      } else if (util_format_is_pure_uint(pformat)) {
+         uint32_t sdata[4];
+         for (c = 0; c < 4; c++)
+            sdata[c] = ((uint32_t *)rgba[c])[j];
+         util_format_write_4ui(pformat, sdata, 0, data_ptr, stride,
+                               s_coord, t_coord, 1, 1);
+      } else {
+         float sdata[4];
+         for (c = 0; c < 4; c++)
+            sdata[c] = rgba[c][j];
+         util_format_write_4f(pformat, sdata, 0, data_ptr, stride,
+                              s_coord, t_coord, 1, 1);
+      }
+   }
+}
+
+/*
+ * Implement atomic operations on unsigned integers.
+ */
+static void
+handle_op_uint(const struct pipe_image_view *iview,
+               const struct tgsi_image_params *params,
+               bool just_read,
+               char *data_ptr,
+               uint qi,
+               unsigned stride,
+               unsigned opcode,
+               int s,
+               int t,
+               float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE],
+               float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
+{
+   uint c;
+   int nc = util_format_get_nr_components(params->format);
+   unsigned sdata[4];
+
+   util_format_read_4ui(params->format,
+                        sdata, 0,
+                        data_ptr, stride,
+                        s, t, 1, 1);
+
+   if (just_read) {
+      for (c = 0; c < nc; c++) {
+         ((uint32_t *)rgba[c])[qi] = sdata[c];
+      }
+      return;
+   }
+   switch (opcode) {
+   case TGSI_OPCODE_ATOMUADD:
+      for (c = 0; c < nc; c++) {
+         unsigned temp = sdata[c];
+         sdata[c] += ((uint32_t *)rgba[c])[qi];
+         ((uint32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMXCHG:
+      for (c = 0; c < nc; c++) {
+         unsigned temp = sdata[c];
+         sdata[c] = ((uint32_t *)rgba[c])[qi];
+         ((uint32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMCAS:
+      for (c = 0; c < nc; c++) {
+         unsigned dst_x = sdata[c];
+         unsigned cmp_x = ((uint32_t *)rgba[c])[qi];
+         unsigned src_x = ((uint32_t *)rgba2[c])[qi];
+         unsigned temp = sdata[c];
+         sdata[c] = (dst_x == cmp_x) ? src_x : dst_x;
+         ((uint32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMAND:
+      for (c = 0; c < nc; c++) {
+         unsigned temp = sdata[c];
+         sdata[c] &= ((uint32_t *)rgba[c])[qi];
+         ((uint32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMOR:
+      for (c = 0; c < nc; c++) {
+         unsigned temp = sdata[c];
+         sdata[c] |= ((uint32_t *)rgba[c])[qi];
+         ((uint32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMXOR:
+      for (c = 0; c < nc; c++) {
+         unsigned temp = sdata[c];
+         sdata[c] ^= ((uint32_t *)rgba[c])[qi];
+         ((uint32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMUMIN:
+      for (c = 0; c < nc; c++) {
+         unsigned dst_x = sdata[c];
+         unsigned src_x = ((uint32_t *)rgba[c])[qi];
+         sdata[c] = MIN2(dst_x, src_x);
+         ((uint32_t *)rgba[c])[qi] = dst_x;
+      }
+      break;
+   case TGSI_OPCODE_ATOMUMAX:
+      for (c = 0; c < nc; c++) {
+         unsigned dst_x = sdata[c];
+         unsigned src_x = ((uint32_t *)rgba[c])[qi];
+         sdata[c] = MAX2(dst_x, src_x);
+         ((uint32_t *)rgba[c])[qi] = dst_x;
+      }
+      break;
+   case TGSI_OPCODE_ATOMIMIN:
+      for (c = 0; c < nc; c++) {
+         int dst_x = sdata[c];
+         int src_x = ((uint32_t *)rgba[c])[qi];
+         sdata[c] = MIN2(dst_x, src_x);
+         ((uint32_t *)rgba[c])[qi] = dst_x;
+      }
+      break;
+   case TGSI_OPCODE_ATOMIMAX:
+      for (c = 0; c < nc; c++) {
+         int dst_x = sdata[c];
+         int src_x = ((uint32_t *)rgba[c])[qi];
+         sdata[c] = MAX2(dst_x, src_x);
+         ((uint32_t *)rgba[c])[qi] = dst_x;
+      }
+      break;
+   default:
+      assert(!"Unexpected TGSI opcode in sp_tgsi_op");
+      break;
+   }
+   util_format_write_4ui(params->format, sdata, 0, data_ptr, stride,
+                         s, t, 1, 1);
+}
+
+/*
+ * Implement atomic operations on signed integers.
+ */
+static void
+handle_op_int(const struct pipe_image_view *iview,
+              const struct tgsi_image_params *params,
+              bool just_read,
+              char *data_ptr,
+              uint qi,
+              unsigned stride,
+              unsigned opcode,
+              int s,
+              int t,
+              float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE],
+              float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
+{
+   uint c;
+   int nc = util_format_get_nr_components(params->format);
+   int sdata[4];
+   util_format_read_4i(params->format,
+                       sdata, 0,
+                       data_ptr, stride,
+                       s, t, 1, 1);
+
+   if (just_read) {
+      for (c = 0; c < nc; c++) {
+         ((int32_t *)rgba[c])[qi] = sdata[c];
+      }
+      return;
+   }
+   switch (opcode) {
+   case TGSI_OPCODE_ATOMUADD:
+      for (c = 0; c < nc; c++) {
+         int temp = sdata[c];
+         sdata[c] += ((int32_t *)rgba[c])[qi];
+         ((int32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMXCHG:
+      for (c = 0; c < nc; c++) {
+         int temp = sdata[c];
+         sdata[c] = ((int32_t *)rgba[c])[qi];
+         ((int32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMCAS:
+      for (c = 0; c < nc; c++) {
+         int dst_x = sdata[c];
+         int cmp_x = ((int32_t *)rgba[c])[qi];
+         int src_x = ((int32_t *)rgba2[c])[qi];
+         int temp = sdata[c];
+         sdata[c] = (dst_x == cmp_x) ? src_x : dst_x;
+         ((int32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMAND:
+      for (c = 0; c < nc; c++) {
+         int temp = sdata[c];
+         sdata[c] &= ((int32_t *)rgba[c])[qi];
+         ((int32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMOR:
+      for (c = 0; c < nc; c++) {
+         int temp = sdata[c];
+         sdata[c] |= ((int32_t *)rgba[c])[qi];
+         ((int32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMXOR:
+      for (c = 0; c < nc; c++) {
+         int temp = sdata[c];
+         sdata[c] ^= ((int32_t *)rgba[c])[qi];
+         ((int32_t *)rgba[c])[qi] = temp;
+      }
+      break;
+   case TGSI_OPCODE_ATOMUMIN:
+      for (c = 0; c < nc; c++) {
+         int dst_x = sdata[c];
+         int src_x = ((int32_t *)rgba[c])[qi];
+         sdata[c] = MIN2(dst_x, src_x);
+         ((int32_t *)rgba[c])[qi] = dst_x;
+      }
+      break;
+   case TGSI_OPCODE_ATOMUMAX:
+      for (c = 0; c < nc; c++) {
+         int dst_x = sdata[c];
+         int src_x = ((int32_t *)rgba[c])[qi];
+         sdata[c] = MAX2(dst_x, src_x);
+         ((int32_t *)rgba[c])[qi] = dst_x;
+      }
+      break;
+   case TGSI_OPCODE_ATOMIMIN:
+      for (c = 0; c < nc; c++) {
+         int dst_x = sdata[c];
+         int src_x = ((int32_t *)rgba[c])[qi];
+         sdata[c] = MIN2(dst_x, src_x);
+         ((int32_t *)rgba[c])[qi] = dst_x;
+      }
+      break;
+   case TGSI_OPCODE_ATOMIMAX:
+      for (c = 0; c < nc; c++) {
+         int dst_x = sdata[c];
+         int src_x = ((int32_t *)rgba[c])[qi];
+         sdata[c] = MAX2(dst_x, src_x);
+         ((int32_t *)rgba[c])[qi] = dst_x;
+      }
+      break;
+   default:
+      assert(!"Unexpected TGSI opcode in sp_tgsi_op");
+      break;
+   }
+   util_format_write_4i(params->format, sdata, 0, data_ptr, stride,
+                        s, t, 1, 1);
+}
+
+/*
+ * Implement atomic image operations.
+ */
+static void
+sp_tgsi_op(const struct tgsi_image *image,
+           const struct tgsi_image_params *params,
+           unsigned opcode,
+           const int s[TGSI_QUAD_SIZE],
+           const int t[TGSI_QUAD_SIZE],
+           const int r[TGSI_QUAD_SIZE],
+           const int sample[TGSI_QUAD_SIZE],
+           float rgba[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE],
+           float rgba2[TGSI_NUM_CHANNELS][TGSI_QUAD_SIZE])
+{
+   struct sp_tgsi_image *sp_img = (struct sp_tgsi_image *)image;
+   struct pipe_image_view *iview;
+   struct softpipe_resource *spr;
+   unsigned width, height, depth;
+   unsigned stride;
+   int j, c;
+   unsigned offset;
+   char *data_ptr;
+
+   if (params->unit > PIPE_MAX_SHADER_IMAGES)
+      return;
+   iview = &sp_img->sp_iview[params->unit];
+   spr = (struct softpipe_resource *)iview->resource;
+   if (!spr)
+      goto fail_write_all_zero;
+   if (!has_compat_target(spr->base.target, params->tgsi_tex_instr))
+      goto fail_write_all_zero;
+
+   if (!get_dimensions(iview, spr, params->tgsi_tex_instr,
+                       params->format, &width, &height, &depth))
+      goto fail_write_all_zero;
+
+   stride = util_format_get_stride(spr->base.format, width);
+
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      int s_coord, t_coord, r_coord;
+      bool just_read = false;
+
+      fill_coords(params, j, s, t, r, &s_coord, &t_coord, &r_coord);
+      if (!bounds_check(width, height, depth,
+                        s_coord, t_coord, r_coord)) {
+         int nc = util_format_get_nr_components(params->format);
+         int ival = util_format_is_pure_integer(params->format);
+         int c;
+         for (c = 0; c < 4; c++) {
+            rgba[c][j] = 0;
+            if (c == 3 && nc < 4) {
+               if (ival)
+                  ((int32_t *)rgba[c])[j] = 1;
+               else
+                  rgba[c][j] = 1.0;
+            }
+         }
+         continue;
+      }
+
+      /* just readback the value for atomic if execmask isn't set */
+      if (!(params->execmask & (1 << j))) {
+         just_read = true;
+      }
+
+      offset = get_image_offset(spr, iview, params->format, r_coord);
+      data_ptr = (char *)spr->data + offset;
+
+      /* we should see atomic operations on r32 formats */
+      if (util_format_is_pure_uint(params->format))
+         handle_op_uint(iview, params, just_read, data_ptr, j, stride,
+                        opcode, s_coord, t_coord, rgba, rgba2);
+      else if (util_format_is_pure_sint(params->format))
+         handle_op_int(iview, params, just_read, data_ptr, j, stride,
+                       opcode, s_coord, t_coord, rgba, rgba2);
+      else
+         assert(0);
+   }
+   return;
+fail_write_all_zero:
+   for (j = 0; j < TGSI_QUAD_SIZE; j++) {
+      for (c = 0; c < 4; c++)
+         rgba[c][j] = 0;
+   }
+   return;
+}
+
+static void
+sp_tgsi_get_dims(const struct tgsi_image *image,
+                 const struct tgsi_image_params *params,
+                 int dims[4])
+{
+   struct sp_tgsi_image *sp_img = (struct sp_tgsi_image *)image;
+   struct pipe_image_view *iview;
+   struct softpipe_resource *spr;
+   int level;
+
+   if (params->unit > PIPE_MAX_SHADER_IMAGES)
+      return;
+   iview = &sp_img->sp_iview[params->unit];
+   spr = (struct softpipe_resource *)iview->resource;
+   if (!spr)
+      return;
+
+   if (params->tgsi_tex_instr == TGSI_TEXTURE_BUFFER) {
+      dims[0] = iview->u.buf.last_element - iview->u.buf.first_element + 1;
+      dims[1] = dims[2] = dims[3] = 0;
+      return;
+   }
+
+   level = iview->u.tex.level;
+   dims[0] = u_minify(spr->base.width0, level);
+   switch (params->tgsi_tex_instr) {
+   case TGSI_TEXTURE_1D_ARRAY:
+      dims[1] = iview->u.tex.last_layer - iview->u.tex.first_layer + 1;
+      /* fallthrough */
+   case TGSI_TEXTURE_1D:
+      return;
+   case TGSI_TEXTURE_2D_ARRAY:
+      dims[2] = iview->u.tex.last_layer - iview->u.tex.first_layer + 1;
+      /* fallthrough */
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_CUBE:
+   case TGSI_TEXTURE_RECT:
+      dims[1] = u_minify(spr->base.height0, level);
+      return;
+   case TGSI_TEXTURE_3D:
+      dims[1] = u_minify(spr->base.height0, level);
+      dims[2] = u_minify(spr->base.depth0, level);
+      return;
+   case TGSI_TEXTURE_CUBE_ARRAY:
+      dims[1] = u_minify(spr->base.height0, level);
+      dims[2] = (iview->u.tex.last_layer - iview->u.tex.first_layer + 1) / 6;
+      break;
+   default:
+      assert(!"unexpected texture target in sp_get_dims()");
+      return;
+   }
+}
+
+struct sp_tgsi_image *
+sp_create_tgsi_image(void)
+{
+   struct sp_tgsi_image *img = CALLOC_STRUCT(sp_tgsi_image);
+   if (!img)
+      return NULL;
+
+   img->base.load = sp_tgsi_load;
+   img->base.store = sp_tgsi_store;
+   img->base.op = sp_tgsi_op;
+   img->base.get_dims = sp_tgsi_get_dims;
+   return img;
+};
diff --git a/src/gallium/drivers/softpipe/sp_image.h b/src/gallium/drivers/softpipe/sp_image.h
new file mode 100644
index 00000000000..3c73f838efe
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_image.h
@@ -0,0 +1,37 @@
+/*
+ * Copyright 2016 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef SP_IMAGE_H
+#define SP_IMAGE_H
+#include "tgsi/tgsi_exec.h"
+
+struct sp_tgsi_image
+{
+   struct tgsi_image base;
+   struct pipe_image_view sp_iview[PIPE_MAX_SHADER_IMAGES];
+};
+
+struct sp_tgsi_image *
+sp_create_tgsi_image(void);
+
+#endif
diff --git a/src/gallium/drivers/softpipe/sp_quad_depth_test.c b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
index 4cce9e9bc12..847a616f491 100644
--- a/src/gallium/drivers/softpipe/sp_quad_depth_test.c
+++ b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
@@ -782,7 +782,7 @@ depth_test_quads_fallback(struct quad_stage *qs,
 {
    unsigned i, pass = 0;
    const struct tgsi_shader_info *fsInfo = &qs->softpipe->fs_variant->info;
-   boolean interp_depth = !fsInfo->writes_z;
+   boolean interp_depth = !fsInfo->writes_z || qs->softpipe->early_depth;
    boolean shader_stencil_ref = fsInfo->writes_stencil;
    struct depth_data data;
    unsigned vp_idx = quads[0]->input.viewport_index;
@@ -902,7 +902,7 @@ choose_depth_test(struct quad_stage *qs,
 {
    const struct tgsi_shader_info *fsInfo = &qs->softpipe->fs_variant->info;
 
-   boolean interp_depth = !fsInfo->writes_z;
+   boolean interp_depth = !fsInfo->writes_z || qs->softpipe->early_depth;
 
    boolean alpha = qs->softpipe->depth_stencil->alpha.enabled;
 
diff --git a/src/gallium/drivers/softpipe/sp_quad_fs.c b/src/gallium/drivers/softpipe/sp_quad_fs.c
index 395bc70f2cf..8fb632d9dcf 100644
--- a/src/gallium/drivers/softpipe/sp_quad_fs.c
+++ b/src/gallium/drivers/softpipe/sp_quad_fs.c
@@ -80,7 +80,7 @@ shade_quad(struct quad_stage *qs, struct quad_header *quad)
 
    /* run shader */
    machine->flatshade_color = softpipe->rasterizer->flatshade ? TRUE : FALSE;
-   return softpipe->fs_variant->run( softpipe->fs_variant, machine, quad );
+   return softpipe->fs_variant->run( softpipe->fs_variant, machine, quad, softpipe->early_depth );
 }
 
 
diff --git a/src/gallium/drivers/softpipe/sp_quad_pipe.c b/src/gallium/drivers/softpipe/sp_quad_pipe.c
index 7131512daee..dbe4c0eb67e 100644
--- a/src/gallium/drivers/softpipe/sp_quad_pipe.c
+++ b/src/gallium/drivers/softpipe/sp_quad_pipe.c
@@ -43,15 +43,17 @@ void
 sp_build_quad_pipeline(struct softpipe_context *sp)
 {
    boolean early_depth_test =
-      sp->depth_stencil->depth.enabled &&
+      (sp->depth_stencil->depth.enabled &&
       sp->framebuffer.zsbuf &&
       !sp->depth_stencil->alpha.enabled &&
       !sp->fs_variant->info.uses_kill &&
       !sp->fs_variant->info.writes_z &&
-      !sp->fs_variant->info.writes_stencil;
+       !sp->fs_variant->info.writes_stencil) ||
+      sp->fs_variant->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL];
 
    sp->quad.first = sp->quad.blend;
 
+   sp->early_depth = early_depth_test;
    if (early_depth_test) {
       insert_stage_at_head( sp, sp->quad.shade );
       insert_stage_at_head( sp, sp->quad.depth_test );
diff --git a/src/gallium/drivers/softpipe/sp_state.h b/src/gallium/drivers/softpipe/sp_state.h
index 16a2897f526..570bc549cc4 100644
--- a/src/gallium/drivers/softpipe/sp_state.h
+++ b/src/gallium/drivers/softpipe/sp_state.h
@@ -56,6 +56,7 @@
 
 
 struct tgsi_sampler;
+struct tgsi_image;
 struct tgsi_exec_machine;
 struct vertex_info;
 
@@ -81,11 +82,13 @@ struct sp_fragment_shader_variant
 
    void (*prepare)(const struct sp_fragment_shader_variant *shader,
 		   struct tgsi_exec_machine *machine,
-		   struct tgsi_sampler *sampler);
+		   struct tgsi_sampler *sampler,
+		   struct tgsi_image *image);
 
    unsigned (*run)(const struct sp_fragment_shader_variant *shader,
 		   struct tgsi_exec_machine *machine,
-		   struct quad_header *quad);
+		   struct quad_header *quad,
+		   bool early_depth_test);
 
    /* Deletes this instance of the object */
    void (*delete)(struct sp_fragment_shader_variant *shader,
@@ -149,6 +152,9 @@ void
 softpipe_init_vertex_funcs(struct pipe_context *pipe);
 
 void
+softpipe_init_image_funcs(struct pipe_context *pipe);
+
+void
 softpipe_set_framebuffer_state(struct pipe_context *,
                                const struct pipe_framebuffer_state *);
 
diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c
index d4d03f1be50..65679e73515 100644
--- a/src/gallium/drivers/softpipe/sp_state_derived.c
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -343,7 +343,8 @@ update_fragment_shader(struct softpipe_context *softpipe, unsigned prim)
       softpipe->fs_variant->prepare(softpipe->fs_variant, 
                                     softpipe->fs_machine,
                                     (struct tgsi_sampler *) softpipe->
-                                    tgsi.sampler[PIPE_SHADER_FRAGMENT]);
+                                    tgsi.sampler[PIPE_SHADER_FRAGMENT],
+                                    (struct tgsi_image *)softpipe->tgsi.image[PIPE_SHADER_FRAGMENT]);
    }
    else {
       softpipe->fs_variant = NULL;
diff --git a/src/gallium/drivers/softpipe/sp_state_image.c b/src/gallium/drivers/softpipe/sp_state_image.c
new file mode 100644
index 00000000000..8909fa26864
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_state_image.c
@@ -0,0 +1,57 @@
+/*
+ * Copyright 2016 Red Hat.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "sp_context.h"
+#include "sp_state.h"
+#include "sp_image.h"
+
+static void softpipe_set_shader_images(struct pipe_context *pipe,
+                                       unsigned shader,
+                                       unsigned start,
+                                       unsigned num,
+                                       struct pipe_image_view *images)
+{
+   struct softpipe_context *softpipe = softpipe_context(pipe);
+   unsigned i;
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(start + num <= Elements(softpipe->sampler_views[shader]));
+
+   /* set the new images */
+   for (i = 0; i < num; i++) {
+      int idx = start + i;
+
+      if (images) {
+         pipe_resource_reference(&softpipe->tgsi.image[shader]->sp_iview[idx].resource, images[i].resource);
+         softpipe->tgsi.image[shader]->sp_iview[idx] = images[i];
+      }
+      else {
+         pipe_resource_reference(&softpipe->tgsi.image[shader]->sp_iview[idx].resource, NULL);
+         memset(&softpipe->tgsi.image[shader]->sp_iview[idx], 0, sizeof(struct pipe_image_view));
+      }
+   }
+}
+
+void softpipe_init_image_funcs(struct pipe_context *pipe)
+{
+   pipe->set_shader_images = softpipe_set_shader_images;
+}
diff --git a/src/gallium/drivers/softpipe/sp_texture.c b/src/gallium/drivers/softpipe/sp_texture.c
index 52ec373f8f2..64666fee03f 100644
--- a/src/gallium/drivers/softpipe/sp_texture.c
+++ b/src/gallium/drivers/softpipe/sp_texture.c
@@ -270,9 +270,9 @@ softpipe_resource_get_handle(struct pipe_screen *screen,
  * Helper function to compute offset (in bytes) for a particular
  * texture level/face/slice from the start of the buffer.
  */
-static unsigned
-sp_get_tex_image_offset(const struct softpipe_resource *spr,
-                        unsigned level, unsigned layer)
+unsigned
+softpipe_get_tex_image_offset(const struct softpipe_resource *spr,
+                              unsigned level, unsigned layer)
 {
    unsigned offset = spr->level_offset[level];
 
@@ -422,7 +422,7 @@ softpipe_transfer_map(struct pipe_context *pipe,
    pt->stride = spr->stride[level];
    pt->layer_stride = spr->img_stride[level];
 
-   spt->offset = sp_get_tex_image_offset(spr, level, box->z);
+   spt->offset = softpipe_get_tex_image_offset(spr, level, box->z);
 
    spt->offset +=
          box->y / util_format_get_blockheight(format) * spt->base.stride +
diff --git a/src/gallium/drivers/softpipe/sp_texture.h b/src/gallium/drivers/softpipe/sp_texture.h
index fbf741a9c72..450c4b1cefc 100644
--- a/src/gallium/drivers/softpipe/sp_texture.h
+++ b/src/gallium/drivers/softpipe/sp_texture.h
@@ -116,5 +116,7 @@ softpipe_init_screen_texture_funcs(struct pipe_screen *screen);
 extern void
 softpipe_init_texture_funcs(struct pipe_context *pipe);
 
-
+unsigned
+softpipe_get_tex_image_offset(const struct softpipe_resource *spr,
+                              unsigned level, unsigned layer);
 #endif /* SP_TEXTURE */
diff --git a/src/gallium/drivers/svga/svga_tgsi.c b/src/gallium/drivers/svga/svga_tgsi.c
index c62d4d671ef..7396ad08e27 100644
--- a/src/gallium/drivers/svga/svga_tgsi.c
+++ b/src/gallium/drivers/svga/svga_tgsi.c
@@ -50,15 +50,6 @@
  */
 static char err_buf[128];
 
-#if 0
-static void
-svga_destroy_shader_emitter(struct svga_shader_emitter *emit)
-{
-   if (emit->buf != err_buf)
-      FREE(emit->buf);
-}
-#endif
-
 
 static boolean
 svga_shader_expand(struct svga_shader_emitter *emit)
@@ -265,6 +256,7 @@ svga_tgsi_vgpu9_translate(struct svga_context *svga,
 
  fail:
    FREE(variant);
-   FREE(emit.buf);
+   if (emit.buf != err_buf)
+      FREE(emit.buf);
    return NULL;
 }
diff --git a/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c b/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c
index 204b814a964..418f898e0e3 100644
--- a/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c
+++ b/src/gallium/drivers/svga/svga_tgsi_decl_sm30.c
@@ -535,7 +535,6 @@ svga_tgsi_sampler_type(const struct svga_shader_emitter *emit, int idx)
 
 static boolean
 ps30_sampler( struct svga_shader_emitter *emit,
-              struct tgsi_declaration_semantic semantic,
               unsigned idx )
 {
    SVGA3DOpDclArgs dcl;
@@ -553,6 +552,17 @@ ps30_sampler( struct svga_shader_emitter *emit,
            svga_shader_emit_dwords( emit, dcl.values, Elements(dcl.values)));
 }
 
+boolean
+svga_shader_emit_samplers_decl( struct svga_shader_emitter *emit )
+{
+   unsigned i;
+
+   for (i = 0; i < emit->num_samplers; i++) {
+      if (!ps30_sampler(emit, i))
+         return FALSE;
+   }
+   return TRUE;
+}
 
 boolean
 svga_translate_decl_sm30( struct svga_shader_emitter *emit,
@@ -563,12 +573,15 @@ svga_translate_decl_sm30( struct svga_shader_emitter *emit,
    unsigned idx;
 
    for( idx = first; idx <= last; idx++ ) {
-      boolean ok;
+      boolean ok = TRUE;
 
       switch (decl->Declaration.File) {
       case TGSI_FILE_SAMPLER:
          assert (emit->unit == PIPE_SHADER_FRAGMENT);
-         ok = ps30_sampler( emit, decl->Semantic, idx );
+         /* just keep track of the number of samplers here.
+          * Will emit the declaration in the helpers function.
+          */
+         emit->num_samplers = MAX2(emit->num_samplers, decl->Range.Last + 1);
          break;
 
       case TGSI_FILE_INPUT:
diff --git a/src/gallium/drivers/svga/svga_tgsi_emit.h b/src/gallium/drivers/svga/svga_tgsi_emit.h
index 7a593ba6e9d..114c9563e2b 100644
--- a/src/gallium/drivers/svga/svga_tgsi_emit.h
+++ b/src/gallium/drivers/svga/svga_tgsi_emit.h
@@ -137,6 +137,7 @@ struct svga_shader_emitter
 
    unsigned pstipple_sampler_unit;
 
+   int num_samplers;
    uint8_t sampler_target[PIPE_MAX_SAMPLERS];
 };
 
@@ -157,6 +158,9 @@ svga_shader_emit_instructions(struct svga_shader_emitter *emit,
                               const struct tgsi_token *tokens);
 
 boolean
+svga_shader_emit_samplers_decl(struct svga_shader_emitter *emit);
+
+boolean
 svga_translate_decl_sm30(struct svga_shader_emitter *emit,
                          const struct tgsi_full_declaration *decl);
 
diff --git a/src/gallium/drivers/svga/svga_tgsi_insn.c b/src/gallium/drivers/svga/svga_tgsi_insn.c
index 3188c411863..bedda2ecf71 100644
--- a/src/gallium/drivers/svga/svga_tgsi_insn.c
+++ b/src/gallium/drivers/svga/svga_tgsi_insn.c
@@ -3797,6 +3797,9 @@ svga_shader_emit_helpers(struct svga_shader_emitter *emit)
    }
 
    if (emit->unit == PIPE_SHADER_FRAGMENT) {
+      if (!svga_shader_emit_samplers_decl( emit ))
+         return FALSE;
+
       if (!emit_ps_preamble( emit ))
          return FALSE;
 
diff --git a/src/gallium/drivers/swr/Makefile.sources-arch b/src/gallium/drivers/swr/Makefile.sources-arch
index 6c105f46199..a04b1203c7c 100644
--- a/src/gallium/drivers/swr/Makefile.sources-arch
+++ b/src/gallium/drivers/swr/Makefile.sources-arch
@@ -59,7 +59,6 @@ COMMON_CXX_SOURCES := \
 CORE_CXX_SOURCES := \
 	rasterizer/core/api.cpp \
 	rasterizer/core/api.h \
-	rasterizer/core/arena.cpp \
 	rasterizer/core/arena.h \
 	rasterizer/core/backend.cpp \
 	rasterizer/core/backend.h \
@@ -83,6 +82,7 @@ CORE_CXX_SOURCES := \
 	rasterizer/core/rasterizer.h \
 	rasterizer/core/rdtsc_core.cpp \
 	rasterizer/core/rdtsc_core.h \
+	rasterizer/core/ringbuffer.h \
 	rasterizer/core/state.h \
 	rasterizer/core/threads.cpp \
 	rasterizer/core/threads.h \
diff --git a/src/gallium/drivers/swr/rasterizer/common/containers.hpp b/src/gallium/drivers/swr/rasterizer/common/containers.hpp
index bc96c5f62fd..f3c05979144 100644
--- a/src/gallium/drivers/swr/rasterizer/common/containers.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/containers.hpp
@@ -33,137 +33,137 @@ namespace SWRL
 template <typename T, int NUM_ELEMENTS>
 struct UncheckedFixedVector
 {
-	UncheckedFixedVector() : mSize(0)
-	{
-	}
-
-	UncheckedFixedVector(std::size_t size, T const& exemplar)
-	{
-		this->mSize = 0;
-		for (std::size_t i = 0; i < size; ++i)
-			this->push_back(exemplar);
-	}
-
-	template <typename Iter>
-	UncheckedFixedVector(Iter fst, Iter lst)
-	{
-		this->mSize = 0;
-		for ( ; fst != lst; ++fst)
-			this->push_back(*fst);
-	}
-
-	UncheckedFixedVector(UncheckedFixedVector const& UFV)
-	{
-		this->mSize = 0;
-		for (std::size_t i = 0, N = UFV.size(); i < N; ++i)
-			(*this)[i] = UFV[i];
-		this->mSize = UFV.size();
-	}
-
-	UncheckedFixedVector& operator=(UncheckedFixedVector const& UFV)
-	{
-		for (std::size_t i = 0, N = UFV.size(); i < N; ++i)
-			(*this)[i] = UFV[i];
-		this->mSize = UFV.size();
-		return *this;
-	}
-
-	T* begin()	{ return &this->mElements[0]; }
-	T* end()	{ return &this->mElements[0] + this->mSize; }
-	T const* begin() const	{ return &this->mElements[0]; }
-	T const* end() const	{ return &this->mElements[0] + this->mSize; }
-
-	friend bool operator==(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
-	{
-		if (L.size() != R.size()) return false;
-		for (std::size_t i = 0, N = L.size(); i < N; ++i)
-		{
-			if (L[i] != R[i]) return false;
-		}
-		return true;
-	}
-
-	friend bool operator!=(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
-	{
-		if (L.size() != R.size()) return true;
-		for (std::size_t i = 0, N = L.size(); i < N; ++i)
-		{
-			if (L[i] != R[i]) return true;
-		}
-		return false;
-	}
-
-	T& operator[](std::size_t idx)
-	{
-		return this->mElements[idx];
-	}
-	T const& operator[](std::size_t idx) const
-	{
-		return this->mElements[idx];
-	}
-	void push_back(T const& t)
-	{
-		this->mElements[this->mSize]	= t;
-		++this->mSize;
-	}
-	void pop_back()
-	{
-		SWR_ASSERT(this->mSize > 0);
-		--this->mSize;
-	}
-	T& back()
-	{
-		return this->mElements[this->mSize-1];
-	}
-	T const& back() const
-	{
-		return this->mElements[this->mSize-1];
-	}
-	bool empty() const
-	{
-		return this->mSize == 0;
-	}
-	std::size_t size() const
-	{
-		return this->mSize;
-	}
-	void resize(std::size_t sz)
-	{
-		this->mSize = sz;
-	}
-	void clear()
-	{
-		this->resize(0);
-	}
+    UncheckedFixedVector() : mSize(0)
+    {
+    }
+
+    UncheckedFixedVector(std::size_t size, T const& exemplar)
+    {
+        this->mSize = 0;
+        for (std::size_t i = 0; i < size; ++i)
+            this->push_back(exemplar);
+    }
+
+    template <typename Iter>
+    UncheckedFixedVector(Iter fst, Iter lst)
+    {
+        this->mSize = 0;
+        for ( ; fst != lst; ++fst)
+            this->push_back(*fst);
+    }
+
+    UncheckedFixedVector(UncheckedFixedVector const& UFV)
+    {
+        this->mSize = 0;
+        for (std::size_t i = 0, N = UFV.size(); i < N; ++i)
+            (*this)[i] = UFV[i];
+        this->mSize = UFV.size();
+    }
+
+    UncheckedFixedVector& operator=(UncheckedFixedVector const& UFV)
+    {
+        for (std::size_t i = 0, N = UFV.size(); i < N; ++i)
+            (*this)[i] = UFV[i];
+        this->mSize = UFV.size();
+        return *this;
+    }
+
+    T* begin()  { return &this->mElements[0]; }
+    T* end()    { return &this->mElements[0] + this->mSize; }
+    T const* begin() const  { return &this->mElements[0]; }
+    T const* end() const    { return &this->mElements[0] + this->mSize; }
+
+    friend bool operator==(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
+    {
+        if (L.size() != R.size()) return false;
+        for (std::size_t i = 0, N = L.size(); i < N; ++i)
+        {
+            if (L[i] != R[i]) return false;
+        }
+        return true;
+    }
+
+    friend bool operator!=(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
+    {
+        if (L.size() != R.size()) return true;
+        for (std::size_t i = 0, N = L.size(); i < N; ++i)
+        {
+            if (L[i] != R[i]) return true;
+        }
+        return false;
+    }
+
+    T& operator[](std::size_t idx)
+    {
+        return this->mElements[idx];
+    }
+    T const& operator[](std::size_t idx) const
+    {
+        return this->mElements[idx];
+    }
+    void push_back(T const& t)
+    {
+        this->mElements[this->mSize]    = t;
+        ++this->mSize;
+    }
+    void pop_back()
+    {
+        SWR_ASSERT(this->mSize > 0);
+        --this->mSize;
+    }
+    T& back()
+    {
+        return this->mElements[this->mSize-1];
+    }
+    T const& back() const
+    {
+        return this->mElements[this->mSize-1];
+    }
+    bool empty() const
+    {
+        return this->mSize == 0;
+    }
+    std::size_t size() const
+    {
+        return this->mSize;
+    }
+    void resize(std::size_t sz)
+    {
+        this->mSize = sz;
+    }
+    void clear()
+    {
+        this->resize(0);
+    }
 private:
-	std::size_t	mSize;
-	T			mElements[NUM_ELEMENTS];
+    std::size_t    mSize{ 0 };
+    T mElements[NUM_ELEMENTS];
 };
 
 template <typename T, int NUM_ELEMENTS>
 struct FixedStack : UncheckedFixedVector<T, NUM_ELEMENTS>
 {
-	FixedStack() {}
-
-	void push(T const& t)
-	{
-		this->push_back(t);
-	}
-
-	void pop()
-	{
-		this->pop_back();
-	}
-
-	T& top()
-	{
-		return this->back();
-	}
-
-	T const& top() const
-	{
-		return this->back();
-	}
+    FixedStack() {}
+
+    void push(T const& t)
+    {
+        this->push_back(t);
+    }
+
+    void pop()
+    {
+        this->pop_back();
+    }
+
+    T& top()
+    {
+        return this->back();
+    }
+
+    T const& top() const
+    {
+        return this->back();
+    }
 };
 
 template <typename T>
@@ -190,16 +190,16 @@ namespace std
 template <typename T, int N>
 struct hash<SWRL::UncheckedFixedVector<T, N>>
 {
-	size_t operator() (SWRL::UncheckedFixedVector<T, N> const& v) const
-	{
-		if (v.size() == 0) return 0;
-		std::hash<T> H;
-		size_t x = H(v[0]);
-		if (v.size() == 1) return x;
-		for (size_t i = 1; i < v.size(); ++i)
-			x ^= H(v[i]) + 0x9e3779b9 + (x<<6) + (x>>2);
-		return x;
-	}
+    size_t operator() (SWRL::UncheckedFixedVector<T, N> const& v) const
+    {
+        if (v.size() == 0) return 0;
+        std::hash<T> H;
+        size_t x = H(v[0]);
+        if (v.size() == 1) return x;
+        for (size_t i = 1; i < v.size(); ++i)
+            x ^= H(v[i]) + 0x9e3779b9 + (x<<6) + (x>>2);
+        return x;
+    }
 };
 
 
diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h
index 522ae0dd65f..5794f3f625a 100644
--- a/src/gallium/drivers/swr/rasterizer/common/os.h
+++ b/src/gallium/drivers/swr/rasterizer/common/os.h
@@ -47,16 +47,18 @@
 #define DEBUGBREAK __debugbreak()
 
 #define PRAGMA_WARNING_PUSH_DISABLE(...) \
-	__pragma(warning(push));\
-	__pragma(warning(disable:__VA_ARGS__));
+    __pragma(warning(push));\
+    __pragma(warning(disable:__VA_ARGS__));
 
 #define PRAGMA_WARNING_POP() __pragma(warning(pop))
 
 #if defined(_WIN32)
 #if defined(_WIN64)
+#define BitScanReverseSizeT BitScanReverse64
 #define BitScanForwardSizeT BitScanForward64
 #define _mm_popcount_sizeT _mm_popcnt_u64
 #else
+#define BitScanReverseSizeT BitScanReverse
 #define BitScanForwardSizeT BitScanForward
 #define _mm_popcount_sizeT _mm_popcnt_u32
 #endif
@@ -68,29 +70,20 @@
 
 #include <stdlib.h>
 #include <string.h>
-#include <X11/Xmd.h>
 #include <x86intrin.h>
 #include <stdint.h>
 #include <sys/types.h>
 #include <unistd.h>
 #include <sys/stat.h>
+#include <stdio.h>
 
-typedef void			VOID;
+typedef void            VOID;
 typedef void*           LPVOID;
-typedef CARD8			BOOL;
-typedef wchar_t			WCHAR;
-typedef uint16_t		UINT16;
-typedef int				INT;
-typedef unsigned int	UINT;
-typedef uint32_t		UINT32;
-typedef uint64_t		UINT64;
-typedef int64_t		    INT64;
-typedef void*			HANDLE;
-typedef float			FLOAT;
-typedef int			    LONG;
-typedef CARD8		    BYTE;
-typedef unsigned char   UCHAR;
-typedef unsigned int	DWORD;
+typedef int             INT;
+typedef unsigned int    UINT;
+typedef void*           HANDLE;
+typedef int             LONG;
+typedef unsigned int    DWORD;
 
 #undef FALSE
 #define FALSE 0
@@ -104,8 +97,11 @@ typedef unsigned int	DWORD;
 #define INLINE __inline
 #endif
 #define DEBUGBREAK asm ("int $3")
+#if !defined(__CYGWIN__)
 #define __cdecl
+#define __stdcall
 #define __declspec(X)
+#endif
 
 #define GCC_VERSION (__GNUC__ * 10000 \
                      + __GNUC_MINOR__ * 100 \
@@ -180,21 +176,13 @@ unsigned char _bittest(const LONG *a, LONG b)
 
 #define CreateDirectory(name, pSecurity) mkdir(name, 0777)
 
-#if defined(_WIN32)
-static inline
-unsigned int _mm_popcnt_u32(unsigned int v)
-{
-    return __builtin_popcount(v);
-}
-#endif
-
 #define _aligned_free free
 #define InterlockedCompareExchange(Dest, Exchange, Comparand) __sync_val_compare_and_swap(Dest, Comparand, Exchange)
 #define InterlockedExchangeAdd(Addend, Value) __sync_fetch_and_add(Addend, Value)
 #define InterlockedDecrement(Append) __sync_sub_and_fetch(Append, 1)
+#define InterlockedDecrement64(Append) __sync_sub_and_fetch(Append, 1)
 #define InterlockedIncrement(Append) __sync_add_and_fetch(Append, 1)
 #define _ReadWriteBarrier() asm volatile("" ::: "memory")
-#define __stdcall
 
 #define PRAGMA_WARNING_PUSH_DISABLE(...)
 #define PRAGMA_WARNING_POP()
@@ -206,7 +194,7 @@ unsigned int _mm_popcnt_u32(unsigned int v)
 #endif
 
 // Universal types
-typedef BYTE        KILOBYTE[1024];
+typedef uint8_t     KILOBYTE[1024];
 typedef KILOBYTE    MEGABYTE[1024];
 typedef MEGABYTE    GIGABYTE[1024];
 
diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
index 454641b2751..c6768b4c566 100644
--- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
@@ -64,12 +64,14 @@ void BucketManager::RegisterThread(const std::string& name)
 
 UINT BucketManager::RegisterBucket(const BUCKET_DESC& desc)
 {
+    mThreadMutex.lock();
     size_t id = mBuckets.size();
     mBuckets.push_back(desc);
+    mThreadMutex.unlock();
     return (UINT)id;
 }
 
-void BucketManager::PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 parentCycles, const BUCKET& bucket)
+void BucketManager::PrintBucket(FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket)
 {
     const char *arrows[] = {
         "",
@@ -88,7 +90,7 @@ void BucketManager::PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64
     float percentParent = (float)((double)bucket.elapsed / (double)parentCycles * 100.0);
 
     // compute average cycle count per invocation
-    UINT64 CPE = bucket.elapsed / bucket.count;
+    uint64_t CPE = bucket.elapsed / bucket.count;
 
     BUCKET_DESC &desc = mBuckets[bucket.id];
 
@@ -127,7 +129,7 @@ void BucketManager::PrintThread(FILE* f, const BUCKET_THREAD& thread)
 
     // compute thread level total cycle counts across all buckets from root
     const BUCKET& root = thread.root;
-    UINT64 totalCycles = 0;
+    uint64_t totalCycles = 0;
     for (const BUCKET& child : root.children)
     {
         totalCycles += child.elapsed;
@@ -186,3 +188,13 @@ void BucketManager::PrintReport(const std::string& filename)
         fclose(f);
     }
 }
+
+void BucketManager_StartBucket(BucketManager* pBucketMgr, uint32_t id)
+{
+    pBucketMgr->StartBucket(id);
+}
+
+void BucketManager_StopBucket(BucketManager* pBucketMgr, uint32_t id)
+{
+    pBucketMgr->StopBucket(id);
+}
diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
index 99cb10ec6e8..9dfa7f694d0 100644
--- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
@@ -70,7 +70,9 @@ public:
     // removes all registered buckets
     void ClearBuckets()
     {
+        mThreadMutex.lock();
         mBuckets.clear();
+        mThreadMutex.unlock();
     }
 
     /// Registers a new thread with the manager.
@@ -209,7 +211,7 @@ public:
     }
 
 private:
-    void PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 parentCycles, const BUCKET& bucket);
+    void PrintBucket(FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket);
     void PrintThread(FILE* f, const BUCKET_THREAD& thread);
 
     // list of active threads that have registered with this manager
@@ -227,3 +229,8 @@ private:
     bool mThreadViz{ false };
     std::string mThreadVizDir;
 };
+
+
+// C helpers for jitter
+void BucketManager_StartBucket(BucketManager* pBucketMgr, uint32_t id);
+void BucketManager_StopBucket(BucketManager* pBucketMgr, uint32_t id);
diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
index 41c6d5dec79..34c322e5a85 100644
--- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
@@ -64,13 +64,13 @@ struct BUCKET_THREAD
     std::string name;
 
     // id for this thread, assigned by the thread manager
-    uint32_t id;
+    uint32_t id{ 0 };
 
     // root of the bucket hierarchy for this thread
     BUCKET root;
 
     // currently executing bucket somewhere in the hierarchy
-    BUCKET* pCurrent;
+    BUCKET* pCurrent{ nullptr };
 
     // currently executing hierarchy level
     uint32_t level{ 0 };
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
index 8fa6d9ef408..fa792b42e1a 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
@@ -43,14 +43,14 @@ typedef uint8_t simdmask;
 // simd vector
 OSALIGNSIMD(union) simdvector
 {
-	simdscalar	v[4];
-	struct
-	{
-		simdscalar x, y, z, w;
-	};
-
-	simdscalar& operator[] (const int i) { return v[i]; }
-	const simdscalar& operator[] (const int i) const { return v[i]; }
+    simdscalar  v[4];
+    struct
+    {
+        simdscalar x, y, z, w;
+    };
+
+    simdscalar& operator[] (const int i) { return v[i]; }
+    const simdscalar& operator[] (const int i) const { return v[i]; }
 };
 
 #if KNOB_SIMD_WIDTH == 8
@@ -59,8 +59,8 @@ OSALIGNSIMD(union) simdvector
 #define _simd_load1_ps _mm256_broadcast_ss
 #define _simd_loadu_ps _mm256_loadu_ps
 #define _simd_setzero_ps _mm256_setzero_ps
-#define _simd_set1_ps	_mm256_set1_ps
-#define _simd_blend_ps	_mm256_blend_ps
+#define _simd_set1_ps   _mm256_set1_ps
+#define _simd_blend_ps  _mm256_blend_ps
 #define _simd_blendv_ps _mm256_blendv_ps
 #define _simd_store_ps _mm256_store_ps
 #define _simd_mul_ps _mm256_mul_ps
@@ -100,21 +100,156 @@ OSALIGNSIMD(union) simdvector
 INLINE \
 __m256i func(__m256i a, __m256i b)\
 {\
-	__m128i aHi = _mm256_extractf128_si256(a, 1);\
-	__m128i bHi = _mm256_extractf128_si256(b, 1);\
-	__m128i aLo = _mm256_castsi256_si128(a);\
-	__m128i bLo = _mm256_castsi256_si128(b);\
+    __m128i aHi = _mm256_extractf128_si256(a, 1);\
+    __m128i bHi = _mm256_extractf128_si256(b, 1);\
+    __m128i aLo = _mm256_castsi256_si128(a);\
+    __m128i bLo = _mm256_castsi256_si128(b);\
 \
-	__m128i subLo = intrin(aLo, bLo);\
-	__m128i subHi = intrin(aHi, bHi);\
+    __m128i subLo = intrin(aLo, bLo);\
+    __m128i subHi = intrin(aHi, bHi);\
 \
-	__m256i result = _mm256_castsi128_si256(subLo);\
-	        result = _mm256_insertf128_si256(result, subHi, 1);\
+    __m256i result = _mm256_castsi128_si256(subLo);\
+            result = _mm256_insertf128_si256(result, subHi, 1);\
 \
-	return result;\
+    return result;\
 }
 
 #if (KNOB_ARCH == KNOB_ARCH_AVX)
+INLINE
+__m256 _simdemu_permute_ps(__m256 a, __m256i b)
+{
+    __m128 aHi = _mm256_extractf128_ps(a, 1);
+    __m128i bHi = _mm256_extractf128_si256(b, 1);
+    __m128 aLo = _mm256_castps256_ps128(a);
+    __m128i bLo = _mm256_castsi256_si128(b);
+
+    __m128i indexHi = _mm_cmpgt_epi32(bLo, _mm_set1_epi32(3));
+    __m128 resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bLo, _mm_set1_epi32(0x3)));
+    __m128 resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bLo, _mm_set1_epi32(0x3)));
+    __m128 blendLowRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi));
+
+    indexHi = _mm_cmpgt_epi32(bHi, _mm_set1_epi32(3));
+    resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bHi, _mm_set1_epi32(0x3)));
+    resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bHi, _mm_set1_epi32(0x3)));
+    __m128 blendHiRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi));
+
+    __m256 result = _mm256_castps128_ps256(blendLowRes);
+    result = _mm256_insertf128_ps(result, blendHiRes, 1);
+
+    return result;
+}
+
+INLINE
+__m256i _simdemu_srlv_epi32(__m256i vA, __m256i vCount)
+{
+    int32_t aHi, aLow, countHi, countLow;
+    __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
+    __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
+    __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
+    __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
+
+    aHi = _mm_extract_epi32(vAHi, 0);
+    countHi = _mm_extract_epi32(vCountHi, 0);
+    aHi >>= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 0);
+
+    aLow = _mm_extract_epi32(vALow, 0);
+    countLow = _mm_extract_epi32(vCountLow, 0);
+    aLow >>= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 0);
+
+    aHi = _mm_extract_epi32(vAHi, 1);
+    countHi = _mm_extract_epi32(vCountHi, 1);
+    aHi >>= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 1);
+
+    aLow = _mm_extract_epi32(vALow, 1);
+    countLow = _mm_extract_epi32(vCountLow, 1);
+    aLow >>= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 1);
+
+    aHi = _mm_extract_epi32(vAHi, 2);
+    countHi = _mm_extract_epi32(vCountHi, 2);
+    aHi >>= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 2);
+
+    aLow = _mm_extract_epi32(vALow, 2);
+    countLow = _mm_extract_epi32(vCountLow, 2);
+    aLow >>= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 2);
+
+    aHi = _mm_extract_epi32(vAHi, 3);
+    countHi = _mm_extract_epi32(vCountHi, 3);
+    aHi >>= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 3);
+
+    aLow = _mm_extract_epi32(vALow, 3);
+    countLow = _mm_extract_epi32(vCountLow, 3);
+    aLow >>= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 3);
+
+    __m256i ret = _mm256_set1_epi32(0);
+    ret = _mm256_insertf128_si256(ret, vAHi, 1);
+    ret = _mm256_insertf128_si256(ret, vALow, 0);
+    return ret;
+}
+
+
+INLINE
+__m256i _simdemu_sllv_epi32(__m256i vA, __m256i vCount)
+{
+    int32_t aHi, aLow, countHi, countLow;
+    __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
+    __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
+    __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
+    __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
+
+    aHi = _mm_extract_epi32(vAHi, 0);
+    countHi = _mm_extract_epi32(vCountHi, 0);
+    aHi <<= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 0);
+
+    aLow = _mm_extract_epi32(vALow, 0);
+    countLow = _mm_extract_epi32(vCountLow, 0);
+    aLow <<= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 0);
+
+    aHi = _mm_extract_epi32(vAHi, 1);
+    countHi = _mm_extract_epi32(vCountHi, 1);
+    aHi <<= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 1);
+
+    aLow = _mm_extract_epi32(vALow, 1);
+    countLow = _mm_extract_epi32(vCountLow, 1);
+    aLow <<= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 1);
+
+    aHi = _mm_extract_epi32(vAHi, 2);
+    countHi = _mm_extract_epi32(vCountHi, 2);
+    aHi <<= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 2);
+
+    aLow = _mm_extract_epi32(vALow, 2);
+    countLow = _mm_extract_epi32(vCountLow, 2);
+    aLow <<= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 2);
+
+    aHi = _mm_extract_epi32(vAHi, 3);
+    countHi = _mm_extract_epi32(vCountHi, 3);
+    aHi <<= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 3);
+
+    aLow = _mm_extract_epi32(vALow, 3);
+    countLow = _mm_extract_epi32(vCountLow, 3);
+    aLow <<= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 3);
+
+    __m256i ret = _mm256_set1_epi32(0);
+    ret = _mm256_insertf128_si256(ret, vAHi, 1);
+    ret = _mm256_insertf128_si256(ret, vALow, 0);
+    return ret;
+}
+
 #define _simd_mul_epi32 _simdemu_mul_epi32
 #define _simd_mullo_epi32 _simdemu_mullo_epi32
 #define _simd_sub_epi32 _simdemu_sub_epi32
@@ -136,7 +271,14 @@ __m256i func(__m256i a, __m256i b)\
 #define _simd_add_epi8 _simdemu_add_epi8
 #define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64
 #define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64
+#define _simd_cmpgt_epi8 _simdemu_cmpgt_epi8
+#define _simd_cmpeq_epi8 _simdemu_cmpeq_epi8
+#define _simd_cmpgt_epi16 _simdemu_cmpgt_epi16
+#define _simd_cmpeq_epi16 _simdemu_cmpeq_epi16
 #define _simd_movemask_epi8 _simdemu_movemask_epi8
+#define _simd_permute_ps _simdemu_permute_ps
+#define _simd_srlv_epi32 _simdemu_srlv_epi32
+#define _simd_sllv_epi32 _simdemu_sllv_epi32
 
 SIMD_EMU_EPI(_simdemu_mul_epi32, _mm_mul_epi32)
 SIMD_EMU_EPI(_simdemu_mullo_epi32, _mm_mullo_epi32)
@@ -158,6 +300,10 @@ SIMD_EMU_EPI(_simdemu_subs_epu8, _mm_subs_epu8)
 SIMD_EMU_EPI(_simdemu_add_epi8, _mm_add_epi8)
 SIMD_EMU_EPI(_simdemu_cmpeq_epi64, _mm_cmpeq_epi64)
 SIMD_EMU_EPI(_simdemu_cmpgt_epi64, _mm_cmpgt_epi64)
+SIMD_EMU_EPI(_simdemu_cmpgt_epi8, _mm_cmpgt_epi8)
+SIMD_EMU_EPI(_simdemu_cmpeq_epi8, _mm_cmpeq_epi8)
+SIMD_EMU_EPI(_simdemu_cmpgt_epi16, _mm_cmpgt_epi16)
+SIMD_EMU_EPI(_simdemu_cmpeq_epi16, _mm_cmpeq_epi16)
 
 #define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
 #define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
@@ -176,25 +322,25 @@ SIMD_EMU_EPI(_simdemu_shuffle_epi8, _mm_shuffle_epi8)
 INLINE
 __m128 _mm_fmaddemu_ps(__m128 a, __m128 b, __m128 c)
 {
-	__m128 res = _mm_mul_ps(a, b);
-	res = _mm_add_ps(res, c);
-	return res;
+    __m128 res = _mm_mul_ps(a, b);
+    res = _mm_add_ps(res, c);
+    return res;
 }
 
 INLINE
 __m256 _mm_fmaddemu256_ps(__m256 a, __m256 b, __m256 c)
 {
-	__m256 res = _mm256_mul_ps(a, b);
-	res = _mm256_add_ps(res, c);
-	return res;
+    __m256 res = _mm256_mul_ps(a, b);
+    res = _mm256_add_ps(res, c);
+    return res;
 }
 
 INLINE
 __m256 _mm_fmsubemu256_ps(__m256 a, __m256 b, __m256 c)
 {
-	__m256 res = _mm256_mul_ps(a, b);
-	res = _mm256_sub_ps(res, c);
-	return res;
+    __m256 res = _mm256_mul_ps(a, b);
+    res = _mm256_sub_ps(res, c);
+    return res;
 }
 
 INLINE
@@ -295,7 +441,14 @@ int _simdemu_movemask_epi8(__m256i a)
 
 #define _simd_cmpeq_epi64 _mm256_cmpeq_epi64
 #define _simd_cmpgt_epi64 _mm256_cmpgt_epi64
+#define _simd_cmpgt_epi8  _mm256_cmpgt_epi8
+#define _simd_cmpeq_epi8  _mm256_cmpeq_epi8
+#define _simd_cmpgt_epi16  _mm256_cmpgt_epi16
+#define _simd_cmpeq_epi16  _mm256_cmpeq_epi16
 #define _simd_movemask_epi8 _mm256_movemask_epi8
+#define _simd_permute_ps _mm256_permutevar8x32_ps
+#define _simd_srlv_epi32 _mm256_srlv_epi32
+#define _simd_sllv_epi32 _mm256_sllv_epi32
 #endif
 
 #define _simd_shuffleps_epi32(vA, vB, imm) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(vA), _mm256_castsi256_ps(vB), imm))
@@ -343,30 +496,30 @@ void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int sl
 
 INLINE __m256i _simdemu_slli_epi32(__m256i a, uint32_t i)
 {
-	__m128i aHi = _mm256_extractf128_si256(a, 1);
-	__m128i aLo = _mm256_castsi256_si128(a);
+    __m128i aHi = _mm256_extractf128_si256(a, 1);
+    __m128i aLo = _mm256_castsi256_si128(a);
 
-	__m128i resHi = _mm_slli_epi32(aHi, i);
-	__m128i resLo = _mm_slli_epi32(aLo, i);
+    __m128i resHi = _mm_slli_epi32(aHi, i);
+    __m128i resLo = _mm_slli_epi32(aLo, i);
 
-	__m256i result = _mm256_castsi128_si256(resLo);
-		    result = _mm256_insertf128_si256(result, resHi, 1);
+    __m256i result = _mm256_castsi128_si256(resLo);
+            result = _mm256_insertf128_si256(result, resHi, 1);
 
-	return result;
+    return result;
 }
 
 INLINE __m256i _simdemu_srai_epi32(__m256i a, uint32_t i)
 {
-	__m128i aHi = _mm256_extractf128_si256(a, 1);
-	__m128i aLo = _mm256_castsi256_si128(a);
+    __m128i aHi = _mm256_extractf128_si256(a, 1);
+    __m128i aLo = _mm256_castsi256_si128(a);
 
-	__m128i resHi = _mm_srai_epi32(aHi, i);
-	__m128i resLo = _mm_srai_epi32(aLo, i);
+    __m128i resHi = _mm_srai_epi32(aHi, i);
+    __m128i resLo = _mm_srai_epi32(aLo, i);
 
-	__m256i result = _mm256_castsi128_si256(resLo);
-		    result = _mm256_insertf128_si256(result, resHi, 1);
+    __m256i result = _mm256_castsi128_si256(resLo);
+            result = _mm256_insertf128_si256(result, resHi, 1);
 
-	return result;
+    return result;
 }
 
 INLINE __m256i _simdemu_srli_epi32(__m256i a, uint32_t i)
@@ -386,7 +539,7 @@ INLINE __m256i _simdemu_srli_epi32(__m256i a, uint32_t i)
 INLINE
 void _simdvec_transpose(simdvector &v)
 {
-	SWR_ASSERT(false, "Need to implement 8 wide version");
+    SWR_ASSERT(false, "Need to implement 8 wide version");
 }
 
 #else
@@ -397,132 +550,132 @@ void _simdvec_transpose(simdvector &v)
 INLINE
 void _simdvec_load_ps(simdvector& r, const float *p)
 {
-	r[0] = _simd_set1_ps(p[0]);
-	r[1] = _simd_set1_ps(p[1]);
-	r[2] = _simd_set1_ps(p[2]);
-	r[3] = _simd_set1_ps(p[3]);
+    r[0] = _simd_set1_ps(p[0]);
+    r[1] = _simd_set1_ps(p[1]);
+    r[2] = _simd_set1_ps(p[2]);
+    r[3] = _simd_set1_ps(p[3]);
 }
 
 INLINE
 void _simdvec_mov(simdvector& r, const simdscalar& s)
 {
-	r[0] = s;
-	r[1] = s;
-	r[2] = s;
-	r[3] = s;
+    r[0] = s;
+    r[1] = s;
+    r[2] = s;
+    r[3] = s;
 }
 
 INLINE
 void _simdvec_mov(simdvector& r, const simdvector& v)
 {
-	r[0] = v[0];
-	r[1] = v[1];
-	r[2] = v[2];
-	r[3] = v[3];
+    r[0] = v[0];
+    r[1] = v[1];
+    r[2] = v[2];
+    r[3] = v[3];
 }
 
 // just move a lane from the source simdvector to dest simdvector
 INLINE
 void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane)
 {
-	_simd_mov(r[0], rlane, s[0], slane);
-	_simd_mov(r[1], rlane, s[1], slane);
-	_simd_mov(r[2], rlane, s[2], slane);
-	_simd_mov(r[3], rlane, s[3], slane);
+    _simd_mov(r[0], rlane, s[0], slane);
+    _simd_mov(r[1], rlane, s[1], slane);
+    _simd_mov(r[2], rlane, s[2], slane);
+    _simd_mov(r[3], rlane, s[3], slane);
 }
 
 INLINE
 void _simdvec_dp3_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
 {
-	simdscalar tmp;
-	r	= _simd_mul_ps(v0[0], v1[0]);	// (v0.x*v1.x)
+    simdscalar tmp;
+    r   = _simd_mul_ps(v0[0], v1[0]);   // (v0.x*v1.x)
 
-	tmp	= _simd_mul_ps(v0[1], v1[1]);		// (v0.y*v1.y)
-	r	= _simd_add_ps(r, tmp);			// (v0.x*v1.x) + (v0.y*v1.y)
+    tmp = _simd_mul_ps(v0[1], v1[1]);       // (v0.y*v1.y)
+    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y)
 
-	tmp	= _simd_mul_ps(v0[2], v1[2]);	// (v0.z*v1.z)
-	r	= _simd_add_ps(r, tmp);			// (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+    tmp = _simd_mul_ps(v0[2], v1[2]);   // (v0.z*v1.z)
+    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
 }
 
 INLINE
 void _simdvec_dp4_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
 {
-	simdscalar tmp;
-	r	= _simd_mul_ps(v0[0], v1[0]);	// (v0.x*v1.x)
+    simdscalar tmp;
+    r   = _simd_mul_ps(v0[0], v1[0]);   // (v0.x*v1.x)
 
-	tmp	= _simd_mul_ps(v0[1], v1[1]);		// (v0.y*v1.y)
-	r	= _simd_add_ps(r, tmp);			// (v0.x*v1.x) + (v0.y*v1.y)
+    tmp = _simd_mul_ps(v0[1], v1[1]);       // (v0.y*v1.y)
+    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y)
 
-	tmp	= _simd_mul_ps(v0[2], v1[2]);	// (v0.z*v1.z)
-	r	= _simd_add_ps(r, tmp);			// (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+    tmp = _simd_mul_ps(v0[2], v1[2]);   // (v0.z*v1.z)
+    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
 
-	tmp	= _simd_mul_ps(v0[3], v1[3]);	// (v0.w*v1.w)
-	r	= _simd_add_ps(r, tmp);			// (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+    tmp = _simd_mul_ps(v0[3], v1[3]);   // (v0.w*v1.w)
+    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
 }
 
 INLINE
 simdscalar _simdvec_rcp_length_ps(const simdvector& v)
 {
-	simdscalar length;
-	_simdvec_dp4_ps(length, v, v);
-	return _simd_rsqrt_ps(length);
+    simdscalar length;
+    _simdvec_dp4_ps(length, v, v);
+    return _simd_rsqrt_ps(length);
 }
 
 INLINE
 void _simdvec_normalize_ps(simdvector& r, const simdvector& v)
 {
-	simdscalar vecLength;
-	vecLength = _simdvec_rcp_length_ps(v);
+    simdscalar vecLength;
+    vecLength = _simdvec_rcp_length_ps(v);
 
-	r[0] = _simd_mul_ps(v[0], vecLength);
-	r[1] = _simd_mul_ps(v[1], vecLength);
-	r[2] = _simd_mul_ps(v[2], vecLength);
-	r[3] = _simd_mul_ps(v[3], vecLength);
+    r[0] = _simd_mul_ps(v[0], vecLength);
+    r[1] = _simd_mul_ps(v[1], vecLength);
+    r[2] = _simd_mul_ps(v[2], vecLength);
+    r[3] = _simd_mul_ps(v[3], vecLength);
 }
 
 INLINE
 void _simdvec_mul_ps(simdvector& r, const simdvector& v, const simdscalar& s)
 {
-	r[0] = _simd_mul_ps(v[0], s);
-	r[1] = _simd_mul_ps(v[1], s);
-	r[2] = _simd_mul_ps(v[2], s);
-	r[3] = _simd_mul_ps(v[3], s);
+    r[0] = _simd_mul_ps(v[0], s);
+    r[1] = _simd_mul_ps(v[1], s);
+    r[2] = _simd_mul_ps(v[2], s);
+    r[3] = _simd_mul_ps(v[3], s);
 }
 
 INLINE
 void _simdvec_mul_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
 {
-	r[0] = _simd_mul_ps(v0[0], v1[0]);
-	r[1] = _simd_mul_ps(v0[1], v1[1]);
-	r[2] = _simd_mul_ps(v0[2], v1[2]);
-	r[3] = _simd_mul_ps(v0[3], v1[3]);
+    r[0] = _simd_mul_ps(v0[0], v1[0]);
+    r[1] = _simd_mul_ps(v0[1], v1[1]);
+    r[2] = _simd_mul_ps(v0[2], v1[2]);
+    r[3] = _simd_mul_ps(v0[3], v1[3]);
 }
 
 INLINE
 void _simdvec_add_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
 {
-	r[0] = _simd_add_ps(v0[0], v1[0]);
-	r[1] = _simd_add_ps(v0[1], v1[1]);
-	r[2] = _simd_add_ps(v0[2], v1[2]);
-	r[3] = _simd_add_ps(v0[3], v1[3]);
+    r[0] = _simd_add_ps(v0[0], v1[0]);
+    r[1] = _simd_add_ps(v0[1], v1[1]);
+    r[2] = _simd_add_ps(v0[2], v1[2]);
+    r[3] = _simd_add_ps(v0[3], v1[3]);
 }
 
 INLINE
 void _simdvec_min_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
 {
-	r[0] = _simd_min_ps(v0[0], s);
-	r[1] = _simd_min_ps(v0[1], s);
-	r[2] = _simd_min_ps(v0[2], s);
-	r[3] = _simd_min_ps(v0[3], s);
+    r[0] = _simd_min_ps(v0[0], s);
+    r[1] = _simd_min_ps(v0[1], s);
+    r[2] = _simd_min_ps(v0[2], s);
+    r[3] = _simd_min_ps(v0[3], s);
 }
 
 INLINE
 void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
 {
-	r[0] = _simd_max_ps(v0[0], s);
-	r[1] = _simd_max_ps(v0[1], s);
-	r[2] = _simd_max_ps(v0[2], s);
-	r[3] = _simd_max_ps(v0[3], s);
+    r[0] = _simd_max_ps(v0[0], s);
+    r[1] = _simd_max_ps(v0[1], s);
+    r[2] = _simd_max_ps(v0[2], s);
+    r[3] = _simd_max_ps(v0[3], s);
 }
 
 // Matrix4x4 * Vector4
@@ -532,65 +685,65 @@ void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
 //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
 INLINE
 void _simd_mat4x4_vec4_multiply(
-	simdvector& result,
-	const float *pMatrix,
-	const simdvector& v)
-{
-	simdscalar m;
-	simdscalar r0;
-	simdscalar r1;
-
-	m	= _simd_load1_ps(pMatrix + 0*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 3);	// m[row][3]
-	r1	= _simd_mul_ps(m, v[3]);				// (m3 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
-	result[0] = r0;
-
-	m	= _simd_load1_ps(pMatrix + 1*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 3);	// m[row][3]
-	r1	= _simd_mul_ps(m, v[3]);				// (m3 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
-	result[1] = r0;
-
-	m	= _simd_load1_ps(pMatrix + 2*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 3);	// m[row][3]
-	r1	= _simd_mul_ps(m, v[3]);				// (m3 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
-	result[2] = r0;
-
-	m	= _simd_load1_ps(pMatrix + 3*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 3*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 3*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 3*4 + 3);	// m[row][3]
-	r1	= _simd_mul_ps(m, v[3]);				// (m3 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
-	result[3] = r0;
+    simdvector& result,
+    const float *pMatrix,
+    const simdvector& v)
+{
+    simdscalar m;
+    simdscalar r0;
+    simdscalar r1;
+
+    m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 3);    // m[row][3]
+    r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+    result[0] = r0;
+
+    m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 3);    // m[row][3]
+    r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+    result[1] = r0;
+
+    m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 3);    // m[row][3]
+    r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+    result[2] = r0;
+
+    m   = _simd_load1_ps(pMatrix + 3*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 3*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 3*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 3*4 + 3);    // m[row][3]
+    r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+    result[3] = r0;
 }
 
 // Matrix4x4 * Vector3 - Direction Vector where w = 0.
@@ -600,45 +753,45 @@ void _simd_mat4x4_vec4_multiply(
 //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
 INLINE
 void _simd_mat3x3_vec3_w0_multiply(
-	simdvector& result,
-	const float *pMatrix,
-	const simdvector& v)
-{
-	simdscalar m;
-	simdscalar r0;
-	simdscalar r1;
-
-	m	= _simd_load1_ps(pMatrix + 0*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	result[0] = r0;
-
-	m	= _simd_load1_ps(pMatrix + 1*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	result[1] = r0;
-
-	m	= _simd_load1_ps(pMatrix + 2*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	result[2] = r0;
-
-	result[3] = _simd_setzero_ps();
+    simdvector& result,
+    const float *pMatrix,
+    const simdvector& v)
+{
+    simdscalar m;
+    simdscalar r0;
+    simdscalar r1;
+
+    m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    result[0] = r0;
+
+    m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    result[1] = r0;
+
+    m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    result[2] = r0;
+
+    result[3] = _simd_setzero_ps();
 }
 
 // Matrix4x4 * Vector3 - Position vector where w = 1.
@@ -648,108 +801,108 @@ void _simd_mat3x3_vec3_w0_multiply(
 //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
 INLINE
 void _simd_mat4x4_vec3_w1_multiply(
-	simdvector& result,
-	const float *pMatrix,
-	const simdvector& v)
-{
-	simdscalar m;
-	simdscalar r0;
-	simdscalar r1;
-
-	m	= _simd_load1_ps(pMatrix + 0*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 3);	// m[row][3]
-	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-	result[0] = r0;
-
-	m	= _simd_load1_ps(pMatrix + 1*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 3);	// m[row][3]
-	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-	result[1] = r0;
-
-	m	= _simd_load1_ps(pMatrix + 2*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 3);	// m[row][3]
-	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-	result[2] = r0;
-
-	m	= _simd_load1_ps(pMatrix + 3*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 3*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 3*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 3*4 + 3);	// m[row][3]
-	result[3]	= _simd_add_ps(r0, m);			// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+    simdvector& result,
+    const float *pMatrix,
+    const simdvector& v)
+{
+    simdscalar m;
+    simdscalar r0;
+    simdscalar r1;
+
+    m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 3);    // m[row][3]
+    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+    result[0] = r0;
+
+    m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 3);    // m[row][3]
+    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+    result[1] = r0;
+
+    m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 3);    // m[row][3]
+    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+    result[2] = r0;
+
+    m   = _simd_load1_ps(pMatrix + 3*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 3*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 3*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 3*4 + 3);    // m[row][3]
+    result[3]   = _simd_add_ps(r0, m);          // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
 }
 
 INLINE
 void _simd_mat4x3_vec3_w1_multiply(
-	simdvector& result,
-	const float *pMatrix,
-	const simdvector& v)
-{
-	simdscalar m;
-	simdscalar r0;
-	simdscalar r1;
-
-	m	= _simd_load1_ps(pMatrix + 0*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 3);	// m[row][3]
-	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-	result[0] = r0;
-
-	m	= _simd_load1_ps(pMatrix + 1*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 3);	// m[row][3]
-	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-	result[1] = r0;
-
-	m	= _simd_load1_ps(pMatrix + 2*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 3);	// m[row][3]
-	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-	result[2] = r0;
-	result[3] = _simd_set1_ps(1.0f);
+    simdvector& result,
+    const float *pMatrix,
+    const simdvector& v)
+{
+    simdscalar m;
+    simdscalar r0;
+    simdscalar r1;
+
+    m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 3);    // m[row][3]
+    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+    result[0] = r0;
+
+    m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 3);    // m[row][3]
+    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+    result[1] = r0;
+
+    m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 3);    // m[row][3]
+    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+    result[2] = r0;
+    result[3] = _simd_set1_ps(1.0f);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -783,5 +936,61 @@ static INLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, cons
     return vplaneps(vA, vB, vC, vI, vJ);
 }
 
+INLINE
+UINT pdep_u32(UINT a, UINT mask)
+{
+#if KNOB_ARCH==KNOB_ARCH_AVX2
+    return _pdep_u32(a, mask);
+#else
+    UINT result = 0;
+
+    // copied from http://wm.ite.pl/articles/pdep-soft-emu.html 
+    // using bsf instead of funky loop
+    DWORD maskIndex;
+    while (_BitScanForward(&maskIndex, mask))
+    {
+        // 1. isolate lowest set bit of mask
+        const UINT lowest = 1 << maskIndex;
+
+        // 2. populate LSB from src
+        const UINT LSB = (UINT)((int)(a << 31) >> 31);
+
+        // 3. copy bit from mask
+        result |= LSB & lowest;
+
+        // 4. clear lowest bit
+        mask &= ~lowest;
+
+        // 5. prepare for next iteration
+        a >>= 1;
+    }
+
+    return result;
+#endif
+}
+
+INLINE
+UINT pext_u32(UINT a, UINT mask)
+{
+#if KNOB_ARCH==KNOB_ARCH_AVX2
+    return _pext_u32(a, mask);
+#else
+    UINT result = 0;
+    DWORD maskIndex;
+    uint32_t currentBit = 0;
+    while (_BitScanForward(&maskIndex, mask))
+    {
+        // 1. isolate lowest set bit of mask
+        const UINT lowest = 1 << maskIndex;
+
+        // 2. copy bit from mask
+        result |= ((a & lowest) > 0) << currentBit++;
+
+        // 3. clear lowest bit
+        mask &= ~lowest;
+    }
+    return result;
+#endif
+}
 
 #endif//__SWR_SIMDINTRIN_H__
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index fccccab503c..f0f7956b590 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -49,7 +49,7 @@ void SetupDefaultState(SWR_CONTEXT *pContext);
 /// @brief Create SWR Context.
 /// @param pCreateInfo - pointer to creation info.
 HANDLE SwrCreateContext(
-    const SWR_CREATECONTEXT_INFO* pCreateInfo)
+    SWR_CREATECONTEXT_INFO* pCreateInfo)
 {
     RDTSC_RESET();
     RDTSC_INIT(0);
@@ -61,27 +61,16 @@ HANDLE SwrCreateContext(
     pContext->driverType = pCreateInfo->driver;
     pContext->privateStateSize = pCreateInfo->privateStateSize;
 
-    pContext->dcRing = (DRAW_CONTEXT*)_aligned_malloc(sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT, 64);
-    memset(pContext->dcRing, 0, sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT);
-
-    pContext->dsRing = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT, 64);
-    memset(pContext->dsRing, 0, sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT);
-
-    pContext->numSubContexts = pCreateInfo->maxSubContexts;
-    if (pContext->numSubContexts > 1)
-    {
-        pContext->subCtxSave = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE) * pContext->numSubContexts, 64);
-        memset(pContext->subCtxSave, 0, sizeof(DRAW_STATE) * pContext->numSubContexts);
-    }
+    pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
+    pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
 
     for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
     {
-        pContext->dcRing[dc].pArena = new Arena();
-        pContext->dcRing[dc].inUse = false;
+        pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
         pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena));
         pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.
 
-        pContext->dsRing[dc].pArena = new Arena();
+        pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
     }
 
     if (!KNOB_SINGLE_THREADED)
@@ -108,9 +97,6 @@ HANDLE SwrCreateContext(
         pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4);
     }
 
-    pContext->nextDrawId = 1;
-    pContext->DrawEnqueued = 1;
-
     // State setup AFTER context is fully initialized
     SetupDefaultState(pContext);
 
@@ -125,6 +111,13 @@ HANDLE SwrCreateContext(
     pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
     pContext->pfnClearTile = pCreateInfo->pfnClearTile;
 
+    // pass pointer to bucket manager back to caller
+#ifdef KNOB_ENABLE_RDTSC
+    pCreateInfo->pBucketMgr = &gBucketMgr;
+#endif
+
+    pCreateInfo->contextSaveSize = sizeof(API_STATE);
+
     return (HANDLE)pContext;
 }
 
@@ -148,10 +141,6 @@ void SwrDestroyContext(HANDLE hContext)
         _aligned_free(pContext->pScratch[i]);
     }
 
-    _aligned_free(pContext->dcRing);
-    _aligned_free(pContext->dsRing);
-    _aligned_free(pContext->subCtxSave);
-
     delete(pContext->pHotTileMgr);
 
     pContext->~SWR_CONTEXT();
@@ -168,49 +157,20 @@ void WakeAllThreads(SWR_CONTEXT *pContext)
     pContext->FifosNotEmpty.notify_all();
 }
 
-bool StillDrawing(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC)
-{
-    // For single thread nothing should still be drawing.
-    if (KNOB_SINGLE_THREADED) { return false; }
-
-    if (pDC->isCompute)
-    {
-        if (pDC->doneCompute)
-        {
-            pDC->inUse = false;
-            return false;
-        }
-    }
-
-    // Check if backend work is done. First make sure all triangles have been binned.
-    if (pDC->doneFE == true)
-    {
-        // ensure workers have all moved passed this draw
-        if (pDC->threadsDoneFE != pContext->NumWorkerThreads)
-        {
-            return true;
-        }
-
-        if (pDC->threadsDoneBE != pContext->NumWorkerThreads)
-        {
-            return true;
-        }
-
-        pDC->inUse = false;    // all work is done.
-    }
-
-    return pDC->inUse;
-}
-
-void QueueDraw(SWR_CONTEXT *pContext)
+template<bool IsDraw>
+void QueueWork(SWR_CONTEXT *pContext)
 {
-    SWR_ASSERT(pContext->pCurDrawContext->inUse == false);
-    pContext->pCurDrawContext->inUse = true;
+    // Each worker thread looks at a DC for both FE and BE work at different times and so we
+    // multiply threadDone by 2.  When the threadDone counter has reached 0 then all workers
+    // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
+    // then moved on if all work is done.)
+    pContext->pCurDrawContext->threadsDone =
+        pContext->NumWorkerThreads ? pContext->NumWorkerThreads * 2 : 2;
 
     _ReadWriteBarrier();
     {
         std::unique_lock<std::mutex> lock(pContext->WaitLock);
-        pContext->DrawEnqueued++;
+        pContext->dcRing.Enqueue();
     }
 
     if (KNOB_SINGLE_THREADED)
@@ -219,10 +179,21 @@ void QueueDraw(SWR_CONTEXT *pContext)
         uint32_t mxcsr = _mm_getcsr();
         _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
 
-        std::unordered_set<uint32_t> lockedTiles;
-        uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
-        WorkOnFifoFE(pContext, 0, curDraw[0], 0);
-        WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles);
+        if (IsDraw)
+        {
+            static TileSet lockedTiles;
+            uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
+            WorkOnFifoFE(pContext, 0, curDraw[0], 0);
+            WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles, 0, 0);
+        }
+        else
+        {
+            uint64_t curDispatch = pContext->pCurDrawContext->drawId;
+            WorkOnCompute(pContext, 0, curDispatch);
+        }
+
+        // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
+        while (CompleteDrawContext(pContext, pContext->pCurDrawContext) > 0) {}
 
         // restore csr
         _mm_setcsr(mxcsr);
@@ -239,40 +210,14 @@ void QueueDraw(SWR_CONTEXT *pContext)
     pContext->pCurDrawContext = nullptr;
 }
 
-///@todo Combine this with QueueDraw
-void QueueDispatch(SWR_CONTEXT *pContext)
+INLINE void QueueDraw(SWR_CONTEXT* pContext)
 {
-    SWR_ASSERT(pContext->pCurDrawContext->inUse == false);
-    pContext->pCurDrawContext->inUse = true;
-
-    _ReadWriteBarrier();
-    {
-        std::unique_lock<std::mutex> lock(pContext->WaitLock);
-        pContext->DrawEnqueued++;
-    }
-
-    if (KNOB_SINGLE_THREADED)
-    {
-        // flush denormals to 0
-        uint32_t mxcsr = _mm_getcsr();
-        _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
-
-        uint64_t curDispatch = pContext->pCurDrawContext->drawId;
-        WorkOnCompute(pContext, 0, curDispatch);
-
-        // restore csr
-        _mm_setcsr(mxcsr);
-    }
-    else
-    {
-        RDTSC_START(APIDrawWakeAllThreads);
-        WakeAllThreads(pContext);
-        RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
-    }
+    QueueWork<true>(pContext);
+}
 
-    // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
-    pContext->pPrevDrawContext = pContext->pCurDrawContext;
-    pContext->pCurDrawContext = nullptr;
+INLINE void QueueDispatch(SWR_CONTEXT* pContext)
+{
+    QueueWork<false>(pContext);
 }
 
 DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
@@ -281,23 +226,21 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
     // If current draw context is null then need to obtain a new draw context to use from ring.
     if (pContext->pCurDrawContext == nullptr)
     {
-        uint32_t dcIndex = pContext->nextDrawId % KNOB_MAX_DRAWS_IN_FLIGHT;
-
-        DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
-        pContext->pCurDrawContext = pCurDrawContext;
-
-        // Need to wait until this draw context is available to use.
-        while (StillDrawing(pContext, pCurDrawContext))
+        // Need to wait for a free entry.
+        while (pContext->dcRing.IsFull())
         {
             _mm_pause();
         }
 
+        uint32_t dcIndex = pContext->dcRing.GetHead() % KNOB_MAX_DRAWS_IN_FLIGHT;
+
+        DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
+        pContext->pCurDrawContext = pCurDrawContext;
+
         // Assign next available entry in DS ring to this DC.
         uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
         pCurDrawContext->pState = &pContext->dsRing[dsIndex];
 
-        Arena& stateArena = *(pCurDrawContext->pState->pArena);
-
         // Copy previous state to current state.
         if (pContext->pPrevDrawContext)
         {
@@ -310,7 +253,9 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
             {
                 CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState);
 
-                stateArena.Reset(true);    // Reset memory.
+                // Should have been cleaned up previously
+                SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
+
                 pCurDrawContext->pState->pPrivateState = nullptr;
 
                 pContext->curStateId++;  // Progress state ring index forward.
@@ -320,30 +265,31 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
                 // If its a split draw then just copy the state pointer over
                 // since its the same draw.
                 pCurDrawContext->pState = pPrevDrawContext->pState;
+                SWR_ASSERT(pPrevDrawContext->cleanupState == false);
             }
         }
         else
         {
-            stateArena.Reset();    // Reset memory.
+            SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
             pContext->curStateId++;  // Progress state ring index forward.
         }
 
+        SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true);
+
         pCurDrawContext->dependency = 0;
-        pCurDrawContext->pArena->Reset();
         pCurDrawContext->pContext = pContext;
         pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
-        pCurDrawContext->inUse = false;
 
-        pCurDrawContext->doneCompute = false;
         pCurDrawContext->doneFE = false;
         pCurDrawContext->FeLock = 0;
-        pCurDrawContext->threadsDoneFE = 0;
-        pCurDrawContext->threadsDoneBE = 0;
+        pCurDrawContext->threadsDone = 0;
 
         pCurDrawContext->pTileMgr->initialize();
 
         // Assign unique drawId for this DC
-        pCurDrawContext->drawId = pContext->nextDrawId++;
+        pCurDrawContext->drawId = pContext->dcRing.GetHead();
+
+        pCurDrawContext->cleanupState = true;
     }
     else
     {
@@ -354,38 +300,36 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
     return pContext->pCurDrawContext;
 }
 
-void SWR_API SwrSetActiveSubContext(
-    HANDLE hContext,
-    uint32_t subContextIndex)
+API_STATE* GetDrawState(SWR_CONTEXT *pContext)
 {
-    SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
-    if (subContextIndex >= pContext->numSubContexts)
-    {
-        return;
-    }
+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+    SWR_ASSERT(pDC->pState != nullptr);
 
-    if (subContextIndex != pContext->curSubCtxId)
-    {
-        // Save and restore draw state
-        DRAW_CONTEXT* pDC = GetDrawContext(pContext);
-        CopyState(
-            pContext->subCtxSave[pContext->curSubCtxId],
-            *(pDC->pState));
+    return &pDC->pState->state;
+}
 
-        CopyState(
-            *(pDC->pState),
-            pContext->subCtxSave[subContextIndex]);
+void SWR_API SwrSaveState(
+    HANDLE hContext,
+    void* pOutputStateBlock,
+    size_t memSize)
+{
+    SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
+    auto pSrc = GetDrawState(pContext);
+    SWR_ASSERT(pOutputStateBlock && memSize >= sizeof(*pSrc));
 
-        pContext->curSubCtxId = subContextIndex;
-    }
+    memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc));
 }
 
-API_STATE* GetDrawState(SWR_CONTEXT *pContext)
+void SWR_API SwrRestoreState(
+    HANDLE hContext,
+    const void* pStateBlock,
+    size_t memSize)
 {
-    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
-    SWR_ASSERT(pDC->pState != nullptr);
+    SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
+    auto pDst = GetDrawState(pContext);
+    SWR_ASSERT(pStateBlock && memSize >= sizeof(*pDst));
 
-    return &pDC->pState->state;
+    memcpy(pDst, pStateBlock, sizeof(*pDst));
 }
 
 void SetupDefaultState(SWR_CONTEXT *pContext)
@@ -431,16 +375,12 @@ void SwrWaitForIdle(HANDLE hContext)
     SWR_CONTEXT *pContext = GetContext(hContext);
 
     RDTSC_START(APIWaitForIdle);
-    // Wait for all work to complete.
-    for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
-    {
-        DRAW_CONTEXT *pDC = &pContext->dcRing[dc];
 
-        while (StillDrawing(pContext, pDC))
-        {
-            _mm_pause();
-        }
+    while (!pContext->dcRing.IsEmpty())
+    {
+        _mm_pause();
     }
+
     RDTSC_STOP(APIWaitForIdle, 1, 0);
 }
 
@@ -770,16 +710,25 @@ void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
         pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1;
     }
 }
-
+// templated backend function tables
+extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
+extern PFN_BACKEND_FUNC gBackendSingleSample[2][2];
+extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2];
+extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2];
+extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS + 1][SWR_MULTISAMPLE_TYPE_MAX];
+extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2];
+extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2];
+extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2];
 void SetupPipeline(DRAW_CONTEXT *pDC)
 {
     DRAW_STATE* pState = pDC->pState;
     const SWR_RASTSTATE &rastState = pState->state.rastState;
+    const SWR_PS_STATE &psState = pState->state.psState;
     BACKEND_FUNCS& backendFuncs = pState->backendFuncs;
     const uint32_t forcedSampleCount = (rastState.bForcedSampleCount) ? 1 : 0;
 
     // setup backend
-    if (pState->state.psState.pfnPixelShader == nullptr)
+    if (psState.pfnPixelShader == nullptr)
     {
         backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount];
         // always need to generate I & J per sample for Z interpolation
@@ -788,41 +737,40 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
     else
     {
         const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.bForcedSampleCount) ? 1 : 0;
-        const uint32_t centroid = ((pState->state.psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
+        const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
 
         // currently only support 'normal' input coverage
-        SWR_ASSERT(pState->state.psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL ||
-                   pState->state.psState.inputCoverage == SWR_INPUT_COVERAGE_NONE);
+        SWR_ASSERT(psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL ||
+                   psState.inputCoverage == SWR_INPUT_COVERAGE_NONE);
      
-        SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)pState->state.psState.barycentricsMask;
+        SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
         
         // select backend function
-        switch(pState->state.psState.shadingRate)
+        switch(psState.shadingRate)
         {
         case SWR_SHADING_RATE_PIXEL:
             if(bMultisampleEnable)
             {
                 // always need to generate I & J per sample for Z interpolation
                 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
-                backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][pState->state.psState.inputCoverage][centroid][forcedSampleCount];
-                backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][pState->state.blendState.sampleCount];
+                backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount];
+                backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
             }
             else
             {
                 // always need to generate I & J per pixel for Z interpolation
                 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
-                backendFuncs.pfnBackend = gBackendSingleSample[pState->state.psState.inputCoverage][centroid];
-                backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][SWR_MULTISAMPLE_1X];
+                backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid];
+                backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][SWR_MULTISAMPLE_1X];
             }
             break;
         case SWR_SHADING_RATE_SAMPLE:
             SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN);
             // always need to generate I & J per sample for Z interpolation
             barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
-            backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][pState->state.psState.inputCoverage][centroid];
-            backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][pState->state.blendState.sampleCount];
+            backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid];
+            backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
             break;
-        case SWR_SHADING_RATE_COARSE:
         default:
             SWR_ASSERT(0 && "Invalid shading rate");
             break;
@@ -913,7 +861,7 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
 
     uint32_t numRTs = pState->state.psState.numRenderTargets;
     pState->state.colorHottileEnable = 0;
-    if(pState->state.psState.pfnPixelShader != nullptr)
+    if (psState.pfnPixelShader != nullptr)
     {
         for (uint32_t rt = 0; rt < numRTs; ++rt)
         {
@@ -1005,6 +953,11 @@ uint32_t MaxVertsPerDraw(
         }
         break;
 
+    // The Primitive Assembly code can only handle 1 RECT at a time.
+    case TOP_RECT_LIST:
+        vertsPerDraw = 3;
+        break;
+
     default:
         // We are not splitting up draws for other topologies.
         break;
@@ -1116,6 +1069,8 @@ void DrawInstanced(
         pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
         pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw;
 
+        pDC->cleanupState = (remainingVerts == numVertsForDraw);
+
         //enqueue DC
         QueueDraw(pContext);
 
@@ -1250,6 +1205,8 @@ void DrawIndexedInstance(
         pDC->FeWork.desc.draw.baseVertex = baseVertex;
         pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
 
+        pDC->cleanupState = (remainingIndices == numIndicesForDraw);
+
         //enqueue DC
         QueueDraw(pContext);
 
@@ -1305,7 +1262,10 @@ void SwrDrawIndexedInstanced(
     DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance);
 }
 
-// Attach surfaces to pipeline
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrInvalidateTiles
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
 void SwrInvalidateTiles(
     HANDLE hContext,
     uint32_t attachmentMask)
@@ -1313,10 +1273,39 @@ void SwrInvalidateTiles(
     SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
 
+    pDC->FeWork.type = DISCARDINVALIDATETILES;
+    pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
+    pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
+    memset(&pDC->FeWork.desc.discardInvalidateTiles.rect, 0, sizeof(SWR_RECT));
+    pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID;
+    pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false;
+    pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false;
+
+    //enqueue
+    QueueDraw(pContext);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrDiscardRect
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
+/// @param rect - if rect is all zeros, the entire attachment surface will be discarded
+void SwrDiscardRect(
+    HANDLE hContext,
+    uint32_t attachmentMask,
+    SWR_RECT rect)
+{
+    SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
     // Queue a load to the hottile
-    pDC->FeWork.type = INVALIDATETILES;
-    pDC->FeWork.pfnWork = ProcessInvalidateTiles;
-    pDC->FeWork.desc.invalidateTiles.attachmentMask = attachmentMask;
+    pDC->FeWork.type = DISCARDINVALIDATETILES;
+    pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
+    pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
+    pDC->FeWork.desc.discardInvalidateTiles.rect = rect;
+    pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED;
+    pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true;
+    pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true;
 
     //enqueue
     QueueDraw(pContext);
@@ -1391,7 +1380,7 @@ void SwrClearRenderTarget(
     uint32_t clearMask,
     const float clearColor[4],
     float z,
-    BYTE stencil)
+    uint8_t stencil)
 {
     RDTSC_START(APIClearRenderTarget);
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h
index 72fae8b2c21..90c2f038c46 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@@ -53,7 +53,7 @@ typedef void(SWR_API *PFN_CALLBACK_FUNC)(uint64_t data, uint64_t data2, uint64_t
 /// @param pDstHotTile - pointer to the hot tile surface
 typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, SWR_FORMAT dstFormat,
     SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, BYTE *pDstHotTile);
+    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, uint8_t *pDstHotTile);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Function signature for store hot tiles
@@ -65,7 +65,7 @@ typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, SWR_FORMAT dstForma
 /// @param pSrcHotTile - pointer to the hot tile surface
 typedef void(SWR_API *PFN_STORE_TILE)(HANDLE hPrivateContext, SWR_FORMAT srcFormat,
     SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, BYTE *pSrcHotTile);
+    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, uint8_t *pSrcHotTile);
 
 /// @brief Function signature for clearing from the hot tiles clear value
 /// @param hPrivateContext - handle to private data
@@ -77,6 +77,8 @@ typedef void(SWR_API *PFN_CLEAR_TILE)(HANDLE hPrivateContext,
     SWR_RENDERTARGET_ATTACHMENT rtIndex,
     uint32_t x, uint32_t y, const float* pClearColor);
 
+class BucketManager;
+
 //////////////////////////////////////////////////////////////////////////
 /// SWR_CREATECONTEXT_INFO
 /////////////////////////////////////////////////////////////////////////
@@ -88,13 +90,17 @@ struct SWR_CREATECONTEXT_INFO
     // Use SwrGetPrivateContextState() to access private state.
     uint32_t privateStateSize;
 
-    // Each SWR context can have multiple sets of active state
-    uint32_t maxSubContexts;
-
-    // tile manipulation functions
+    // Tile manipulation functions
     PFN_LOAD_TILE pfnLoadTile;
     PFN_STORE_TILE pfnStoreTile;
     PFN_CLEAR_TILE pfnClearTile;
+
+    // Pointer to rdtsc buckets mgr returned to the caller.
+    // Only populated when KNOB_ENABLE_RDTSC is set
+    BucketManager* pBucketMgr;
+
+    // Output: size required memory passed to for SwrSaveState / SwrRestoreState
+    size_t  contextSaveSize;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -112,7 +118,7 @@ struct SWR_RECT
 /// @brief Create SWR Context.
 /// @param pCreateInfo - pointer to creation info.
 HANDLE SWR_API SwrCreateContext(
-    const SWR_CREATECONTEXT_INFO* pCreateInfo);
+    SWR_CREATECONTEXT_INFO* pCreateInfo);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Destroys SWR Context.
@@ -121,12 +127,24 @@ void SWR_API SwrDestroyContext(
     HANDLE hContext);
 
 //////////////////////////////////////////////////////////////////////////
-/// @brief Set currently active state context
-/// @param subContextIndex - value from 0 to
-///     SWR_CREATECONTEXT_INFO.maxSubContexts.  Defaults to 0.
-void SWR_API SwrSetActiveSubContext(
+/// @brief Saves API state associated with hContext
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pOutputStateBlock - Memory block to receive API state data
+/// @param memSize - Size of memory pointed to by pOutputStateBlock
+void SWR_API SwrSaveState(
     HANDLE hContext,
-    uint32_t subContextIndex);
+    void* pOutputStateBlock,
+    size_t memSize);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Restores API state to hContext previously saved with SwrSaveState
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pStateBlock - Memory block to read API state data from
+/// @param memSize - Size of memory pointed to by pStateBlock
+void SWR_API SwrRestoreState(
+    HANDLE hContext,
+    const void* pStateBlock,
+    size_t memSize);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Sync cmd. Executes the callback func when all rendering up to this sync
@@ -391,6 +409,16 @@ void SWR_API SwrInvalidateTiles(
     uint32_t attachmentMask);
 
 //////////////////////////////////////////////////////////////////////////
+/// @brief SwrDiscardRect
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
+/// @param rect - if rect is all zeros, the entire attachment surface will be discarded
+void SWR_API SwrDiscardRect(
+    HANDLE hContext,
+    uint32_t attachmentMask,
+    SWR_RECT rect);
+
+//////////////////////////////////////////////////////////////////////////
 /// @brief SwrDispatch
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
@@ -419,9 +447,9 @@ void SWR_API SwrStoreTiles(
 void SWR_API SwrClearRenderTarget(
     HANDLE hContext,
     uint32_t clearMask,
-    const FLOAT clearColor[4],
+    const float clearColor[4],
     float z,
-    BYTE stencil);
+    uint8_t stencil);
 
 void SWR_API SwrSetRastState(
     HANDLE hContext,
diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.cpp b/src/gallium/drivers/swr/rasterizer/core/arena.cpp
deleted file mode 100644
index 8184c8d3f4c..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/arena.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file arena.cpp
-*
-* @brief Arena memory manager
-*        The arena is convenient and fast for managing allocations for any of
-*        our allocations that are associated with operations and can all be freed
-*        once when their operation has completed. Allocations are cheap since
-*        most of the time its simply an increment of an offset. Also, no need to
-*        free individual allocations. All of the arena memory can be freed at once.
-*
-******************************************************************************/
-
-#include "context.h"
-#include "arena.h"
-
-#include <cmath>
-
-Arena::Arena()
-    : m_pCurBlock(nullptr), m_size(0)
-{
-    m_pMutex = new std::mutex();
-}
-
-Arena::~Arena()
-{
-    Reset();        // Reset just in case to avoid leaking memory.
-
-    if (m_pCurBlock)
-    {
-        _aligned_free(m_pCurBlock->pMem);
-        delete m_pCurBlock;
-    }
-
-    delete m_pMutex;
-}
-
-///@todo Remove this when all users have stopped using this.
-void Arena::Init()
-{
-    m_size = 0;
-    m_pCurBlock = nullptr;
-
-    m_pMutex = new std::mutex();
-}
-
-void* Arena::AllocAligned(size_t size, size_t align)
-{
-    if (m_pCurBlock)
-    {
-        ArenaBlock* pCurBlock = m_pCurBlock;
-        pCurBlock->offset = AlignUp(pCurBlock->offset, align);
-
-        if ((pCurBlock->offset + size) <= pCurBlock->blockSize)
-        {
-            void* pMem = PtrAdd(pCurBlock->pMem, pCurBlock->offset);
-            pCurBlock->offset += size;
-            m_size += size;
-            return pMem;
-        }
-
-        // Not enough memory in this block, fall through to allocate
-        // a new block
-    }
-
-    static const size_t ArenaBlockSize = 1024*1024;
-    size_t blockSize = std::max(m_size + ArenaBlockSize, std::max(size, ArenaBlockSize));
-    blockSize = AlignUp(blockSize, KNOB_SIMD_WIDTH*4);
-
-    void *pMem = _aligned_malloc(blockSize, KNOB_SIMD_WIDTH*4);    // Arena blocks are always simd byte aligned.
-    SWR_ASSERT(pMem != nullptr);
-
-    ArenaBlock* pNewBlock = new (std::nothrow) ArenaBlock();
-    SWR_ASSERT(pNewBlock != nullptr);
-
-    if (pNewBlock != nullptr)
-    {
-        pNewBlock->pNext        = m_pCurBlock;
-
-        m_pCurBlock             = pNewBlock;
-        m_pCurBlock->pMem       = pMem;
-        m_pCurBlock->blockSize  = blockSize;
-
-    }
-
-    return AllocAligned(size, align);
-}
-
-void* Arena::Alloc(size_t size)
-{
-    return AllocAligned(size, 1);
-}
-
-void* Arena::AllocAlignedSync(size_t size, size_t align)
-{
-    void* pAlloc = nullptr;
-
-    SWR_ASSERT(m_pMutex != nullptr);
-
-    m_pMutex->lock();
-    pAlloc = AllocAligned(size, align);
-    m_pMutex->unlock();
-
-    return pAlloc;
-}
-
-void* Arena::AllocSync(size_t size)
-{
-    void* pAlloc = nullptr;
-
-    SWR_ASSERT(m_pMutex != nullptr);
-
-    m_pMutex->lock();
-    pAlloc = Alloc(size);
-    m_pMutex->unlock();
-
-    return pAlloc;
-}
-
-void Arena::Reset(bool removeAll)
-{
-    if (m_pCurBlock)
-    {
-        m_pCurBlock->offset = 0;
-
-        ArenaBlock *pUsedBlocks = m_pCurBlock->pNext;
-        m_pCurBlock->pNext = nullptr;
-        while(pUsedBlocks)
-        {
-            ArenaBlock* pBlock = pUsedBlocks;
-            pUsedBlocks = pBlock->pNext;
-
-            _aligned_free(pBlock->pMem);
-            delete pBlock;
-        }
-
-        if (removeAll)
-        {
-            _aligned_free(m_pCurBlock->pMem);
-            delete m_pCurBlock;
-            m_pCurBlock = nullptr;
-        }
-    }
-
-    m_size = 0;
-}
diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h
index 76eee11fb08..67d81a44347 100644
--- a/src/gallium/drivers/swr/rasterizer/core/arena.h
+++ b/src/gallium/drivers/swr/rasterizer/core/arena.h
@@ -33,37 +33,308 @@
 #pragma once
 
 #include <mutex>
+#include <algorithm>
+#include <atomic>
+#include "core/utils.h"
 
-class Arena
+class DefaultAllocator
 {
 public:
-    Arena();
-   ~Arena();
+    void* AllocateAligned(size_t size, size_t align)
+    {
+        void* p = _aligned_malloc(size, align);
+        return p;
+    }
+    void  Free(void* pMem)
+    {
+        _aligned_free(pMem);
+    }
+};
 
-    void        Init();
+static const size_t ARENA_BLOCK_ALIGN = 64;
 
-    void*       AllocAligned(size_t size, size_t  align);
-    void*       Alloc(size_t  size);
+struct ArenaBlock
+{
+    size_t      blockSize = 0;
+    ArenaBlock* pNext = nullptr;
+};
+static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN,
+              "Increase BLOCK_ALIGN size");
 
-    void*       AllocAlignedSync(size_t size, size_t align);
-    void*       AllocSync(size_t size);
+// Caching Allocator for Arena
+template<uint32_t NumBucketsT = 4, uint32_t StartBucketBitT = 16>
+struct CachingAllocatorT : DefaultAllocator
+{
+    static uint32_t GetBucketId(size_t blockSize)
+    {
+        uint32_t bucketId = 0;
 
-    void        Reset(bool removeAll = false);
-    size_t      Size() { return m_size; }
+#if defined(BitScanReverseSizeT)
+        BitScanReverseSizeT((unsigned long*)&bucketId, blockSize >> CACHE_START_BUCKET_BIT);
+        bucketId = std::min<uint32_t>(bucketId, CACHE_NUM_BUCKETS - 1);
+#endif
 
-private:
+        return bucketId;
+    }
+
+    void* AllocateAligned(size_t size, size_t align)
+    {
+        SWR_ASSERT(size >= sizeof(ArenaBlock));
+        SWR_ASSERT(size <= uint32_t(-1));
+
+        size_t blockSize = size - ARENA_BLOCK_ALIGN;
+
+        {
+            // search cached blocks
+            std::lock_guard<std::mutex> l(m_mutex);
+            ArenaBlock* pPrevBlock = &m_cachedBlocks[GetBucketId(blockSize)];
+            ArenaBlock* pBlock = pPrevBlock->pNext;
+            ArenaBlock* pPotentialBlock = nullptr;
+            ArenaBlock* pPotentialPrev = nullptr;
+
+            while (pBlock)
+            {
+                if (pBlock->blockSize >= blockSize)
+                {
+                    if (pBlock == AlignUp(pBlock, align))
+                    {
+                        if (pBlock->blockSize == blockSize)
+                        {
+                            // Won't find a better match
+                            break;
+                        }
+
+                        // We could use this as it is larger than we wanted, but
+                        // continue to search for a better match
+                        pPotentialBlock = pBlock;
+                        pPotentialPrev = pPrevBlock;
+                    }
+                }
+                else
+                {
+                    // Blocks are sorted by size (biggest first)
+                    // So, if we get here, there are no blocks 
+                    // large enough, fall through to allocation.
+                    pBlock = nullptr;
+                    break;
+                }
+
+                pPrevBlock = pBlock;
+                pBlock = pBlock->pNext;
+            }
+
+            if (!pBlock)
+            {
+                // Couldn't find an exact match, use next biggest size
+                pBlock = pPotentialBlock;
+                pPrevBlock = pPotentialPrev;
+            }
+
+            if (pBlock)
+            {
+                SWR_ASSERT(pPrevBlock && pPrevBlock->pNext == pBlock);
+                pPrevBlock->pNext = pBlock->pNext;
+                pBlock->pNext = nullptr;
+
+                return pBlock;
+            }
+
+            m_totalAllocated += size;
+
+#if 0
+            {
+                static uint32_t count = 0;
+                char buf[128];
+                sprintf_s(buf, "Arena Alloc %d 0x%llx bytes - 0x%llx total\n", ++count, uint64_t(size), uint64_t(m_totalAllocated));
+                OutputDebugStringA(buf);
+            }
+#endif
+        }
+
+        return this->DefaultAllocator::AllocateAligned(size, align);
+    }
+
+    void  Free(void* pMem)
+    {
+        if (pMem)
+        {
+            ArenaBlock* pNewBlock = reinterpret_cast<ArenaBlock*>(pMem);
+            SWR_ASSERT(pNewBlock->blockSize >= 0);
+
+            std::unique_lock<std::mutex> l(m_mutex);
+            ArenaBlock* pPrevBlock = &m_cachedBlocks[GetBucketId(pNewBlock->blockSize)];
+            ArenaBlock* pBlock = pPrevBlock->pNext;
+
+            while (pBlock)
+            {
+                if (pNewBlock->blockSize >= pBlock->blockSize)
+                {
+                    // Insert here
+                    break;
+                }
+                pPrevBlock = pBlock;
+                pBlock = pBlock->pNext;
+            }
+
+            // Insert into list
+            SWR_ASSERT(pPrevBlock);
+            pPrevBlock->pNext = pNewBlock;
+            pNewBlock->pNext = pBlock;
+        }
+    }
+
+    ~CachingAllocatorT()
+    {
+        // Free all cached blocks
+        for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i)
+        {
+            ArenaBlock* pBlock = m_cachedBlocks[i].pNext;
+            while (pBlock)
+            {
+                ArenaBlock* pNext = pBlock->pNext;
+                this->DefaultAllocator::Free(pBlock);
+                pBlock = pNext;
+            }
+        }
+    }
+
+    // buckets, for block sizes < (1 << (start+1)), < (1 << (start+2)), ...
+    static const uint32_t   CACHE_NUM_BUCKETS       = NumBucketsT;
+    static const uint32_t   CACHE_START_BUCKET_BIT  = StartBucketBitT;
+
+    ArenaBlock              m_cachedBlocks[CACHE_NUM_BUCKETS];
+    std::mutex              m_mutex;
+
+    size_t                  m_totalAllocated = 0;
+};
+typedef CachingAllocatorT<> CachingAllocator;
+
+template<typename T = DefaultAllocator, size_t BlockSizeT = (128 * 1024)>
+class TArena
+{
+public:
+    TArena(T& in_allocator)  : m_allocator(in_allocator) {}
+    TArena()                 : m_allocator(m_defAllocator) {}
+    ~TArena()
+    {
+        Reset(true);
+    }
+
+    void* AllocAligned(size_t size, size_t  align)
+    {
+        if (0 == size)
+        {
+            return nullptr;
+        }
+
+        SWR_ASSERT(align <= ARENA_BLOCK_ALIGN);
+
+        if (m_pCurBlock)
+        {
+            ArenaBlock* pCurBlock = m_pCurBlock;
+            size_t offset = AlignUp(m_offset, align);
+
+            if ((offset + size) <= pCurBlock->blockSize)
+            {
+                void* pMem = PtrAdd(pCurBlock, offset + ARENA_BLOCK_ALIGN);
+                m_offset = offset + size;
+                return pMem;
+            }
+
+            // Not enough memory in this block, fall through to allocate
+            // a new block
+        }
+
+        static const size_t ArenaBlockSize = BlockSizeT - ARENA_BLOCK_ALIGN;
+        size_t blockSize = std::max(size, ArenaBlockSize);
+
+        // Add in one BLOCK_ALIGN unit to store ArenaBlock in.
+        blockSize = AlignUp(blockSize, ARENA_BLOCK_ALIGN);
+
+        void *pMem = m_allocator.AllocateAligned(blockSize + ARENA_BLOCK_ALIGN, ARENA_BLOCK_ALIGN);    // Arena blocks are always simd byte aligned.
+        SWR_ASSERT(pMem != nullptr);
+
+        ArenaBlock* pNewBlock = new (pMem) ArenaBlock();
+
+        if (pNewBlock != nullptr)
+        {
+            m_offset = 0;
+            pNewBlock->pNext = m_pCurBlock;
+
+            m_pCurBlock = pNewBlock;
+            m_pCurBlock->blockSize = blockSize;
+        }
+
+        return AllocAligned(size, align);
+    }
+
+    void* Alloc(size_t  size)
+    {
+        return AllocAligned(size, 1);
+    }
 
-    struct ArenaBlock
+    void* AllocAlignedSync(size_t size, size_t align)
     {
-        void*       pMem        = nullptr;
-        size_t      blockSize   = 0;
-        size_t      offset      = 0;
-        ArenaBlock* pNext       = nullptr;
-    };
+        void* pAlloc = nullptr;
 
-    ArenaBlock*     m_pCurBlock = nullptr;
-    size_t          m_size      = 0;
+        m_mutex.lock();
+        pAlloc = AllocAligned(size, align);
+        m_mutex.unlock();
+
+        return pAlloc;
+    }
+
+    void* AllocSync(size_t size)
+    {
+        void* pAlloc = nullptr;
+
+        m_mutex.lock();
+        pAlloc = Alloc(size);
+        m_mutex.unlock();
+
+        return pAlloc;
+    }
+
+    void Reset(bool removeAll = false)
+    {
+        m_offset = 0;
+
+        if (m_pCurBlock)
+        {
+            ArenaBlock *pUsedBlocks = m_pCurBlock->pNext;
+            m_pCurBlock->pNext = nullptr;
+            while (pUsedBlocks)
+            {
+                ArenaBlock* pBlock = pUsedBlocks;
+                pUsedBlocks = pBlock->pNext;
+
+                m_allocator.Free(pBlock);
+            }
+
+            if (removeAll)
+            {
+                m_allocator.Free(m_pCurBlock);
+                m_pCurBlock = nullptr;
+            }
+        }
+    }
+
+    bool IsEmpty()
+    {
+        return (m_pCurBlock == nullptr) || (m_offset == 0 && m_pCurBlock->pNext == nullptr);
+    }
+
+private:
+
+    ArenaBlock*         m_pCurBlock = nullptr;
+    size_t              m_offset    = 0;
 
     /// @note Mutex is only used by sync allocation functions.
-    std::mutex*     m_pMutex;
+    std::mutex          m_mutex;
+
+    DefaultAllocator    m_defAllocator;
+    T&                  m_allocator;
 };
+
+using StdArena      = TArena<DefaultAllocator>;
+using CachingArena  = TArena<CachingAllocator>;
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
index 4a472bc9e5c..7fb83edf169 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -156,7 +156,7 @@ void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTil
 }
 
 template<SWR_FORMAT format>
-void ClearRasterTile(BYTE *pTileBuffer, simdvector &value)
+void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
 {
     auto lambda = [&](int comp)
     {
@@ -299,10 +299,10 @@ void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, vo
             /// @todo clear data should come in as RGBA32_FLOAT
             DWORD clearData[4];
             float clearFloat[4];
-            clearFloat[0] = ((BYTE*)(&pClear->clearRTColor))[0] / 255.0f;
-            clearFloat[1] = ((BYTE*)(&pClear->clearRTColor))[1] / 255.0f;
-            clearFloat[2] = ((BYTE*)(&pClear->clearRTColor))[2] / 255.0f;
-            clearFloat[3] = ((BYTE*)(&pClear->clearRTColor))[3] / 255.0f;
+            clearFloat[0] = ((uint8_t*)(&pClear->clearRTColor))[0] / 255.0f;
+            clearFloat[1] = ((uint8_t*)(&pClear->clearRTColor))[1] / 255.0f;
+            clearFloat[2] = ((uint8_t*)(&pClear->clearRTColor))[2] / 255.0f;
+            clearFloat[3] = ((uint8_t*)(&pClear->clearRTColor))[3] / 255.0f;
             clearData[0] = *(DWORD*)&clearFloat[0];
             clearData[1] = *(DWORD*)&clearFloat[1];
             clearData[2] = *(DWORD*)&clearFloat[2];
@@ -399,30 +399,32 @@ void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile
 }
 
 
-void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
+void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
 {
-    INVALIDATE_TILES_DESC *pDesc = (INVALIDATE_TILES_DESC*)pData;
+    DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData;
     SWR_CONTEXT *pContext = pDC->pContext;
 
+    const int numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
+
     for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
     {
         if (pDesc->attachmentMask & (1 << i))
         {
-            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, false);
+            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(
+                pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples);
             if (pHotTile)
             {
-                pHotTile->state = HOTTILE_INVALID;
+                pHotTile->state = (HOTTILE_STATE)pDesc->newTileState;
             }
         }
     }
 }
 
 #if KNOB_SIMD_WIDTH == 8
-const __m256 vQuadCenterOffsetsX = { 0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5 };
-const __m256 vQuadCenterOffsetsY = { 0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5 };
-const __m256 vQuadULOffsetsX ={0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
-const __m256 vQuadULOffsetsY ={0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
-#define MASK 0xff
+const __m256 vCenterOffsetsX = {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
+const __m256 vCenterOffsetsY = {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
+const __m256 vULOffsetsX = {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
+const __m256 vULOffsetsY = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
 #else
 #error Unsupported vector width
 #endif
@@ -457,155 +459,6 @@ simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscala
     return _simd_movemask_ps(vClipMask);
 }
 
-template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
-INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
-{
-
-    // will need to update for avx512
-    assert(KNOB_SIMD_WIDTH == 8);
-
-    __m256i mask[2];
-    __m256i sampleCoverage[2];
-    if(bIsStandardPattern)
-    {
-        __m256i src = _mm256_set1_epi32(0);
-        __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
-
-        if(MultisampleTraits<sampleCountT>::numSamples == 1)
-        {
-            mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 2)
-        {
-            mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 4)
-        {
-            mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 8)
-        {
-            mask[0] = _mm256_set1_epi32(-1);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 16)
-        {
-            mask[0] = _mm256_set1_epi32(-1);
-            mask[1] = _mm256_set1_epi32(-1);
-            index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
-        }
-
-        // gather coverage for samples 0-7
-        sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
-        if(MultisampleTraits<sampleCountT>::numSamples > 8)
-        {
-            // gather coverage for samples 8-15
-            sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
-        }
-    }
-    else
-    {
-        // center coverage is the same for all samples; just broadcast to the sample slots
-        uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
-        if(MultisampleTraits<sampleCountT>::numSamples == 1)
-        {
-            sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 2)
-        {
-            sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 4)
-        {
-            sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 8)
-        {
-            sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 16)
-        {
-            sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
-            sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
-        }
-    }
-
-    mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
-                              -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
-    // pull out the the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
-    __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
-
-    __m256i packedCoverage1;
-    if(MultisampleTraits<sampleCountT>::numSamples > 8)
-    {
-        // pull out the the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
-        packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
-    }
-
-#if (KNOB_ARCH == KNOB_ARCH_AVX)
-    // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
-    __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
-    __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
-    packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
-
-    __m256i packedSampleCoverage;
-    if(MultisampleTraits<sampleCountT>::numSamples > 8)
-    {
-        // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
-        hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
-        shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
-        shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
-        packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
-        packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
-    }
-    else
-    {
-        packedSampleCoverage = packedCoverage0;
-    }
-#else
-    __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
-    // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
-    packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
-
-    __m256i packedSampleCoverage;
-    if(MultisampleTraits<sampleCountT>::numSamples > 8)
-    {
-        permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
-        // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
-        packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
-
-        // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
-        packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
-    }
-    else
-    {
-        packedSampleCoverage = packedCoverage0;
-    }
-#endif
-
-    for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
-    {
-        // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
-        inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
-
-        if(!bForcedSampleCount)
-        {
-            // input coverage has to be anded with sample mask if MSAA isn't forced on
-            inputMask[i] &= sampleMask;
-        }
-
-        // shift to the next pixel in the 4x2
-        packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
-    }
-}
-
-template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
-INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
-{
-    uint32_t inputMask[KNOB_SIMD_WIDTH]; 
-    generateInputCoverage<sampleCountT, bIsStandardPattern, bForcedSampleCount>(coverageMask, inputMask, sampleMask);
-    inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
-}
-
 template<bool perspMask>
 INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
 {
@@ -766,6 +619,8 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND
     // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
     static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
     uint32_t rasterTileColorOffset = MultisampleTraits<sampleCount>::RasterTileColorOffset(sample);
+    simdvector blendOut;
+
     for(uint32_t rt = 0; rt < NumRT; ++rt)
     {
         uint8_t *pColorSample;
@@ -779,6 +634,9 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND
         }
 
         const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
+        // pfnBlendFunc may not update all channels.  Initialize with PS output.
+        /// TODO: move this into the blend JIT.
+        blendOut = psContext.shaded[rt];
 
         // Blend outputs and update coverage mask for alpha test
         if(pfnBlendFunc[rt] != nullptr)
@@ -789,7 +647,7 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND
                 psContext.shaded[1],
                 sample,
                 pColorSample,
-                psContext.shaded[rt],
+                blendOut,
                 &psContext.oMask,
                 (simdscalari*)&coverageMask);
         }
@@ -805,19 +663,19 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND
         // store with color mask
         if(!pRTBlend->writeDisableRed)
         {
-            _simd_maskstore_ps((float*)pColorSample, outputMask, psContext.shaded[rt].x);
+            _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
         }
         if(!pRTBlend->writeDisableGreen)
         {
-            _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, psContext.shaded[rt].y);
+            _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
         }
         if(!pRTBlend->writeDisableBlue)
         {
-            _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, psContext.shaded[rt].z);
+            _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
         }
         if(!pRTBlend->writeDisableAlpha)
         {
-            _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, psContext.shaded[rt].w);
+            _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
         }
     }
 }
@@ -884,9 +742,9 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
     for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
         // UL pixel corner
-        psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+        psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
         // pixel center
-        psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
+        psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
 
         for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
         {
@@ -898,9 +756,9 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
             if(coverageMask & MASK)
             {
                 RDTSC_START(BEBarycentric);
-                psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+                psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
                 // pixel center
-                psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
+                psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
 
                 backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
 
@@ -1077,15 +935,15 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
         // UL pixel corner
-        psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+        psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
         // pixel center
-        psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
+        psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
         
         for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
         {
-            psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+            psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
             // pixel center
-            psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
+            psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
 
             RDTSC_START(BEBarycentric);
             backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
@@ -1313,14 +1171,14 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
     
     for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
-        psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
-        psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
+        psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
+        psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
         for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
         {
-            simdscalar vZ[MultisampleTraits<sampleCount>::numSamples];
-            psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+            simdscalar vZ[MultisampleTraits<sampleCount>::numSamples]{ 0 };
+            psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
             // set pixel center positions
-            psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
+            psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
 
             if (bInputCoverage)
             {
@@ -1353,7 +1211,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
             }
             else
             {
-				psContext.activeMask = _simd_set1_epi32(-1);
+                psContext.activeMask = _simd_set1_epi32(-1);
             }
 
             // need to declare enough space for all samples
@@ -1552,9 +1410,11 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
     RDTSC_START(BESetup);
 
     static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
+
     SWR_CONTEXT *pContext = pDC->pContext;
     const API_STATE& state = GetApiState(pDC);
     const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
+    const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
 
     // broadcast scalars
     BarycentricCoeffs coeffs;
@@ -1572,7 +1432,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
 
     coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
 
-    BYTE *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
+    uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
 
     RDTSC_STOP(BESetup, 0, 0);
 
@@ -1580,12 +1440,12 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
         // UL pixel corner
-        simdscalar vYSamplePosUL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+        simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
 
         for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
         {
             // UL pixel corners
-            simdscalar vXSamplePosUL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+            simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
 
             // iterate over active samples
             unsigned long sample = 0;
@@ -1593,7 +1453,8 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
             while (_BitScanForward(&sample, sampleMask))
             {
                 sampleMask &= ~(1 << sample);
-                if (work.coverageMask[sample] & MASK)
+                simdmask coverageMask = work.coverageMask[sample] & MASK;
+                if (coverageMask)
                 {
                     RDTSC_START(BEBarycentric);
                     // calculate per sample positions
@@ -1607,7 +1468,14 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
 
                     RDTSC_STOP(BEBarycentric, 0, 0);
 
-                    simdscalar vCoverageMask = vMask(work.coverageMask[sample] & MASK);
+                    // interpolate user clip distance if available
+                    if (rastState.clipDistanceMask)
+                    {
+                        coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
+                            psContext.vI.sample, psContext.vJ.sample);
+                    }
+
+                    simdscalar vCoverageMask = vMask(coverageMask);
                     simdscalar stencilPassMask = vCoverageMask;
 
                     // offset depth/stencil buffers current sample
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h
index 53089e5047b..2fa18953cad 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.h
@@ -29,16 +29,20 @@
 #pragma once
 
 #include "common/os.h"
-#include "core/context.h" 
+#include "core/context.h"
+#include "core/multisample.h"
 
 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId);
 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
 void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
-void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
+void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers);
 void InitClearTilesTable();
+simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ);
+void InitBackendFuncTables();
+void InitCPSFuncTables();
 
 enum SWR_BACKEND_FUNCS
 {
@@ -47,13 +51,160 @@ enum SWR_BACKEND_FUNCS
     SWR_BACKEND_MSAA_SAMPLE_RATE,
     SWR_BACKEND_FUNCS_MAX,
 };
-void InitBackendFuncTables();
 
-extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
-extern PFN_BACKEND_FUNC gBackendSingleSample[2][2];
-extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2];
-extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2];
-extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS+1][SWR_MULTISAMPLE_TYPE_MAX];
-extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2];
-extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2];
-extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2];
+#if KNOB_SIMD_WIDTH == 8
+extern const __m256 vCenterOffsetsX;
+extern const __m256 vCenterOffsetsY;
+extern const __m256 vULOffsetsX;
+extern const __m256 vULOffsetsY;
+#define MASK 0xff
+#endif
+
+template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
+INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
+{
+
+    // will need to update for avx512
+    assert(KNOB_SIMD_WIDTH == 8);
+
+    __m256i mask[2];
+    __m256i sampleCoverage[2];
+    if(bIsStandardPattern)
+    {
+        __m256i src = _mm256_set1_epi32(0);
+        __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
+
+        if(MultisampleTraits<sampleCountT>::numSamples == 1)
+        {
+            mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 2)
+        {
+            mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 4)
+        {
+            mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 8)
+        {
+            mask[0] = _mm256_set1_epi32(-1);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 16)
+        {
+            mask[0] = _mm256_set1_epi32(-1);
+            mask[1] = _mm256_set1_epi32(-1);
+            index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
+        }
+
+        // gather coverage for samples 0-7
+        sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
+        if(MultisampleTraits<sampleCountT>::numSamples > 8)
+        {
+            // gather coverage for samples 8-15
+            sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
+        }
+    }
+    else
+    {
+        // center coverage is the same for all samples; just broadcast to the sample slots
+        uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
+        if(MultisampleTraits<sampleCountT>::numSamples == 1)
+        {
+            sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 2)
+        {
+            sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 4)
+        {
+            sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 8)
+        {
+            sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 16)
+        {
+            sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
+            sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
+        }
+    }
+
+    mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
+                              -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
+    // pull out the the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
+    __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
+
+    __m256i packedCoverage1;
+    if(MultisampleTraits<sampleCountT>::numSamples > 8)
+    {
+        // pull out the the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
+        packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
+    }
+
+#if (KNOB_ARCH == KNOB_ARCH_AVX)
+    // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
+    __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
+    __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
+    packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
+
+    __m256i packedSampleCoverage;
+    if(MultisampleTraits<sampleCountT>::numSamples > 8)
+    {
+        // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
+        hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
+        shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
+        shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
+        packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
+        packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
+    }
+    else
+    {
+        packedSampleCoverage = packedCoverage0;
+    }
+#else
+    __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
+    // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
+    packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
+
+    __m256i packedSampleCoverage;
+    if(MultisampleTraits<sampleCountT>::numSamples > 8)
+    {
+        permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
+        // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
+        packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
+
+        // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
+        packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
+    }
+    else
+    {
+        packedSampleCoverage = packedCoverage0;
+    }
+#endif
+
+    for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
+    {
+        // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
+        inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
+
+        if(!bForcedSampleCount)
+        {
+            // input coverage has to be anded with sample mask if MSAA isn't forced on
+            inputMask[i] &= sampleMask;
+        }
+
+        // shift to the next pixel in the 4x2
+        packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
+    }
+}
+
+template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
+INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
+{
+    uint32_t inputMask[KNOB_SIMD_WIDTH]; 
+    generateInputCoverage<sampleCountT, bIsStandardPattern, bForcedSampleCount>(coverageMask, inputMask, sampleMask);
+    inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
index ce27bf71d3c..3a2a8b35be8 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
@@ -31,6 +31,9 @@
 #include "common/os.h"
 #include "core/clip.h"
 
+// Temp storage used by the clipper
+THREAD simdvertex tlsTempVertices[7];
+
 float ComputeInterpFactor(float boundaryCoord0, float boundaryCoord1)
 {
     return (boundaryCoord0 / (boundaryCoord0 - boundaryCoord1));
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h
index 49494a4e374..ba5870a92bb 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -32,6 +32,9 @@
 #include "core/pa.h"
 #include "rdtsc_core.h"
 
+// Temp storage used by the clipper
+extern THREAD simdvertex tlsTempVertices[7];
+
 enum SWR_CLIPCODES
 {
     // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
@@ -354,6 +357,25 @@ public:
             }
         }
 
+        // assemble user clip distances if enabled
+        if (this->state.rastState.clipDistanceMask & 0xf)
+        {
+            pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector);
+            for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+            {
+                vertices[i].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT] = tmpVector[i];
+            }
+        }
+
+        if (this->state.rastState.clipDistanceMask & 0xf0)
+        {
+            pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector);
+            for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+            {
+                vertices[i].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT] = tmpVector[i];
+            }
+        }
+
         uint32_t numAttribs = maxSlot + 1;
 
         simdscalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
@@ -436,6 +458,27 @@ public:
                 }
             }
 
+            // transpose user clip distances if enabled
+            if (this->state.rastState.clipDistanceMask & 0xf)
+            {
+                pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * inputPrim;
+                for (uint32_t c = 0; c < 4; ++c)
+                {
+                    transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
+                    pBase += sizeof(simdscalar);
+                }
+            }
+
+            if (this->state.rastState.clipDistanceMask & 0xf0)
+            {
+                pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * inputPrim;
+                for (uint32_t c = 0; c < 4; ++c)
+                {
+                    transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
+                    pBase += sizeof(simdscalar);
+                }
+            }
+
             PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology);
 
             while (clipPa.GetNextStreamOutput())
@@ -630,6 +673,31 @@ private:
                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
             }
         }
+
+        // interpolate clip distance if enabled
+        if (this->state.rastState.clipDistanceMask & 0xf)
+        {
+            uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
+            for (uint32_t c = 0; c < 4; ++c)
+            {
+                simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
+                simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
+                simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+                ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
+            }
+        }
+
+        if (this->state.rastState.clipDistanceMask & 0xf0)
+        {
+            uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
+            for (uint32_t c = 0; c < 4; ++c)
+            {
+                simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
+                simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
+                simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+                ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
+            }
+        }
     }
 
     template<SWR_CLIPCODES ClippingPlane>
@@ -700,6 +768,27 @@ private:
                     }
                 }
 
+                // store clip distance if enabled
+                if (this->state.rastState.clipDistanceMask & 0xf)
+                {
+                    uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
+                    for (uint32_t c = 0; c < 4; ++c)
+                    {
+                        simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+                        ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
+                    }
+                }
+
+                if (this->state.rastState.clipDistanceMask & 0xf0)
+                {
+                    uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
+                    for (uint32_t c = 0; c < 4; ++c)
+                    {
+                        simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+                        ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
+                    }
+                }
+
                 // increment outIndex
                 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
             }
@@ -818,8 +907,7 @@ private:
     simdscalari ClipPrims(float* pVertices, const simdscalar& vPrimMask, const simdscalar& vClipMask, int numAttribs)
     {
         // temp storage
-        simdvertex tempVertices[7];
-        float* pTempVerts = (float*)&tempVertices[0];
+        float* pTempVerts = (float*)&tlsTempVertices[0];
 
         // zero out num input verts for non-active lanes
         simdscalari vNumInPts = _simd_set1_epi32(NumVertsPerPrim);
@@ -854,9 +942,9 @@ private:
         return vNumOutPts;
     }
 
-    const uint32_t workerId;
-    const DRIVER_TYPE driverType;
-    DRAW_CONTEXT* pDC;
+    const uint32_t workerId{ 0 };
+    const DRIVER_TYPE driverType{ DX };
+    DRAW_CONTEXT* pDC{ nullptr };
     const API_STATE& state;
     simdscalar clipCodes[NumVertsPerPrim];
 };
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index 4a214aff1c8..39f23372a18 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -41,6 +41,7 @@
 #include "core/knobs.h"
 #include "common/simdintrin.h"
 #include "core/threads.h"
+#include "ringbuffer.h"
 
 // x.8 fixed point precision values
 #define FIXED_POINT_SHIFT 8
@@ -82,6 +83,7 @@ struct SWR_TRIANGLE_DESC
     float *pUserClipBuffer;
 
     uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES];
+    uint64_t anyCoveredSamples;
 
     TRI_FLAGS triFlags;
 };
@@ -109,12 +111,16 @@ struct CLEAR_DESC
     CLEAR_FLAGS flags;
     float clearRTColor[4];  // RGBA_32F
     float clearDepth;   // [0..1]
-    BYTE clearStencil;
+    uint8_t clearStencil;
 };
 
-struct INVALIDATE_TILES_DESC
+struct DISCARD_INVALIDATE_TILES_DESC
 {
     uint32_t attachmentMask;
+    SWR_RECT rect;
+    SWR_TILE_STATE newTileState;
+    bool createNewTiles;
+    bool fullTilesOnly;
 };
 
 struct SYNC_DESC
@@ -150,7 +156,7 @@ enum WORK_TYPE
     SYNC,
     DRAW,
     CLEAR,
-    INVALIDATETILES,
+    DISCARDINVALIDATETILES,
     STORETILES,
     QUERYSTATS,
 };
@@ -164,7 +170,7 @@ struct BE_WORK
         SYNC_DESC sync;
         TRIANGLE_WORK_DESC tri;
         CLEAR_DESC clear;
-        INVALIDATE_TILES_DESC invalidateTiles;
+        DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
         STORE_TILES_DESC storeTiles;
         QUERY_DESC queryStats;
     } desc;
@@ -201,7 +207,7 @@ struct FE_WORK
         SYNC_DESC sync;
         DRAW_WORK draw;
         CLEAR_DESC clear;
-        INVALIDATE_TILES_DESC invalidateTiles;
+        DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
         STORE_TILES_DESC storeTiles;
         QUERY_DESC queryStats;
     } desc;
@@ -354,6 +360,7 @@ struct BACKEND_FUNCS
     PFN_OUTPUT_MERGER pfnOutputMerger;
 };
 
+
 // Draw State
 struct DRAW_STATE
 {
@@ -365,7 +372,7 @@ struct DRAW_STATE
     BACKEND_FUNCS backendFuncs;
     PFN_PROCESS_PRIMS pfnProcessPrims;
 
-    Arena*    pArena;     // This should only be used by API thread.
+    CachingArena* pArena;     // This should only be used by API thread.
 };
 
 // Draw Context
@@ -381,25 +388,22 @@ struct DRAW_CONTEXT
 
     FE_WORK FeWork;
     volatile OSALIGNLINE(uint32_t) FeLock;
-    volatile OSALIGNLINE(bool) inUse;
     volatile OSALIGNLINE(bool) doneFE;    // Is FE work done for this draw?
-
-    // Have all worker threads moved past draw in DC ring?
-    volatile OSALIGNLINE(uint32_t) threadsDoneFE;
-    volatile OSALIGNLINE(uint32_t) threadsDoneBE;
+    volatile OSALIGNLINE(int64_t) threadsDone;
 
     uint64_t dependency;
 
     MacroTileMgr* pTileMgr;
 
     // The following fields are valid if isCompute is true.
-    volatile OSALIGNLINE(bool) doneCompute; // Is this dispatch done?   (isCompute)
     DispatchQueue* pDispatch;               // Queue for thread groups. (isCompute)
 
     DRAW_STATE* pState;
-    Arena*    pArena;
+    CachingArena* pArena;
 
     uint8_t* pSpillFill[KNOB_MAX_NUM_THREADS];  // Scratch space used for spill fills.
+
+    bool  cleanupState; // True if this is the last draw using an entry in the state ring.
 };
 
 INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
@@ -438,7 +442,7 @@ struct SWR_CONTEXT
     //  3. State - When an applications sets state after draw
     //     a. Same as step 1.
     //     b. State is copied from prev draw context to current.
-    DRAW_CONTEXT* dcRing;
+    RingBuffer<DRAW_CONTEXT> dcRing;
 
     DRAW_CONTEXT *pCurDrawContext;    // This points to DC entry in ring for an unsubmitted draw.
     DRAW_CONTEXT *pPrevDrawContext;   // This points to DC entry for the previous context submitted that we can copy state from.
@@ -448,14 +452,10 @@ struct SWR_CONTEXT
     //  These split draws all have identical state. So instead of storing the state directly
     //  in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs
     //  to reference a single entry in the DS ring.
-    DRAW_STATE*   dsRing;
+    RingBuffer<DRAW_STATE> dsRing;
 
     uint32_t curStateId;               // Current index to the next available entry in the DS ring.
 
-    DRAW_STATE*   subCtxSave;          // Save area for inactive contexts.
-    uint32_t      curSubCtxId;         // Current index for active state subcontext.
-    uint32_t      numSubContexts;      // Number of available subcontexts
-
     uint32_t NumWorkerThreads;
 
     THREAD_POOL threadPool; // Thread pool associated with this context
@@ -463,13 +463,6 @@ struct SWR_CONTEXT
     std::condition_variable FifosNotEmpty;
     std::mutex WaitLock;
 
-    // Draw Contexts will get a unique drawId generated from this
-    uint64_t nextDrawId;
-
-    // most recent draw id enqueued by the API thread
-    // written by api thread, read by multiple workers
-    OSALIGNLINE(volatile uint64_t) DrawEnqueued;
-
     DRIVER_TYPE driverType;
 
     uint32_t privateStateSize;
@@ -486,6 +479,8 @@ struct SWR_CONTEXT
 
     // Scratch space for workers.
     uint8_t* pScratch[KNOB_MAX_NUM_THREADS];
+
+    CachingAllocator cachingArenaAllocator;
 };
 
 void WaitForDependencies(SWR_CONTEXT *pContext, uint64_t drawId);
diff --git a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
index 4f245c8c53e..2cc9d4054ac 100644
--- a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
+++ b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
@@ -82,7 +82,7 @@ void StencilOp(SWR_STENCILOP op, simdscalar mask, simdscalar stencilRefps, simds
 
 INLINE
 simdscalar DepthStencilTest(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState,
-                 bool frontFacing, simdscalar interpZ, BYTE* pDepthBase, simdscalar coverageMask, BYTE *pStencilBase,
+                 bool frontFacing, simdscalar interpZ, uint8_t* pDepthBase, simdscalar coverageMask, uint8_t *pStencilBase,
                  simdscalar* pStencilMask)
 {
     static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
@@ -177,8 +177,8 @@ simdscalar DepthStencilTest(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENC
 
 INLINE
 void DepthStencilWrite(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState,
-        bool frontFacing, simdscalar interpZ, BYTE* pDepthBase, const simdscalar& depthMask, const simdscalar& coverageMask, 
-        BYTE *pStencilBase, const simdscalar& stencilMask)
+        bool frontFacing, simdscalar interpZ, uint8_t* pDepthBase, const simdscalar& depthMask, const simdscalar& coverageMask, 
+        uint8_t *pStencilBase, const simdscalar& stencilMask)
 {
     if (pDSState->depthWriteEnable)
     {
diff --git a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
index 7e556012e6b..ccf0b70544f 100644
--- a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
+++ b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
@@ -49,7 +49,8 @@ struct QUEUE
     static const uint32_t mBlockSizeShift = 6;
     static const uint32_t mBlockSize = 1 << mBlockSizeShift;
 
-    void clear(Arena& arena)
+    template <typename ArenaT>
+    void clear(ArenaT& arena)
     {
         mHead = 0;
         mTail = 0;
@@ -102,7 +103,8 @@ struct QUEUE
         mNumEntries --;
     }
 
-    bool enqueue_try_nosync(Arena& arena, const T* entry)
+    template <typename ArenaT>
+    bool enqueue_try_nosync(ArenaT& arena, const T* entry)
     {
         memcpy(&mCurBlock[mTail], entry, sizeof(T));
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
index 83d85fc86d8..344758eefe5 100644
--- a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
@@ -34,7 +34,7 @@
 /// @param pSrc - source data in SOA form
 /// @param dst - output data in SOA form
 template<SWR_FORMAT SrcFormat>
-INLINE void LoadSOA(const BYTE *pSrc, simdvector &dst)
+INLINE void LoadSOA(const uint8_t *pSrc, simdvector &dst)
 {
     // fast path for float32
     if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<SrcFormat>::GetBPC(0) == 32))
@@ -141,7 +141,7 @@ INLINE simdscalar Normalize(simdscalar vComp, uint32_t Component)
 /// @param src - source data in SOA form
 /// @param dst - output data in SOA form
 template<SWR_FORMAT DstFormat>
-INLINE void StoreSOA(const simdvector &src, BYTE *pDst)
+INLINE void StoreSOA(const simdvector &src, uint8_t *pDst)
 {
     // fast path for float32
     if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<DstFormat>::GetBPC(0) == 32))
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h b/src/gallium/drivers/swr/rasterizer/core/format_types.h
index aa350259a15..9acf846a7f0 100644
--- a/src/gallium/drivers/swr/rasterizer/core/format_types.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h
@@ -34,8 +34,8 @@ template <uint32_t NumBits, bool Signed = false>
 struct PackTraits
 {
     static const uint32_t MyNumBits = NumBits;
-    static simdscalar loadSOA(const BYTE *pSrc) = delete;
-    static void storeSOA(BYTE *pDst, simdscalar src) = delete;
+    static simdscalar loadSOA(const uint8_t *pSrc) = delete;
+    static void storeSOA(uint8_t *pDst, simdscalar src) = delete;
     static simdscalar unpack(simdscalar &in) = delete;
     static simdscalar pack(simdscalar &in) = delete;
 };
@@ -48,8 +48,8 @@ struct PackTraits<0, false>
 {
     static const uint32_t MyNumBits = 0;
 
-    static simdscalar loadSOA(const BYTE *pSrc) { return _simd_setzero_ps(); }
-    static void storeSOA(BYTE *pDst, simdscalar src) { return; }
+    static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_setzero_ps(); }
+    static void storeSOA(uint8_t *pDst, simdscalar src) { return; }
     static simdscalar unpack(simdscalar &in) { return _simd_setzero_ps(); }
     static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); }
 };
@@ -63,7 +63,7 @@ struct PackTraits<8, false>
 {
     static const uint32_t MyNumBits = 8;
 
-    static simdscalar loadSOA(const BYTE *pSrc)
+    static simdscalar loadSOA(const uint8_t *pSrc)
     {
 #if KNOB_SIMD_WIDTH == 8
         __m256 result = _mm256_setzero_ps();
@@ -74,7 +74,7 @@ struct PackTraits<8, false>
 #endif
     }
 
-    static void storeSOA(BYTE *pDst, simdscalar src)
+    static void storeSOA(uint8_t *pDst, simdscalar src)
     {
         // store simd bytes
 #if KNOB_SIMD_WIDTH == 8
@@ -125,7 +125,7 @@ struct PackTraits<8, true>
 {
     static const uint32_t MyNumBits = 8;
 
-    static simdscalar loadSOA(const BYTE *pSrc)
+    static simdscalar loadSOA(const uint8_t *pSrc)
     {
 #if KNOB_SIMD_WIDTH == 8
         __m256 result = _mm256_setzero_ps();
@@ -136,7 +136,7 @@ struct PackTraits<8, true>
 #endif
     }
 
-    static void storeSOA(BYTE *pDst, simdscalar src)
+    static void storeSOA(uint8_t *pDst, simdscalar src)
     {
         // store simd bytes
 #if KNOB_SIMD_WIDTH == 8
@@ -188,7 +188,7 @@ struct PackTraits<16, false>
 {
     static const uint32_t MyNumBits = 16;
 
-    static simdscalar loadSOA(const BYTE *pSrc)
+    static simdscalar loadSOA(const uint8_t *pSrc)
     {
 #if KNOB_SIMD_WIDTH == 8
         __m256 result = _mm256_setzero_ps();
@@ -199,7 +199,7 @@ struct PackTraits<16, false>
 #endif
     }
 
-    static void storeSOA(BYTE *pDst, simdscalar src)
+    static void storeSOA(uint8_t *pDst, simdscalar src)
     {
 #if KNOB_SIMD_WIDTH == 8
         // store 16B (2B * 8)
@@ -249,7 +249,7 @@ struct PackTraits<16, true>
 {
     static const uint32_t MyNumBits = 16;
 
-    static simdscalar loadSOA(const BYTE *pSrc)
+    static simdscalar loadSOA(const uint8_t *pSrc)
     {
 #if KNOB_SIMD_WIDTH == 8
         __m256 result = _mm256_setzero_ps();
@@ -260,7 +260,7 @@ struct PackTraits<16, true>
 #endif
     }
 
-    static void storeSOA(BYTE *pDst, simdscalar src)
+    static void storeSOA(uint8_t *pDst, simdscalar src)
     {
 #if KNOB_SIMD_WIDTH == 8
         // store 16B (2B * 8)
@@ -311,8 +311,8 @@ struct PackTraits<32, false>
 {
     static const uint32_t MyNumBits = 32;
 
-    static simdscalar loadSOA(const BYTE *pSrc) { return _simd_load_ps((const float*)pSrc); }
-    static void storeSOA(BYTE *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); }
+    static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_load_ps((const float*)pSrc); }
+    static void storeSOA(uint8_t *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); }
     static simdscalar unpack(simdscalar &in) { return in; }
     static simdscalar pack(simdscalar &in) { return in; }
 };
@@ -984,7 +984,7 @@ struct ComponentTraits
         return TypeTraits<X, NumBitsX>::fromFloat();
     }
 
-    INLINE static simdscalar loadSOA(uint32_t comp, const BYTE* pSrc)
+    INLINE static simdscalar loadSOA(uint32_t comp, const uint8_t* pSrc)
     {
         switch (comp)
         {
@@ -1001,7 +1001,7 @@ struct ComponentTraits
         return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
     }
 
-    INLINE static void storeSOA(uint32_t comp, BYTE *pDst, simdscalar src)
+    INLINE static void storeSOA(uint32_t comp, uint8_t *pDst, simdscalar src)
     {
         switch (comp)
         {
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index f43a672bd82..36721e00beb 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -193,35 +193,71 @@ void ProcessStoreTiles(
 /// @param workerId - thread's worker id. Even thread has a unique id.
 /// @param pUserData - Pointer to user data passed back to callback.
 /// @todo This should go away when we switch this to use compute threading.
-void ProcessInvalidateTiles(
+void ProcessDiscardInvalidateTiles(
     SWR_CONTEXT *pContext,
     DRAW_CONTEXT *pDC,
     uint32_t workerId,
     void *pUserData)
 {
     RDTSC_START(FEProcessInvalidateTiles);
-    INVALIDATE_TILES_DESC *pInv = (INVALIDATE_TILES_DESC*)pUserData;
+    DISCARD_INVALIDATE_TILES_DESC *pInv = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 
-    const API_STATE& state = GetApiState(pDC);
+    SWR_RECT rect;
+
+    if (pInv->rect.top | pInv->rect.bottom | pInv->rect.right | pInv->rect.left)
+    {
+        // Valid rect
+        rect = pInv->rect;
+    }
+    else
+    {
+        // Use viewport dimensions
+        const API_STATE& state = GetApiState(pDC);
+
+        rect.left   = (uint32_t)state.vp[0].x;
+        rect.right  = (uint32_t)(state.vp[0].x + state.vp[0].width);
+        rect.top    = (uint32_t)state.vp[0].y;
+        rect.bottom = (uint32_t)(state.vp[0].y + state.vp[0].height);
+    }
 
     // queue a store to each macro tile
     // compute macro tile bounds for the current render target
     uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
     uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
 
-    uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth;
-    uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight;
+    // Setup region assuming full tiles
+    uint32_t macroTileStartX = (rect.left + (macroWidth - 1)) / macroWidth;
+    uint32_t macroTileStartY = (rect.top + (macroHeight - 1)) / macroHeight;
+
+    uint32_t macroTileEndX = rect.right / macroWidth;
+    uint32_t macroTileEndY = rect.bottom / macroHeight;
+
+    if (pInv->fullTilesOnly == false)
+    {
+        // include partial tiles
+        macroTileStartX = rect.left / macroWidth;
+        macroTileStartY = rect.top / macroHeight;
+
+        macroTileEndX = (rect.right + macroWidth - 1) / macroWidth;
+        macroTileEndY = (rect.bottom + macroHeight - 1) / macroHeight;
+    }
+
+    SWR_ASSERT(macroTileEndX <= KNOB_NUM_HOT_TILES_X);
+    SWR_ASSERT(macroTileEndY <= KNOB_NUM_HOT_TILES_Y);
+
+    macroTileEndX = std::min<uint32_t>(macroTileEndX, KNOB_NUM_HOT_TILES_X);
+    macroTileEndY = std::min<uint32_t>(macroTileEndY, KNOB_NUM_HOT_TILES_Y);
 
     // load tiles
     BE_WORK work;
-    work.type = INVALIDATETILES;
-    work.pfnWork = ProcessInvalidateTilesBE;
-    work.desc.invalidateTiles = *pInv;
+    work.type = DISCARDINVALIDATETILES;
+    work.pfnWork = ProcessDiscardInvalidateTilesBE;
+    work.desc.discardInvalidateTiles = *pInv;
 
-    for (uint32_t x = 0; x < numMacroTilesX; ++x)
+    for (uint32_t x = macroTileStartX; x < macroTileEndX; ++x)
     {
-        for (uint32_t y = 0; y < numMacroTilesY; ++y)
+        for (uint32_t y = macroTileStartY; y < macroTileEndY; ++y)
         {
             pTileMgr->enqueue(x, y, &work);
         }
@@ -630,6 +666,8 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t num
     }
 }
 
+THREAD SWR_GS_CONTEXT tlsGsContext;
+
 //////////////////////////////////////////////////////////////////////////
 /// @brief Implements GS stage.
 /// @param pDC - pointer to draw context.
@@ -651,7 +689,6 @@ static void GeometryShaderStage(
 {
     RDTSC_START(FEGeometryShader);
 
-    SWR_GS_CONTEXT gsContext;
     SWR_CONTEXT* pContext = pDC->pContext;
 
     const API_STATE& state = GetApiState(pDC);
@@ -660,9 +697,9 @@ static void GeometryShaderStage(
     SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
     SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
 
-    gsContext.pStream = (uint8_t*)pGsOut;
-    gsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
-    gsContext.PrimitiveID = primID;
+    tlsGsContext.pStream = (uint8_t*)pGsOut;
+    tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
+    tlsGsContext.PrimitiveID = primID;
 
     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
     simdvector attrib[MAX_ATTRIBUTES];
@@ -675,7 +712,7 @@ static void GeometryShaderStage(
 
         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
         {
-            gsContext.vert[i].attrib[attribSlot] = attrib[i];
+            tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
         }
     }
     
@@ -683,7 +720,7 @@ static void GeometryShaderStage(
     pa.Assemble(VERTEX_POSITION_SLOT, attrib);
     for (uint32_t i = 0; i < numVertsPerPrim; ++i)
     {
-        gsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
+        tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
     }
 
     const uint32_t vertexStride = sizeof(simdvertex);
@@ -710,14 +747,14 @@ static void GeometryShaderStage(
 
     for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
     {
-        gsContext.InstanceID = instance;
-        gsContext.mask = GenerateMask(numInputPrims);
+        tlsGsContext.InstanceID = instance;
+        tlsGsContext.mask = GenerateMask(numInputPrims);
 
         // execute the geometry shader
-        state.pfnGsFunc(GetPrivateState(pDC), &gsContext);
+        state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
 
-        gsContext.pStream += instanceStride;
-        gsContext.pCutOrStreamIdBuffer += cutInstanceStride;
+        tlsGsContext.pStream += instanceStride;
+        tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
     }
 
     // set up new binner and state for the GS output topology
@@ -736,7 +773,7 @@ static void GeometryShaderStage(
     // foreach input prim:
     // - setup a new PA based on the emitted verts for that prim
     // - loop over the new verts, calling PA to assemble each prim
-    uint32_t* pVertexCount = (uint32_t*)&gsContext.vertexCount;
+    uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
     uint32_t* pPrimitiveId = (uint32_t*)&primID;
 
     uint32_t totalPrimsGenerated = 0;
@@ -844,7 +881,7 @@ static void GeometryShaderStage(
 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
     void **ppStreamCutBuffer)
 {
-    Arena* pArena = pDC->pArena;
+    auto pArena = pDC->pArena;
     SWR_ASSERT(pArena != nullptr);
     SWR_ASSERT(state.gsState.gsEnable);
     // allocate arena space to hold GS output verts
@@ -1186,7 +1223,7 @@ void ProcessDraw(
 
         // if the entire index buffer isn't being consumed, set the last index
         // so that fetches < a SIMD wide will be masked off
-        fetchInfo.pLastIndex = (const int32_t*)(((BYTE*)state.indexBuffer.pIndices) + state.indexBuffer.size);
+        fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
         if (pLastRequestedIndex < fetchInfo.pLastIndex)
         {
             fetchInfo.pLastIndex = pLastRequestedIndex;
@@ -1362,7 +1399,7 @@ void ProcessDraw(
             i += KNOB_SIMD_WIDTH;
             if (IsIndexedT)
             {
-                fetchInfo.pIndices = (int*)((BYTE*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
+                fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
             }
             else
             {
@@ -1776,7 +1813,7 @@ void BinTriangles(
             work.pfnWork = gRasterizerTable[rastState.scissorEnable][SWR_MULTISAMPLE_1X];
         }
 
-        Arena* pArena = pDC->pArena;
+        auto pArena = pDC->pArena;
         SWR_ASSERT(pArena != nullptr);
 
         // store active attribs
@@ -1948,7 +1985,7 @@ void BinPoints(
 
             work.pfnWork = RasterizeSimplePoint;
 
-            Arena* pArena = pDC->pArena;
+            auto pArena = pDC->pArena;
             SWR_ASSERT(pArena != nullptr);
 
             // store attributes
@@ -2082,7 +2119,7 @@ void BinPoints(
 
             work.pfnWork = RasterizeTriPoint;
 
-            Arena* pArena = pDC->pArena;
+            auto pArena = pDC->pArena;
             SWR_ASSERT(pArena != nullptr);
 
             // store active attribs
@@ -2299,7 +2336,7 @@ void BinLines(
 
         work.pfnWork = RasterizeLine;
 
-        Arena* pArena = pDC->pArena;
+        auto pArena = pDC->pArena;
         SWR_ASSERT(pArena != nullptr);
 
         // store active attribs
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h
index acb935fc251..f92f88c3226 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h
@@ -146,14 +146,13 @@ float calcDeterminantInt(const __m128i vA, const __m128i vB)
     //vMul = [A1*B2 - B1*A2]
     vMul = _mm_sub_epi64(vMul, vMul2);
 
-	// According to emmintrin.h __mm_store1_pd(), address must be 16-byte aligned
-    OSALIGN(int64_t, 16) result;
-    _mm_store1_pd((double*)&result, _mm_castsi128_pd(vMul));
+    int64_t result;
+    _mm_store_sd((double*)&result, _mm_castsi128_pd(vMul));
 
-    double fResult = (double)result;
-    fResult = fResult * (1.0 / FIXED_POINT16_SCALE);
+    double dResult = (double)result;
+    dResult = dResult * (1.0 / FIXED_POINT16_SCALE);
 
-    return (float)fResult;
+    return (float)dResult;
 }
 
 INLINE
@@ -316,7 +315,7 @@ void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, vo
 
 void ProcessClear(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
 void ProcessStoreTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-void ProcessInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+void ProcessDiscardInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
 void ProcessSync(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
 void ProcessQueryStats(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
index 3f19555557f..adf738c1bed 100644
--- a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
+++ b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
@@ -80,6 +80,11 @@ static inline void ConvertEnvToKnob(const char* pOverride, float& knobValue)
     }
 }
 
+static inline void ConvertEnvToKnob(const char* pOverride, std::string& knobValue)
+{
+    knobValue = pOverride;
+}
+
 template <typename T>
 static inline void InitKnob(T& knob)
 {
diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h
index 2028d9fbcfe..f8f1a33b7e3 100644
--- a/src/gallium/drivers/swr/rasterizer/core/pa.h
+++ b/src/gallium/drivers/swr/rasterizer/core/pa.h
@@ -34,12 +34,12 @@
 
 struct PA_STATE
 {
-    DRAW_CONTEXT *pDC;              // draw context
-    uint8_t* pStreamBase;           // vertex stream
-    uint32_t streamSizeInVerts;     // total size of the input stream in verts
+    DRAW_CONTEXT *pDC{ nullptr };              // draw context
+    uint8_t* pStreamBase{ nullptr };           // vertex stream
+    uint32_t streamSizeInVerts{ 0 };     // total size of the input stream in verts
 
     // The topology the binner will use. In some cases the FE changes the topology from the api state.
-    PRIMITIVE_TOPOLOGY binTopology;
+    PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
 
     PA_STATE() {}
     PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) :
@@ -76,37 +76,37 @@ struct PA_STATE
 // cuts
 struct PA_STATE_OPT : public PA_STATE
 {
-    simdvertex leadingVertex;           // For tri-fan
-    uint32_t numPrims;              // Total number of primitives for draw.
-    uint32_t numPrimsComplete;      // Total number of complete primitives.
+    simdvertex leadingVertex;            // For tri-fan
+    uint32_t numPrims{ 0 };              // Total number of primitives for draw.
+    uint32_t numPrimsComplete{ 0 };      // Total number of complete primitives.
 
-    uint32_t numSimdPrims;          // Number of prims in current simd.
+    uint32_t numSimdPrims{ 0 };          // Number of prims in current simd.
 
-    uint32_t cur;                   // index to current VS output.
-    uint32_t prev;                  // index to prev VS output. Not really needed in the state.
-    uint32_t first;                 // index to first VS output. Used for trifan.
+    uint32_t cur{ 0 };                   // index to current VS output.
+    uint32_t prev{ 0 };                  // index to prev VS output. Not really needed in the state.
+    uint32_t first{ 0 };                 // index to first VS output. Used for trifan.
 
-    uint32_t counter;               // state counter
-    bool reset;                     // reset state
+    uint32_t counter{ 0 };               // state counter
+    bool reset{ false };                 // reset state
 
-    uint32_t primIDIncr;            // how much to increment for each vector (typically vector / {1, 2})
+    uint32_t primIDIncr{ 0 };            // how much to increment for each vector (typically vector / {1, 2})
     simdscalari primID;
 
     typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]);
     typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
 
-    PFN_PA_FUNC        pfnPaFunc;        // PA state machine function for assembling 4 triangles.
-    PFN_PA_SINGLE_FUNC pfnPaSingleFunc;  // PA state machine function for assembling single triangle.
-    PFN_PA_FUNC        pfnPaFuncReset;   // initial state to set on reset
+    PFN_PA_FUNC        pfnPaFunc{ nullptr };        // PA state machine function for assembling 4 triangles.
+    PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr };  // PA state machine function for assembling single triangle.
+    PFN_PA_FUNC        pfnPaFuncReset{ nullptr };   // initial state to set on reset
 
     // state used to advance the PA when Next is called
-    PFN_PA_FUNC        pfnPaNextFunc;
-    uint32_t           nextNumSimdPrims;
-    uint32_t           nextNumPrimsIncrement;
-    bool               nextReset;
-    bool               isStreaming;
+    PFN_PA_FUNC        pfnPaNextFunc{ nullptr };
+    uint32_t           nextNumSimdPrims{ 0 };
+    uint32_t           nextNumPrimsIncrement{ 0 };
+    bool               nextReset{ false };
+    bool               isStreaming{ false };
 
-    simdmask tmpIndices;             // temporary index store for unused virtual function
+    simdmask tmpIndices{ 0 };            // temporary index store for unused virtual function
     
     PA_STATE_OPT() {}
     PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
@@ -333,33 +333,33 @@ INLINE __m128 swizzleLaneN(const simdvector &a, int lane)
 // Cut-aware primitive assembler.
 struct PA_STATE_CUT : public PA_STATE
 {
-    simdmask* pCutIndices;          // cut indices buffer, 1 bit per vertex
-    uint32_t numVerts;              // number of vertices available in buffer store
-    uint32_t numAttribs;            // number of attributes
-    int32_t numRemainingVerts;      // number of verts remaining to be assembled
-    uint32_t numVertsToAssemble;    // total number of verts to assemble for the draw
+    simdmask* pCutIndices{ nullptr };    // cut indices buffer, 1 bit per vertex
+    uint32_t numVerts{ 0 };              // number of vertices available in buffer store
+    uint32_t numAttribs{ 0 };            // number of attributes
+    int32_t numRemainingVerts{ 0 };      // number of verts remaining to be assembled
+    uint32_t numVertsToAssemble{ 0 };    // total number of verts to assemble for the draw
     OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][KNOB_SIMD_WIDTH];    // current index buffer for gather
     simdscalari vOffsets[MAX_NUM_VERTS_PER_PRIM];           // byte offsets for currently assembling simd
-    uint32_t numPrimsAssembled;     // number of primitives that are fully assembled
-    uint32_t headVertex;            // current unused vertex slot in vertex buffer store
-    uint32_t tailVertex;            // beginning vertex currently assembling
-    uint32_t curVertex;             // current unprocessed vertex
-    uint32_t startPrimId;           // starting prim id
-    simdscalari vPrimId;            // vector of prim ID
-    bool needOffsets;               // need to compute gather offsets for current SIMD
-    uint32_t vertsPerPrim;
-    simdvertex tmpVertex;               // temporary simdvertex for unimplemented API
-    bool processCutVerts;           // vertex indices with cuts should be processed as normal, otherwise they
-                                    // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
-                                    // while the GS sends valid verts for every index 
+    uint32_t numPrimsAssembled{ 0 };     // number of primitives that are fully assembled
+    uint32_t headVertex{ 0 };            // current unused vertex slot in vertex buffer store
+    uint32_t tailVertex{ 0 };            // beginning vertex currently assembling
+    uint32_t curVertex{ 0 };             // current unprocessed vertex
+    uint32_t startPrimId{ 0 };           // starting prim id
+    simdscalari vPrimId;                 // vector of prim ID
+    bool needOffsets{ false };           // need to compute gather offsets for current SIMD
+    uint32_t vertsPerPrim{ 0 };
+    simdvertex tmpVertex;                // temporary simdvertex for unimplemented API
+    bool processCutVerts{ false };       // vertex indices with cuts should be processed as normal, otherwise they
+                                         // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
+                                         // while the GS sends valid verts for every index 
     // Topology state tracking
     uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
-    uint32_t curIndex;
-    bool reverseWinding;            // indicates reverse winding for strips
-    int32_t adjExtraVert;           // extra vert uses for tristrip w/ adj
+    uint32_t curIndex{ 0 };
+    bool reverseWinding{ false };        // indicates reverse winding for strips
+    int32_t adjExtraVert{ 0 };           // extra vert uses for tristrip w/ adj
 
     typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
-    PFN_PA_FUNC pfnPa;              // per-topology function that processes a single vert
+    PFN_PA_FUNC pfnPa{ nullptr };        // per-topology function that processes a single vert
 
     PA_STATE_CUT() {}
     PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, simdmask* in_pIndices, uint32_t in_numVerts, 
@@ -1199,9 +1199,9 @@ struct PA_FACTORY
 
     PA_STATE_OPT paOpt;
     PA_STATE_CUT paCut;
-    bool cutPA;
+    bool cutPA{ false };
 
-    PRIMITIVE_TOPOLOGY topo;
+    PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
 
     simdvertex vertexStore[MAX_NUM_VERTS_PER_PRIM];
     simdmask indexStore[MAX_NUM_VERTS_PER_PRIM];
diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
index 587e336d87d..52fb7c88cdd 100644
--- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
@@ -690,9 +690,10 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
 
     // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
     // used to for testing if entire raster tile is inside a triangle
-    vEdgeFix16[0] = _mm256_add_pd(vEdgeFix16[0], rastEdges[0].vRasterTileOffsets);
-    vEdgeFix16[1] = _mm256_add_pd(vEdgeFix16[1], rastEdges[1].vRasterTileOffsets);
-    vEdgeFix16[2] = _mm256_add_pd(vEdgeFix16[2], rastEdges[2].vRasterTileOffsets);
+    for (uint32_t e = 0; e < numEdges; ++e)
+    {
+        vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets);
+    }
 
     // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
     // step sample positions to the raster tile bbox of multisample points
@@ -700,7 +701,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
     //                             |      |
     //                             |      |
     // min(xSamples),max(ySamples)  ------  max(xSamples),max(ySamples)
-    __m256d vEdge0TileBbox, vEdge1TileBbox, vEdge2TileBbox;
+    __m256d vEdgeTileBbox[3];
     if (sampleCount > SWR_MULTISAMPLE_1X)
     {
         __m128i vTileSampleBBoxXh = MultisampleTraits<sampleCount>::TileSampleOffsetsX();
@@ -711,17 +712,12 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
 
         // step edge equation tests from Tile
         // used to for testing if entire raster tile is inside a triangle
-        __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].a), vTileSampleBBoxXFix8);
-        __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].b), vTileSampleBBoxYFix8);
-        vEdge0TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-
-        vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].a), vTileSampleBBoxXFix8);
-        vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].b), vTileSampleBBoxYFix8);
-        vEdge1TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-
-        vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].a), vTileSampleBBoxXFix8);
-        vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].b), vTileSampleBBoxYFix8);
-        vEdge2TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+        for (uint32_t e = 0; e < 3; ++e)
+        {
+            __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
+            __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
+            vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+        }
     }
 
     RDTSC_STOP(BEStepSetup, 0, pDC->drawId);
@@ -756,7 +752,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
 
         for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
         {
-            uint64_t anyCoveredSamples = 0;
+            triDesc.anyCoveredSamples = 0;
 
             // is the corner of the edge outside of the raster tile? (vEdge < 0)
             int mask0, mask1, mask2;
@@ -770,9 +766,9 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
             {
                 __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
                 // evaluate edge equations at the tile multisample bounding box
-                vSampleBboxTest0 = _mm256_add_pd(vEdge0TileBbox, vEdgeFix16[0]);
-                vSampleBboxTest1 = _mm256_add_pd(vEdge1TileBbox, vEdgeFix16[1]);
-                vSampleBboxTest2 = _mm256_add_pd(vEdge2TileBbox, vEdgeFix16[2]);
+                vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]);
+                vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]);
+                vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]);
                 mask0 = _mm256_movemask_pd(vSampleBboxTest0);
                 mask1 = _mm256_movemask_pd(vSampleBboxTest1);
                 mask2 = _mm256_movemask_pd(vSampleBboxTest2);
@@ -789,20 +785,21 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
                     triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
                     if ((mask0 & mask1 & mask2) == 0xf)
                     {
-                        anyCoveredSamples = triDesc.coverageMask[sampleNum];
+                        triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
                         // trivial accept, all 4 corners of all 3 edges are negative 
                         // i.e. raster tile completely inside triangle
                         RDTSC_EVENT(BETrivialAccept, 1, 0);
                     }
                     else
                     {
-                        __m256d vEdge0AtSample, vEdge1AtSample, vEdge2AtSample; 
+                        __m256d vEdgeAtSample[numEdges];
                         if(sampleCount == SWR_MULTISAMPLE_1X)
                         {
                             // should get optimized out for single sample case (global value numbering or copy propagation)
-                            vEdge0AtSample = vEdgeFix16[0];
-                            vEdge1AtSample = vEdgeFix16[1];
-                            vEdge2AtSample = vEdgeFix16[2];
+                            for (uint32_t e = 0; e < numEdges; ++e)
+                            {
+                                vEdgeAtSample[e] = vEdgeFix16[e];
+                            }
                         }
                         else
                         {
@@ -815,31 +812,20 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
                             // for each edge and broadcasts it before offsetting to individual pixel quads
 
                             // step edge equation tests from UL tile corner to pixel sample position
-                            __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].a), vSampleOffsetX);
-                            __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].b), vSampleOffsetY);
-                            vEdge0AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-                            vEdge0AtSample = _mm256_add_pd(vEdgeFix16[0], vEdge0AtSample);
-
-                            vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].a), vSampleOffsetX);
-                            vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].b), vSampleOffsetY);
-                            vEdge1AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-                            vEdge1AtSample = _mm256_add_pd(vEdgeFix16[1], vEdge1AtSample);
-
-                            vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].a), vSampleOffsetX);
-                            vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].b), vSampleOffsetY);
-                            vEdge2AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-                            vEdge2AtSample = _mm256_add_pd(vEdgeFix16[2], vEdge2AtSample);
+                            for (uint32_t e = 0; e < numEdges; ++e)
+                            {
+                                __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
+                                __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
+                                vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+                                vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]);
+                            }
                         }
 
                         double startQuadEdges[numEdges];
                         const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
-                        _mm256_maskstore_pd(&startQuadEdges[0], vLane0Mask, vEdge0AtSample);
-                        _mm256_maskstore_pd(&startQuadEdges[1], vLane0Mask, vEdge1AtSample);
-                        _mm256_maskstore_pd(&startQuadEdges[2], vLane0Mask, vEdge2AtSample);
-
-                        for (uint32_t e = 3; e < numEdges; ++e)
+                        for (uint32_t e = 0; e < numEdges; ++e)
                         {
-                            _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeFix16[e]);
+                            _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]);
                         }
 
                         // not trivial accept or reject, must rasterize full tile
@@ -854,7 +840,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
                         }
                         RDTSC_STOP(BERasterizePartial, 0, 0);
 
-                        anyCoveredSamples |= triDesc.coverageMask[sampleNum]; 
+                        triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum]; 
                     }
                 }
                 else
@@ -875,7 +861,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
             }
             else
 #endif
-            if(anyCoveredSamples)
+            if(triDesc.anyCoveredSamples)
             {
                 RDTSC_START(BEPixelBackend);
                 backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
diff --git a/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h
new file mode 100644
index 00000000000..7ff109d4fe8
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h
@@ -0,0 +1,102 @@
+/****************************************************************************
+* Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file arena.h
+*
+* @brief RingBuffer
+*        The RingBuffer class manages all aspects of the ring buffer including
+*        the head/tail indices, etc.
+*
+******************************************************************************/
+#pragma once
+
+template<typename T>
+class RingBuffer
+{
+public:
+    RingBuffer()
+        : mpRingBuffer(nullptr), mNumEntries(0), mRingHead(0), mRingTail(0)
+    {
+    }
+
+    ~RingBuffer()
+    {
+        Destroy();
+    }
+
+    void Init(uint32_t numEntries)
+    {
+        SWR_ASSERT(numEntries > 0);
+        mNumEntries = numEntries;
+        mpRingBuffer = (T*)_aligned_malloc(sizeof(T)*numEntries, 64);
+        SWR_ASSERT(mpRingBuffer != nullptr);
+        memset(mpRingBuffer, 0, sizeof(T)*numEntries);
+    }
+
+    void Destroy()
+    {
+        _aligned_free(mpRingBuffer);
+        mpRingBuffer = nullptr;
+    }
+
+    T& operator[](const uint32_t index)
+    {
+        SWR_ASSERT(index < mNumEntries);
+        return mpRingBuffer[index];
+    }
+
+    INLINE void Enqueue()
+    {
+        mRingHead++; // There's only one producer.
+    }
+
+    INLINE void Dequeue()
+    {
+        InterlockedIncrement(&mRingTail); // There are multiple consumers.
+    }
+
+    INLINE bool IsEmpty()
+    {
+        return (GetHead() == GetTail());
+    }
+
+    INLINE bool IsFull()
+    {
+        ///@note We don't handle wrap case due to using 64-bit indices.
+        ///      It would take 11 million years to wrap at 50,000 DCs per sec.
+        ///      If we used 32-bit indices then its about 23 hours to wrap.
+        uint64_t numEnqueued = GetHead() - GetTail();
+        SWR_ASSERT(numEnqueued <= mNumEntries);
+
+        return (numEnqueued == mNumEntries);
+    }
+
+    INLINE volatile uint64_t GetTail() { return mRingTail; }
+    INLINE volatile uint64_t GetHead() { return mRingHead; }
+
+protected:
+    T* mpRingBuffer;
+    uint32_t mNumEntries;
+
+    OSALIGNLINE(volatile uint64_t) mRingHead;  // Consumer Counter
+    OSALIGNLINE(volatile uint64_t) mRingTail;  // Producer Counter
+};
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h
index 2758555fd4b..5752094ca10 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -307,6 +307,8 @@ struct PixelPositions
     simdscalar centroid;
 };
 
+#define SWR_MAX_NUM_MULTISAMPLES 16
+
 //////////////////////////////////////////////////////////////////////////
 /// SWR_PS_CONTEXT
 /// @brief Input to pixel shader.
@@ -338,6 +340,7 @@ struct SWR_PS_CONTEXT
     uint32_t frontFace;         // IN: front- 1, back- 0
     uint32_t primID;            // IN: primitive ID
     uint32_t sampleIndex;       // IN: sampleIndex
+
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -748,7 +751,6 @@ struct SWR_RENDER_TARGET_BLEND_STATE
 };
 static_assert(sizeof(SWR_RENDER_TARGET_BLEND_STATE) == 1, "Invalid SWR_RENDER_TARGET_BLEND_STATE size");
 
-#define SWR_MAX_NUM_MULTISAMPLES 16
 enum SWR_MULTISAMPLE_COUNT
 {
     SWR_MULTISAMPLE_1X = 0,
@@ -786,7 +788,8 @@ typedef void(__cdecl *PFN_GS_FUNC)(HANDLE hPrivateData, SWR_GS_CONTEXT* pGsConte
 typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, SWR_CS_CONTEXT* pCsContext);
 typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT& soContext);
 typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext);
-typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, BYTE*, simdvector&, simdscalari*, simdscalari*);
+typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext);
+typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, uint8_t*, simdvector&, simdscalari*, simdscalari*);
 
 //////////////////////////////////////////////////////////////////////////
 /// FRONTEND_STATE
@@ -941,6 +944,7 @@ struct SWR_BACKEND_STATE
     uint8_t numComponents[KNOB_NUM_ATTRIBUTES];
 };
 
+
 union SWR_DEPTH_STENCIL_STATE
 {
     struct
@@ -980,7 +984,6 @@ enum SWR_SHADING_RATE
 {
     SWR_SHADING_RATE_PIXEL,
     SWR_SHADING_RATE_SAMPLE,
-    SWR_SHADING_RATE_COARSE,
     SWR_SHADING_RATE_MAX,
 };
 
@@ -1024,4 +1027,5 @@ struct SWR_PS_STATE
     uint32_t barycentricsMask   : 3;    // which type(s) of barycentric coords does the PS interpolate attributes with
     uint32_t usesUAV            : 1;    // pixel shader accesses UAV 
     uint32_t forceEarlyZ        : 1;    // force execution of early depth/stencil test
+
 };
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index 24c5588bfec..07bc94a1a54 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -24,7 +24,6 @@
 #include <stdio.h>
 #include <thread>
 #include <algorithm>
-#include <unordered_set>
 #include <float.h>
 #include <vector>
 #include <utility>
@@ -44,7 +43,6 @@
 #include "rasterizer.h"
 #include "rdtsc_core.h"
 #include "tilemgr.h"
-#include "core/multisample.h"
 
 
 
@@ -265,9 +263,7 @@ void bindThread(uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=
 INLINE
 uint64_t GetEnqueuedDraw(SWR_CONTEXT *pContext)
 {
-    //uint64_t result = _InterlockedCompareExchange64((volatile __int64*)&pContext->DrawEnqueued, 0, 0);
-    //return result;
-    return pContext->DrawEnqueued;
+    return pContext->dcRing.GetHead();
 }
 
 INLINE
@@ -283,170 +279,27 @@ bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint64_t lastReti
     return (pDC->dependency > lastRetiredDraw);
 }
 
-void ClearColorHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
+INLINE int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
 {
-    // Load clear color into SIMD register...
-    float *pClearData = (float*)(pHotTile->clearData);
-    simdscalar valR = _simd_broadcast_ss(&pClearData[0]);
-    simdscalar valG = _simd_broadcast_ss(&pClearData[1]);
-    simdscalar valB = _simd_broadcast_ss(&pClearData[2]);
-    simdscalar valA = _simd_broadcast_ss(&pClearData[3]);
+    int64_t result = InterlockedDecrement64(&pDC->threadsDone);
+    SWR_ASSERT(result >= 0);
 
-    float *pfBuf = (float*)pHotTile->pBuffer;
-    uint32_t numSamples = pHotTile->numSamples;
-
-    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+    if (result == 0)
     {
-        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+        // Cleanup memory allocations
+        pDC->pArena->Reset(true);
+        pDC->pTileMgr->initialize();
+        if (pDC->cleanupState)
         {
-            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++)
-            {
-                _simd_store_ps(pfBuf, valR);
-                pfBuf += KNOB_SIMD_WIDTH;
-                _simd_store_ps(pfBuf, valG);
-                pfBuf += KNOB_SIMD_WIDTH;
-                _simd_store_ps(pfBuf, valB);
-                pfBuf += KNOB_SIMD_WIDTH;
-                _simd_store_ps(pfBuf, valA);
-                pfBuf += KNOB_SIMD_WIDTH;
-            }
+            pDC->pState->pArena->Reset(true);
         }
-    }
-}
-
-void ClearDepthHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
-{
-    // Load clear color into SIMD register...
-    float *pClearData = (float*)(pHotTile->clearData);
-    simdscalar valZ = _simd_broadcast_ss(&pClearData[0]);
 
-    float *pfBuf = (float*)pHotTile->pBuffer;
-    uint32_t numSamples = pHotTile->numSamples;
-
-    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
-    {
-        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
-        {
-            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM)
-            {
-                _simd_store_ps(pfBuf, valZ);
-                pfBuf += KNOB_SIMD_WIDTH;
-            }
-        }
-    }
-}
-
-void ClearStencilHotTile(const HOTTILE* pHotTile)
-{
-    // convert from F32 to U8.
-    uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]);
-    //broadcast 32x into __m256i...
-    simdscalari valS = _simd_set1_epi8(clearVal);
-
-    simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer;
-    uint32_t numSamples = pHotTile->numSamples;
-
-    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
-    {
-        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
-        {
-            // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
-            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4)
-            {
-                _simd_store_si(pBuf, valS);
-                pBuf += 1;
-            }
-        }
-    }
-}
-
-// for draw calls, we initialize the active hot tiles and perform deferred
-// load on them if tile is in invalid state. we do this in the outer thread loop instead of inside
-// the draw routine itself mainly for performance, to avoid unnecessary setup
-// every triangle
-// @todo support deferred clear
-INLINE
-void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, const TRIANGLE_WORK_DESC* pWork)
-{
-    const API_STATE& state = GetApiState(pDC);
-    HotTileMgr *pHotTileMgr = pContext->pHotTileMgr;
-
-    uint32_t x, y;
-    MacroTileMgr::getTileIndices(macroID, x, y);
-    x *= KNOB_MACROTILE_X_DIM;
-    y *= KNOB_MACROTILE_Y_DIM;
-
-    uint32_t numSamples = GetNumSamples(state.rastState.sampleCount);
-
-    // check RT if enabled
-    unsigned long rtSlot = 0;
-    uint32_t colorHottileEnableMask = state.colorHottileEnable;
-    while(_BitScanForward(&rtSlot, colorHottileEnableMask))
-    {
-        HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples);
-
-        if (pHotTile->state == HOTTILE_INVALID)
-        {
-            RDTSC_START(BELoadTiles);
-            // invalid hottile before draw requires a load from surface before we can draw to it
-            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
-            pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_STOP(BELoadTiles, 0, 0);
-        }
-        else if (pHotTile->state == HOTTILE_CLEAR)
-        {
-            RDTSC_START(BELoadTiles);
-            // Clear the tile.
-            ClearColorHotTile(pHotTile);
-            pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_STOP(BELoadTiles, 0, 0);
-        }
-        colorHottileEnableMask &= ~(1 << rtSlot);
-    }
+        _ReadWriteBarrier();
 
-    // check depth if enabled
-    if (state.depthHottileEnable)
-    {
-        HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
-        if (pHotTile->state == HOTTILE_INVALID)
-        {
-            RDTSC_START(BELoadTiles);
-            // invalid hottile before draw requires a load from surface before we can draw to it
-            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
-            pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_STOP(BELoadTiles, 0, 0);
-        }
-        else if (pHotTile->state == HOTTILE_CLEAR)
-        {
-            RDTSC_START(BELoadTiles);
-            // Clear the tile.
-            ClearDepthHotTile(pHotTile);
-            pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_STOP(BELoadTiles, 0, 0);
-        }
+        pContext->dcRing.Dequeue();  // Remove from tail
     }
 
-    // check stencil if enabled
-    if (state.stencilHottileEnable)
-    {
-        HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
-        if (pHotTile->state == HOTTILE_INVALID)
-        {
-            RDTSC_START(BELoadTiles);
-            // invalid hottile before draw requires a load from surface before we can draw to it
-            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
-            pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_STOP(BELoadTiles, 0, 0);
-        }
-        else if (pHotTile->state == HOTTILE_CLEAR)
-        {
-            RDTSC_START(BELoadTiles);
-            // Clear the tile.
-            ClearStencilHotTile(pHotTile);
-            pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_STOP(BELoadTiles, 0, 0);
-        }
-    }
+    return result;
 }
 
 INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
@@ -466,7 +319,7 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
         if (isWorkComplete)
         {
             curDrawBE++;
-            InterlockedIncrement(&pDC->threadsDoneBE);
+            CompleteDrawContext(pContext, pDC);
         }
         else
         {
@@ -496,7 +349,9 @@ void WorkOnFifoBE(
     SWR_CONTEXT *pContext,
     uint32_t workerId,
     uint64_t &curDrawBE,
-    std::unordered_set<uint32_t>& lockedTiles)
+    TileSet& lockedTiles,
+    uint32_t numaNode,
+    uint32_t numaMask)
 {
     // Find the first incomplete draw that has pending work. If no such draw is found then
     // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
@@ -537,68 +392,78 @@ void WorkOnFifoBE(
 
         for (uint32_t tileID : macroTiles)
         {
+            // Only work on tiles for for this numa node
+            uint32_t x, y;
+            pDC->pTileMgr->getTileIndices(tileID, x, y);
+            if (((x ^ y) & numaMask) != numaNode)
+            {
+                continue;
+            }
+
             MacroTileQueue &tile = pDC->pTileMgr->getMacroTileQueue(tileID);
             
+            if (!tile.getNumQueued())
+            {
+                continue;
+            }
+
             // can only work on this draw if it's not in use by other threads
-            if (lockedTiles.find(tileID) == lockedTiles.end())
+            if (lockedTiles.find(tileID) != lockedTiles.end())
             {
-                if (tile.getNumQueued())
+                continue;
+            }
+
+            if (tile.tryLock())
+            {
+                BE_WORK *pWork;
+
+                RDTSC_START(WorkerFoundWork);
+
+                uint32_t numWorkItems = tile.getNumQueued();
+                SWR_ASSERT(numWorkItems);
+
+                pWork = tile.peek();
+                SWR_ASSERT(pWork);
+                if (pWork->type == DRAW)
                 {
-                    if (tile.tryLock())
-                    {
-                        BE_WORK *pWork;
-
-                        RDTSC_START(WorkerFoundWork);
-
-                        uint32_t numWorkItems = tile.getNumQueued();
-
-                        if (numWorkItems != 0)
-                        {
-                            pWork = tile.peek();
-                            SWR_ASSERT(pWork);
-                            if (pWork->type == DRAW)
-                            {
-                                InitializeHotTiles(pContext, pDC, tileID, (const TRIANGLE_WORK_DESC*)&pWork->desc);
-                            }
-                        }
-
-                        while ((pWork = tile.peek()) != nullptr)
-                        {
-                            pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
-                            tile.dequeue();
-                        }
-                        RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId);
-
-                        _ReadWriteBarrier();
-
-                        pDC->pTileMgr->markTileComplete(tileID);
-
-                        // Optimization: If the draw is complete and we're the last one to have worked on it then
-                        // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
-                        if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete())
-                        {
-                            // We can increment the current BE and safely move to next draw since we know this draw is complete.
-                            curDrawBE++;
-                            InterlockedIncrement(&pDC->threadsDoneBE);
-
-                            lastRetiredDraw++;
-
-                            lockedTiles.clear();
-                            break;
-                        }
-                    }
-                    else
-                    {
-                        // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
-                        lockedTiles.insert(tileID);
-                    }
+                    pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, tileID);
+                }
+
+                while ((pWork = tile.peek()) != nullptr)
+                {
+                    pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
+                    tile.dequeue();
                 }
+                RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId);
+
+                _ReadWriteBarrier();
+
+                pDC->pTileMgr->markTileComplete(tileID);
+
+                // Optimization: If the draw is complete and we're the last one to have worked on it then
+                // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
+                if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete())
+                {
+                    // We can increment the current BE and safely move to next draw since we know this draw is complete.
+                    curDrawBE++;
+                    CompleteDrawContext(pContext, pDC);
+
+                    lastRetiredDraw++;
+
+                    lockedTiles.clear();
+                    break;
+                }
+            }
+            else
+            {
+                // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
+                lockedTiles.insert(tileID);
             }
         }
     }
 }
 
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, UCHAR numaNode)
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode)
 {
     // Try to grab the next DC from the ring
     uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
@@ -608,8 +473,8 @@ void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE,
         DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
         if (pDC->isCompute || pDC->doneFE || pDC->FeLock)
         {
+            CompleteDrawContext(pContext, pDC);
             curDrawFE++;
-            InterlockedIncrement(&pDC->threadsDoneFE);
         }
         else
         {
@@ -673,22 +538,12 @@ void WorkOnCompute(
     // Is there any work remaining?
     if (queue.getNumQueued() > 0)
     {
-        bool lastToComplete = false;
-
         uint32_t threadGroupId = 0;
         while (queue.getWork(threadGroupId))
         {
             ProcessComputeBE(pDC, workerId, threadGroupId);
 
-            lastToComplete = queue.finishedWork();
-        }
-
-        _ReadWriteBarrier();
-
-        if (lastToComplete)
-        {
-            SWR_ASSERT(queue.isWorkComplete() == true);
-            pDC->doneCompute = true;
+            queue.finishedWork();
         }
     }
 }
@@ -704,14 +559,15 @@ DWORD workerThreadMain(LPVOID pData)
 
     RDTSC_INIT(threadId);
 
-    int numaNode = (int)pThreadData->numaId;
+    uint32_t numaNode = pThreadData->numaId;
+    uint32_t numaMask = pContext->threadPool.numaMask;
 
     // flush denormals to 0
     _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
 
     // Track tiles locked by other threads. If we try to lock a macrotile and find its already
     // locked then we'll add it to this list so that we don't try and lock it again.
-    std::unordered_set<uint32_t> lockedTiles;
+    TileSet lockedTiles;
 
     // each worker has the ability to work on any of the queued draws as long as certain
     // conditions are met. the data associated
@@ -732,10 +588,10 @@ DWORD workerThreadMain(LPVOID pData)
     //    the worker can safely increment its oldestDraw counter and move on to the next draw.
     std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);
 
-    auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->DrawEnqueued; };
+    auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->dcRing.GetHead(); };
 
-    uint64_t curDrawBE = 1;
-    uint64_t curDrawFE = 1;
+    uint64_t curDrawBE = 0;
+    uint64_t curDrawFE = 0;
 
     while (pContext->threadPool.inThreadShutdown == false)
     {
@@ -776,7 +632,7 @@ DWORD workerThreadMain(LPVOID pData)
         }
 
         RDTSC_START(WorkerWorkOnFifoBE);
-        WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles);
+        WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
         RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);
 
         WorkOnCompute(pContext, workerId, curDrawBE);
@@ -853,9 +709,12 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
             numThreads, KNOB_MAX_NUM_THREADS);
     }
 
+    uint32_t numAPIReservedThreads = 1;
+
+
     if (numThreads == 1)
     {
-        // If only 1 worker thread, try to move it to an available
+        // If only 1 worker threads, try to move it to an available
         // HW thread.  If that fails, use the API thread.
         if (numCoresPerNode < numHWCoresPerNode)
         {
@@ -878,8 +737,15 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
     }
     else
     {
-        // Save a HW thread for the API thread.
-        numThreads--;
+        // Save HW threads for the API if we can
+        if (numThreads > numAPIReservedThreads)
+        {
+            numThreads -= numAPIReservedThreads;
+        }
+        else
+        {
+            numAPIReservedThreads = 0;
+        }
     }
 
     pPool->numThreads = numThreads;
@@ -887,6 +753,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
 
     pPool->inThreadShutdown = false;
     pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA));
+    pPool->numaMask = 0;
 
     if (KNOB_MAX_WORKER_THREADS)
     {
@@ -907,6 +774,8 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
     }
     else
     {
+        pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.)
+
         uint32_t workerId = 0;
         for (uint32_t n = 0; n < numNodes; ++n)
         {
@@ -918,9 +787,9 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
                 auto& core = node.cores[c];
                 for (uint32_t t = 0; t < numHyperThreads; ++t)
                 {
-                    if (c == 0 && n == 0 && t == 0)
+                    if (numAPIReservedThreads)
                     {
-                        // Skip core 0, thread0  on node 0 to reserve for API thread
+                        --numAPIReservedThreads;
                         continue;
                     }
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h
index 0fa7196f5ac..821d7dcb16e 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@@ -34,6 +34,7 @@
 typedef std::thread* THREAD_PTR;
 
 struct SWR_CONTEXT;
+struct DRAW_CONTEXT;
 
 struct THREAD_DATA
 {
@@ -50,14 +51,18 @@ struct THREAD_POOL
 {
     THREAD_PTR threads[KNOB_MAX_NUM_THREADS];
     uint32_t numThreads;
+    uint32_t numaMask;
     volatile bool inThreadShutdown;
     THREAD_DATA *pThreadData;
 };
 
+typedef std::unordered_set<uint32_t> TileSet;
+
 void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
 void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
 
 // Expose FE and BE worker functions to the API thread if single threaded
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, UCHAR numaNode);
-void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, std::unordered_set<uint32_t> &usedTiles);
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode);
+void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask);
 void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE);
+int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC);
+\ No newline at end of file
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
index 860393661e2..794577270cf 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@@ -29,7 +29,9 @@
 #include <unordered_map>
 
 #include "fifo.hpp"
-#include "tilemgr.h"
+#include "core/tilemgr.h"
+#include "core/multisample.h"
+#include "rdtsc_core.h"
 
 #define TILE_ID(x,y) ((x << 16 | y))
 
@@ -54,24 +56,21 @@ void DispatchQueue::operator delete(void *p)
     _aligned_free(p);
 }
 
-MacroTileMgr::MacroTileMgr(Arena& arena) : mArena(arena)
+MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena)
 {
 }
 
-void MacroTileMgr::initialize()
-{
-    mWorkItemsProduced = 0;
-    mWorkItemsConsumed = 0;
-
-    mDirtyTiles.clear();
-}
-
 void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork)
 {
     // Should not enqueue more then what we have backing for in the hot tile manager.
     SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
     SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
 
+    if ((x & ~(KNOB_NUM_HOT_TILES_X-1)) | (y & ~(KNOB_NUM_HOT_TILES_Y-1)))
+    {
+        return;
+    }
+
     uint32_t id = TILE_ID(x, y);
 
     MacroTileQueue &tile = mTiles[id];
@@ -103,3 +102,284 @@ void MacroTileMgr::markTileComplete(uint32_t id)
     tile.mWorkItemsFE = 0;
     tile.mWorkItemsBE = 0;
 }
+
+HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples,
+    uint32_t renderTargetArrayIndex)
+{
+    uint32_t x, y;
+    MacroTileMgr::getTileIndices(macroID, x, y);
+
+    SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
+    SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
+
+    HotTileSet &tile = mHotTiles[x][y];
+    HOTTILE& hotTile = tile.Attachment[attachment];
+    if (hotTile.pBuffer == NULL)
+    {
+        if (create)
+        {
+            uint32_t size = numSamples * mHotTileSize[attachment];
+            uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
+            hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, KNOB_SIMD_WIDTH * 4, numaNode);
+            hotTile.state = HOTTILE_INVALID;
+            hotTile.numSamples = numSamples;
+            hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
+        }
+        else
+        {
+            return NULL;
+        }
+    }
+    else
+    {
+        // free the old tile and create a new one with enough space to hold all samples
+        if (numSamples > hotTile.numSamples)
+        {
+            // tile should be either uninitialized or resolved if we're deleting and switching to a 
+            // new sample count
+            SWR_ASSERT((hotTile.state == HOTTILE_INVALID) ||
+                (hotTile.state == HOTTILE_RESOLVED) ||
+                (hotTile.state == HOTTILE_CLEAR));
+            FreeHotTileMem(hotTile.pBuffer);
+
+            uint32_t size = numSamples * mHotTileSize[attachment];
+            uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
+            hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, KNOB_SIMD_WIDTH * 4, numaNode);
+            hotTile.state = HOTTILE_INVALID;
+            hotTile.numSamples = numSamples;
+        }
+
+        // if requested render target array index isn't currently loaded, need to store out the current hottile 
+        // and load the requested array slice
+        if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex)
+        {
+            SWR_FORMAT format;
+            switch (attachment)
+            {
+            case SWR_ATTACHMENT_COLOR0:
+            case SWR_ATTACHMENT_COLOR1:
+            case SWR_ATTACHMENT_COLOR2:
+            case SWR_ATTACHMENT_COLOR3:
+            case SWR_ATTACHMENT_COLOR4:
+            case SWR_ATTACHMENT_COLOR5:
+            case SWR_ATTACHMENT_COLOR6:
+            case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break;
+            case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break;
+            case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break;
+            default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break;
+            }
+
+            if (hotTile.state == HOTTILE_DIRTY)
+            {
+                pContext->pfnStoreTile(GetPrivateState(pDC), format, attachment,
+                    x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer);
+            }
+
+            pContext->pfnLoadTile(GetPrivateState(pDC), format, attachment,
+                x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer);
+
+            hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
+            hotTile.state = HOTTILE_DIRTY;
+        }
+    }
+    return &tile.Attachment[attachment];
+}
+
+HOTTILE* HotTileMgr::GetHotTileNoLoad(
+    SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID,
+    SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples)
+{
+    uint32_t x, y;
+    MacroTileMgr::getTileIndices(macroID, x, y);
+
+    SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
+    SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
+
+    HotTileSet &tile = mHotTiles[x][y];
+    HOTTILE& hotTile = tile.Attachment[attachment];
+    if (hotTile.pBuffer == NULL)
+    {
+        if (create)
+        {
+            uint32_t size = numSamples * mHotTileSize[attachment];
+            hotTile.pBuffer = (uint8_t*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
+            hotTile.state = HOTTILE_INVALID;
+            hotTile.numSamples = numSamples;
+            hotTile.renderTargetArrayIndex = 0;
+        }
+        else
+        {
+            return NULL;
+        }
+    }
+
+    return &hotTile;
+}
+
+void HotTileMgr::ClearColorHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
+{
+    // Load clear color into SIMD register...
+    float *pClearData = (float*)(pHotTile->clearData);
+    simdscalar valR = _simd_broadcast_ss(&pClearData[0]);
+    simdscalar valG = _simd_broadcast_ss(&pClearData[1]);
+    simdscalar valB = _simd_broadcast_ss(&pClearData[2]);
+    simdscalar valA = _simd_broadcast_ss(&pClearData[3]);
+
+    float *pfBuf = (float*)pHotTile->pBuffer;
+    uint32_t numSamples = pHotTile->numSamples;
+
+    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+    {
+        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+        {
+            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++)
+            {
+                _simd_store_ps(pfBuf, valR);
+                pfBuf += KNOB_SIMD_WIDTH;
+                _simd_store_ps(pfBuf, valG);
+                pfBuf += KNOB_SIMD_WIDTH;
+                _simd_store_ps(pfBuf, valB);
+                pfBuf += KNOB_SIMD_WIDTH;
+                _simd_store_ps(pfBuf, valA);
+                pfBuf += KNOB_SIMD_WIDTH;
+            }
+        }
+    }
+}
+
+void HotTileMgr::ClearDepthHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
+{
+    // Load clear color into SIMD register...
+    float *pClearData = (float*)(pHotTile->clearData);
+    simdscalar valZ = _simd_broadcast_ss(&pClearData[0]);
+
+    float *pfBuf = (float*)pHotTile->pBuffer;
+    uint32_t numSamples = pHotTile->numSamples;
+
+    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+    {
+        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+        {
+            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM)
+            {
+                _simd_store_ps(pfBuf, valZ);
+                pfBuf += KNOB_SIMD_WIDTH;
+            }
+        }
+    }
+}
+
+void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile)
+{
+    // convert from F32 to U8.
+    uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]);
+    //broadcast 32x into __m256i...
+    simdscalari valS = _simd_set1_epi8(clearVal);
+
+    simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer;
+    uint32_t numSamples = pHotTile->numSamples;
+
+    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+    {
+        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+        {
+            // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
+            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4)
+            {
+                _simd_store_si(pBuf, valS);
+                pBuf += 1;
+            }
+        }
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief InitializeHotTiles
+/// for draw calls, we initialize the active hot tiles and perform deferred
+/// load on them if tile is in invalid state. we do this in the outer thread
+/// loop instead of inside the draw routine itself mainly for performance,
+/// to avoid unnecessary setup every triangle
+/// @todo support deferred clear
+/// @param pCreateInfo - pointer to creation info.
+void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID)
+{
+    const API_STATE& state = GetApiState(pDC);
+    HotTileMgr *pHotTileMgr = pContext->pHotTileMgr;
+
+    uint32_t x, y;
+    MacroTileMgr::getTileIndices(macroID, x, y);
+    x *= KNOB_MACROTILE_X_DIM;
+    y *= KNOB_MACROTILE_Y_DIM;
+
+    uint32_t numSamples = GetNumSamples(state.rastState.sampleCount);
+
+    // check RT if enabled
+    unsigned long rtSlot = 0;
+    uint32_t colorHottileEnableMask = state.colorHottileEnable;
+    while (_BitScanForward(&rtSlot, colorHottileEnableMask))
+    {
+        HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples);
+
+        if (pHotTile->state == HOTTILE_INVALID)
+        {
+            RDTSC_START(BELoadTiles);
+            // invalid hottile before draw requires a load from surface before we can draw to it
+            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+            pHotTile->state = HOTTILE_DIRTY;
+            RDTSC_STOP(BELoadTiles, 0, 0);
+        }
+        else if (pHotTile->state == HOTTILE_CLEAR)
+        {
+            RDTSC_START(BELoadTiles);
+            // Clear the tile.
+            ClearColorHotTile(pHotTile);
+            pHotTile->state = HOTTILE_DIRTY;
+            RDTSC_STOP(BELoadTiles, 0, 0);
+        }
+        colorHottileEnableMask &= ~(1 << rtSlot);
+    }
+
+    // check depth if enabled
+    if (state.depthHottileEnable)
+    {
+        HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
+        if (pHotTile->state == HOTTILE_INVALID)
+        {
+            RDTSC_START(BELoadTiles);
+            // invalid hottile before draw requires a load from surface before we can draw to it
+            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+            pHotTile->state = HOTTILE_DIRTY;
+            RDTSC_STOP(BELoadTiles, 0, 0);
+        }
+        else if (pHotTile->state == HOTTILE_CLEAR)
+        {
+            RDTSC_START(BELoadTiles);
+            // Clear the tile.
+            ClearDepthHotTile(pHotTile);
+            pHotTile->state = HOTTILE_DIRTY;
+            RDTSC_STOP(BELoadTiles, 0, 0);
+        }
+    }
+
+    // check stencil if enabled
+    if (state.stencilHottileEnable)
+    {
+        HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
+        if (pHotTile->state == HOTTILE_INVALID)
+        {
+            RDTSC_START(BELoadTiles);
+            // invalid hottile before draw requires a load from surface before we can draw to it
+            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+            pHotTile->state = HOTTILE_DIRTY;
+            RDTSC_STOP(BELoadTiles, 0, 0);
+        }
+        else if (pHotTile->state == HOTTILE_CLEAR)
+        {
+            RDTSC_START(BELoadTiles);
+            // Clear the tile.
+            ClearStencilHotTile(pHotTile);
+            pHotTile->state = HOTTILE_DIRTY;
+            RDTSC_STOP(BELoadTiles, 0, 0);
+        }
+    }
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
index 9137941bad4..aa561badc1c 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
@@ -59,7 +59,8 @@ struct MacroTileQueue
 
     //////////////////////////////////////////////////////////////////////////
     /// @brief Clear fifo and unlock it.
-    void clear(Arena& arena)
+    template <typename ArenaT>
+    void clear(ArenaT& arena)
     {
         mFifo.clear(arena);
     }
@@ -71,7 +72,8 @@ struct MacroTileQueue
         return mFifo.peek();
     }
 
-    bool enqueue_try_nosync(Arena& arena, const BE_WORK* entry)
+    template <typename ArenaT>
+    bool enqueue_try_nosync(ArenaT& arena, const BE_WORK* entry)
     {
         return mFifo.enqueue_try_nosync(arena, entry);
     }
@@ -104,7 +106,7 @@ private:
 class MacroTileMgr
 {
 public:
-    MacroTileMgr(Arena& arena);
+    MacroTileMgr(CachingArena& arena);
     ~MacroTileMgr()
     {
         for (auto &tile : mTiles)
@@ -113,7 +115,14 @@ public:
         }
     }
 
-    void initialize();
+    INLINE void initialize()
+    {
+        mWorkItemsProduced = 0;
+        mWorkItemsConsumed = 0;
+
+        mDirtyTiles.clear();
+    }
+
     INLINE std::vector<uint32_t>& getDirtyTiles() { return mDirtyTiles; }
     INLINE MacroTileQueue& getMacroTileQueue(uint32_t id) { return mTiles[id]; }
     void markTileComplete(uint32_t id);
@@ -135,15 +144,14 @@ public:
     void operator delete (void *p);
 
 private:
-    Arena& mArena;
-    SWR_FORMAT mFormat;
+    CachingArena& mArena;
     std::unordered_map<uint32_t, MacroTileQueue> mTiles;
 
     // Any tile that has work queued to it is a dirty tile.
     std::vector<uint32_t> mDirtyTiles;
 
-    OSALIGNLINE(LONG) mWorkItemsProduced;
-    OSALIGNLINE(volatile LONG) mWorkItemsConsumed;
+    OSALIGNLINE(LONG) mWorkItemsProduced { 0 };
+    OSALIGNLINE(volatile LONG) mWorkItemsConsumed { 0 };
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -224,7 +232,7 @@ public:
     void *operator new(size_t size);
     void operator delete (void *p);
 
-    void* mpTaskData;        // The API thread will set this up and the callback task function will interpet this.
+    void* mpTaskData{ nullptr };        // The API thread will set this up and the callback task function will interpet this.
 
     OSALIGNLINE(volatile LONG) mTasksAvailable{ 0 };
     OSALIGNLINE(volatile LONG) mTasksOutstanding{ 0 };
@@ -241,7 +249,7 @@ enum HOTTILE_STATE
 
 struct HOTTILE
 {
-    BYTE *pBuffer;
+    uint8_t *pBuffer;
     HOTTILE_STATE state;
     DWORD clearData[4];                 // May need to change based on pfnClearTile implementation.  Reorder for alignment?
     uint32_t numSamples;
@@ -283,108 +291,50 @@ public:
             {
                 for (int a = 0; a < SWR_NUM_ATTACHMENTS; ++a)
                 {
-                    if (mHotTiles[x][y].Attachment[a].pBuffer != NULL)
-                    {
-                        _aligned_free(mHotTiles[x][y].Attachment[a].pBuffer);
-                        mHotTiles[x][y].Attachment[a].pBuffer = NULL;
-                    }
+                    FreeHotTileMem(mHotTiles[x][y].Attachment[a].pBuffer);
                 }
             }
         }
     }
 
-    HOTTILE *GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1, 
-        uint32_t renderTargetArrayIndex = 0)
-    {
-        uint32_t x, y;
-        MacroTileMgr::getTileIndices(macroID, x, y);
+    void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID);
 
-        assert(x < KNOB_NUM_HOT_TILES_X);
-        assert(y < KNOB_NUM_HOT_TILES_Y);
+    HOTTILE *GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1,
+        uint32_t renderTargetArrayIndex = 0);
 
-        HotTileSet &tile = mHotTiles[x][y];
-        HOTTILE& hotTile = tile.Attachment[attachment];
-        if (hotTile.pBuffer == NULL)
-        {
-            if (create)
-            {
-                uint32_t size = numSamples * mHotTileSize[attachment];
-                hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
-                hotTile.state = HOTTILE_INVALID;
-                hotTile.numSamples = numSamples;
-                hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
-            }
-            else
-            {
-                return NULL;
-            }
-        }
-        else
-        {
-            // free the old tile and create a new one with enough space to hold all samples
-            if (numSamples > hotTile.numSamples)
-            {
-                // tile should be either uninitialized or resolved if we're deleting and switching to a 
-                // new sample count
-                assert((hotTile.state == HOTTILE_INVALID) ||
-                       (hotTile.state == HOTTILE_RESOLVED) || 
-                       (hotTile.state == HOTTILE_CLEAR));
-                _aligned_free(hotTile.pBuffer);
-
-                uint32_t size = numSamples * mHotTileSize[attachment];
-                hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
-                hotTile.state = HOTTILE_INVALID;
-                hotTile.numSamples = numSamples;
-            }
+    HOTTILE *GetHotTileNoLoad(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1);
 
-            // if requested render target array index isn't currently loaded, need to store out the current hottile 
-            // and load the requested array slice
-            if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex)
-            {
-                SWR_FORMAT format;
-                switch (attachment)
-                {
-                case SWR_ATTACHMENT_COLOR0:
-                case SWR_ATTACHMENT_COLOR1:
-                case SWR_ATTACHMENT_COLOR2:
-                case SWR_ATTACHMENT_COLOR3:
-                case SWR_ATTACHMENT_COLOR4:
-                case SWR_ATTACHMENT_COLOR5:
-                case SWR_ATTACHMENT_COLOR6:
-                case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break;
-                case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break;
-                case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break;
-                default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break;
-                }
+    static void ClearColorHotTile(const HOTTILE* pHotTile);
+    static void ClearDepthHotTile(const HOTTILE* pHotTile);
+    static void ClearStencilHotTile(const HOTTILE* pHotTile);
 
-                if (hotTile.state == HOTTILE_DIRTY)
-                {
-                    pContext->pfnStoreTile(GetPrivateState(pDC), format, attachment,
-                        x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer);
-                }
-
-                pContext->pfnLoadTile(GetPrivateState(pDC), format, attachment,
-                    x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer);
+private:
+    HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y];
+    uint32_t mHotTileSize[SWR_NUM_ATTACHMENTS];
 
-                hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
-                hotTile.state = HOTTILE_DIRTY;
-            }
-        }
-        return &tile.Attachment[attachment];
+    void* AllocHotTileMem(size_t size, uint32_t align, uint32_t numaNode)
+    {
+        void* p = nullptr;
+#if defined(_WIN32)
+        HANDLE hProcess = GetCurrentProcess();
+        p = VirtualAllocExNuma(hProcess, nullptr, size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE, numaNode);
+#else
+        p = _aligned_malloc(size, align);
+#endif
+
+        return p;
     }
 
-    HotTileSet &GetHotTile(uint32_t macroID)
+    void FreeHotTileMem(void* pBuffer)
     {
-        uint32_t x, y;
-        MacroTileMgr::getTileIndices(macroID, x, y);
-        assert(x < KNOB_NUM_HOT_TILES_X);
-        assert(y < KNOB_NUM_HOT_TILES_Y);
-
-        return mHotTiles[x][y];
+        if (pBuffer)
+        {
+#if defined(_WIN32)
+            VirtualFree(pBuffer, 0, MEM_RELEASE);
+#else
+            _aligned_free(pBuffer);
+#endif
+        }
     }
-
-private:
-    HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y];
-    uint32_t mHotTileSize[SWR_NUM_ATTACHMENTS];
 };
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.cpp b/src/gallium/drivers/swr/rasterizer/core/utils.cpp
index f36452f2cec..a1d665e77cc 100644
--- a/src/gallium/drivers/swr/rasterizer/core/utils.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/utils.cpp
@@ -27,6 +27,11 @@
 ******************************************************************************/
 #if defined(_WIN32)
 
+#if defined(NOMINMAX)
+// GDI Plus requires non-std min / max macros be defined :(
+#undef NOMINMAX
+#endif
+
 #include<Windows.h>
 #include <Gdiplus.h>
 #include <Gdiplusheaders.h>
diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h
index b9dc48c4fd7..60a3a6af19e 100644
--- a/src/gallium/drivers/swr/rasterizer/core/utils.h
+++ b/src/gallium/drivers/swr/rasterizer/core/utils.h
@@ -46,8 +46,7 @@ void OpenBitmapFromFile(
     uint32_t *height);
 #endif
 
-/// @todo assume linux is always 64 bit
-#if defined(_WIN64) || defined(__linux__) || defined(__gnu_linux__)
+#if defined(_WIN64) || defined(__x86_64__)
 #define _MM_INSERT_EPI64 _mm_insert_epi64
 #define _MM_EXTRACT_EPI64 _mm_extract_epi64
 #else
@@ -89,7 +88,10 @@ INLINE __m128i  _MM_INSERT_EPI64(__m128i a, INT64 b, const int32_t ndx)
 
 OSALIGNLINE(struct) BBOX
 {
-    int top, bottom, left, right;
+    int top{ 0 };
+    int bottom{ 0 };
+    int left{ 0 };
+    int right{ 0 };
 
     BBOX() {}
     BBOX(int t, int b, int l, int r) : top(t), bottom(b), left(l), right(r) {}
@@ -110,7 +112,10 @@ OSALIGNLINE(struct) BBOX
 
 struct simdBBox
 {
-    simdscalari top, bottom, left, right;
+    simdscalari top;
+    simdscalari bottom;
+    simdscalari left;
+    simdscalari right;
 };
 
 INLINE
@@ -271,7 +276,7 @@ struct TransposeSingleComponent
     /// @brief Pass-thru for single component.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
     {
         memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
     }
@@ -286,7 +291,7 @@ struct Transpose8_8_8_8
     /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
     {
         simdscalari src = _simd_load_si((const simdscalari*)pSrc);
 #if KNOB_SIMD_WIDTH == 8
@@ -325,7 +330,7 @@ struct Transpose8_8_8
     /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -337,7 +342,7 @@ struct Transpose8_8
     /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
     {
         simdscalari src = _simd_load_si((const simdscalari*)pSrc);
 
@@ -361,7 +366,7 @@ struct Transpose32_32_32_32
     /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
     {
 #if KNOB_SIMD_WIDTH == 8
         simdscalar src0 = _simd_load_ps((const float*)pSrc);
@@ -394,7 +399,7 @@ struct Transpose32_32_32
     /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
     {
 #if KNOB_SIMD_WIDTH == 8
         simdscalar src0 = _simd_load_ps((const float*)pSrc);
@@ -426,7 +431,7 @@ struct Transpose32_32
     /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
     {
         const float* pfSrc = (const float*)pSrc;
         __m128 src_r0 = _mm_load_ps(pfSrc + 0);
@@ -456,7 +461,7 @@ struct Transpose16_16_16_16
     /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
     {
 #if KNOB_SIMD_WIDTH == 8
         simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
@@ -496,7 +501,7 @@ struct Transpose16_16_16
     /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
     {
 #if KNOB_SIMD_WIDTH == 8
         simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
@@ -535,7 +540,7 @@ struct Transpose16_16
     /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
     {
         simdscalar src = _simd_load_ps((const float*)pSrc);
 
@@ -566,7 +571,7 @@ struct Transpose24_8
     /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -578,7 +583,7 @@ struct Transpose32_8_24
     /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
 };
 
 
@@ -592,7 +597,7 @@ struct Transpose4_4_4_4
     /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -604,7 +609,7 @@ struct Transpose5_6_5
     /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -616,7 +621,7 @@ struct Transpose9_9_9_5
     /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -628,7 +633,7 @@ struct Transpose5_5_5_1
     /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -640,7 +645,7 @@ struct Transpose10_10_10_2
     /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -652,7 +657,7 @@ struct Transpose11_11_10
     /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
 };
 
 // helper function to unroll loops
@@ -694,7 +699,7 @@ uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
     }
 #endif
 
-    BYTE* pRemainderBytes = (BYTE*)pDataWords;
+    uint8_t* pRemainderBytes = (uint8_t*)pDataWords;
     for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
     {
         crc = _mm_crc32_u8(crc, *pRemainderBytes++);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
index 734c89792f0..de856c4a095 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -47,6 +47,10 @@
 #include "llvm/Analysis/CFGPrinter.h"
 #include "llvm/IRReader/IRReader.h"
 
+#if LLVM_USE_INTEL_JITEVENTS
+#include "llvm/ExecutionEngine/JITEventListener.h"
+#endif
+
 #include "core/state.h"
 #include "common/containers.hpp"
 
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
index c974a611224..4ffb0fbee01 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
@@ -53,6 +53,10 @@
 #include "llvm/Config/config.h"
 #endif
 
+#ifndef HAVE_LLVM
+#define HAVE_LLVM (LLVM_VERSION_MAJOR << 8) || LLVM_VERSION_MINOR
+#endif
+
 #include "llvm/IR/Verifier.h"
 #include "llvm/ExecutionEngine/MCJIT.h"
 #include "llvm/Support/FileSystem.h"
@@ -60,11 +64,10 @@
 
 #include "llvm/Analysis/Passes.h"
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
 #include "llvm/PassManager.h"
 #else
 #include "llvm/IR/LegacyPassManager.h"
-using namespace llvm::legacy;
 #endif
 
 #include "llvm/CodeGen/Passes.h"
@@ -166,7 +169,6 @@ struct JitManager
     FunctionType* mTrinaryFPTy;
     FunctionType* mUnaryIntTy;
     FunctionType* mBinaryIntTy;
-    FunctionType* mTrinaryIntTy;
 
     Type* mSimtFP32Ty;
     Type* mSimtInt32Ty;
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
index 954524afd3a..a64f86006f4 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
@@ -576,9 +576,12 @@ struct BlendJit : public Builder
             src1[i] = LOAD(pSrc1, { i });
         }
         Value* currentMask = VIMMED1(-1);
-        if(state.desc.alphaToCoverageEnable)
+        if (state.desc.alphaToCoverageEnable)
         {
-            currentMask = FP_TO_SI(FMUL(src[3], VBROADCAST(C((float)state.desc.numSamples))), mSimdInt32Ty);
+            Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f);
+            uint32_t bits = (1 << state.desc.numSamples) - 1;
+            currentMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits)));
+            currentMask = FP_TO_SI(FADD(currentMask, VIMMED1(0.5f)), mSimdInt32Ty);
         }
 
         // alpha test
@@ -702,6 +705,12 @@ struct BlendJit : public Builder
             currentMask = AND(sampleMask, currentMask);
         }
 
+        if (state.desc.alphaToCoverageEnable)
+        {
+            Value* sampleMasked = SHL(C(1), sampleNum);
+            currentMask = AND(currentMask, VBROADCAST(sampleMasked));
+        }
+
         if(state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable ||
            state.desc.oMaskEnable)
         {
@@ -717,7 +726,13 @@ struct BlendJit : public Builder
 
         JitManager::DumpToFile(blendFunc, "");
 
-        FunctionPassManager passes(JM()->mpCurrentModule);
+#if HAVE_LLVM == 0x306
+        FunctionPassManager
+#else
+        llvm::legacy::FunctionPassManager
+#endif
+            passes(JM()->mpCurrentModule);
+
         passes.add(createBreakCriticalEdgesPass());
         passes.add(createCFGSimplificationPass());
         passes.add(createEarlyCSEPass());
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
index c15bdf1e756..757ea3fe39c 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -38,6 +38,8 @@ using namespace llvm;
 Builder::Builder(JitManager *pJitMgr)
     : mpJitMgr(pJitMgr)
 {
+    mVWidth = pJitMgr->mVWidth;
+
     mpIRBuilder = &pJitMgr->mBuilder;
 
     mVoidTy = Type::getVoidTy(pJitMgr->mContext);
@@ -48,14 +50,18 @@ Builder::Builder(JitManager *pJitMgr)
     mInt8Ty = Type::getInt8Ty(pJitMgr->mContext);
     mInt16Ty = Type::getInt16Ty(pJitMgr->mContext);
     mInt32Ty = Type::getInt32Ty(pJitMgr->mContext);
+    mInt8PtrTy = PointerType::get(mInt8Ty, 0);
+    mInt16PtrTy = PointerType::get(mInt16Ty, 0);
+    mInt32PtrTy = PointerType::get(mInt32Ty, 0);
     mInt64Ty = Type::getInt64Ty(pJitMgr->mContext);
     mV4FP32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mFP32Ty), false); // vector4 float type (represented as structure)
     mV4Int32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mInt32Ty), false); // vector4 int type
-    mSimdInt16Ty = VectorType::get(mInt16Ty, mpJitMgr->mVWidth);
-    mSimdInt32Ty = VectorType::get(mInt32Ty, mpJitMgr->mVWidth);
-    mSimdInt64Ty = VectorType::get(mInt64Ty, mpJitMgr->mVWidth);
-    mSimdFP16Ty = VectorType::get(mFP16Ty, mpJitMgr->mVWidth);
-    mSimdFP32Ty = VectorType::get(mFP32Ty, mpJitMgr->mVWidth);
+    mSimdInt16Ty = VectorType::get(mInt16Ty, mVWidth);
+    mSimdInt32Ty = VectorType::get(mInt32Ty, mVWidth);
+    mSimdInt64Ty = VectorType::get(mInt64Ty, mVWidth);
+    mSimdFP16Ty = VectorType::get(mFP16Ty, mVWidth);
+    mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth);
+    mSimdVectorTy = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mSimdFP32Ty), false);
 
     if (sizeof(uint32_t*) == 4)
     {
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
index 49216612cc9..239ef2ab49f 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -43,6 +43,8 @@ struct Builder
     JitManager* mpJitMgr;
     IRBuilder<>* mpIRBuilder;
 
+    uint32_t             mVWidth;
+
     // Built in types.
     Type*                mVoidTy;
     Type*                mInt1Ty;
@@ -54,12 +56,16 @@ struct Builder
     Type*                mFP16Ty;
     Type*                mFP32Ty;
     Type*                mDoubleTy;
+    Type*                mInt8PtrTy;
+    Type*                mInt16PtrTy;
+    Type*                mInt32PtrTy;
     Type*                mSimdFP16Ty;
     Type*                mSimdFP32Ty;
     Type*                mSimdInt16Ty;
     Type*                mSimdInt32Ty;
     Type*                mSimdInt64Ty;
     Type*                mSimdIntPtrTy;
+    Type*                mSimdVectorTy;
     StructType*          mV4FP32Ty;
     StructType*          mV4Int32Ty;
 
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 5394fc7bf5a..486dad8f04c 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -28,6 +28,8 @@
 * 
 ******************************************************************************/
 #include "builder.h"
+#include "common/rdtsc_buckets.h"
+
 #include "llvm/Support/DynamicLibrary.h"
 
 void __cdecl CallPrint(const char* fmt, ...);
@@ -189,32 +191,32 @@ Constant *Builder::PRED(bool pred)
 
 Value *Builder::VIMMED1(int i)
 {
-    return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
+    return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 }
 
 Value *Builder::VIMMED1(uint32_t i)
 {
-    return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
+    return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 }
 
 Value *Builder::VIMMED1(float i)
 {
-    return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantFP>(C(i)));
+    return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
 }
 
 Value *Builder::VIMMED1(bool i)
 {
-    return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
+    return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 }
 
 Value *Builder::VUNDEF_IPTR()
 {
-    return UndefValue::get(VectorType::get(PointerType::get(mInt32Ty, 0),JM()->mVWidth));
+    return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
 }
 
 Value *Builder::VUNDEF_I()
 {
-    return UndefValue::get(VectorType::get(mInt32Ty, JM()->mVWidth));
+    return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
 }
 
 Value *Builder::VUNDEF(Type *ty, uint32_t size)
@@ -224,15 +226,15 @@ Value *Builder::VUNDEF(Type *ty, uint32_t size)
 
 Value *Builder::VUNDEF_F()
 {
-    return UndefValue::get(VectorType::get(mFP32Ty, JM()->mVWidth));
+    return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
 }
 
 Value *Builder::VUNDEF(Type* t)
 {
-    return UndefValue::get(VectorType::get(t, JM()->mVWidth));
+    return UndefValue::get(VectorType::get(t, mVWidth));
 }
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
 Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index)
 {
     return VINSERT(vec, val, C((int64_t)index));
@@ -247,7 +249,7 @@ Value *Builder::VBROADCAST(Value *src)
         return src;
     }
 
-    return VECTOR_SPLAT(JM()->mVWidth, src);
+    return VECTOR_SPLAT(mVWidth, src);
 }
 
 uint32_t Builder::IMMED(Value* v)
@@ -257,6 +259,13 @@ uint32_t Builder::IMMED(Value* v)
     return pValConst->getZExtValue();
 }
 
+int32_t Builder::S_IMMED(Value* v)
+{
+    SWR_ASSERT(isa<ConstantInt>(v));
+    ConstantInt *pValConst = cast<ConstantInt>(v);
+    return pValConst->getSExtValue();
+}
+
 Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
 {
     std::vector<Value*> indices;
@@ -342,8 +351,8 @@ Value *Builder::MASKLOADD(Value* src,Value* mask)
     else
     {
         Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
-        Value* fMask = BITCAST(mask,VectorType::get(mFP32Ty,JM()->mVWidth));
-        vResult = BITCAST(CALL(func,{src,fMask}), VectorType::get(mInt32Ty,JM()->mVWidth));
+        Value* fMask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
+        vResult = BITCAST(CALL(func,{src,fMask}), VectorType::get(mInt32Ty,mVWidth));
     }
     return vResult;
 }
@@ -512,7 +521,7 @@ CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list
 
     // get a pointer to the first character in the constant string array
     std::vector<Constant*> geplist{C(0),C(0)};
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
     Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false);
 #else
     Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
@@ -575,7 +584,7 @@ Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMas
         Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
         Value *vOffsets = MUL(vIndices,vScaleVec);
         Value *mask = MASK(vMask);
-        for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+        for(uint32_t i = 0; i < mVWidth; ++i)
         {
             // single component byte index
             Value *offset = VEXTRACT(vOffsets,C(i));
@@ -625,7 +634,7 @@ Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMas
         Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
         Value *vOffsets = MUL(vIndices, vScaleVec);
         Value *mask = MASK(vMask);
-        for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+        for(uint32_t i = 0; i < mVWidth; ++i)
         {
             // single component byte index
             Value *offset = VEXTRACT(vOffsets, C(i));
@@ -774,12 +783,61 @@ Value *Builder::PERMD(Value* a, Value* idx)
     }
     else
     {
-        res = VSHUFFLE(a, a, idx);
+        if (isa<Constant>(idx))
+        {
+            res = VSHUFFLE(a, a, idx);
+        }
+        else
+        {
+            res = VUNDEF_I();
+            for (uint32_t l = 0; l < JM()->mVWidth; ++l)
+            {
+                Value* pIndex = VEXTRACT(idx, C(l));
+                Value* pVal = VEXTRACT(a, pIndex);
+                res = VINSERT(res, pVal, C(l));
+            }
+        }
     }
     return res;
 }
 
 //////////////////////////////////////////////////////////////////////////
+/// @brief Generate a VPERMPS operation (shuffle 32 bit float values 
+/// across 128 bit lanes) in LLVM IR.  If not supported on the underlying 
+/// platform, emulate it
+/// @param a - 256bit SIMD lane(8x32bit) of float values.
+/// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
+Value *Builder::PERMPS(Value* a, Value* idx)
+{
+    Value* res;
+    // use avx2 permute instruction if available
+    if (JM()->mArch.AVX2())
+    {
+        // llvm 3.6.0 swapped the order of the args to vpermd
+        res = VPERMPS(idx, a);
+    }
+    else
+    {
+        if (isa<Constant>(idx))
+        {
+            res = VSHUFFLE(a, a, idx);
+        }
+        else
+        {
+            res = VUNDEF_F();
+            for (uint32_t l = 0; l < JM()->mVWidth; ++l)
+            {
+                Value* pIndex = VEXTRACT(idx, C(l));
+                Value* pVal = VEXTRACT(a, pIndex);
+                res = VINSERT(res, pVal, C(l));
+            }
+        }
+    }
+
+    return res;
+}
+
+//////////////////////////////////////////////////////////////////////////
 /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
 /// in LLVM IR.  If not supported on the underlying platform, emulate it
 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
@@ -800,7 +858,7 @@ Value *Builder::CVTPH2PS(Value* a)
         }
 
         Value* pResult = UndefValue::get(mSimdFP32Ty);
-        for (uint32_t i = 0; i < JM()->mVWidth; ++i)
+        for (uint32_t i = 0; i < mVWidth; ++i)
         {
             Value* pSrc = VEXTRACT(a, C(i));
             Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
@@ -833,7 +891,7 @@ Value *Builder::CVTPS2PH(Value* a, Value* rounding)
         }
 
         Value* pResult = UndefValue::get(mSimdInt16Ty);
-        for (uint32_t i = 0; i < JM()->mVWidth; ++i)
+        for (uint32_t i = 0; i < mVWidth; ++i)
         {
             Value* pSrc = VEXTRACT(a, C(i));
             Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
@@ -1085,8 +1143,8 @@ void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byt
 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
 {
     // cast types
-    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
-    Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits
+    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+    Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
 
     // input could either be float or int vector; do shuffle work in int
     vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
@@ -1094,7 +1152,7 @@ void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInp
 
     if(bPackedOutput) 
     {
-        Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+        Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
 
         // shuffle mask
         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
@@ -1179,12 +1237,12 @@ void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInp
 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
 {
     // cast types
-    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
-    Type* v32x8Ty =  VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits
+    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+    Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
 
     if(bPackedOutput)
     {
-        Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+        Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
         // shuffle mask
         Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
                                      0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
@@ -1286,16 +1344,18 @@ void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
 {
     Value* pStack = STACKSAVE();
 
+    Type* pSrcTy = vSrc->getType()->getVectorElementType();
+
     // allocate tmp stack for masked off lanes
-    Value* vTmpPtr = ALLOCA(vSrc->getType()->getVectorElementType());
+    Value* vTmpPtr = ALLOCA(pSrcTy);
 
     Value *mask = MASK(vMask);
-    for (uint32_t i = 0; i < JM()->mVWidth; ++i)
+    for (uint32_t i = 0; i < mVWidth; ++i)
     {
         Value *offset = VEXTRACT(vOffsets, C(i));
         // byte pointer to component
         Value *storeAddress = GEP(pDst, offset);
-        storeAddress = BITCAST(storeAddress, PointerType::get(mFP32Ty, 0));
+        storeAddress = BITCAST(storeAddress, PointerType::get(pSrcTy, 0));
         Value *selMask = VEXTRACT(mask, C(i));
         Value *srcElem = VEXTRACT(vSrc, C(i));
         // switch in a safe address to load if we're trying to access a vertex 
@@ -1349,7 +1409,7 @@ Value *Builder::FCLAMP(Value* src, float low, float high)
 Value* Builder::STACKSAVE()
 {
     Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
     return CALL(pfnStackSave);
 #else
     return CALLA(pfnStackSave);
@@ -1401,11 +1461,13 @@ void __cdecl CallPrint(const char* fmt, ...)
     vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
     OutputDebugString(strBuf);
 #endif
+
+    va_end(args);
 }
 
 Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
 {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
     Function *func =
         Intrinsic::getDeclaration(JM()->mpCurrentModule,
                                   Intrinsic::x86_avx_vextractf128_si_256);
@@ -1413,8 +1475,8 @@ Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
 #else
     bool flag = !imm8->isZeroValue();
     SmallVector<Constant*,8> idx;
-    for (unsigned i = 0; i < JM()->mVWidth / 2; i++) {
-        idx.push_back(C(flag ? i + JM()->mVWidth / 2 : i));
+    for (unsigned i = 0; i < mVWidth / 2; i++) {
+        idx.push_back(C(flag ? i + mVWidth / 2 : i));
     }
     return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
 #endif
@@ -1422,7 +1484,7 @@ Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
 
 Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
 {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
     Function *func =
         Intrinsic::getDeclaration(JM()->mpCurrentModule,
                                   Intrinsic::x86_avx_vinsertf128_si_256);
@@ -1430,18 +1492,54 @@ Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
 #else
     bool flag = !imm8->isZeroValue();
     SmallVector<Constant*,8> idx;
-    for (unsigned i = 0; i < JM()->mVWidth; i++) {
+    for (unsigned i = 0; i < mVWidth; i++) {
         idx.push_back(C(i));
     }
     Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
 
     SmallVector<Constant*,8> idx2;
-    for (unsigned i = 0; i < JM()->mVWidth / 2; i++) {
-        idx2.push_back(C(flag ? i : i + JM()->mVWidth));
+    for (unsigned i = 0; i < mVWidth / 2; i++) {
+        idx2.push_back(C(flag ? i : i + mVWidth));
     }
-    for (unsigned i = JM()->mVWidth / 2; i < JM()->mVWidth; i++) {
-        idx2.push_back(C(flag ? i + JM()->mVWidth / 2 : i));
+    for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
+        idx2.push_back(C(flag ? i + mVWidth / 2 : i));
     }
     return VSHUFFLE(a, inter, ConstantVector::get(idx2));
 #endif
 }
+
+// rdtsc buckets macros
+void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
+{
+    std::vector<Type*> args{
+        PointerType::get(mInt32Ty, 0),   // pBucketMgr
+        mInt32Ty                        // id
+    };
+
+    FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
+    Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
+    if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
+    {
+        sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
+    }
+
+    CALL(pFunc, { pBucketMgr, pId });
+}
+
+void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
+{
+    std::vector<Type*> args{
+        PointerType::get(mInt32Ty, 0),   // pBucketMgr
+        mInt32Ty                        // id
+    };
+
+    FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
+    Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
+    if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
+    {
+        sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
+    }
+
+    CALL(pFunc, { pBucketMgr, pId });
+}
+
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 48e0558c4dd..f43ef69d1ed 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -59,7 +59,7 @@ Value *VUNDEF_F();
 Value *VUNDEF_I();
 Value *VUNDEF(Type* ty, uint32_t size);
 Value *VUNDEF_IPTR();
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
 Value *VINSERT(Value *vec, Value *val, uint64_t index);
 #endif
 Value *VBROADCAST(Value *src);
@@ -67,6 +67,7 @@ Value *VRCP(Value *va);
 Value *VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY);
 
 uint32_t IMMED(Value* i);
+int32_t S_IMMED(Value* i);
 
 Value *GEP(Value* ptr, const std::initializer_list<Value*> &indexList);
 Value *GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList);
@@ -115,6 +116,7 @@ Value *PSHUFB(Value* a, Value* b);
 Value *PMOVSXBD(Value* a);
 Value *PMOVSXWD(Value* a);
 Value *PERMD(Value* a, Value* idx);
+Value *PERMPS(Value* a, Value* idx);
 Value *CVTPH2PS(Value* a);
 Value *CVTPS2PH(Value* a, Value* rounding);
 Value *PMAXSD(Value* a, Value* b);
@@ -147,3 +149,7 @@ Value* INT3() { return INTERRUPT(C((uint8_t)3)); }
 
 Value *VEXTRACTI128(Value* a, Constant* imm8);
 Value *VINSERTI128(Value* a, Value* b, Constant* imm8);
+
+// rdtsc buckets macros
+void RDTSC_START(Value* pBucketMgr, Value* pId);
+void RDTSC_STOP(Value* pBucketMgr, Value* pId);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index c5a180e27cb..2c2c56bd151 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -105,7 +105,7 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
     std::vector<Value*>    vtxInputIndices(2, C(0));
     // GEP
     pVtxOut = GEP(pVtxOut, C(0));
-    pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, JM()->mVWidth), 0));
+    pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 
     // SWR_FETCH_CONTEXT::pStreams
     Value*    streams = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
@@ -174,7 +174,12 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
 
     verifyFunction(*fetch);
 
-    FunctionPassManager setupPasses(JM()->mpCurrentModule);
+#if HAVE_LLVM == 0x306
+        FunctionPassManager
+#else
+        llvm::legacy::FunctionPassManager
+#endif
+            setupPasses(JM()->mpCurrentModule);
 
     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
     setupPasses.add(createBreakCriticalEdgesPass());
@@ -186,7 +191,12 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
 
     JitManager::DumpToFile(fetch, "se");
 
-    FunctionPassManager optPasses(JM()->mpCurrentModule);
+#if HAVE_LLVM == 0x306
+        FunctionPassManager
+#else
+        llvm::legacy::FunctionPassManager
+#endif
+            optPasses(JM()->mpCurrentModule);
 
     ///@todo Haven't touched these either. Need to remove some of these and add others.
     optPasses.add(createCFGSimplificationPass());
@@ -220,8 +230,8 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fet
 
     SWRL::UncheckedFixedVector<Value*, 16>    vectors;
 
-    std::vector<Constant*>    pMask(JM()->mVWidth);
-    for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+    std::vector<Constant*>    pMask(mVWidth);
+    for(uint32_t i = 0; i < mVWidth; ++i)
     {
         pMask[i] = (C(i < 4 ? i : 4));
     }
@@ -254,7 +264,7 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fet
         Value* startVertexOffset = MUL(Z_EXT(startVertex, mInt64Ty), stride);
 
         // Load from the stream.
-        for(uint32_t lane = 0; lane < JM()->mVWidth; ++lane)
+        for(uint32_t lane = 0; lane < mVWidth; ++lane)
         {
             // Get index
             Value* index = VEXTRACT(vIndices, C(lane));
@@ -380,44 +390,44 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fet
             vectors.push_back(wvec);
         }
 
-        std::vector<Constant*>        v01Mask(JM()->mVWidth);
-        std::vector<Constant*>        v23Mask(JM()->mVWidth);
-        std::vector<Constant*>        v02Mask(JM()->mVWidth);
-        std::vector<Constant*>        v13Mask(JM()->mVWidth);
+        std::vector<Constant*>        v01Mask(mVWidth);
+        std::vector<Constant*>        v23Mask(mVWidth);
+        std::vector<Constant*>        v02Mask(mVWidth);
+        std::vector<Constant*>        v13Mask(mVWidth);
 
         // Concatenate the vectors together.
         elements[0] = VUNDEF_F(); 
         elements[1] = VUNDEF_F(); 
         elements[2] = VUNDEF_F(); 
         elements[3] = VUNDEF_F(); 
-        for(uint32_t b = 0, num4Wide = JM()->mVWidth / 4; b < num4Wide; ++b)
+        for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
         {
             v01Mask[4 * b + 0] = C(0 + 4 * b);
             v01Mask[4 * b + 1] = C(1 + 4 * b);
-            v01Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth);
-            v01Mask[4 * b + 3] = C(1 + 4 * b + JM()->mVWidth);
+            v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
+            v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
 
             v23Mask[4 * b + 0] = C(2 + 4 * b);
             v23Mask[4 * b + 1] = C(3 + 4 * b);
-            v23Mask[4 * b + 2] = C(2 + 4 * b + JM()->mVWidth);
-            v23Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth);
+            v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
+            v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 
             v02Mask[4 * b + 0] = C(0 + 4 * b);
             v02Mask[4 * b + 1] = C(2 + 4 * b);
-            v02Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth);
-            v02Mask[4 * b + 3] = C(2 + 4 * b + JM()->mVWidth);
+            v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
+            v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
 
             v13Mask[4 * b + 0] = C(1 + 4 * b);
             v13Mask[4 * b + 1] = C(3 + 4 * b);
-            v13Mask[4 * b + 2] = C(1 + 4 * b + JM()->mVWidth);
-            v13Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth);
+            v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
+            v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 
-            std::vector<Constant*>    iMask(JM()->mVWidth);
-            for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+            std::vector<Constant*>    iMask(mVWidth);
+            for(uint32_t i = 0; i < mVWidth; ++i)
             {
                 if(((4 * b) <= i) && (i < (4 * (b + 1))))
                 {
-                    iMask[i] = C(i % 4 + JM()->mVWidth);
+                    iMask[i] = C(i % 4 + mVWidth);
                 }
                 else
                 {
@@ -805,7 +815,7 @@ Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
     STORE(C((uint8_t)0), pZeroIndex);
 
     // Load a SIMD of index pointers
-    for(int64_t lane = 0; lane < JM()->mVWidth; lane++)
+    for(int64_t lane = 0; lane < mVWidth; lane++)
     {
         // Calculate the address of the requested index
         Value *pIndex = GEP(pIndices, C(lane));
@@ -840,7 +850,7 @@ Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
     STORE(C((uint16_t)0), pZeroIndex);
 
     // Load a SIMD of index pointers
-    for(int64_t lane = 0; lane < JM()->mVWidth; lane++)
+    for(int64_t lane = 0; lane < mVWidth; lane++)
     {
         // Calculate the address of the requested index
         Value *pIndex = GEP(pIndices, C(lane));
@@ -925,13 +935,13 @@ void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
     const uint32_t (&swizzle)[4] = std::get<9>(args);
 
     // cast types
-    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
-    Type* v32x8Ty =  VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits
+    Type* vGatherTy = mSimdInt32Ty;
+    Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
 
     // have to do extra work for sign extending
     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
-        Type* v16x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 2); // 8x16bit ints in a 128bit lane
-        Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+        Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
+        Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
 
         // shuffle mask, including any swizzling
         const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
@@ -1138,8 +1148,8 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
     Value* (&vVertexElements)[4] = std::get<8>(args);
 
     // cast types
-    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
-    Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits
+    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+    Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
 
     // have to do extra work for sign extending
     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
@@ -1149,7 +1159,7 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
 
         Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
-        Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+        Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
 
         // shuffle mask
         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
index 1814b7c8d5f..e73b232757b 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
@@ -27,7 +27,7 @@ import json as JSON
 import operator
 
 header = r"""/****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -84,16 +84,16 @@ inst_aliases = {
 }
 
 intrinsics = [
-	    ["VGATHERPS", "x86_avx2_gather_d_ps_256", ["src", "pBase", "indices", "mask", "scale"]],
+        ["VGATHERPS", "x86_avx2_gather_d_ps_256", ["src", "pBase", "indices", "mask", "scale"]],
         ["VGATHERDD", "x86_avx2_gather_d_d_256", ["src", "pBase", "indices", "mask", "scale"]],
-	    ["VSQRTPS", "x86_avx_sqrt_ps_256", ["a"]],
-	    ["VRSQRTPS", "x86_avx_rsqrt_ps_256", ["a"]],
-	    ["VRCPPS", "x86_avx_rcp_ps_256", ["a"]],
-	    ["VMINPS", "x86_avx_min_ps_256", ["a", "b"]],
-	    ["VMAXPS", "x86_avx_max_ps_256", ["a", "b"]],
-	    ["VPMINSD", "x86_avx2_pmins_d", ["a", "b"]],
-	    ["VPMAXSD", "x86_avx2_pmaxs_d", ["a", "b"]],
-	    ["VROUND", "x86_avx_round_ps_256", ["a", "rounding"]],
+        ["VSQRTPS", "x86_avx_sqrt_ps_256", ["a"]],
+        ["VRSQRTPS", "x86_avx_rsqrt_ps_256", ["a"]],
+        ["VRCPPS", "x86_avx_rcp_ps_256", ["a"]],
+        ["VMINPS", "x86_avx_min_ps_256", ["a", "b"]],
+        ["VMAXPS", "x86_avx_max_ps_256", ["a", "b"]],
+        ["VPMINSD", "x86_avx2_pmins_d", ["a", "b"]],
+        ["VPMAXSD", "x86_avx2_pmaxs_d", ["a", "b"]],
+        ["VROUND", "x86_avx_round_ps_256", ["a", "rounding"]],
         ["VCMPPS", "x86_avx_cmp_ps_256", ["a", "b", "cmpop"]],
         ["VBLENDVPS", "x86_avx_blendv_ps_256", ["a", "b", "mask"]],
         ["BEXTR_32", "x86_bmi_bextr_32", ["src", "control"]],
@@ -103,6 +103,7 @@ intrinsics = [
         ["VPMOVSXBD", "x86_avx2_pmovsxbd", ["a"]],  # sign extend packed 8bit components
         ["VPMOVSXWD", "x86_avx2_pmovsxwd", ["a"]],  # sign extend packed 16bit components
         ["VPERMD", "x86_avx2_permd", ["idx", "a"]],
+        ["VPERMPS", "x86_avx2_permps", ["idx", "a"]],
         ["VCVTPH2PS", "x86_vcvtph2ps_256", ["a"]],
         ["VCVTPS2PH", "x86_vcvtps2ph_256", ["a", "round"]],
         ["VHSUBPS", "x86_avx_hsub_ps_256", ["a", "b"]],
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py
index 7bba435467b..0b53a929e6c 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py
+++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py
@@ -28,7 +28,7 @@ import operator
 
 header = r"""
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
index 6c5f22bc47c..36baa8d794b 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
@@ -293,7 +293,13 @@ struct StreamOutJit : public Builder
 
         JitManager::DumpToFile(soFunc, "SoFunc");
 
-        FunctionPassManager passes(JM()->mpCurrentModule);
+#if HAVE_LLVM == 0x306
+        FunctionPassManager
+#else
+        llvm::legacy::FunctionPassManager
+#endif
+            passes(JM()->mpCurrentModule);
+
         passes.add(createBreakCriticalEdgesPass());
         passes.add(createCFGSimplificationPass());
         passes.add(createEarlyCSEPass());
diff --git a/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
index ad73cd840a7..d001cb6b5cb 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
+++ b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
@@ -33,7 +33,7 @@
 #include "memory/tilingtraits.h"
 #include "memory/Convert.h"
 
-typedef void(*PFN_STORE_TILES_CLEAR)(const FLOAT*, SWR_SURFACE_STATE*, UINT, UINT);
+typedef void(*PFN_STORE_TILES_CLEAR)(const float*, SWR_SURFACE_STATE*, UINT, UINT);
 
 //////////////////////////////////////////////////////////////////////////
 /// Clear Raster Tile Function Tables.
@@ -54,17 +54,17 @@ struct StoreRasterTileClear
     /// @param pDstSurface - Destination surface state
     /// @param x, y - Coordinates to raster tile.
     INLINE static void StoreClear(
-        const BYTE* dstFormattedColor,
+        const uint8_t* dstFormattedColor,
         UINT dstBytesPerPixel,
         SWR_SURFACE_STATE* pDstSurface,
         UINT x, UINT y) // (x, y) pixel coordinate to start of raster tile.
     {
         // Compute destination address for raster tile.
-        BYTE* pDstTile = (BYTE*)pDstSurface->pBaseAddress +
+        uint8_t* pDstTile = (uint8_t*)pDstSurface->pBaseAddress +
             (y * pDstSurface->pitch) + (x * dstBytesPerPixel);
 
         // start of first row
-        BYTE* pDst = pDstTile;
+        uint8_t* pDst = pDstTile;
         UINT dstBytesPerRow = 0;
 
         // For each raster tile pixel in row 0 (rx, 0)
@@ -104,15 +104,15 @@ struct StoreMacroTileClear
     /// @param pDstSurface - Destination surface state
     /// @param x, y - Coordinates to macro tile
     static void StoreClear(
-        const FLOAT *pColor,
+        const float *pColor,
         SWR_SURFACE_STATE* pDstSurface,
         UINT x, UINT y)
     {
         UINT dstBytesPerPixel = (FormatTraits<DstFormat>::bpp / 8);
 
-        BYTE dstFormattedColor[16]; // max bpp is 128, so 16 is all we need here for one pixel
+        uint8_t dstFormattedColor[16]; // max bpp is 128, so 16 is all we need here for one pixel
 
-        FLOAT srcColor[4];
+        float srcColor[4];
 
         for (UINT comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
         {
diff --git a/src/gallium/drivers/swr/rasterizer/memory/Convert.h b/src/gallium/drivers/swr/rasterizer/memory/Convert.h
index 0f9e0ad4bd8..7c185e5e454 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/Convert.h
+++ b/src/gallium/drivers/swr/rasterizer/memory/Convert.h
@@ -227,10 +227,10 @@ static uint16_t Convert32To16Float(float val)
 /// @param srcPixel - Pointer to source pixel (pre-swizzled according to dest).
 template<SWR_FORMAT DstFormat>
 static void ConvertPixelFromFloat(
-    BYTE* pDstPixel,
+    uint8_t* pDstPixel,
     const float srcPixel[4])
 {
-    UINT outColor[4];  // typeless bits
+    uint32_t outColor[4] = { 0 };  // typeless bits
 
     // Store component
     for (UINT comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
@@ -390,9 +390,9 @@ static void ConvertPixelFromFloat(
 template<SWR_FORMAT SrcFormat>
 INLINE static void ConvertPixelToFloat(
     float dstPixel[4],
-    const BYTE* pSrc)
+    const uint8_t* pSrc)
 {
-    UINT srcColor[4];  // typeless bits
+    uint32_t srcColor[4];  // typeless bits
 
     // unpack src pixel
     typename FormatTraits<SrcFormat>::FormatT* pPixel = (typename FormatTraits<SrcFormat>::FormatT*)pSrc;
@@ -421,11 +421,11 @@ INLINE static void ConvertPixelToFloat(
     }
 
     // Convert components
-    for (UINT comp = 0; comp < FormatTraits<SrcFormat>::numComps; ++comp)
+    for (uint32_t comp = 0; comp < FormatTraits<SrcFormat>::numComps; ++comp)
     {
         SWR_TYPE type = FormatTraits<SrcFormat>::GetType(comp);
 
-        UINT src = srcColor[comp];
+        uint32_t src = srcColor[comp];
 
         switch (type)
         {
@@ -486,7 +486,7 @@ INLINE static void ConvertPixelToFloat(
         }
         case SWR_TYPE_UINT:
         {
-            UINT dst = (UINT)src;
+            uint32_t dst = (uint32_t)src;
             dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = *(float*)&dst;
             break;
         }
diff --git a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h
index 50f8e57c22a..381ac89a7b8 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h
+++ b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h
@@ -28,6 +28,7 @@
 #pragma once
 
 #include "core/state.h"
+#include "common/simdintrin.h"
 
 template<SWR_TILE_MODE mode, int>
 struct TilingTraits
@@ -130,63 +131,6 @@ template<int X> struct TilingTraits <SWR_TILE_MODE_WMAJOR, X>
     static UINT GetPdepY() { return 0x1ea; }
 };
 
-INLINE
-UINT pdep_u32(UINT a, UINT mask)
-{
-#if KNOB_ARCH==KNOB_ARCH_AVX2
-    return _pdep_u32(a, mask);
-#else
-    UINT result = 0;
-
-    // copied from http://wm.ite.pl/articles/pdep-soft-emu.html 
-    // using bsf instead of funky loop
-    DWORD maskIndex;
-    while (_BitScanForward(&maskIndex, mask))
-    {
-        // 1. isolate lowest set bit of mask
-        const UINT lowest = 1 << maskIndex;
-
-        // 2. populate LSB from src
-        const UINT LSB = (UINT)((int)(a << 31) >> 31);
-
-        // 3. copy bit from mask
-        result |= LSB & lowest;
-
-        // 4. clear lowest bit
-        mask &= ~lowest;
-
-        // 5. prepare for next iteration
-        a >>= 1;
-    }
-
-    return result;
-#endif
-}
-
-INLINE
-UINT pext_u32(UINT a, UINT mask)
-{
-#if KNOB_ARCH==KNOB_ARCH_AVX2
-    return _pext_u32(a, mask);
-#else
-    UINT result = 0;
-    DWORD maskIndex;
-    uint32_t currentBit = 0;
-    while (_BitScanForward(&maskIndex, mask))
-    {
-        // 1. isolate lowest set bit of mask
-        const UINT lowest = 1 << maskIndex;
-
-        // 2. copy bit from mask
-        result |= ((a & lowest) > 0) << currentBit++;
-
-        // 3. clear lowest bit
-        mask &= ~lowest;
-    }
-    return result;
-#endif
-}
-
 //////////////////////////////////////////////////////////////////////////
 /// @brief Computes the tileID for 2D tiled surfaces
 /// @param pitch - surface pitch in bytes
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py b/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py
index 44ab69815b1..3d003fb4a33 100644
--- a/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py
+++ b/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+# Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
index 8c51e1e8e73..0f3ded68544 100644
--- a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
+++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+# Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -21,24 +21,20 @@
 
 # Python source
 KNOBS = [
-    ['ENABLE_ASSERT_DIALOGS', {
-        'type'      : 'bool',
-        'default'   : 'true',
-        'desc'      : ['Use dialogs when asserts fire.',
-                       'Asserts are only enabled in debug builds'],
-    }],
 
     ['SINGLE_THREADED', {
         'type'      : 'bool',
         'default'   : 'false',
         'desc'      : ['If enabled will perform all rendering on the API thread.',
                        'This is useful mainly for debugging purposes.'],
+        'category'  : 'debug',
     }],
 
     ['DUMP_SHADER_IR', {
-       'type'       : 'bool',
-       'default'    : 'false',
-       'desc'       : ['Dumps shader LLVM IR at various stages of jit compilation.'],
+        'type'      : 'bool',
+        'default'   : 'false',
+        'desc'      : ['Dumps shader LLVM IR at various stages of jit compilation.'],
+        'category'  : 'debug',
     }],
 
     ['USE_GENERIC_STORETILE', {
@@ -46,6 +42,7 @@ KNOBS = [
         'default'   : 'false',
         'desc'      : ['Always use generic function for performing StoreTile.',
                        'Will be slightly slower than using optimized (jitted) path'],
+        'category'  : 'debug',
     }],
 
     ['FAST_CLEAR', {
@@ -53,6 +50,7 @@ KNOBS = [
         'default'   : 'true',
         'desc'      : ['Replace 3D primitive execute with a SWRClearRT operation and',
                        'defer clear execution to first backend op on hottile, or hottile store'],
+        'category'  : 'perf',
     }],
 
     ['MAX_NUMA_NODES', {
@@ -61,6 +59,7 @@ KNOBS = [
         'desc'      : ['Maximum # of NUMA-nodes per system used for worker threads',
                        '  0 == ALL NUMA-nodes in the system',
                        '  N == Use at most N NUMA-nodes for rendering'],
+        'category'  : 'perf',
     }],
 
     ['MAX_CORES_PER_NUMA_NODE', {
@@ -69,6 +68,7 @@ KNOBS = [
         'desc'      : ['Maximum # of cores per NUMA-node used for worker threads.',
                        '  0 == ALL non-API thread cores per NUMA-node',
                        '  N == Use at most N cores per NUMA-node'],
+        'category'  : 'perf',
     }],
 
     ['MAX_THREADS_PER_CORE', {
@@ -77,6 +77,7 @@ KNOBS = [
         'desc'      : ['Maximum # of (hyper)threads per physical core used for worker threads.',
                        '  0 == ALL hyper-threads per core',
                        '  N == Use at most N hyper-threads per physical core'],
+        'category'  : 'perf',
     }],
 
     ['MAX_WORKER_THREADS', {
@@ -87,6 +88,7 @@ KNOBS = [
                        'IMPORTANT: If this is non-zero, no worker threads will be bound to',
                        'specific HW threads.  They will all be "floating" SW threads.',
                        'In this case, the above 3 KNOBS will be ignored.'],
+        'category'  : 'perf',
     }],
 
     ['BUCKETS_START_FRAME', {
@@ -96,6 +98,7 @@ KNOBS = [
                        '',
                        'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h',
                        'for this to have an effect.'],
+        'category'  : 'perf',
     }],
 
     ['BUCKETS_END_FRAME', {
@@ -105,6 +108,7 @@ KNOBS = [
                        '',
                        'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h',
                        'for this to have an effect.'],
+        'category'  : 'perf',
     }],
 
     ['WORKER_SPIN_LOOP_COUNT', {
@@ -112,46 +116,32 @@ KNOBS = [
         'default'   : '5000',
         'desc'      : ['Number of spin-loop iterations worker threads will perform',
                        'before going to sleep when waiting for work'],
+        'category'  : 'perf',
     }],
 
     ['MAX_DRAWS_IN_FLIGHT', {
         'type'      : 'uint32_t',
-        'default'   : '160',
+        'default'   : '96',
         'desc'      : ['Maximum number of draws outstanding before API thread blocks.'],
+        'category'  : 'perf',
     }],
 
     ['MAX_PRIMS_PER_DRAW', {
-       'type'       : 'uint32_t',
-       'default'    : '2040',
-       'desc'       : ['Maximum primitives in a single Draw().',
+        'type'      : 'uint32_t',
+        'default'   : '2040',
+        'desc'      : ['Maximum primitives in a single Draw().',
                        'Larger primitives are split into smaller Draw calls.',
                        'Should be a multiple of (3 * vectorWidth).'],
+        'category'  : 'perf',
     }],
 
     ['MAX_TESS_PRIMS_PER_DRAW', {
-       'type'       : 'uint32_t',
-       'default'    : '16',
-       'desc'       : ['Maximum primitives in a single Draw() with tessellation enabled.',
+        'type'      : 'uint32_t',
+        'default'   : '16',
+        'desc'      : ['Maximum primitives in a single Draw() with tessellation enabled.',
                        'Larger primitives are split into smaller Draw calls.',
                        'Should be a multiple of (vectorWidth).'],
-    }],
-
-    ['MAX_FRAC_ODD_TESS_FACTOR', {
-        'type'      : 'float',
-        'default'   : '63.0f',
-        'desc'      : ['(DEBUG) Maximum tessellation factor for fractional-odd partitioning.'],
-    }],
-
-    ['MAX_FRAC_EVEN_TESS_FACTOR', {
-        'type'      : 'float',
-        'default'   : '64.0f',
-        'desc'      : ['(DEBUG) Maximum tessellation factor for fractional-even partitioning.'],
-    }],
-
-    ['MAX_INTEGER_TESS_FACTOR', {
-        'type'      : 'uint32_t',
-        'default'   : '64',
-        'desc'      : ['(DEBUG) Maximum tessellation factor for integer partitioning.'],
+        'category'  : 'perf',
     }],
 
 
@@ -159,12 +149,14 @@ KNOBS = [
         'type'      : 'bool',
         'default'   : 'false',
         'desc'      : ['Enable threadviz output.'],
+        'category'  : 'perf',
     }],
 
     ['TOSS_DRAW', {
         'type'      : 'bool',
         'default'   : 'false',
         'desc'      : ['Disable per-draw/dispatch execution'],
+        'category'  : 'perf',
     }],
 
     ['TOSS_QUEUE_FE', {
@@ -173,6 +165,7 @@ KNOBS = [
         'desc'      : ['Stop per-draw execution at worker FE',
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+        'category'  : 'perf',
     }],
 
     ['TOSS_FETCH', {
@@ -181,6 +174,7 @@ KNOBS = [
         'desc'      : ['Stop per-draw execution at vertex fetch',
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+        'category'  : 'perf',
     }],
 
     ['TOSS_IA', {
@@ -189,6 +183,7 @@ KNOBS = [
         'desc'      : ['Stop per-draw execution at input assembler',
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+        'category'  : 'perf',
     }],
 
     ['TOSS_VS', {
@@ -197,6 +192,7 @@ KNOBS = [
         'desc'      : ['Stop per-draw execution at vertex shader',
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+        'category'  : 'perf',
     }],
 
     ['TOSS_SETUP_TRIS', {
@@ -205,6 +201,7 @@ KNOBS = [
         'desc'      : ['Stop per-draw execution at primitive setup',
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+        'category'  : 'perf',
     }],
 
     ['TOSS_BIN_TRIS', {
@@ -213,6 +210,7 @@ KNOBS = [
         'desc'      : ['Stop per-draw execution at primitive binning',
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+        'category'  : 'perf',
     }],
 
     ['TOSS_RS', {
@@ -221,6 +219,5 @@ KNOBS = [
         'desc'      : ['Stop per-draw execution at rasterizer',
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
-    }],
-
-]
+        'category'  : 'perf',
+    }],]
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template b/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template
index 922117e7e16..521346ca833 100644
--- a/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template
+++ b/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template
@@ -10,7 +10,7 @@
         return ' '*(max_len - knob_len)
 %>/******************************************************************************
 *
-* Copyright 2015
+* Copyright 2015-2016
 * Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -77,7 +77,11 @@ struct GlobalKnobs
     % for line in knob[1]['desc']:
     // ${line}
     % endfor
+    % if knob[1]['type'] == 'std::string':
+    DEFINE_KNOB(${knob[0]}, ${knob[1]['type']}, "${repr(knob[1]['default'])[1:-1]}");
+    % else:
     DEFINE_KNOB(${knob[0]}, ${knob[1]['type']}, ${knob[1]['default']});
+    % endif
 
     % endfor
     GlobalKnobs();
@@ -125,7 +129,7 @@ std::string GlobalKnobs::ToString(const char* optPerLinePrefix)
     str << optPerLinePrefix << "KNOB_${knob[0]}:${space_knob(knob[0])}";
     % if knob[1]['type'] == 'bool':
     str << (KNOB_${knob[0]} ? "+\n" : "-\n");
-    % elif knob[1]['type'] != 'float':
+    % elif knob[1]['type'] != 'float' and knob[1]['type'] != 'std::string':
     str << std::hex << std::setw(11) << std::left << KNOB_${knob[0]};
     str << std::dec << KNOB_${knob[0]} << "\n";
     % else:
diff --git a/src/gallium/drivers/swr/swr_context.cpp b/src/gallium/drivers/swr/swr_context.cpp
index 78b8fdf619b..46c79a14b2f 100644
--- a/src/gallium/drivers/swr/swr_context.cpp
+++ b/src/gallium/drivers/swr/swr_context.cpp
@@ -338,7 +338,6 @@ swr_create_context(struct pipe_screen *p_screen, void *priv, unsigned flags)
    SWR_CREATECONTEXT_INFO createInfo;
    createInfo.driver = GL;
    createInfo.privateStateSize = sizeof(swr_draw_context);
-   createInfo.maxSubContexts = 0;
    createInfo.pfnLoadTile = swr_LoadHotTile;
    createInfo.pfnStoreTile = swr_StoreHotTile;
    createInfo.pfnClearTile = swr_StoreHotTileClear;
diff --git a/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c b/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
index a2d89ef3349..8b65cac5084 100644
--- a/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
+++ b/src/gallium/drivers/vc4/vc4_nir_lower_txf_ms.c
@@ -23,7 +23,6 @@
 
 #include "vc4_qir.h"
 #include "kernel/vc4_packet.h"
-#include "tgsi/tgsi_info.h"
 #include "compiler/nir/nir_builder.h"
 
 /** @file vc4_nir_lower_txf_ms.c
diff --git a/src/gallium/drivers/virgl/virgl_tgsi.c b/src/gallium/drivers/virgl/virgl_tgsi.c
index 641b0b3e3b5..4a2271f24f4 100644
--- a/src/gallium/drivers/virgl/virgl_tgsi.c
+++ b/src/gallium/drivers/virgl/virgl_tgsi.c
@@ -40,6 +40,7 @@ virgl_tgsi_transform_property(struct tgsi_transform_context *ctx,
    switch (prop->Property.PropertyName) {
    case TGSI_PROPERTY_NUM_CLIPDIST_ENABLED:
    case TGSI_PROPERTY_NUM_CULLDIST_ENABLED:
+   case TGSI_PROPERTY_NEXT_SHADER:
       break;
    default:
       ctx->emit_property(ctx, prop);
diff --git a/src/gallium/include/pipe/p_context.h b/src/gallium/include/pipe/p_context.h
index ee68fdd6f6f..1c97e82ece5 100644
--- a/src/gallium/include/pipe/p_context.h
+++ b/src/gallium/include/pipe/p_context.h
@@ -162,7 +162,7 @@ struct pipe_context {
     *               item of that data to store (e.g. for
     *               PIPE_QUERY_PIPELINE_STATISTICS).
     *               When the index is -1, instead of the value of the query
-    *               the driver should instead write a 1/0 to the appropriate
+    *               the driver should instead write a 1 or 0 to the appropriate
     *               location with 1 meaning that the query result is available.
     */
    void (*get_query_result_resource)(struct pipe_context *pipe,
diff --git a/src/gallium/include/state_tracker/vdpau_dmabuf.h b/src/gallium/include/state_tracker/vdpau_dmabuf.h
new file mode 100644
index 00000000000..886c3445d81
--- /dev/null
+++ b/src/gallium/include/state_tracker/vdpau_dmabuf.h
@@ -0,0 +1,94 @@
+/**************************************************************************
+ *
+ * Copyright 2016 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/*
+ * Authors:
+ *      Christian König <[email protected]>
+ *
+ */
+
+#ifndef _VDPAU_DMABUF_H_
+#define _VDPAU_DMABUF_H_
+
+#include <vdpau/vdpau.h>
+
+/* driver specific functions for NV_vdpau_interop */
+#ifndef VDP_FUNC_ID_BASE_DRIVER
+#define VDP_FUNC_ID_BASE_DRIVER 0x2000
+#endif
+
+/* New DMA-buf based implementation */
+#define VDP_FUNC_ID_VIDEO_SURFACE_DMA_BUF (VDP_FUNC_ID_BASE_DRIVER + 2)
+#define VDP_FUNC_ID_OUTPUT_SURFACE_DMA_BUF (VDP_FUNC_ID_BASE_DRIVER + 3)
+
+/* Define some more internal RGBA formats for more
+ * robust handling of Video Surfaces
+ */
+#define VDP_RGBA_FORMAT_R8          (-1)
+#define VDP_RGBA_FORMAT_R8G8        (-2)
+
+struct VdpSurfaceDMABufDesc {
+   /* DMA-buf file descriptor */
+   uint32_t handle;
+   /* Width in pixel */
+   uint32_t width;
+   /* Height in pixel */
+   uint32_t height;
+   /* Offset in bytes */
+   uint32_t offset;
+   /* Stride in bytes */
+   uint32_t stride;
+   /* VDP_RGBA_FORMAT_* as defined in the VDPAU spec and above. */
+   uint32_t format;
+};
+
+/**
+ * \brief Video surface planes
+ */
+typedef uint32_t VdpVideoSurfacePlane;
+
+/** \hideinitializer \brief Luma top field */
+#define VDP_VIDEO_SURFACE_PLANE_LUMA_TOP      ((VdpVideoSurfacePlane)0)
+/** \hideinitializer \brief Luma bottom field */
+#define VDP_VIDEO_SURFACE_PLANE_LUMA_BOTTOM   ((VdpVideoSurfacePlane)1)
+/** \hideinitializer \brief Chroma top field */
+#define VDP_VIDEO_SURFACE_PLANE_CHROMA_TOP    ((VdpVideoSurfacePlane)2)
+/** \hideinitializer \brief Chroma bottom field */
+#define VDP_VIDEO_SURFACE_PLANE_CHROMA_BOTTOM ((VdpVideoSurfacePlane)3)
+
+typedef VdpStatus VdpVideoSurfaceDMABuf(
+   VdpVideoSurface               surface,
+   VdpVideoSurfacePlane          plane,
+   struct VdpSurfaceDMABufDesc * result
+);
+
+typedef VdpStatus VdpOutputSurfaceDMABuf(
+   VdpVideoSurface               surface,
+   struct VdpSurfaceDMABufDesc * result
+);
+
+#endif /* _VDPAU_DMABUF_H_ */
diff --git a/src/gallium/include/state_tracker/vdpau_funcs.h b/src/gallium/include/state_tracker/vdpau_funcs.h
new file mode 100644
index 00000000000..66e3c23ede4
--- /dev/null
+++ b/src/gallium/include/state_tracker/vdpau_funcs.h
@@ -0,0 +1,65 @@
+/**************************************************************************
+ *
+ * Copyright 2016 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/*
+ * Authors:
+ *      Christian König <[email protected]>
+ *
+ */
+
+#ifndef _VDPAU_FUNCS_H_
+#define _VDPAU_FUNCS_H_
+
+#include "vdpau_dmabuf.h"
+
+/* Used for implementing NV_vdpau_interop */
+static inline enum pipe_format
+VdpFormatRGBAToPipe(uint32_t vdpau_format)
+{
+   switch (vdpau_format) {
+   case VDP_RGBA_FORMAT_R8:
+      return PIPE_FORMAT_R8_UNORM;
+   case VDP_RGBA_FORMAT_R8G8:
+      return PIPE_FORMAT_R8G8_UNORM;
+   case VDP_RGBA_FORMAT_A8:
+      return PIPE_FORMAT_A8_UNORM;
+   case VDP_RGBA_FORMAT_B10G10R10A2:
+      return PIPE_FORMAT_B10G10R10A2_UNORM;
+   case VDP_RGBA_FORMAT_B8G8R8A8:
+      return PIPE_FORMAT_B8G8R8A8_UNORM;
+   case VDP_RGBA_FORMAT_R10G10B10A2:
+      return PIPE_FORMAT_R10G10B10A2_UNORM;
+   case VDP_RGBA_FORMAT_R8G8B8A8:
+      return PIPE_FORMAT_R8G8B8A8_UNORM;
+   default:
+      assert(0);
+   }
+
+   return PIPE_FORMAT_NONE;
+}
+
+#endif /* _VDPAU_FUNCS_H_ */
diff --git a/src/gallium/include/state_tracker/vdpau_interop.h b/src/gallium/include/state_tracker/vdpau_interop.h
index 3ca7c9d4aa6..04d455a370a 100644
--- a/src/gallium/include/state_tracker/vdpau_interop.h
+++ b/src/gallium/include/state_tracker/vdpau_interop.h
@@ -35,8 +35,13 @@
 #define _VDPAU_INTEROP_H_
 
 /* driver specific functions for NV_vdpau_interop */
-
+#ifndef VDP_FUNC_ID_BASE_DRIVER
 #define VDP_FUNC_ID_BASE_DRIVER 0x2000
+#endif
+
+/* Older implementation relying on passing pipe_video_buffer and
+ * pipe_resources around. Deprecated and shouldn't be used for new things.
+ */
 #define VDP_FUNC_ID_VIDEO_SURFACE_GALLIUM (VDP_FUNC_ID_BASE_DRIVER + 0)
 #define VDP_FUNC_ID_OUTPUT_SURFACE_GALLIUM (VDP_FUNC_ID_BASE_DRIVER + 1)
 
diff --git a/src/gallium/state_trackers/vdpau/bitmap.c b/src/gallium/state_trackers/vdpau/bitmap.c
index 97a428727a5..35c8820433d 100644
--- a/src/gallium/state_trackers/vdpau/bitmap.c
+++ b/src/gallium/state_trackers/vdpau/bitmap.c
@@ -71,7 +71,7 @@ vlVdpBitmapSurfaceCreate(VdpDevice device,
 
    memset(&res_tmpl, 0, sizeof(res_tmpl));
    res_tmpl.target = PIPE_TEXTURE_2D;
-   res_tmpl.format = FormatRGBAToPipe(rgba_format);
+   res_tmpl.format = VdpFormatRGBAToPipe(rgba_format);
    res_tmpl.width0 = width;
    res_tmpl.height0 = height;
    res_tmpl.depth0 = 1;
diff --git a/src/gallium/state_trackers/vdpau/ftab.c b/src/gallium/state_trackers/vdpau/ftab.c
index add465983e5..901a444f1c7 100644
--- a/src/gallium/state_trackers/vdpau/ftab.c
+++ b/src/gallium/state_trackers/vdpau/ftab.c
@@ -107,10 +107,12 @@ static void* ftab_winsys[1] =
    &vlVdpPresentationQueueTargetCreateX11  /* VDP_FUNC_ID_PRESENTATION_QUEUE_TARGET_CREATE_X11 */
 };
 
-static void* ftab_driver[2] =
+static void* ftab_driver[4] =
 {
    &vlVdpVideoSurfaceGallium, /* VDP_FUNC_ID_SURFACE_GALLIUM */
-   &vlVdpOutputSurfaceGallium /* VDP_FUNC_ID_OUTPUT_SURFACE_GALLIUM */
+   &vlVdpOutputSurfaceGallium, /* VDP_FUNC_ID_OUTPUT_SURFACE_GALLIUM */
+   &vlVdpVideoSurfaceDMABuf, /* VDP_FUNC_ID_VIDEO_SURFACE_DMA_BUF */
+   &vlVdpOutputSurfaceDMABuf /* VDP_FUNC_ID_OUTPUT_SURFACE_DMA_BUF */
 };
 
 boolean vlGetFuncFTAB(VdpFuncId function_id, void **func)
diff --git a/src/gallium/state_trackers/vdpau/output.c b/src/gallium/state_trackers/vdpau/output.c
index 3248f76808d..c644cc8ba85 100644
--- a/src/gallium/state_trackers/vdpau/output.c
+++ b/src/gallium/state_trackers/vdpau/output.c
@@ -36,6 +36,8 @@
 
 #include "vl/vl_csc.h"
 
+#include "state_tracker/drm_driver.h"
+
 #include "vdpau_private.h"
 
 /**
@@ -74,12 +76,13 @@ vlVdpOutputSurfaceCreate(VdpDevice device,
    memset(&res_tmpl, 0, sizeof(res_tmpl));
 
    res_tmpl.target = PIPE_TEXTURE_2D;
-   res_tmpl.format = FormatRGBAToPipe(rgba_format);
+   res_tmpl.format = VdpFormatRGBAToPipe(rgba_format);
    res_tmpl.width0 = width;
    res_tmpl.height0 = height;
    res_tmpl.depth0 = 1;
    res_tmpl.array_size = 1;
-   res_tmpl.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET;
+   res_tmpl.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET |
+                   PIPE_BIND_LINEAR | PIPE_BIND_SHARED;
    res_tmpl.usage = PIPE_USAGE_DEFAULT;
 
    pipe_mutex_lock(dev->mutex);
@@ -763,3 +766,40 @@ struct pipe_resource *vlVdpOutputSurfaceGallium(VdpOutputSurface surface)
 
    return vlsurface->surface->texture;
 }
+
+VdpStatus vlVdpOutputSurfaceDMABuf(VdpVideoSurface surface,
+                                   struct VdpSurfaceDMABufDesc *result)
+{
+   vlVdpOutputSurface *vlsurface;
+   struct pipe_screen *pscreen;
+   struct winsys_handle whandle;
+
+   memset(result, 0, sizeof(*result));
+   result->handle = -1;
+
+   vlsurface = vlGetDataHTAB(surface);
+   if (!vlsurface || !vlsurface->surface)
+      return VDP_STATUS_INVALID_HANDLE;
+
+   pipe_mutex_lock(vlsurface->device->mutex);
+   vlVdpResolveDelayedRendering(vlsurface->device, NULL, NULL);
+   vlsurface->device->context->flush(vlsurface->device->context, NULL, 0);
+   pipe_mutex_unlock(vlsurface->device->mutex);
+
+   memset(&whandle, 0, sizeof(struct winsys_handle));
+   whandle.type = DRM_API_HANDLE_TYPE_FD;
+
+   pscreen = vlsurface->surface->texture->screen;
+   if (!pscreen->resource_get_handle(pscreen, vlsurface->surface->texture, &whandle,
+				     PIPE_HANDLE_USAGE_READ_WRITE))
+      return VDP_STATUS_NO_IMPLEMENTATION;
+
+   result->handle = whandle.handle;
+   result->width = vlsurface->surface->width;
+   result->height = vlsurface->surface->height;
+   result->offset = whandle.offset;
+   result->stride = whandle.stride;
+   result->format = PipeToFormatRGBA(vlsurface->surface->format);
+
+   return VDP_STATUS_OK;
+}
diff --git a/src/gallium/state_trackers/vdpau/query.c b/src/gallium/state_trackers/vdpau/query.c
index d41e6d950a7..a279ad3d020 100644
--- a/src/gallium/state_trackers/vdpau/query.c
+++ b/src/gallium/state_trackers/vdpau/query.c
@@ -224,7 +224,7 @@ vlVdpOutputSurfaceQueryCapabilities(VdpDevice device, VdpRGBAFormat surface_rgba
    if (!pscreen)
       return VDP_STATUS_RESOURCES;
 
-   format = FormatRGBAToPipe(surface_rgba_format);
+   format = VdpFormatRGBAToPipe(surface_rgba_format);
    if (format == PIPE_FORMAT_NONE || format == PIPE_FORMAT_A8_UNORM)
       return VDP_STATUS_INVALID_RGBA_FORMAT;
 
@@ -276,7 +276,7 @@ vlVdpOutputSurfaceQueryGetPutBitsNativeCapabilities(VdpDevice device, VdpRGBAFor
    if (!pscreen)
       return VDP_STATUS_ERROR;
 
-   format = FormatRGBAToPipe(surface_rgba_format);
+   format = VdpFormatRGBAToPipe(surface_rgba_format);
    if (format == PIPE_FORMAT_NONE || format == PIPE_FORMAT_A8_UNORM)
       return VDP_STATUS_INVALID_RGBA_FORMAT;
 
@@ -317,7 +317,7 @@ vlVdpOutputSurfaceQueryPutBitsIndexedCapabilities(VdpDevice device,
    if (!pscreen)
       return VDP_STATUS_ERROR;
 
-   rgba_format = FormatRGBAToPipe(surface_rgba_format);
+   rgba_format = VdpFormatRGBAToPipe(surface_rgba_format);
    if (rgba_format == PIPE_FORMAT_NONE || rgba_format == PIPE_FORMAT_A8_UNORM)
       return VDP_STATUS_INVALID_RGBA_FORMAT;
 
@@ -376,7 +376,7 @@ vlVdpOutputSurfaceQueryPutBitsYCbCrCapabilities(VdpDevice device, VdpRGBAFormat
    if (!pscreen)
       return VDP_STATUS_ERROR;
 
-   rgba_format = FormatRGBAToPipe(surface_rgba_format);
+   rgba_format = VdpFormatRGBAToPipe(surface_rgba_format);
    if (rgba_format == PIPE_FORMAT_NONE || rgba_format == PIPE_FORMAT_A8_UNORM)
       return VDP_STATUS_INVALID_RGBA_FORMAT;
 
@@ -424,7 +424,7 @@ vlVdpBitmapSurfaceQueryCapabilities(VdpDevice device, VdpRGBAFormat surface_rgba
    if (!pscreen)
       return VDP_STATUS_RESOURCES;
 
-   format = FormatRGBAToPipe(surface_rgba_format);
+   format = VdpFormatRGBAToPipe(surface_rgba_format);
    if (format == PIPE_FORMAT_NONE)
       return VDP_STATUS_INVALID_RGBA_FORMAT;
 
diff --git a/src/gallium/state_trackers/vdpau/surface.c b/src/gallium/state_trackers/vdpau/surface.c
index ffcedc12de6..d418d56a26a 100644
--- a/src/gallium/state_trackers/vdpau/surface.c
+++ b/src/gallium/state_trackers/vdpau/surface.c
@@ -37,6 +37,8 @@
 #include "util/u_video.h"
 #include "vl/vl_defines.h"
 
+#include "state_tracker/drm_driver.h"
+
 #include "vdpau_private.h"
 
 enum getbits_conversion {
@@ -412,3 +414,70 @@ struct pipe_video_buffer *vlVdpVideoSurfaceGallium(VdpVideoSurface surface)
 
    return p_surf->video_buffer;
 }
+
+VdpStatus vlVdpVideoSurfaceDMABuf(VdpVideoSurface surface,
+                                  VdpVideoSurfacePlane plane,
+                                  struct VdpSurfaceDMABufDesc *result)
+{
+   vlVdpSurface *p_surf = vlGetDataHTAB(surface);
+
+   struct pipe_screen *pscreen;
+   struct winsys_handle whandle;
+
+   struct pipe_surface *surf;
+
+   if (!p_surf)
+      return VDP_STATUS_INVALID_HANDLE;
+
+   if (plane > 3)
+      return VDP_STATUS_INVALID_VALUE;
+
+   if (!result)
+      return VDP_STATUS_INVALID_POINTER;
+
+   memset(result, 0, sizeof(*result));
+   result->handle = -1;
+
+   pipe_mutex_lock(p_surf->device->mutex);
+   if (p_surf->video_buffer == NULL) {
+      struct pipe_context *pipe = p_surf->device->context;
+
+      /* try to create a video buffer if we don't already have one */
+      p_surf->video_buffer = pipe->create_video_buffer(pipe, &p_surf->templat);
+   }
+
+   /* Check if surface match interop requirements */
+   if (p_surf->video_buffer == NULL || !p_surf->video_buffer->interlaced ||
+       p_surf->video_buffer->buffer_format != PIPE_FORMAT_NV12) {
+      pipe_mutex_unlock(p_surf->device->mutex);
+      return VDP_STATUS_NO_IMPLEMENTATION;
+   }
+
+   surf = p_surf->video_buffer->get_surfaces(p_surf->video_buffer)[plane];
+   pipe_mutex_unlock(p_surf->device->mutex);
+
+   if (!surf)
+      return VDP_STATUS_RESOURCES;
+
+   memset(&whandle, 0, sizeof(struct winsys_handle));
+   whandle.type = DRM_API_HANDLE_TYPE_FD;
+   whandle.layer = surf->u.tex.first_layer;
+
+   pscreen = surf->texture->screen;
+   if (!pscreen->resource_get_handle(pscreen, surf->texture, &whandle,
+				     PIPE_HANDLE_USAGE_READ_WRITE))
+      return VDP_STATUS_NO_IMPLEMENTATION;
+
+   result->handle = whandle.handle;
+   result->width = surf->width;
+   result->height = surf->height;
+   result->offset = whandle.offset;
+   result->stride = whandle.stride;
+
+   if (surf->format == PIPE_FORMAT_R8_UNORM)
+      result->format = VDP_RGBA_FORMAT_R8;
+   else
+      result->format = VDP_RGBA_FORMAT_R8G8;
+
+   return VDP_STATUS_OK;
+}
diff --git a/src/gallium/state_trackers/vdpau/vdpau_private.h b/src/gallium/state_trackers/vdpau/vdpau_private.h
index 27ac44cd9c1..3b6647e9975 100644
--- a/src/gallium/state_trackers/vdpau/vdpau_private.h
+++ b/src/gallium/state_trackers/vdpau/vdpau_private.h
@@ -37,6 +37,8 @@
 #include "pipe/p_video_codec.h"
 
 #include "state_tracker/vdpau_interop.h"
+#include "state_tracker/vdpau_dmabuf.h"
+#include "state_tracker/vdpau_funcs.h"
 
 #include "util/u_debug.h"
 #include "util/u_rect.h"
@@ -161,27 +163,6 @@ PipeToFormatYCBCR(enum pipe_format p_format)
    return -1;
 }
 
-static inline enum pipe_format
-FormatRGBAToPipe(VdpRGBAFormat vdpau_format)
-{
-   switch (vdpau_format) {
-      case VDP_RGBA_FORMAT_A8:
-         return PIPE_FORMAT_A8_UNORM;
-      case VDP_RGBA_FORMAT_B10G10R10A2:
-         return PIPE_FORMAT_B10G10R10A2_UNORM;
-      case VDP_RGBA_FORMAT_B8G8R8A8:
-         return PIPE_FORMAT_B8G8R8A8_UNORM;
-      case VDP_RGBA_FORMAT_R10G10B10A2:
-         return PIPE_FORMAT_R10G10B10A2_UNORM;
-      case VDP_RGBA_FORMAT_R8G8B8A8:
-         return PIPE_FORMAT_R8G8B8A8_UNORM;
-      default:
-         assert(0);
-   }
-
-   return PIPE_FORMAT_NONE;
-}
-
 static inline VdpRGBAFormat
 PipeToFormatRGBA(enum pipe_format p_format)
 {
@@ -542,6 +523,8 @@ VdpPresentationQueueTargetCreateX11 vlVdpPresentationQueueTargetCreateX11;
 /* interop to mesa state tracker */
 VdpVideoSurfaceGallium vlVdpVideoSurfaceGallium;
 VdpOutputSurfaceGallium vlVdpOutputSurfaceGallium;
+VdpVideoSurfaceDMABuf vlVdpVideoSurfaceDMABuf;
+VdpOutputSurfaceDMABuf vlVdpOutputSurfaceDMABuf;
 
 #define VDPAU_OUT   0
 #define VDPAU_ERR   1
diff --git a/src/gallium/state_trackers/xa/xa_tgsi.c b/src/gallium/state_trackers/xa/xa_tgsi.c
index 5d8b8079c4b..a50393d7886 100644
--- a/src/gallium/state_trackers/xa/xa_tgsi.c
+++ b/src/gallium/state_trackers/xa/xa_tgsi.c
@@ -339,6 +339,16 @@ create_yuv_shader(struct pipe_context *pipe, struct ureg_program *ureg)
     u_sampler = ureg_DECL_sampler(ureg, 1);
     v_sampler = ureg_DECL_sampler(ureg, 2);
 
+    ureg_DECL_sampler_view(ureg, 0, TGSI_TEXTURE_2D,
+                           TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT,
+                           TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT);
+    ureg_DECL_sampler_view(ureg, 1, TGSI_TEXTURE_2D,
+                           TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT,
+                           TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT);
+    ureg_DECL_sampler_view(ureg, 2, TGSI_TEXTURE_2D,
+                           TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT,
+                           TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT);
+
     matrow0 = ureg_DECL_constant(ureg, 0);
     matrow1 = ureg_DECL_constant(ureg, 1);
     matrow2 = ureg_DECL_constant(ureg, 2);
@@ -475,6 +485,9 @@ create_fs(struct pipe_context *pipe, unsigned fs_traits)
     }
     if (is_composite) {
 	src_sampler = ureg_DECL_sampler(ureg, 0);
+        ureg_DECL_sampler_view(ureg, 0, TGSI_TEXTURE_2D,
+                               TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT,
+                               TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT);
 	src_input = ureg_DECL_fs_input(ureg,
 				       TGSI_SEMANTIC_GENERIC, 0,
 				       TGSI_INTERPOLATE_PERSPECTIVE);
@@ -494,12 +507,18 @@ create_fs(struct pipe_context *pipe, unsigned fs_traits)
 
     if (has_mask) {
 	mask_sampler = ureg_DECL_sampler(ureg, 1);
+        ureg_DECL_sampler_view(ureg, 1, TGSI_TEXTURE_2D,
+                               TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT,
+                               TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT);
 	mask_pos = ureg_DECL_fs_input(ureg,
 				      TGSI_SEMANTIC_GENERIC, 1,
 				      TGSI_INTERPOLATE_PERSPECTIVE);
     }
 #if 0				/* unused right now */
     dst_sampler = ureg_DECL_sampler(ureg, 2);
+    ureg_DECL_sampler_view(ureg, 2, TGSI_TEXTURE_2D,
+                           TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT,
+                           TGSI_RETURN_TYPE_FLOAT, TGSI_RETURN_TYPE_FLOAT);
     dst_pos = ureg_DECL_fs_input(ureg,
 				 TGSI_SEMANTIC_POSITION, 2,
 				 TGSI_INTERPOLATE_PERSPECTIVE);
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp b/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp
index 570216241d1..7c5d29a2166 100644
--- a/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.cpp
@@ -351,6 +351,8 @@ AddrChipFamily CIAddrLib::HwlConvertChipFamily(
             m_settings.isIceland         = ASICREV_IS_ICELAND_M(uChipRevision);
             m_settings.isTonga           = ASICREV_IS_TONGA_P(uChipRevision);
             m_settings.isFiji            = ASICREV_IS_FIJI_P(uChipRevision);
+            m_settings.isPolaris10       = ASICREV_IS_POLARIS10_P(uChipRevision);
+            m_settings.isPolaris11       = ASICREV_IS_POLARIS11_M(uChipRevision);
             break;
         case FAMILY_CZ:
             m_settings.isCarrizo         = 1;
@@ -403,7 +405,7 @@ BOOL_32 CIAddrLib::HwlInitGlobalParams(
 
     // @todo: VI
     // Move this to VI code path once created
-    if (m_settings.isTonga)
+    if (m_settings.isTonga || m_settings.isPolaris10)
     {
         m_pipes = 8;
     }
@@ -415,6 +417,10 @@ BOOL_32 CIAddrLib::HwlInitGlobalParams(
     {
         m_pipes = 16;
     }
+    else if (m_settings.isPolaris11)
+    {
+        m_pipes = 4;
+    }
 
     if (valid)
     {
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h b/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h
index 4cbe9706baa..de995fa4058 100644
--- a/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/r800/ciaddrlib.h
@@ -60,6 +60,8 @@ struct CIChipSettings
         UINT_32 isIceland         : 1;
         UINT_32 isTonga           : 1;
         UINT_32 isFiji            : 1;
+        UINT_32 isPolaris10       : 1;
+        UINT_32 isPolaris11       : 1;
         // VI fusion (Carrizo)
         UINT_32 isCarrizo         : 1;
     };
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_id.h b/src/gallium/winsys/amdgpu/drm/amdgpu_id.h
index 90fe0cd50f1..40b835c2248 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_id.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_id.h
@@ -138,6 +138,10 @@ enum {
 
 	VI_FIJI_P_A0      = 60,
 
+	VI_POLARIS10_P_A0 = 80,
+
+	VI_POLARIS11_M_A0 = 90,
+
 	VI_UNKNOWN        = 0xFF
 };
 
@@ -147,7 +151,11 @@ enum {
 #define ASICREV_IS_TONGA_P(eChipRev)	\
 	((eChipRev >= VI_TONGA_P_A0) && (eChipRev < VI_FIJI_P_A0))
 #define ASICREV_IS_FIJI_P(eChipRev)	\
-	(eChipRev >= VI_FIJI_P_A0)
+	((eChipRev >= VI_FIJI_P_A0)  && (eChipRev < VI_POLARIS10_P_A0))
+#define ASICREV_IS_POLARIS10_P(eChipRev)\
+	((eChipRev >= VI_POLARIS10_P_A0) && (eChipRev < VI_POLARIS11_M_A0))
+#define ASICREV_IS_POLARIS11_M(eChipRev)   \
+	(eChipRev >= VI_POLARIS11_M_A0)
 
 /* CZ specific rev IDs */
 enum {
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
index 938b9c244b2..87d9a6aebec 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -237,6 +237,14 @@ static boolean do_winsys_init(struct amdgpu_winsys *ws, int fd)
       ws->family = FAMILY_VI;
       ws->rev_id = VI_FIJI_P_A0;
       break;
+   case CHIP_POLARIS10:
+      ws->family = FAMILY_VI;
+      ws->rev_id = VI_POLARIS10_P_A0;
+      break;
+   case CHIP_POLARIS11:
+      ws->family = FAMILY_VI;
+      ws->rev_id = VI_POLARIS11_M_A0;
+      break;
    default:
       fprintf(stderr, "amdgpu: Unknown family.\n");
       goto fail;
diff --git a/src/mapi/glapi/gen/Makefile.am b/src/mapi/glapi/gen/Makefile.am
index 8421af48854..fff6805fbcd 100644
--- a/src/mapi/glapi/gen/Makefile.am
+++ b/src/mapi/glapi/gen/Makefile.am
@@ -89,18 +89,7 @@ EXTRA_DIST= \
 XORG_GLX_DIR = $(XORG_BASE)/glx
 XORG_GLAPI_DIR = $(XORG_BASE)/glx
 
-XORG_GLAPI_OUTPUTS = \
-	$(XORG_GLAPI_DIR)/glprocs.h \
-	$(XORG_GLAPI_DIR)/glapitable.h \
-	$(XORG_GLAPI_DIR)/dispatch.h
-
-if HAVE_APPLEDRI
-XORG_GLAPI_OUTPUTS += \
-	$(XORG_GLAPI_DIR)/glapi_gentable.c
-endif
-
 XORG_OUTPUTS = \
-	$(XORG_GLAPI_OUTPUTS) \
 	$(XORG_GLX_DIR)/indirect_dispatch.c \
 	$(XORG_GLX_DIR)/indirect_dispatch_swap.c \
 	$(XORG_GLX_DIR)/indirect_dispatch.h \
@@ -111,6 +100,8 @@ XORG_OUTPUTS = \
 	$(XORG_GLX_DIR)/indirect_size_get.h \
 	$(XORG_GLX_DIR)/indirect_table.c
 
+.PHONY: $(XORG_OUTPUTS)
+
 ######################################################################
 
 API_XML = \
@@ -330,7 +321,7 @@ $(XORG_GLX_DIR)/indirect_dispatch.h: glX_proto_recv.py gl_and_glX_API.xml $(COMM
 
 $(XORG_GLX_DIR)/indirect_size_get.h: glX_proto_size.py $(COMMON_GLX)
 	$(PYTHON_GEN) $< -f $(srcdir)/gl_API.xml -m size_h \
-           --only-get -h '_INDIRECT_SIZE_GET_H_' \
+           --only-get --header-tag '_INDIRECT_SIZE_GET_H_' \
 	  | $(INDENT) $(XORG_INDENT_FLAGS) > $@
 
 $(XORG_GLX_DIR)/indirect_size_get.c: glX_proto_size.py $(COMMON_GLX)
@@ -339,7 +330,7 @@ $(XORG_GLX_DIR)/indirect_size_get.c: glX_proto_size.py $(COMMON_GLX)
 
 $(XORG_GLX_DIR)/indirect_reqsize.h: glX_proto_size.py $(COMMON_GLX)
 	$(PYTHON_GEN) $< -f $(srcdir)/gl_API.xml -m reqsize_h \
-           --only-get -h '_INDIRECT_SIZE_GET_H_' \
+           --only-get --header-tag '_INDIRECT_SIZE_GET_H_' \
 	  | $(INDENT) $(XORG_INDENT_FLAGS) > $@
 
 $(XORG_GLX_DIR)/indirect_reqsize.c: glX_proto_size.py $(COMMON_GLX)
diff --git a/src/mapi/glapi/gen/apiexec.py b/src/mapi/glapi/gen/apiexec.py
index 2a8043264eb..b4f4cf6831b 100644
--- a/src/mapi/glapi/gen/apiexec.py
+++ b/src/mapi/glapi/gen/apiexec.py
@@ -68,7 +68,7 @@ class exec_info():
 functions = {
     # OpenGL 3.1 / GL_ARB_texture_buffer_object.  Mesa only exposes this
     # extension with core profile.
-    "TexBuffer": exec_info(core=31),
+    "TexBuffer": exec_info(core=31, es2=31),
 
     # OpenGL 3.2 / GL_OES_geometry_shader.
     "FramebufferTexture": exec_info(core=32, es2=31),
@@ -146,7 +146,7 @@ functions = {
 
     # OpenGL 4.3 / GL_ARB_texture_buffer_range.  Mesa can expose the extension
     # with OpenGL 3.1.
-    "TexBufferRange": exec_info(core=31),
+    "TexBufferRange": exec_info(core=31, es2=31),
 
     # OpenGL 4.3 / GL_ARB_framebuffer_no_attachments.  Mesa can expose the
     # extension with OpenGL 3.0.
diff --git a/src/mapi/glapi/gen/es_EXT.xml b/src/mapi/glapi/gen/es_EXT.xml
index 178f7c027bc..3b2c15ebf5c 100644
--- a/src/mapi/glapi/gen/es_EXT.xml
+++ b/src/mapi/glapi/gen/es_EXT.xml
@@ -798,6 +798,12 @@
     </function>
 </category>
 
+<category name="GL_OES_sample_shading" number="169">
+    <function name="MinSampleShadingOES" alias="MinSampleShading" es2="3.0">
+        <param name="value" type="GLfloat"/>
+    </function>
+</category>
+
 <!-- 174. GL_OES_texture_storage_multisample_2d_array -->
 <category name="GL_OES_texture_storage_multisample_2d_array" number="174">
     <enum name="TEXTURE_2D_MULTISAMPLE_ARRAY_OES"              value="0x9102"/>
@@ -817,6 +823,59 @@
     </function>
 </category>
 
+<category name="GL_EXT_draw_buffers_indexed" number="176">
+
+  <function name="BlendFunciEXT" alias="BlendFunciARB" es2="3.0">
+    <param name="buf" type="GLuint"/>
+    <param name="sfactor" type="GLenum"/>
+    <param name="dfactor" type="GLenum"/>
+  </function>
+
+  <function name="BlendFuncSeparateiEXT" alias="BlendFuncSeparateiARB" es2="3.0">
+    <param name="buf" type="GLuint"/>
+    <param name="sfactorRGB" type="GLenum"/>
+    <param name="dfactorRGB" type="GLenum"/>
+    <param name="sfactorAlpha" type="GLenum"/>
+    <param name="dfactorAlpha" type="GLenum"/>
+  </function>
+
+  <function name="BlendEquationiEXT" alias="BlendEquationiARB" es2="3.0">
+    <param name="buf" type="GLuint"/>
+    <param name="mode" type="GLenum"/>
+  </function>
+
+  <function name="BlendEquationSeparateiEXT" alias="BlendEquationSeparateiARB" es2="3.0">
+    <param name="buf" type="GLuint"/>
+    <param name="modeRGB" type="GLenum"/>
+    <param name="modeA" type="GLenum"/>
+  </function>
+
+  <function name="ColorMaskiEXT" alias="ColorMaski" es2="3.0">
+    <param name="buf" type="GLuint"/>
+    <param name="r" type="GLboolean"/>
+    <param name="g" type="GLboolean"/>
+    <param name="b" type="GLboolean"/>
+    <param name="a" type="GLboolean"/>
+  </function>
+
+  <function name="EnableiEXT" alias="Enablei" es2="3.0">
+    <param name="target" type="GLenum"/>
+    <param name="index" type="GLuint"/>
+  </function>
+
+  <function name="DisableiEXT" alias="Disablei" es2="3.0">
+    <param name="target" type="GLenum"/>
+    <param name="index" type="GLuint"/>
+  </function>
+
+  <function name="IsEnablediEXT" alias="IsEnabledi" es2="3.0">
+    <param name="target" type="GLenum"/>
+    <param name="index" type="GLuint"/>
+    <return type="GLboolean"/>
+  </function>
+
+</category>
+
 <category name="GL_EXT_texture_border_clamp" number="182">
 
     <!-- The *TexParameter* functions are added in EXT_texture_integer -->
@@ -847,6 +906,24 @@
 
 </category>
 
+<category name="GL_EXT_texture_buffer" number="183">
+
+    <function name="TexBufferEXT" es2="3.1" alias="TexBuffer">
+        <param name="target" type="GLenum"/>
+        <param name="internalFormat" type="GLenum"/>
+        <param name="buffer" type="GLuint"/>
+    </function>
+
+    <function name="TexBufferRangeEXT" es2="3.1" alias="TexBufferRange">
+        <param name="target" type="GLenum"/>
+        <param name="internalformat" type="GLenum"/>
+        <param name="buffer" type="GLuint"/>
+        <param name="offset" type="GLintptr"/>
+        <param name="size" type="GLsizeiptr"/>
+    </function>
+
+</category>
+
 <category name="GL_EXT_draw_elements_base_vertex" number="204">
 
     <function name="DrawElementsBaseVertexEXT" alias="DrawElementsBaseVertex"
@@ -891,6 +968,99 @@
 
 </category>
 
+<category name="GL_EXT_copy_image" number="208">
+
+    <function name="CopyImageSubDataEXT" alias="CopyImageSubData" es2="3.0">
+        <param name="srcName" type="GLuint"/>
+        <param name="srcTarget" type="GLenum"/>
+        <param name="srcLevel" type="GLint"/>
+        <param name="srcX" type="GLint"/>
+        <param name="srcY" type="GLint"/>
+        <param name="srcZ" type="GLint"/>
+        <param name="dstName" type="GLuint"/>
+        <param name="dstTarget" type="GLenum"/>
+        <param name="dstLevel" type="GLint"/>
+        <param name="dstX" type="GLint"/>
+        <param name="dstY" type="GLint"/>
+        <param name="dstZ" type="GLint"/>
+        <param name="srcWidth" type="GLsizei"/>
+        <param name="srcHeight" type="GLsizei"/>
+        <param name="srcDepth" type="GLsizei"/>
+    </function>
+
+</category>
+
+<category name="GL_OES_draw_buffers_indexed" number="209">
+
+  <function name="BlendFunciOES" alias="BlendFunciARB" es2="3.0">
+    <param name="buf" type="GLuint"/>
+    <param name="sfactor" type="GLenum"/>
+    <param name="dfactor" type="GLenum"/>
+  </function>
+
+  <function name="BlendFuncSeparateiOES" alias="BlendFuncSeparateiARB" es2="3.0">
+    <param name="buf" type="GLuint"/>
+    <param name="sfactorRGB" type="GLenum"/>
+    <param name="dfactorRGB" type="GLenum"/>
+    <param name="sfactorAlpha" type="GLenum"/>
+    <param name="dfactorAlpha" type="GLenum"/>
+  </function>
+
+  <function name="BlendEquationiOES" alias="BlendEquationiARB" es2="3.0">
+    <param name="buf" type="GLuint"/>
+    <param name="mode" type="GLenum"/>
+  </function>
+
+  <function name="BlendEquationSeparateiOES" alias="BlendEquationSeparateiARB" es2="3.0">
+    <param name="buf" type="GLuint"/>
+    <param name="modeRGB" type="GLenum"/>
+    <param name="modeA" type="GLenum"/>
+  </function>
+
+  <function name="ColorMaskiOES" alias="ColorMaski" es2="3.0">
+    <param name="buf" type="GLuint"/>
+    <param name="r" type="GLboolean"/>
+    <param name="g" type="GLboolean"/>
+    <param name="b" type="GLboolean"/>
+    <param name="a" type="GLboolean"/>
+  </function>
+
+  <function name="EnableiOES" alias="Enablei" es2="3.0">
+    <param name="target" type="GLenum"/>
+    <param name="index" type="GLuint"/>
+  </function>
+
+  <function name="DisableiOES" alias="Disablei" es2="3.0">
+    <param name="target" type="GLenum"/>
+    <param name="index" type="GLuint"/>
+  </function>
+
+  <function name="IsEnablediOES" alias="IsEnabledi" es2="3.0">
+    <param name="target" type="GLenum"/>
+    <param name="index" type="GLuint"/>
+    <return type="GLboolean"/>
+  </function>
+
+</category>
+
+<category name="GL_OES_texture_buffer" number="216">
+
+    <function name="TexBufferOES" es2="3.1" alias="TexBuffer">
+        <param name="target" type="GLenum"/>
+        <param name="internalFormat" type="GLenum"/>
+        <param name="buffer" type="GLuint"/>
+    </function>
+
+    <function name="TexBufferRangeOES" es2="3.1" alias="TexBufferRange">
+        <param name="target" type="GLenum"/>
+        <param name="internalformat" type="GLenum"/>
+        <param name="buffer" type="GLuint"/>
+        <param name="offset" type="GLintptr"/>
+        <param name="size" type="GLsizeiptr"/>
+    </function>
+
+</category>
+
 <category name="GL_OES_draw_elements_base_vertex" number="219">
 
     <function name="DrawElementsBaseVertexOES" alias="DrawElementsBaseVertex"
@@ -971,6 +1141,28 @@
 
 </category>
 
+<category name="GL_OES_copy_image" number="208">
+
+    <function name="CopyImageSubDataOES" alias="CopyImageSubData" es2="3.0">
+        <param name="srcName" type="GLuint"/>
+        <param name="srcTarget" type="GLenum"/>
+        <param name="srcLevel" type="GLint"/>
+        <param name="srcX" type="GLint"/>
+        <param name="srcY" type="GLint"/>
+        <param name="srcZ" type="GLint"/>
+        <param name="dstName" type="GLuint"/>
+        <param name="dstTarget" type="GLenum"/>
+        <param name="dstLevel" type="GLint"/>
+        <param name="dstX" type="GLint"/>
+        <param name="dstY" type="GLint"/>
+        <param name="dstZ" type="GLint"/>
+        <param name="srcWidth" type="GLsizei"/>
+        <param name="srcHeight" type="GLsizei"/>
+        <param name="srcDepth" type="GLsizei"/>
+    </function>
+
+</category>
+
 <!-- 175. GL_OES_geometry_shader -->
 <category name="GL_OES_geometry_shader" number="210">
     <enum name="GEOMETRY_SHADER_OES"                             value="0x8DD9"/>
diff --git a/src/mapi/glapi/gen/glX_proto_recv.py b/src/mapi/glapi/gen/glX_proto_recv.py
index 5d95f278a91..afee3882254 100644
--- a/src/mapi/glapi/gen/glX_proto_recv.py
+++ b/src/mapi/glapi/gen/glX_proto_recv.py
@@ -55,15 +55,15 @@ class PrintGlxDispatch_h(gl_XML.gl_print_base):
             if not func.ignore and not func.vectorequiv:
                 if func.glx_rop:
                     print 'extern _X_HIDDEN void __glXDisp_%s(GLbyte * pc);' % (func.name)
-                    print 'extern _X_HIDDEN void __glXDispSwap_%s(GLbyte * pc);' % (func.name)
+                    print 'extern _X_HIDDEN _X_COLD void __glXDispSwap_%s(GLbyte * pc);' % (func.name)
                 elif func.glx_sop or func.glx_vendorpriv:
                     print 'extern _X_HIDDEN int __glXDisp_%s(struct __GLXclientStateRec *, GLbyte *);' % (func.name)
-                    print 'extern _X_HIDDEN int __glXDispSwap_%s(struct __GLXclientStateRec *, GLbyte *);' % (func.name)
+                    print 'extern _X_HIDDEN _X_COLD int __glXDispSwap_%s(struct __GLXclientStateRec *, GLbyte *);' % (func.name)
 
                     if func.glx_sop and func.glx_vendorpriv:
                         n = func.glx_vendorpriv_names[0]
                         print 'extern _X_HIDDEN int __glXDisp_%s(struct __GLXclientStateRec *, GLbyte *);' % (n)
-                        print 'extern _X_HIDDEN int __glXDispSwap_%s(struct __GLXclientStateRec *, GLbyte *);' % (n)
+                        print 'extern _X_HIDDEN _X_COLD int __glXDispSwap_%s(struct __GLXclientStateRec *, GLbyte *);' % (n)
 
         return
 
@@ -80,21 +80,14 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto):
 
 
     def printRealHeader(self):
-        print '#include <X11/Xmd.h>'
-        print '#include <GL/gl.h>'
-        print '#include <GL/glxproto.h>'
-
         print '#include <inttypes.h>'
+        print '#include "glxserver.h"'
         print '#include "indirect_size.h"'
         print '#include "indirect_size_get.h"'
         print '#include "indirect_dispatch.h"'
-        print '#include "glxserver.h"'
         print '#include "glxbyteorder.h"'
         print '#include "indirect_util.h"'
         print '#include "singlesize.h"'
-        print '#include "glapi.h"'
-        print '#include "glapitable.h"'
-        print '#include "dispatch.h"'
         print ''
         print '#define __GLX_PAD(x)  (((x) + 3) & ~3)'
         print ''
@@ -124,6 +117,9 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto):
 
         return
 
+    def fptrType(self, name):
+	fptr = "pfngl" + name + "proc"
+	return fptr.upper()
 
     def printFunction(self, f, name):
         if (f.glx_sop or f.glx_vendorpriv) and (len(f.get_images()) != 0):
@@ -141,6 +137,9 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto):
 
         print '{'
 
+        if not f.is_abi():
+            print '    %s %s = __glGetProcAddress("gl%s");' % (self.fptrType(name), name, name)
+
         if f.glx_rop or f.vectorequiv:
             self.printRenderFunction(f)
         elif f.glx_sop or f.glx_vendorpriv:
@@ -225,6 +224,7 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto):
 
     def emit_function_call(self, f, retval_assign, indent):
         list = []
+        prefix = "gl" if f.is_abi() else ""
 
         for param in f.parameterIterator():
             if param.is_padding:
@@ -237,14 +237,7 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto):
 
             list.append( '%s        %s' % (indent, location) )
 
-
-        if len( list ):
-            print '%s    %sCALL_%s( GET_DISPATCH(), (' % (indent, retval_assign, f.name)
-            print string.join( list, ",\n" )
-            print '%s    ) );' % (indent)
-        else:
-            print '%s    %sCALL_%s( GET_DISPATCH(), () );' % (indent, retval_assign, f.name)
-        return
+        print '%s    %s%s%s(%s);' % (indent, retval_assign, prefix, f.name, string.join(list, ',\n'))
 
 
     def common_func_print_just_start(self, f, indent):
@@ -444,6 +437,10 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto):
                 print '        %s %s = __glXGetAnswerBuffer(cl, %s%s, answerBuffer, sizeof(answerBuffer), %u);' % (param.type_string(), param.name, param.counter, size_scale, type_size)
                 answer_string = param.name
                 answer_count = param.counter
+                print ''
+                print '        if (%s == NULL) return BadAlloc;' % (param.name)
+                print '        __glXClearErrorOccured();'
+                print ''
             elif c >= 1:
                 print '        %s %s[%u];' % (answer_type, param.name, c)
                 answer_string = param.name
@@ -507,18 +504,18 @@ class PrintGlxDispatchFunctions(glX_proto_common.glx_print_proto):
             # the must NEVER be byte-swapped.
 
             if not (img.img_type == "GL_BITMAP" and img.img_format == "GL_COLOR_INDEX"):
-                print '    CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_SWAP_BYTES,   hdr->swapBytes) );'
+                print '    glPixelStorei(GL_UNPACK_SWAP_BYTES, hdr->swapBytes);'
 
-            print '    CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_LSB_FIRST,    hdr->lsbFirst) );'
+            print '    glPixelStorei(GL_UNPACK_LSB_FIRST, hdr->lsbFirst);'
 
-            print '    CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_ROW_LENGTH,   (GLint) %shdr->rowLength%s) );' % (pre, post)
+            print '    glPixelStorei(GL_UNPACK_ROW_LENGTH, (GLint) %shdr->rowLength%s);' % (pre, post)
             if img.depth:
-                print '    CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_IMAGE_HEIGHT, (GLint) %shdr->imageHeight%s) );' % (pre, post)
-            print '    CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_SKIP_ROWS,    (GLint) %shdr->skipRows%s) );' % (pre, post)
+                print '    glPixelStorei(GL_UNPACK_IMAGE_HEIGHT, (GLint) %shdr->imageHeight%s);' % (pre, post)
+            print '    glPixelStorei(GL_UNPACK_SKIP_ROWS, (GLint) %shdr->skipRows%s);' % (pre, post)
             if img.depth:
-                print '    CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_SKIP_IMAGES,  (GLint) %shdr->skipImages%s) );' % (pre, post)
-            print '    CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_SKIP_PIXELS,  (GLint) %shdr->skipPixels%s) );' % (pre, post)
-            print '    CALL_PixelStorei( GET_DISPATCH(), (GL_UNPACK_ALIGNMENT,    (GLint) %shdr->alignment%s) );' % (pre, post)
+                print '    glPixelStorei(GL_UNPACK_SKIP_IMAGES, (GLint) %shdr->skipImages%s);' % (pre, post)
+            print '    glPixelStorei(GL_UNPACK_SKIP_PIXELS, (GLint) %shdr->skipPixels%s);' % (pre, post)
+            print '    glPixelStorei(GL_UNPACK_ALIGNMENT, (GLint) %shdr->alignment%s);' % (pre, post)
             print ''
 
 
diff --git a/src/mesa/Android.libmesa_dricore.mk b/src/mesa/Android.libmesa_dricore.mk
index a3e6c6d55ae..d7647a76bd0 100644
--- a/src/mesa/Android.libmesa_dricore.mk
+++ b/src/mesa/Android.libmesa_dricore.mk
@@ -48,9 +48,8 @@ endif # x86
 endif # MESA_ENABLE_ASM
 
 ifeq ($(ARCH_X86_HAVE_SSE4_1),true)
-LOCAL_SRC_FILES += \
-	main/streaming-load-memcpy.c \
-	main/sse_minmax.c
+LOCAL_WHOLE_STATIC_LIBRARIES := \
+	libmesa_sse41
 LOCAL_CFLAGS := \
 	-msse4.1 \
        -DUSE_SSE41
@@ -63,7 +62,7 @@ LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/src/gallium/include \
 	$(MESA_TOP)/src/gallium/auxiliary
 
-LOCAL_WHOLE_STATIC_LIBRARIES := \
+LOCAL_WHOLE_STATIC_LIBRARIES += \
 	libmesa_program
 
 include $(LOCAL_PATH)/Android.gen.mk
diff --git a/src/mesa/Android.libmesa_sse41.mk b/src/mesa/Android.libmesa_sse41.mk
new file mode 100644
index 00000000000..8562da60193
--- /dev/null
+++ b/src/mesa/Android.libmesa_sse41.mk
@@ -0,0 +1,44 @@
+# Copyright 2012 Intel Corporation
+# Copyright (C) 2010-2011 Chia-I Wu <[email protected]>
+# Copyright (C) 2010-2011 LunarG Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included
+# in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+# DEALINGS IN THE SOFTWARE.
+
+ifeq ($(ARCH_X86_HAVE_SSE4_1),true)
+
+LOCAL_PATH := $(call my-dir)
+
+include $(LOCAL_PATH)/Makefile.sources
+
+include $(CLEAR_VARS)
+
+LOCAL_MODULE := libmesa_sse41
+
+LOCAL_SRC_FILES += \
+	$(X86_SSE41_FILES)
+
+LOCAL_C_INCLUDES := \
+	$(MESA_TOP)/src/mapi \
+	$(MESA_TOP)/src/gallium/include \
+	$(MESA_TOP)/src/gallium/auxiliary
+
+include $(MESA_COMMON_MK)
+include $(BUILD_STATIC_LIBRARY)
+
+endif
diff --git a/src/mesa/Android.libmesa_st_mesa.mk b/src/mesa/Android.libmesa_st_mesa.mk
index 9fd9460a5ba..bbd39562785 100644
--- a/src/mesa/Android.libmesa_st_mesa.mk
+++ b/src/mesa/Android.libmesa_st_mesa.mk
@@ -47,6 +47,8 @@ endif # x86
 endif # MESA_ENABLE_ASM
 
 ifeq ($(ARCH_X86_HAVE_SSE4_1),true)
+LOCAL_WHOLE_STATIC_LIBRARIES := \
+	libmesa_sse41
 LOCAL_CFLAGS := \
        -DUSE_SSE41
 endif
@@ -58,7 +60,7 @@ LOCAL_C_INCLUDES := \
 	$(MESA_TOP)/src/gallium/auxiliary \
 	$(MESA_TOP)/src/gallium/include
 
-LOCAL_WHOLE_STATIC_LIBRARIES := \
+LOCAL_WHOLE_STATIC_LIBRARIES += \
 	libmesa_program
 
 include $(LOCAL_PATH)/Android.gen.mk
diff --git a/src/mesa/Android.mk b/src/mesa/Android.mk
index 20f781948be..9a1aef8b28e 100644
--- a/src/mesa/Android.mk
+++ b/src/mesa/Android.mk
@@ -24,5 +24,6 @@ include $(LOCAL_PATH)/Android.mesa_gen_matypes.mk
 include $(LOCAL_PATH)/Android.libmesa_glsl_utils.mk
 include $(LOCAL_PATH)/Android.libmesa_dricore.mk
 include $(LOCAL_PATH)/Android.libmesa_st_mesa.mk
+include $(LOCAL_PATH)/Android.libmesa_sse41.mk
 
 include $(LOCAL_PATH)/program/Android.mk
diff --git a/src/mesa/Makefile.sources b/src/mesa/Makefile.sources
index a6c12c64828..7425f01273d 100644
--- a/src/mesa/Makefile.sources
+++ b/src/mesa/Makefile.sources
@@ -395,6 +395,7 @@ VBO_FILES = \
 	vbo/vbo_split_inplace.c
 
 STATETRACKER_FILES = \
+	state_tracker/st_atifs_to_tgsi.c \
 	state_tracker/st_atom_array.c \
 	state_tracker/st_atom_atomicbuf.c \
 	state_tracker/st_atom_blend.c \
@@ -586,6 +587,10 @@ X86_64_FILES =		\
 	x86-64/x86-64.h	\
 	x86-64/xform4.S
 
+X86_SSE41_FILES = \
+	main/streaming-load-memcpy.c \
+	main/sse_minmax.c
+
 SPARC_FILES =			\
 	sparc/sparc.h		\
 	sparc/sparc_clip.S	\
diff --git a/src/mesa/drivers/common/driverfuncs.c b/src/mesa/drivers/common/driverfuncs.c
index e96f92af5bb..2730b7b2f2a 100644
--- a/src/mesa/drivers/common/driverfuncs.c
+++ b/src/mesa/drivers/common/driverfuncs.c
@@ -117,6 +117,9 @@ _mesa_init_driver_functions(struct dd_function_table *driver)
    driver->NewProgram = _mesa_new_program;
    driver->DeleteProgram = _mesa_delete_program;
 
+   /* ATI_fragment_shader */
+   driver->NewATIfs = NULL;
+
    /* simple state commands */
    driver->AlphaFunc = NULL;
    driver->BlendColor = NULL;
diff --git a/src/mesa/drivers/common/meta_generate_mipmap.c b/src/mesa/drivers/common/meta_generate_mipmap.c
index d4b75390ebf..b81e179e2cd 100644
--- a/src/mesa/drivers/common/meta_generate_mipmap.c
+++ b/src/mesa/drivers/common/meta_generate_mipmap.c
@@ -137,21 +137,6 @@ _mesa_meta_glsl_generate_mipmap_cleanup(struct gl_context *ctx,
    _mesa_meta_blit_shader_table_cleanup(ctx, &mipmap->shaders);
 }
 
-static GLboolean
-prepare_mipmap_level(struct gl_context *ctx,
-                     struct gl_texture_object *texObj, GLuint level,
-                     GLsizei width, GLsizei height, GLsizei depth,
-                     GLenum intFormat, mesa_format format)
-{
-   if (texObj->Target == GL_TEXTURE_1D_ARRAY) {
-      /* Work around Mesa expecting the number of array slices in "height". */
-      height = depth;
-      depth = 1;
-   }
-
-   return _mesa_prepare_mipmap_level(ctx, texObj, level, width, height, depth,
-                                     0, intFormat, format);
-}
 
 /**
  * Called via ctx->Driver.GenerateMipmap()
@@ -270,6 +255,8 @@ _mesa_meta_GenerateMipmap(struct gl_context *ctx, GLenum target,
    /* texture is already locked, unlock now */
    _mesa_unlock_texture(ctx, texObj);
 
+   _mesa_prepare_mipmap_levels(ctx, texObj, baseLevel, maxLevel);
+
    for (dstLevel = baseLevel + 1; dstLevel <= maxLevel; dstLevel++) {
       const struct gl_texture_image *srcImage;
       struct gl_texture_image *dstImage;
@@ -309,17 +296,14 @@ _mesa_meta_GenerateMipmap(struct gl_context *ctx, GLenum target,
       _mesa_texture_parameteriv(ctx, texObj, GL_TEXTURE_MAX_LEVEL,
                                 (GLint *) &dstLevel, false);
 
-      if (!prepare_mipmap_level(ctx, texObj, dstLevel,
-                                dstWidth, dstHeight, dstDepth,
-                                srcImage->InternalFormat,
-                                srcImage->TexFormat)) {
-         /* All done.  We either ran out of memory or we would go beyond the
-          * last valid level of an immutable texture if we continued.
-          */
-         break;
-      }
       dstImage = _mesa_select_tex_image(texObj, faceTarget, dstLevel);
 
+      /* All done.  We either ran out of memory or we would go beyond the last
+       * valid level of an immutable texture if we continued.
+       */
+      if (dstImage == NULL)
+         break;
+
       /* limit minification to src level */
       _mesa_texture_parameteriv(ctx, texObj, GL_TEXTURE_MAX_LEVEL,
                                 (GLint *) &srcLevel, false);
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index f1da218ba63..daabf708b06 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -228,10 +228,16 @@ fs_visitor::emit_texture(ir_texture_opcode op,
    }
 
    /* fixup #layers for cube map arrays */
-   if (op == ir_txs && is_cube_array) {
+   if (op == ir_txs && (devinfo->gen < 7 || is_cube_array)) {
       fs_reg depth = offset(dst, bld, 2);
       fs_reg fixed_depth = vgrf(glsl_type::int_type);
-      bld.emit(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, brw_imm_d(6));
+
+      if (is_cube_array) {
+         bld.emit(SHADER_OPCODE_INT_QUOTIENT, fixed_depth, depth, brw_imm_d(6));
+      } else if (devinfo->gen < 7) {
+         /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
+         bld.emit_minmax(fixed_depth, depth, brw_imm_d(1), BRW_CONDITIONAL_GE);
+      }
 
       fs_reg *fixed_payload = ralloc_array(mem_ctx, fs_reg, inst->regs_written);
       int components = inst->regs_written / (inst->exec_size / 8);
diff --git a/src/mesa/drivers/dri/i965/brw_link.cpp b/src/mesa/drivers/dri/i965/brw_link.cpp
index b512f8b6ee1..c7d6fb8c79b 100644
--- a/src/mesa/drivers/dri/i965/brw_link.cpp
+++ b/src/mesa/drivers/dri/i965/brw_link.cpp
@@ -260,6 +260,6 @@ brw_link_shader(struct gl_context *ctx, struct gl_shader_program *shProg)
    if (brw->precompile && !brw_shader_precompile(ctx, shProg))
       return false;
 
-   build_program_resource_list(shProg);
+   build_program_resource_list(ctx, shProg);
    return true;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_pipe_control.c b/src/mesa/drivers/dri/i965/brw_pipe_control.c
index b41e28e1ec8..4672efdffc3 100644
--- a/src/mesa/drivers/dri/i965/brw_pipe_control.c
+++ b/src/mesa/drivers/dri/i965/brw_pipe_control.c
@@ -338,8 +338,6 @@ brw_emit_mi_flush(struct brw_context *brw)
       }
       brw_emit_pipe_control_flush(brw, flags);
    }
-
-   brw_render_cache_set_clear(brw);
 }
 
 int
diff --git a/src/mesa/drivers/dri/i965/brw_sampler_state.c b/src/mesa/drivers/dri/i965/brw_sampler_state.c
index c20a02817f9..1dc7d71929c 100644
--- a/src/mesa/drivers/dri/i965/brw_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sampler_state.c
@@ -459,10 +459,12 @@ brw_update_sampler_state(struct brw_context *brw,
        target == GL_TEXTURE_CUBE_MAP_ARRAY) {
       /* Cube maps must use the same wrap mode for all three coordinate
        * dimensions.  Prior to Haswell, only CUBE and CLAMP are valid.
+       *
+       * Ivybridge and Baytrail seem to have problems with CUBE mode and
+       * integer formats.  Fall back to CLAMP for now.
        */
       if ((tex_cube_map_seamless || sampler->CubeMapSeamless) &&
-         (sampler->MinFilter != GL_NEAREST ||
-          sampler->MagFilter != GL_NEAREST)) {
+          !(brw->gen == 7 && !brw->is_haswell && is_integer_format)) {
 	 wrap_s = BRW_TEXCOORDMODE_CUBE;
 	 wrap_t = BRW_TEXCOORDMODE_CUBE;
 	 wrap_r = BRW_TEXCOORDMODE_CUBE;
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 5b54b51395c..8d925843732 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -783,26 +783,13 @@ schedule_node::schedule_node(backend_instruction *inst,
 void
 instruction_scheduler::add_insts_from_block(bblock_t *block)
 {
-   /* Removing the last instruction from a basic block removes the block as
-    * well, so put a NOP at the end to keep it alive.
-    */
-   if (!block->end()->is_control_flow()) {
-      backend_instruction *nop = new(mem_ctx) backend_instruction();
-      nop->opcode = BRW_OPCODE_NOP;
-      block->end()->insert_after(block, nop);
-   }
-
-   foreach_inst_in_block_safe(backend_instruction, inst, block) {
-      if (inst->opcode == BRW_OPCODE_NOP || inst->is_control_flow())
-         continue;
-
+   foreach_inst_in_block(backend_instruction, inst, block) {
       schedule_node *n = new(mem_ctx) schedule_node(inst, this);
 
-      this->instructions_to_schedule++;
-
-      inst->remove(block);
       instructions.push_tail(n);
    }
+
+   this->instructions_to_schedule = block->end_ip - block->start_ip + 1;
 }
 
 /** Recursive computation of the delay member of a node. */
@@ -905,6 +892,15 @@ fs_instruction_scheduler::is_compressed(fs_inst *inst)
    return inst->exec_size == 16;
 }
 
+static bool
+is_scheduling_barrier(const fs_inst *inst)
+{
+   return inst->opcode == FS_OPCODE_PLACEHOLDER_HALT ||
+          inst->is_control_flow() ||
+          inst->eot ||
+          (inst->has_side_effects() && inst->opcode != FS_OPCODE_FB_WRITE);
+}
+
 void
 fs_instruction_scheduler::calculate_deps()
 {
@@ -923,15 +919,6 @@ fs_instruction_scheduler::calculate_deps()
     */
    schedule_node *last_fixed_grf_write = NULL;
 
-   /* The last instruction always needs to still be the last
-    * instruction.  Either it's flow control (IF, ELSE, ENDIF, DO,
-    * WHILE) and scheduling other things after it would disturb the
-    * basic block, or it's FB_WRITE and we should do a better job at
-    * dead code elimination anyway.
-    */
-   schedule_node *last = (schedule_node *)instructions.get_tail();
-   add_barrier_deps(last);
-
    memset(last_grf_write, 0, sizeof(last_grf_write));
    memset(last_mrf_write, 0, sizeof(last_mrf_write));
 
@@ -939,9 +926,7 @@ fs_instruction_scheduler::calculate_deps()
    foreach_in_list(schedule_node, n, &instructions) {
       fs_inst *inst = (fs_inst *)n->inst;
 
-      if ((inst->opcode == FS_OPCODE_PLACEHOLDER_HALT ||
-           inst->has_side_effects()) &&
-          inst->opcode != FS_OPCODE_FB_WRITE)
+      if (is_scheduling_barrier(inst))
          add_barrier_deps(n);
 
       /* read-after-write deps. */
@@ -964,10 +949,7 @@ fs_instruction_scheduler::calculate_deps()
             }
          } else if (inst->src[i].is_accumulator()) {
             add_dep(last_accumulator_write, n);
-         } else if (inst->src[i].file != BAD_FILE &&
-                    inst->src[i].file != IMM &&
-                    inst->src[i].file != UNIFORM) {
-            assert(inst->src[i].file != MRF);
+         } else if (inst->src[i].file == ARF) {
             add_barrier_deps(n);
          }
       }
@@ -1026,8 +1008,7 @@ fs_instruction_scheduler::calculate_deps()
       } else if (inst->dst.is_accumulator()) {
          add_dep(last_accumulator_write, n);
          last_accumulator_write = n;
-      } else if (inst->dst.file != BAD_FILE &&
-                 !inst->dst.is_null()) {
+      } else if (inst->dst.file == ARF && !inst->dst.is_null()) {
          add_barrier_deps(n);
       }
 
@@ -1080,10 +1061,7 @@ fs_instruction_scheduler::calculate_deps()
             }
          } else if (inst->src[i].is_accumulator()) {
             add_dep(n, last_accumulator_write, 0);
-         } else if (inst->src[i].file != BAD_FILE &&
-                    inst->src[i].file != IMM &&
-                    inst->src[i].file != UNIFORM) {
-            assert(inst->src[i].file != MRF);
+         } else if (inst->src[i].file == ARF) {
             add_barrier_deps(n);
          }
       }
@@ -1140,8 +1118,7 @@ fs_instruction_scheduler::calculate_deps()
          }
       } else if (inst->dst.is_accumulator()) {
          last_accumulator_write = n;
-      } else if (inst->dst.file != BAD_FILE &&
-                 !inst->dst.is_null()) {
+      } else if (inst->dst.file == ARF && !inst->dst.is_null()) {
          add_barrier_deps(n);
       }
 
@@ -1161,6 +1138,13 @@ fs_instruction_scheduler::calculate_deps()
    }
 }
 
+static bool
+is_scheduling_barrier(const vec4_instruction *inst)
+{
+   return inst->is_control_flow() ||
+          inst->has_side_effects();
+}
+
 void
 vec4_instruction_scheduler::calculate_deps()
 {
@@ -1175,15 +1159,6 @@ vec4_instruction_scheduler::calculate_deps()
     */
    schedule_node *last_fixed_grf_write = NULL;
 
-   /* The last instruction always needs to still be the last instruction.
-    * Either it's flow control (IF, ELSE, ENDIF, DO, WHILE) and scheduling
-    * other things after it would disturb the basic block, or it's the EOT
-    * URB_WRITE and we should do a better job at dead code eliminating
-    * anything that could have been scheduled after it.
-    */
-   schedule_node *last = (schedule_node *)instructions.get_tail();
-   add_barrier_deps(last);
-
    memset(last_grf_write, 0, sizeof(last_grf_write));
    memset(last_mrf_write, 0, sizeof(last_mrf_write));
 
@@ -1191,7 +1166,7 @@ vec4_instruction_scheduler::calculate_deps()
    foreach_in_list(schedule_node, n, &instructions) {
       vec4_instruction *inst = (vec4_instruction *)n->inst;
 
-      if (inst->has_side_effects() && inst->opcode != FS_OPCODE_FB_WRITE)
+      if (is_scheduling_barrier(inst))
          add_barrier_deps(n);
 
       /* read-after-write deps. */
@@ -1204,12 +1179,7 @@ vec4_instruction_scheduler::calculate_deps()
          } else if (inst->src[i].is_accumulator()) {
             assert(last_accumulator_write);
             add_dep(last_accumulator_write, n);
-         } else if (inst->src[i].file != BAD_FILE &&
-                    inst->src[i].file != IMM &&
-                    inst->src[i].file != UNIFORM) {
-            /* No reads from MRF, and ATTR is already translated away */
-            assert(inst->src[i].file != MRF &&
-                   inst->src[i].file != ATTR);
+         } else if (inst->src[i].file == ARF) {
             add_barrier_deps(n);
          }
       }
@@ -1248,8 +1218,7 @@ vec4_instruction_scheduler::calculate_deps()
       } else if (inst->dst.is_accumulator()) {
          add_dep(last_accumulator_write, n);
          last_accumulator_write = n;
-      } else if (inst->dst.file != BAD_FILE &&
-                 !inst->dst.is_null()) {
+      } else if (inst->dst.file == ARF && !inst->dst.is_null()) {
          add_barrier_deps(n);
       }
 
@@ -1291,11 +1260,7 @@ vec4_instruction_scheduler::calculate_deps()
             add_dep(n, last_fixed_grf_write);
          } else if (inst->src[i].is_accumulator()) {
             add_dep(n, last_accumulator_write);
-         } else if (inst->src[i].file != BAD_FILE &&
-                    inst->src[i].file != IMM &&
-                    inst->src[i].file != UNIFORM) {
-            assert(inst->src[i].file != MRF &&
-                   inst->src[i].file != ATTR);
+         } else if (inst->src[i].file == ARF) {
             add_barrier_deps(n);
          }
       }
@@ -1330,8 +1295,7 @@ vec4_instruction_scheduler::calculate_deps()
          last_fixed_grf_write = n;
       } else if (inst->dst.is_accumulator()) {
          last_accumulator_write = n;
-      } else if (inst->dst.file != BAD_FILE &&
-                 !inst->dst.is_null()) {
+      } else if (inst->dst.file == ARF && !inst->dst.is_null()) {
          add_barrier_deps(n);
       }
 
@@ -1500,7 +1464,6 @@ void
 instruction_scheduler::schedule_instructions(bblock_t *block)
 {
    const struct brw_device_info *devinfo = bs->devinfo;
-   backend_instruction *inst = block->end();
    time = 0;
    if (!post_reg_alloc)
       reg_pressure = reg_pressure_in[block->num];
@@ -1519,7 +1482,8 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
       /* Schedule this instruction. */
       assert(chosen);
       chosen->remove();
-      inst->insert_before(block, chosen->inst);
+      chosen->inst->exec_node::remove();
+      block->instructions.push_tail(chosen->inst);
       instructions_to_schedule--;
 
       if (!post_reg_alloc) {
@@ -1588,8 +1552,6 @@ instruction_scheduler::schedule_instructions(bblock_t *block)
       }
    }
 
-   if (block->end()->opcode == BRW_OPCODE_NOP)
-      block->end()->remove(block);
    assert(instructions_to_schedule == 0);
 
    block->cycle_count = time;
@@ -1674,11 +1636,6 @@ fs_visitor::schedule_instructions(instruction_scheduler_mode mode)
                                   cfg->num_blocks, mode);
    sched.run(cfg);
 
-   if (unlikely(debug_enabled) && mode == SCHEDULE_POST) {
-      fprintf(stderr, "%s%d estimated execution time: %d cycles\n",
-              stage_abbrev, dispatch_width, sched.time);
-   }
-
    invalidate_live_intervals();
 }
 
@@ -1688,10 +1645,5 @@ vec4_visitor::opt_schedule_instructions()
    vec4_instruction_scheduler sched(this, prog_data->total_grf);
    sched.run(cfg);
 
-   if (unlikely(debug_enabled)) {
-      fprintf(stderr, "%s estimated execution time: %d cycles\n",
-              stage_abbrev, sched.time);
-   }
-
    invalidate_live_intervals();
 }
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index 21977a23130..736deb443dd 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -948,6 +948,8 @@ adjust_later_block_ips(bblock_t *start_block, int ip_adjustment)
 void
 backend_instruction::insert_after(bblock_t *block, backend_instruction *inst)
 {
+   assert(this != inst);
+
    if (!this->is_head_sentinel())
       assert(inst_is_in_block(block, this) || !"Instruction not in block");
 
@@ -961,6 +963,8 @@ backend_instruction::insert_after(bblock_t *block, backend_instruction *inst)
 void
 backend_instruction::insert_before(bblock_t *block, backend_instruction *inst)
 {
+   assert(this != inst);
+
    if (!this->is_tail_sentinel())
       assert(inst_is_in_block(block, this) || !"Instruction not in block");
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index c9728bfb694..4b3b08903c9 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -1973,7 +1973,6 @@ generate_code(struct brw_codegen *p,
       case TCS_OPCODE_SRC0_010_IS_ZERO:
          /* If src_reg had stride like fs_reg, we wouldn't need this. */
          brw_MOV(p, brw_null_reg(), stride(src[0], 0, 1, 0));
-         brw_inst_set_cond_modifier(devinfo, brw_last_inst, BRW_CONDITIONAL_Z);
          break;
 
       case TCS_OPCODE_RELEASE_INPUT:
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
index 0ce48b8df5f..28aaaebd0b3 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_tcs.cpp
@@ -182,7 +182,9 @@ vec4_tcs_visitor::emit_thread_end()
        * we don't have stride in the vec4 world, nor UV immediates in
        * align16, so we need an opcode to get invocation_id<0,4,0>.
        */
-      emit(TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(), invocation_id);
+      set_condmod(BRW_CONDITIONAL_Z,
+                  emit(TCS_OPCODE_SRC0_010_IS_ZERO, dst_null_d(),
+                       invocation_id));
       emit(IF(BRW_PREDICATE_NORMAL));
       for (unsigned i = 0; i < key->input_vertices; i += 2) {
          /* If we have an odd number of input vertices, the last will be
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 4cfbc143d5a..33c5f07cec9 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -1056,10 +1056,16 @@ vec4_visitor::emit_texture(ir_texture_opcode op,
    /* fixup num layers (z) for cube arrays: hardware returns faces * layers;
     * spec requires layers.
     */
-   if (op == ir_txs && is_cube_array) {
-      emit_math(SHADER_OPCODE_INT_QUOTIENT,
-                writemask(inst->dst, WRITEMASK_Z),
-                src_reg(inst->dst), brw_imm_d(6));
+   if (op == ir_txs) {
+      if (is_cube_array) {
+         emit_math(SHADER_OPCODE_INT_QUOTIENT,
+                   writemask(inst->dst, WRITEMASK_Z),
+                   src_reg(inst->dst), brw_imm_d(6));
+      } else if (devinfo->gen < 7) {
+         /* Gen4-6 return 0 instead of 1 for single layer surfaces. */
+         emit_minmax(BRW_CONDITIONAL_GE, writemask(inst->dst, WRITEMASK_Z),
+                     src_reg(inst->dst), brw_imm_d(1));
+      }
    }
 
    if (devinfo->gen == 6 && op == ir_tg4) {
diff --git a/src/mesa/drivers/dri/i965/gen6_sol.c b/src/mesa/drivers/dri/i965/gen6_sol.c
index 2f6eadffd2e..24bb4b41b1e 100644
--- a/src/mesa/drivers/dri/i965/gen6_sol.c
+++ b/src/mesa/drivers/dri/i965/gen6_sol.c
@@ -69,13 +69,13 @@ gen6_update_sol_surfaces(struct brw_context *brw)
                brw, xfb_obj->Buffers[buffer],
                &brw->gs.base.surf_offset[surf_index],
                linked_xfb_info->Outputs[i].NumComponents,
-               linked_xfb_info->BufferStride[buffer], buffer_offset);
+               linked_xfb_info->Buffers[buffer].Stride, buffer_offset);
          } else {
             brw_update_sol_surface(
                brw, xfb_obj->Buffers[buffer],
                &brw->ff_gs.surf_offset[surf_index],
                linked_xfb_info->Outputs[i].NumComponents,
-               linked_xfb_info->BufferStride[buffer], buffer_offset);
+               linked_xfb_info->Buffers[buffer].Stride, buffer_offset);
          }
       } else {
          if (!brw->geometry_program)
@@ -256,7 +256,7 @@ brw_begin_transform_feedback(struct gl_context *ctx, GLenum mode,
     * overflowing any of the buffers currently being used for feedback.
     */
    unsigned max_index
-      = _mesa_compute_max_transform_feedback_vertices(xfb_obj,
+      = _mesa_compute_max_transform_feedback_vertices(ctx, xfb_obj,
                                                       linked_xfb_info);
 
    /* Initialize the SVBI 0 register to zero and set the maximum index. */
diff --git a/src/mesa/drivers/dri/i965/gen7_blorp.cpp b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
index 89b73ca7519..eae1e30e150 100644
--- a/src/mesa/drivers/dri/i965/gen7_blorp.cpp
+++ b/src/mesa/drivers/dri/i965/gen7_blorp.cpp
@@ -55,11 +55,8 @@ gen7_blorp_emit_urb_config(struct brw_context *brw)
                                  0 /* gs_size */,
                                  urb_size / 2 /* fs_size */);
 
-   /* The minimum valid number of VS entries is 32. See 3DSTATE_URB_VS, Dword
-    * 1.15:0 "VS Number of URB Entries".
-    */
    gen7_emit_urb_state(brw,
-                       32 /* num_vs_entries */,
+                       brw->urb.min_vs_entries /* num_vs_entries */,
                        2 /* vs_size */,
                        2 /* vs_start */,
                        0 /* num_hs_entries */,
diff --git a/src/mesa/drivers/dri/i965/gen7_sol_state.c b/src/mesa/drivers/dri/i965/gen7_sol_state.c
index 8cd2fc4b48a..c44572c3438 100644
--- a/src/mesa/drivers/dri/i965/gen7_sol_state.c
+++ b/src/mesa/drivers/dri/i965/gen7_sol_state.c
@@ -70,7 +70,7 @@ upload_3dstate_so_buffers(struct brw_context *brw)
 	 continue;
       }
 
-      stride = linked_xfb_info->BufferStride[i] * 4;
+      stride = linked_xfb_info->Buffers[i].Stride * 4;
 
       start = xfb_obj->Offset[i];
       assert(start % 4 == 0);
diff --git a/src/mesa/drivers/dri/i965/gen8_ps_state.c b/src/mesa/drivers/dri/i965/gen8_ps_state.c
index b9a06e7b2c7..7dfd4bfb8de 100644
--- a/src/mesa/drivers/dri/i965/gen8_ps_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_ps_state.c
@@ -91,10 +91,15 @@ gen8_upload_ps_extra(struct brw_context *brw,
     * GEN8_PSX_PIXEL_SHADER_NO_RT_WRITE is not set it shouldn't make any
     * difference so we may just disable it here.
     *
+    * Gen8 hardware tries to compute ThreadDispatchEnable for us but doesn't
+    * take into account KillPixels when no depth or stencil writes are enabled.
+    * In order for occlusion queries to work correctly with no attachments, we
+    * need to force-enable here.
+    *
     * BRW_NEW_FS_PROG_DATA | BRW_NEW_FRAGMENT_PROGRAM | _NEW_BUFFERS | _NEW_COLOR
     */
-   if (_mesa_active_fragment_shader_has_side_effects(&brw->ctx) &&
-       !brw_color_buffer_write_enabled(brw))
+   if ((_mesa_active_fragment_shader_has_side_effects(ctx) ||
+        prog_data->uses_kill) && !brw_color_buffer_write_enabled(brw))
       dw1 |= GEN8_PSX_SHADER_HAS_UAV;
 
    if (prog_data->computed_stencil) {
diff --git a/src/mesa/drivers/dri/i965/gen8_sol_state.c b/src/mesa/drivers/dri/i965/gen8_sol_state.c
index 58ead68e90c..f30818031f4 100644
--- a/src/mesa/drivers/dri/i965/gen8_sol_state.c
+++ b/src/mesa/drivers/dri/i965/gen8_sol_state.c
@@ -139,13 +139,13 @@ gen8_upload_3dstate_streamout(struct brw_context *brw, bool active,
 
       /* Set buffer pitches; 0 means unbound. */
       if (xfb_obj->Buffers[0])
-         dw3 |= linked_xfb_info->BufferStride[0] * 4;
+         dw3 |= linked_xfb_info->Buffers[0].Stride * 4;
       if (xfb_obj->Buffers[1])
-         dw3 |= (linked_xfb_info->BufferStride[1] * 4) << 16;
+         dw3 |= (linked_xfb_info->Buffers[1].Stride * 4) << 16;
       if (xfb_obj->Buffers[2])
-         dw4 |= linked_xfb_info->BufferStride[2] * 4;
+         dw4 |= linked_xfb_info->Buffers[2].Stride * 4;
       if (xfb_obj->Buffers[3])
-         dw4 |= (linked_xfb_info->BufferStride[3] * 4) << 16;
+         dw4 |= (linked_xfb_info->Buffers[3].Stride * 4) << 16;
    }
 
    BEGIN_BATCH(5);
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.c b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
index f77807472fd..e41f927819e 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.c
@@ -106,6 +106,32 @@ intel_batchbuffer_free(struct brw_context *brw)
    drm_intel_bo_unreference(brw->batch.bo);
 }
 
+void
+intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
+                                enum brw_gpu_ring ring)
+{
+   /* If we're switching rings, implicitly flush the batch. */
+   if (unlikely(ring != brw->batch.ring) && brw->batch.ring != UNKNOWN_RING &&
+       brw->gen >= 6) {
+      intel_batchbuffer_flush(brw);
+   }
+
+#ifdef DEBUG
+   assert(sz < BATCH_SZ - BATCH_RESERVED);
+#endif
+   if (intel_batchbuffer_space(brw) < sz)
+      intel_batchbuffer_flush(brw);
+
+   enum brw_gpu_ring prev_ring = brw->batch.ring;
+   /* The intel_batchbuffer_flush() calls above might have changed
+    * brw->batch.ring to UNKNOWN_RING, so we need to set it here at the end.
+    */
+   brw->batch.ring = ring;
+
+   if (unlikely(prev_ring == UNKNOWN_RING && ring == RENDER_RING))
+      intel_batchbuffer_emit_render_ring_prelude(brw);
+}
+
 static void
 do_batch_dump(struct brw_context *brw)
 {
diff --git a/src/mesa/drivers/dri/i965/intel_batchbuffer.h b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
index f47369029a0..aa1dc38babc 100644
--- a/src/mesa/drivers/dri/i965/intel_batchbuffer.h
+++ b/src/mesa/drivers/dri/i965/intel_batchbuffer.h
@@ -44,6 +44,8 @@ void intel_batchbuffer_init(struct brw_context *brw);
 void intel_batchbuffer_free(struct brw_context *brw);
 void intel_batchbuffer_save_state(struct brw_context *brw);
 void intel_batchbuffer_reset_to_saved(struct brw_context *brw);
+void intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
+                                     enum brw_gpu_ring ring);
 
 int _intel_batchbuffer_flush(struct brw_context *brw,
 			     const char *file, int line);
@@ -117,32 +119,6 @@ intel_batchbuffer_emit_float(struct brw_context *brw, float f)
 }
 
 static inline void
-intel_batchbuffer_require_space(struct brw_context *brw, GLuint sz,
-                                enum brw_gpu_ring ring)
-{
-   /* If we're switching rings, implicitly flush the batch. */
-   if (unlikely(ring != brw->batch.ring) && brw->batch.ring != UNKNOWN_RING &&
-       brw->gen >= 6) {
-      intel_batchbuffer_flush(brw);
-   }
-
-#ifdef DEBUG
-   assert(sz < BATCH_SZ - BATCH_RESERVED);
-#endif
-   if (intel_batchbuffer_space(brw) < sz)
-      intel_batchbuffer_flush(brw);
-
-   enum brw_gpu_ring prev_ring = brw->batch.ring;
-   /* The intel_batchbuffer_flush() calls above might have changed
-    * brw->batch.ring to UNKNOWN_RING, so we need to set it here at the end.
-    */
-   brw->batch.ring = ring;
-
-   if (unlikely(prev_ring == UNKNOWN_RING && ring == RENDER_RING))
-      intel_batchbuffer_emit_render_ring_prelude(brw);
-}
-
-static inline void
 intel_batchbuffer_begin(struct brw_context *brw, int n, enum brw_gpu_ring ring)
 {
    intel_batchbuffer_require_space(brw, n * 4, ring);
diff --git a/src/mesa/drivers/dri/i965/intel_fbo.c b/src/mesa/drivers/dri/i965/intel_fbo.c
index b7b679686e5..7eb21acc40b 100644
--- a/src/mesa/drivers/dri/i965/intel_fbo.c
+++ b/src/mesa/drivers/dri/i965/intel_fbo.c
@@ -1065,7 +1065,28 @@ brw_render_cache_set_check_flush(struct brw_context *brw, drm_intel_bo *bo)
    if (!_mesa_set_search(brw->render_cache, bo))
       return;
 
-   brw_emit_mi_flush(brw);
+   if (brw->gen >= 6) {
+      if (brw->gen == 6) {
+         /* [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache
+          * Flush Enable = 1, a PIPE_CONTROL with any non-zero
+          * post-sync-op is required.
+          */
+         brw_emit_post_sync_nonzero_flush(brw);
+      }
+
+      brw_emit_pipe_control_flush(brw,
+                                  PIPE_CONTROL_DEPTH_CACHE_FLUSH |
+                                  PIPE_CONTROL_RENDER_TARGET_FLUSH |
+                                  PIPE_CONTROL_CS_STALL);
+
+      brw_emit_pipe_control_flush(brw,
+                                  PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE |
+                                  PIPE_CONTROL_CONST_CACHE_INVALIDATE);
+   } else {
+      brw_emit_mi_flush(brw);
+   }
+
+   brw_render_cache_set_clear(brw);
 }
 
 /**
diff --git a/src/mesa/drivers/x11/fakeglx.c b/src/mesa/drivers/x11/fakeglx.c
index 9286f718d00..80b71765e6c 100644
--- a/src/mesa/drivers/x11/fakeglx.c
+++ b/src/mesa/drivers/x11/fakeglx.c
@@ -74,6 +74,7 @@
    "GLX_MESA_copy_sub_buffer " \
    "GLX_MESA_pixmap_colormap " \
    "GLX_MESA_release_buffers " \
+   "GLX_ARB_create_context " \
    "GLX_ARB_get_proc_address " \
    "GLX_EXT_texture_from_pixmap " \
    "GLX_EXT_visual_info " \
@@ -2831,6 +2832,56 @@ Fake_glXReleaseTexImageEXT(Display *dpy, GLXDrawable drawable, int buffer)
 }
 
 
+static GLXContext
+Fake_glXCreateContextAttribs(Display *dpy, GLXFBConfig config,
+                             GLXContext share_context, Bool direct,
+                             const int *attrib_list)
+{
+   XMesaContext xmCtx;
+   XMesaVisual xmvis = (XMesaVisual) config;
+   int i;
+   int major = 0, minor = 0, ctxFlags = 0, profileFlags = 0;
+
+   for (i = 0; attrib_list[i]; i += 2) {
+      switch (attrib_list[i]) {
+      case GLX_CONTEXT_MAJOR_VERSION_ARB:
+         major = attrib_list[i + 1];
+         break;
+      case GLX_CONTEXT_MINOR_VERSION_ARB:
+         minor = attrib_list[i + 1];
+         break;
+      case GLX_CONTEXT_FLAGS_ARB:
+         ctxFlags = attrib_list[i + 1];
+         break;
+      case GLX_CONTEXT_PROFILE_MASK_ARB:
+         profileFlags = attrib_list[i + 1];
+         break;
+      default:
+         fprintf(stderr, "Bad attribute in glXCreateContextAttribs()\n");
+         return 0;
+      }
+   }
+
+   if (major * 10 + minor > 21) {
+      /* swrast only supports GL 2.1 and earlier */
+      return 0;
+   }
+
+   /* These are ignored for now.  We'd have to enhance XMesaCreateContext
+    * to take these flags and the version, at least.
+    */
+   (void) ctxFlags;
+   (void) profileFlags;
+
+   /* deallocate unused windows/buffers */
+   XMesaGarbageCollect(dpy);
+
+   xmCtx = XMesaCreateContext(xmvis, (XMesaContext) share_context);
+
+   return (GLXContext) xmCtx;
+}
+
+
 /* silence warning */
 extern struct _glxapi_table *_mesa_GetGLXDispatchTable(void);
 
@@ -2990,5 +3041,6 @@ _mesa_GetGLXDispatchTable(void)
    glx.BindTexImageEXT = Fake_glXBindTexImageEXT;
    glx.ReleaseTexImageEXT = Fake_glXReleaseTexImageEXT;
 
+   glx.CreateContextAttribs = Fake_glXCreateContextAttribs;
    return &glx;
 }
diff --git a/src/mesa/drivers/x11/glxapi.c b/src/mesa/drivers/x11/glxapi.c
index a870e94ed4a..cc1bb2ab4b3 100644
--- a/src/mesa/drivers/x11/glxapi.c
+++ b/src/mesa/drivers/x11/glxapi.c
@@ -1319,6 +1319,9 @@ static struct name_address_pair GLX_functions[] = {
    { "glXBindTexImageEXT", (__GLXextFuncPtr) glXBindTexImageEXT },
    { "glXReleaseTexImageEXT", (__GLXextFuncPtr) glXReleaseTexImageEXT },
 
+   /*** GLX_ARB_create_context ***/
+   { "glXCreateContextAttribsARB", (__GLXextFuncPtr) glXCreateContextAttribsARB },
+
    { NULL, NULL }   /* end of list */
 };
 
@@ -1370,3 +1373,20 @@ void PUBLIC
 {
    return glXGetProcAddressARB(procName);
 }
+
+
+/**
+ * Added in GLX_ARB_create_context.
+ */
+GLXContext PUBLIC
+glXCreateContextAttribsARB(Display *dpy, GLXFBConfig config,
+                           GLXContext share_context, Bool direct,
+                           const int *attrib_list)
+{
+   struct _glxapi_table *t;
+   GET_DISPATCH(dpy, t);
+   if (!t)
+      return 0;
+   return (t->CreateContextAttribs)(dpy, config, share_context, direct,
+                                    attrib_list);
+}
diff --git a/src/mesa/drivers/x11/glxapi.h b/src/mesa/drivers/x11/glxapi.h
index bd6e97053e6..aff38f7531d 100644
--- a/src/mesa/drivers/x11/glxapi.h
+++ b/src/mesa/drivers/x11/glxapi.h
@@ -201,6 +201,11 @@ struct _glxapi_table {
    void (*BindTexImageEXT)(Display *dpy, GLXDrawable drawable, int buffer,
                            const int *attrib_list);
    void (*ReleaseTexImageEXT)(Display *dpy, GLXDrawable drawable, int buffer);
+
+   /*** GLX_ARB_create_context ***/
+   GLXContext (*CreateContextAttribs)(Display *dpy, GLXFBConfig config,
+                                      GLXContext share_context, Bool direct,
+                                      const int *attrib_list);
 };
 
 
diff --git a/src/mesa/main/atifragshader.c b/src/mesa/main/atifragshader.c
index 8fcbff6a7a4..34f45c68008 100644
--- a/src/mesa/main/atifragshader.c
+++ b/src/mesa/main/atifragshader.c
@@ -30,6 +30,7 @@
 #include "main/mtypes.h"
 #include "main/dispatch.h"
 #include "main/atifragshader.h"
+#include "program/program.h"
 
 #define MESA_DEBUG_ATI_FS 0
 
@@ -63,6 +64,7 @@ _mesa_delete_ati_fragment_shader(struct gl_context *ctx, struct ati_fragment_sha
       free(s->Instructions[i]);
       free(s->SetupInst[i]);
    }
+   _mesa_reference_program(ctx, &s->Program, NULL);
    free(s);
 }
 
@@ -321,6 +323,8 @@ _mesa_BeginFragmentShaderATI(void)
          free(ctx->ATIFragmentShader.Current->SetupInst[i]);
    }
 
+   _mesa_reference_program(ctx, &ctx->ATIFragmentShader.Current->Program, NULL);
+
    /* malloc the instructions here - not sure if the best place but its
       a start */
    for (i = 0; i < MAX_NUM_PASSES_ATI; i++) {
@@ -405,7 +409,14 @@ _mesa_EndFragmentShaderATI(void)
    }
 #endif
 
-   if (!ctx->Driver.ProgramStringNotify(ctx, GL_FRAGMENT_SHADER_ATI, NULL)) {
+   if (ctx->Driver.NewATIfs) {
+      struct gl_program *prog = ctx->Driver.NewATIfs(ctx,
+                                                     ctx->ATIFragmentShader.Current);
+      _mesa_reference_program(ctx, &ctx->ATIFragmentShader.Current->Program, prog);
+   }
+
+   if (!ctx->Driver.ProgramStringNotify(ctx, GL_FRAGMENT_SHADER_ATI,
+                                        curProg->Program)) {
       ctx->ATIFragmentShader.Current->isValid = GL_FALSE;
       /* XXX is this the right error? */
       _mesa_error(ctx, GL_INVALID_OPERATION,
diff --git a/src/mesa/main/atifragshader.h b/src/mesa/main/atifragshader.h
index 59011341018..0e32795da3b 100644
--- a/src/mesa/main/atifragshader.h
+++ b/src/mesa/main/atifragshader.h
@@ -16,6 +16,7 @@ struct gl_context;
 #define MAX_NUM_INSTRUCTIONS_PER_PASS_ATI 8
 #define MAX_NUM_PASSES_ATI                2
 #define MAX_NUM_FRAGMENT_REGISTERS_ATI    6
+#define MAX_NUM_FRAGMENT_CONSTANTS_ATI    8
 
 struct ati_fs_opcode_st
 {
diff --git a/src/mesa/main/bufferobj.c b/src/mesa/main/bufferobj.c
index 9aec42508a7..731b62ebe21 100644
--- a/src/mesa/main/bufferobj.c
+++ b/src/mesa/main/bufferobj.c
@@ -148,8 +148,8 @@ get_buffer_target(struct gl_context *ctx, GLenum target)
       }
       break;
    case GL_TEXTURE_BUFFER:
-      if (ctx->API == API_OPENGL_CORE &&
-          ctx->Extensions.ARB_texture_buffer_object) {
+      if (_mesa_has_ARB_texture_buffer_object(ctx) ||
+          _mesa_has_OES_texture_buffer(ctx)) {
          return &ctx->Texture.BufferObject;
       }
       break;
diff --git a/src/mesa/main/buffers.c b/src/mesa/main/buffers.c
index 26dafd1b786..a28c5831576 100644
--- a/src/mesa/main/buffers.c
+++ b/src/mesa/main/buffers.c
@@ -222,6 +222,12 @@ read_buffer_enum_to_index(GLenum buffer)
    }
 }
 
+static bool
+is_legal_es3_readbuffer_enum(GLenum buf)
+{
+   return buf == GL_BACK || buf == GL_NONE ||
+          (buf >= GL_COLOR_ATTACHMENT0 && buf <= GL_COLOR_ATTACHMENT31);
+}
 
 /**
  * Called by glDrawBuffer() and glNamedFramebufferDrawBuffer().
@@ -715,7 +721,11 @@ read_buffer(struct gl_context *ctx, struct gl_framebuffer *fb,
    }
    else {
       /* general case / window-system framebuffer */
-      srcBuffer = read_buffer_enum_to_index(buffer);
+      if (_mesa_is_gles3(ctx) && !is_legal_es3_readbuffer_enum(buffer))
+         srcBuffer = -1;
+      else
+         srcBuffer = read_buffer_enum_to_index(buffer);
+
       if (srcBuffer == -1) {
          _mesa_error(ctx, GL_INVALID_ENUM,
                      "%s(invalid buffer %s)", caller,
diff --git a/src/mesa/main/copyimage.c b/src/mesa/main/copyimage.c
index d571d221bce..a0f1c691220 100644
--- a/src/mesa/main/copyimage.c
+++ b/src/mesa/main/copyimage.c
@@ -25,6 +25,7 @@
  *    Jason Ekstrand <[email protected]>
  */
 
+#include "context.h"
 #include "glheader.h"
 #include "errors.h"
 #include "enums.h"
@@ -360,8 +361,32 @@ compressed_format_compatible(const struct gl_context *ctx,
       case GL_COMPRESSED_SIGNED_RED_RGTC1:
          compressedClass = BLOCK_CLASS_64_BITS;
          break;
+      case GL_COMPRESSED_RGBA8_ETC2_EAC:
+      case GL_COMPRESSED_SRGB8_ALPHA8_ETC2_EAC:
+      case GL_COMPRESSED_RG11_EAC:
+      case GL_COMPRESSED_SIGNED_RG11_EAC:
+         if (_mesa_is_gles(ctx))
+            compressedClass = BLOCK_CLASS_128_BITS;
+         else
+            return false;
+         break;
+      case GL_COMPRESSED_RGB8_ETC2:
+      case GL_COMPRESSED_SRGB8_ETC2:
+      case GL_COMPRESSED_R11_EAC:
+      case GL_COMPRESSED_SIGNED_R11_EAC:
+      case GL_COMPRESSED_RGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+      case GL_COMPRESSED_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+         if (_mesa_is_gles(ctx))
+            compressedClass = BLOCK_CLASS_64_BITS;
+         else
+            return false;
+         break;
       default:
-         return false;
+         if (_mesa_is_gles(ctx) && _mesa_is_astc_format(compressedFormat))
+            compressedClass = BLOCK_CLASS_128_BITS;
+         else
+            return false;
+         break;
    }
 
    switch (otherFormat) {
diff --git a/src/mesa/main/dd.h b/src/mesa/main/dd.h
index 60bc8ef4411..d62fee690f4 100644
--- a/src/mesa/main/dd.h
+++ b/src/mesa/main/dd.h
@@ -477,6 +477,11 @@ struct dd_function_table {
    /** Delete a program */
    void (*DeleteProgram)(struct gl_context *ctx, struct gl_program *prog);   
    /**
+    * Allocate a program to associate with the new ATI fragment shader (optional)
+    */
+   struct gl_program * (*NewATIfs)(struct gl_context *ctx,
+                                   struct ati_fragment_shader *curProg);
+   /**
     * Notify driver that a program string (and GPU code) has been specified
     * or modified.  Return GL_TRUE or GL_FALSE to indicate if the program is
     * supported by the driver.
diff --git a/src/mesa/main/enable.c b/src/mesa/main/enable.c
index b90a60ba03f..d2830770ec2 100644
--- a/src/mesa/main/enable.c
+++ b/src/mesa/main/enable.c
@@ -807,7 +807,7 @@ _mesa_set_enable(struct gl_context *ctx, GLenum cap, GLboolean state)
 
       /* GL_ARB_sample_shading */
       case GL_SAMPLE_SHADING:
-         if (!_mesa_is_desktop_gl(ctx))
+         if (!_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx))
             goto invalid_enum_error;
          CHECK_EXTENSION(ARB_sample_shading, cap);
          if (ctx->Multisample.SampleShading == state)
@@ -1606,7 +1606,7 @@ _mesa_IsEnabled( GLenum cap )
 
       /* ARB_sample_shading */
       case GL_SAMPLE_SHADING:
-         if (!_mesa_is_desktop_gl(ctx))
+         if (!_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx))
             goto invalid_enum_error;
          CHECK_EXTENSION(ARB_sample_shading);
          return ctx->Multisample.SampleShading;
diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h
index 54a5bb057a3..7c36b1e1d0b 100644
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -186,11 +186,13 @@ EXT(EXT_blend_subtract                      , dummy_true
 EXT(EXT_buffer_storage                      , ARB_buffer_storage                     ,  x ,  x ,  x ,  31, 2015)
 EXT(EXT_color_buffer_float                  , dummy_true                             ,  x ,  x , ES1,  30, 2013)
 EXT(EXT_compiled_vertex_array               , dummy_true                             , GLL,  x ,  x ,  x , 1996)
+EXT(EXT_copy_image                          , OES_copy_image                         ,  x ,  x ,  x ,  30, 2014)
 EXT(EXT_copy_texture                        , dummy_true                             , GLL,  x ,  x ,  x , 1995)
 EXT(EXT_depth_bounds_test                   , EXT_depth_bounds_test                  , GLL, GLC,  x ,  x , 2002)
 EXT(EXT_discard_framebuffer                 , dummy_true                             ,  x ,  x , ES1, ES2, 2009)
 EXT(EXT_draw_buffers                        , dummy_true                             ,  x ,  x ,  x , ES2, 2012)
 EXT(EXT_draw_buffers2                       , EXT_draw_buffers2                      , GLL, GLC,  x ,  x , 2006)
+EXT(EXT_draw_buffers_indexed                , ARB_draw_buffers_blend                 ,  x ,  x ,  x ,  30, 2014)
 EXT(EXT_draw_elements_base_vertex           , ARB_draw_elements_base_vertex          ,  x ,  x ,  x , ES2, 2014)
 EXT(EXT_draw_instanced                      , ARB_draw_instanced                     , GLL, GLC,  x ,  x , 2006)
 EXT(EXT_draw_range_elements                 , dummy_true                             , GLL,  x ,  x ,  x , 1997)
@@ -228,6 +230,7 @@ EXT(EXT_texture                             , dummy_true
 EXT(EXT_texture3D                           , dummy_true                             , GLL,  x ,  x ,  x , 1996)
 EXT(EXT_texture_array                       , EXT_texture_array                      , GLL, GLC,  x ,  x , 2006)
 EXT(EXT_texture_border_clamp                , ARB_texture_border_clamp               ,  x ,  x ,  x , ES2, 2014)
+EXT(EXT_texture_buffer                      , OES_texture_buffer                     ,  x ,  x ,  x ,  31, 2014)
 EXT(EXT_texture_compression_dxt1            , ANGLE_texture_compression_dxt          , GLL, GLC, ES1, ES2, 2004)
 EXT(EXT_texture_compression_latc            , EXT_texture_compression_latc           , GLL,  x ,  x ,  x , 2006)
 EXT(EXT_texture_compression_rgtc            , ARB_texture_compression_rgtc           , GLL, GLC,  x ,  x , 2004)
@@ -308,10 +311,12 @@ EXT(OES_blend_subtract                      , dummy_true
 EXT(OES_byte_coordinates                    , dummy_true                             ,  x ,  x , ES1,  x , 2002)
 EXT(OES_compressed_ETC1_RGB8_texture        , OES_compressed_ETC1_RGB8_texture       ,  x ,  x , ES1, ES2, 2005)
 EXT(OES_compressed_paletted_texture         , dummy_true                             ,  x ,  x , ES1,  x , 2003)
+EXT(OES_copy_image                          , OES_copy_image                         ,  x ,  x ,  x ,  30, 2014)
 EXT(OES_depth24                             , dummy_true                             ,  x ,  x , ES1, ES2, 2005)
 EXT(OES_depth32                             , dummy_false                            ,  x ,  x ,  x ,  x , 2005)
 EXT(OES_depth_texture                       , ARB_depth_texture                      ,  x ,  x ,  x , ES2, 2006)
 EXT(OES_depth_texture_cube_map              , OES_depth_texture_cube_map             ,  x ,  x ,  x , ES2, 2012)
+EXT(OES_draw_buffers_indexed                , ARB_draw_buffers_blend                 ,  x ,  x ,  x ,  30, 2014)
 EXT(OES_draw_elements_base_vertex           , ARB_draw_elements_base_vertex          ,  x ,  x ,  x , ES2, 2014)
 EXT(OES_draw_texture                        , OES_draw_texture                       ,  x ,  x , ES1,  x , 2004)
 EXT(OES_element_index_uint                  , dummy_true                             ,  x ,  x , ES1, ES2, 2005)
@@ -329,7 +334,10 @@ EXT(OES_point_sprite                        , ARB_point_sprite
 EXT(OES_query_matrix                        , dummy_true                             ,  x ,  x , ES1,  x , 2003)
 EXT(OES_read_format                         , dummy_true                             , GLL, GLC, ES1,  x , 2003)
 EXT(OES_rgb8_rgba8                          , dummy_true                             ,  x ,  x , ES1, ES2, 2005)
+EXT(OES_sample_shading                      , OES_sample_variables                   ,  x ,  x ,  x ,  30, 2014)
+EXT(OES_sample_variables                    , OES_sample_variables                   ,  x ,  x ,  x ,  30, 2014)
 EXT(OES_shader_image_atomic                 , ARB_shader_image_load_store            ,  x ,  x ,  x ,  31, 2015)
+EXT(OES_shader_multisample_interpolation    , OES_sample_variables                   ,  x ,  x ,  x ,  30, 2014)
 EXT(OES_single_precision                    , dummy_true                             ,  x ,  x , ES1,  x , 2003)
 EXT(OES_standard_derivatives                , OES_standard_derivatives               ,  x ,  x ,  x , ES2, 2005)
 EXT(OES_stencil1                            , dummy_false                            ,  x ,  x ,  x ,  x , 2005)
@@ -339,6 +347,7 @@ EXT(OES_stencil_wrap                        , dummy_true
 EXT(OES_surfaceless_context                 , dummy_true                             ,  x ,  x , ES1, ES2, 2012)
 EXT(OES_texture_3D                          , dummy_true                             ,  x ,  x ,  x , ES2, 2005)
 EXT(OES_texture_border_clamp                , ARB_texture_border_clamp               ,  x ,  x ,  x , ES2, 2014)
+EXT(OES_texture_buffer                      , OES_texture_buffer                     ,  x ,  x ,  x ,  31, 2014)
 EXT(OES_texture_cube_map                    , ARB_texture_cube_map                   ,  x ,  x , ES1,  x , 2007)
 EXT(OES_texture_env_crossbar                , ARB_texture_env_crossbar               ,  x ,  x , ES1,  x , 2005)
 EXT(OES_texture_float                       , OES_texture_float                      ,  x ,  x ,  x , ES2, 2005)
diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index b0fadc93aef..6829c33254c 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -408,6 +408,11 @@ static const int extra_ARB_gpu_shader5_or_oes_geometry_shader[] = {
    EXTRA_END
 };
 
+static const int extra_ARB_gpu_shader5_or_OES_sample_variables[] = {
+   EXT(ARB_gpu_shader5),
+   EXT(OES_sample_variables),
+};
+
 EXTRA_EXT(ARB_texture_cube_map);
 EXTRA_EXT(EXT_texture_array);
 EXTRA_EXT(NV_fog_distance);
@@ -1907,8 +1912,8 @@ tex_binding_to_index(const struct gl_context *ctx, GLenum binding)
          || _mesa_is_gles3(ctx)
          ? TEXTURE_2D_ARRAY_INDEX : -1;
    case GL_TEXTURE_BINDING_BUFFER:
-      return ctx->API == API_OPENGL_CORE &&
-             ctx->Extensions.ARB_texture_buffer_object ?
+      return (_mesa_has_ARB_texture_buffer_object(ctx) ||
+              _mesa_has_OES_texture_buffer(ctx)) ?
              TEXTURE_BUFFER_INDEX : -1;
    case GL_TEXTURE_BINDING_CUBE_MAP_ARRAY:
       return _mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_texture_cube_map_array
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index 12c21899cb1..a0cc4f8e842 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -503,6 +503,14 @@ descriptor=[
   [ "MAX_COMBINED_SHADER_OUTPUT_RESOURCES", "CONTEXT_INT(Const.MaxCombinedShaderOutputResources), extra_ARB_shader_image_load_store_shader_storage_buffer_object_es31" ],
 ]},
 
+# Enums in OpenGL Core profile and ES 3.0
+{ "apis": ["GL_CORE", "GLES3"], "params": [
+  # GL_ARB_gpu_shader5 / GL_OES_shader_multisample_interpolation
+  [ "MIN_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MinFragmentInterpolationOffset), extra_ARB_gpu_shader5_or_OES_sample_variables" ],
+  [ "MAX_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MaxFragmentInterpolationOffset), extra_ARB_gpu_shader5_or_OES_sample_variables" ],
+  [ "FRAGMENT_INTERPOLATION_OFFSET_BITS", "CONST(FRAGMENT_INTERPOLATION_OFFSET_BITS), extra_ARB_gpu_shader5_or_OES_sample_variables" ],
+]},
+
 # Enums in OpenGL Core profile and ES 3.1
 { "apis": ["GL_CORE", "GLES31"], "params": [
 # GL_ARB_draw_indirect / GLES 3.1
@@ -535,6 +543,16 @@ descriptor=[
 
 # GL_ARB_gpu_shader5 / GL_OES_geometry_shader
   [ "MAX_GEOMETRY_SHADER_INVOCATIONS", "CONST(MAX_GEOMETRY_SHADER_INVOCATIONS), extra_ARB_gpu_shader5_or_oes_geometry_shader" ],
+
+# GL_ARB_texture_buffer_object / GL_OES_texture_buffer
+  [ "MAX_TEXTURE_BUFFER_SIZE_ARB", "CONTEXT_INT(Const.MaxTextureBufferSize), extra_texture_buffer_object" ],
+  [ "TEXTURE_BINDING_BUFFER_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ],
+  [ "TEXTURE_BUFFER_DATA_STORE_BINDING_ARB", "LOC_CUSTOM, TYPE_INT, TEXTURE_BUFFER_INDEX, extra_texture_buffer_object" ],
+  [ "TEXTURE_BUFFER_FORMAT_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ],
+  [ "TEXTURE_BUFFER_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ],
+
+# GL_ARB_texture_buffer_range
+  [ "TEXTURE_BUFFER_OFFSET_ALIGNMENT", "CONTEXT_INT(Const.TextureBufferOffsetAlignment), extra_ARB_texture_buffer_range" ],
 ]},
 
 # Remaining enums are only in OpenGL
@@ -805,13 +823,6 @@ descriptor=[
 # GL_ARB_color_buffer_float
   [ "RGBA_FLOAT_MODE_ARB", "BUFFER_FIELD(Visual.floatMode, TYPE_BOOLEAN), extra_core_ARB_color_buffer_float_and_new_buffers" ],
 
-# GL_ARB_texture_buffer_object
-  [ "MAX_TEXTURE_BUFFER_SIZE_ARB", "CONTEXT_INT(Const.MaxTextureBufferSize), extra_texture_buffer_object" ],
-  [ "TEXTURE_BINDING_BUFFER_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ],
-  [ "TEXTURE_BUFFER_DATA_STORE_BINDING_ARB", "LOC_CUSTOM, TYPE_INT, TEXTURE_BUFFER_INDEX, extra_texture_buffer_object" ],
-  [ "TEXTURE_BUFFER_FORMAT_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ],
-  [ "TEXTURE_BUFFER_ARB", "LOC_CUSTOM, TYPE_INT, 0, extra_texture_buffer_object" ],
-
 # GL 3.0
   [ "CONTEXT_FLAGS", "CONTEXT_INT(Const.ContextFlags), extra_version_30" ],
 
@@ -871,21 +882,12 @@ descriptor=[
 
 # Enums restricted to OpenGL Core profile
 { "apis": ["GL_CORE"], "params": [
-# GL_ARB_texture_buffer_range
-  [ "TEXTURE_BUFFER_OFFSET_ALIGNMENT", "CONTEXT_INT(Const.TextureBufferOffsetAlignment), extra_ARB_texture_buffer_range" ],
-
 # GL_ARB_viewport_array
   [ "MAX_VIEWPORTS", "CONTEXT_INT(Const.MaxViewports), extra_ARB_viewport_array" ],
   [ "VIEWPORT_SUBPIXEL_BITS", "CONTEXT_INT(Const.ViewportSubpixelBits), extra_ARB_viewport_array" ],
   [ "VIEWPORT_BOUNDS_RANGE", "CONTEXT_FLOAT2(Const.ViewportBounds), extra_ARB_viewport_array" ],
   [ "VIEWPORT_INDEX_PROVOKING_VERTEX", "CONTEXT_ENUM(Const.LayerAndVPIndexProvokingVertex), extra_ARB_viewport_array" ],
 
-# GL_ARB_gpu_shader5
-  [ "MAX_GEOMETRY_SHADER_INVOCATIONS", "CONST(MAX_GEOMETRY_SHADER_INVOCATIONS), extra_ARB_gpu_shader5" ],
-  [ "MIN_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MinFragmentInterpolationOffset), extra_ARB_gpu_shader5" ],
-  [ "MAX_FRAGMENT_INTERPOLATION_OFFSET", "CONTEXT_FLOAT(Const.MaxFragmentInterpolationOffset), extra_ARB_gpu_shader5" ],
-  [ "FRAGMENT_INTERPOLATION_OFFSET_BITS", "CONST(FRAGMENT_INTERPOLATION_OFFSET_BITS), extra_ARB_gpu_shader5" ],
-
 # GL_ARB_tessellation_shader
   [ "PATCH_VERTICES", "CONTEXT_INT(TessCtrlProgram.patch_vertices), extra_ARB_tessellation_shader" ],
   [ "PATCH_DEFAULT_OUTER_LEVEL", "CONTEXT_FLOAT4(TessCtrlProgram.patch_default_outer_level), extra_ARB_tessellation_shader" ],
diff --git a/src/mesa/main/mipmap.c b/src/mesa/main/mipmap.c
index 5a02780b960..5ff53f4265c 100644
--- a/src/mesa/main/mipmap.c
+++ b/src/mesa/main/mipmap.c
@@ -1810,11 +1810,11 @@ _mesa_next_mipmap_level_size(GLenum target, GLint border,
  * for mipmap generation.  If not, (re) allocate it.
  * \return GL_TRUE if successful, GL_FALSE if mipmap generation should stop
  */
-GLboolean
-_mesa_prepare_mipmap_level(struct gl_context *ctx,
-                           struct gl_texture_object *texObj, GLuint level,
-                           GLsizei width, GLsizei height, GLsizei depth,
-                           GLsizei border, GLenum intFormat, mesa_format format)
+static GLboolean
+prepare_mipmap_level(struct gl_context *ctx,
+                     struct gl_texture_object *texObj, GLuint level,
+                     GLsizei width, GLsizei height, GLsizei depth,
+                     GLsizei border, GLenum intFormat, mesa_format format)
 {
    const GLuint numFaces = _mesa_num_tex_faces(texObj->Target);
    GLuint face;
@@ -1872,6 +1872,49 @@ _mesa_prepare_mipmap_level(struct gl_context *ctx,
 }
 
 
+/**
+ * Prepare all mipmap levels beyond 'baseLevel' for mipmap generation.
+ * When finished, all the gl_texture_image structures for the smaller
+ * mipmap levels will be consistent with the base level (in terms of
+ * dimensions, format, etc).
+ */
+void
+_mesa_prepare_mipmap_levels(struct gl_context *ctx,
+                            struct gl_texture_object *texObj,
+                            unsigned baseLevel, unsigned maxLevel)
+{
+   const struct gl_texture_image *baseImage =
+      _mesa_select_tex_image(texObj, texObj->Target, baseLevel);
+   const GLint border = 0;
+   GLint width = baseImage->Width;
+   GLint height = baseImage->Height;
+   GLint depth = baseImage->Depth;
+   const GLenum intFormat = baseImage->InternalFormat;
+   const mesa_format texFormat = baseImage->TexFormat;
+   GLint newWidth, newHeight, newDepth;
+
+   /* Prepare baseLevel + 1, baseLevel + 2, ... */
+   for (unsigned level = baseLevel + 1; level <= maxLevel; level++) {
+      if (!_mesa_next_mipmap_level_size(texObj->Target, border,
+                                        width, height, depth,
+                                        &newWidth, &newHeight, &newDepth)) {
+         /* all done */
+         break;
+      }
+
+      if (!prepare_mipmap_level(ctx, texObj, level,
+                                newWidth, newHeight, newDepth,
+                                border, intFormat, texFormat)) {
+         break;
+      }
+
+      width = newWidth;
+      height = newHeight;
+      depth = newDepth;
+   }
+}
+
+
 static void
 generate_mipmap_uncompressed(struct gl_context *ctx, GLenum target,
 			     struct gl_texture_object *texObj,
@@ -1892,7 +1935,6 @@ generate_mipmap_uncompressed(struct gl_context *ctx, GLenum target,
       GLint dstWidth, dstHeight, dstDepth;
       GLint border;
       GLint slice;
-      GLboolean nextLevel;
       GLubyte **srcMaps, **dstMaps;
       GLboolean success = GL_TRUE;
 
@@ -1904,22 +1946,14 @@ generate_mipmap_uncompressed(struct gl_context *ctx, GLenum target,
       srcDepth = srcImage->Depth;
       border = srcImage->Border;
 
-      nextLevel = _mesa_next_mipmap_level_size(target, border,
-                                         srcWidth, srcHeight, srcDepth,
-                                         &dstWidth, &dstHeight, &dstDepth);
-      if (!nextLevel)
-         return;
-
-      if (!_mesa_prepare_mipmap_level(ctx, texObj, level + 1,
-                                      dstWidth, dstHeight, dstDepth,
-                                      border, srcImage->InternalFormat,
-                                      srcImage->TexFormat)) {
-         return;
-      }
-
       /* get dest gl_texture_image */
       dstImage = _mesa_select_tex_image(texObj, target, level + 1);
-      assert(dstImage);
+      if (!dstImage) {
+         break;
+      }
+      dstWidth = dstImage->Width;
+      dstHeight = dstImage->Height;
+      dstDepth = dstImage->Depth;
 
       if (target == GL_TEXTURE_1D_ARRAY) {
 	 srcDepth = srcHeight;
@@ -2087,7 +2121,6 @@ generate_mipmap_compressed(struct gl_context *ctx, GLenum target,
       GLint srcWidth, srcHeight, srcDepth;
       GLint dstWidth, dstHeight, dstDepth;
       GLint border;
-      GLboolean nextLevel;
       GLuint temp_dst_row_stride, temp_dst_img_stride; /* in bytes */
       GLint i;
 
@@ -2099,23 +2132,14 @@ generate_mipmap_compressed(struct gl_context *ctx, GLenum target,
       srcDepth = srcImage->Depth;
       border = srcImage->Border;
 
-      nextLevel = _mesa_next_mipmap_level_size(target, border,
-                                         srcWidth, srcHeight, srcDepth,
-                                         &dstWidth, &dstHeight, &dstDepth);
-      if (!nextLevel)
-	 goto end;
-
-      if (!_mesa_prepare_mipmap_level(ctx, texObj, level + 1,
-                                      dstWidth, dstHeight, dstDepth,
-                                      border, srcImage->InternalFormat,
-                                      srcImage->TexFormat)) {
-         /* all done */
-         goto end;
-      }
-
       /* get dest gl_texture_image */
       dstImage = _mesa_select_tex_image(texObj, target, level + 1);
-      assert(dstImage);
+      if (!dstImage) {
+         break;
+      }
+      dstWidth = dstImage->Width;
+      dstHeight = dstImage->Height;
+      dstDepth = dstImage->Depth;
 
       /* Compute dst image strides and alloc memory on first iteration */
       temp_dst_row_stride = _mesa_format_row_stride(temp_format, dstWidth);
@@ -2194,6 +2218,8 @@ _mesa_generate_mipmap(struct gl_context *ctx, GLenum target,
 
    maxLevel = MIN2(maxLevel, texObj->MaxLevel);
 
+   _mesa_prepare_mipmap_levels(ctx, texObj, texObj->BaseLevel, maxLevel);
+
    if (_mesa_is_format_compressed(srcImage->TexFormat)) {
       generate_mipmap_compressed(ctx, target, texObj, srcImage, maxLevel);
    } else {
diff --git a/src/mesa/main/mipmap.h b/src/mesa/main/mipmap.h
index c0366d329a2..d11c7fada37 100644
--- a/src/mesa/main/mipmap.h
+++ b/src/mesa/main/mipmap.h
@@ -40,12 +40,10 @@ _mesa_generate_mipmap_level(GLenum target,
                             GLubyte **dstData,
                             GLint dstRowStride);
 
-
-extern GLboolean
-_mesa_prepare_mipmap_level(struct gl_context *ctx,
-                           struct gl_texture_object *texObj, GLuint level,
-                           GLsizei width, GLsizei height, GLsizei depth,
-                           GLsizei border, GLenum intFormat, mesa_format format);
+void
+_mesa_prepare_mipmap_levels(struct gl_context *ctx,
+                            struct gl_texture_object *texObj,
+                            unsigned baseLevel, unsigned maxLevel);
 
 extern void
 _mesa_generate_mipmap(struct gl_context *ctx, GLenum target,
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 71aae178adf..d609ae92e0c 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -1618,7 +1618,9 @@ struct gl_transform_feedback_varying_info
 {
    char *Name;
    GLenum Type;
+   GLint BufferIndex;
    GLint Size;
+   GLint Offset;
 };
 
 
@@ -1644,15 +1646,33 @@ struct gl_transform_feedback_output
 };
 
 
+struct gl_transform_feedback_buffer
+{
+   unsigned Binding;
+
+   unsigned NumVaryings;
+
+   /**
+    * Total number of components stored in each buffer.  This may be used by
+    * hardware back-ends to determine the correct stride when interleaving
+    * multiple transform feedback outputs in the same buffer.
+    */
+   unsigned Stride;
+
+   /**
+    * Which transform feedback stream this buffer binding is associated with.
+    */
+   unsigned Stream;
+};
+
+
 /** Post-link transform feedback info. */
 struct gl_transform_feedback_info
 {
    unsigned NumOutputs;
 
-   /**
-    * Number of transform feedback buffers in use by this program.
-    */
-   unsigned NumBuffers;
+   /* Bitmask of active buffer indices. */
+   unsigned ActiveBuffers;
 
    struct gl_transform_feedback_output *Outputs;
 
@@ -1663,17 +1683,7 @@ struct gl_transform_feedback_info
    struct gl_transform_feedback_varying_info *Varyings;
    GLint NumVarying;
 
-   /**
-    * Total number of components stored in each buffer.  This may be used by
-    * hardware back-ends to determine the correct stride when interleaving
-    * multiple transform feedback outputs in the same buffer.
-    */
-   unsigned BufferStride[MAX_FEEDBACK_BUFFERS];
-
-   /**
-    * Which transform feedback stream this buffer binding is associated with.
-    */
-   unsigned BufferStream[MAX_FEEDBACK_BUFFERS];
+   struct gl_transform_feedback_buffer Buffers[MAX_FEEDBACK_BUFFERS];
 };
 
 
@@ -2196,6 +2206,7 @@ struct ati_fragment_shader
    GLboolean interpinp1;
    GLboolean isValid;
    GLuint swizzlerq;
+   struct gl_program *Program;
 };
 
 /**
@@ -2306,7 +2317,7 @@ struct gl_shader
     * duplicated.
     */
    unsigned NumBufferInterfaceBlocks;
-   struct gl_uniform_block *BufferInterfaceBlocks;
+   struct gl_uniform_block **BufferInterfaceBlocks;
 
    unsigned NumUniformBlocks;
    struct gl_uniform_block **UniformBlocks;
@@ -2330,6 +2341,11 @@ struct gl_shader
    bool origin_upper_left;
    bool pixel_center_integer;
 
+   struct {
+      /** Global xfb_stride out qualifier if any */
+      GLuint BufferStride[MAX_FEEDBACK_BUFFERS];
+   } TransformFeedback;
+
    /**
     * Tessellation Control shader state from layout qualifiers.
     */
@@ -2672,6 +2688,8 @@ struct gl_shader_program
     */
    struct {
       GLenum BufferMode;
+      /** Global xfb_stride out qualifier if any */
+      GLuint BufferStride[MAX_FEEDBACK_BUFFERS];
       GLuint NumVarying;
       GLchar **VaryingNames;  /**< Array [NumVarying] of char * */
    } TransformFeedback;
@@ -2827,13 +2845,6 @@ struct gl_shader_program
    int *InterfaceBlockStageIndex[MESA_SHADER_STAGES];
 
    /**
-    * Indices into the BufferInterfaceBlocks[] array for Uniform Buffer
-    * Objects and Shader Storage Buffer Objects.
-    */
-   unsigned *UboInterfaceBlockIndex;
-   unsigned *SsboInterfaceBlockIndex;
-
-   /**
     * Map of active uniform names to locations
     *
     * Maps any active uniform that is not an array element to a location.
@@ -3905,7 +3916,10 @@ struct gl_extensions
    GLboolean EXT_transform_feedback;
    GLboolean EXT_timer_query;
    GLboolean EXT_vertex_array_bgra;
+   GLboolean OES_copy_image;
+   GLboolean OES_sample_variables;
    GLboolean OES_standard_derivatives;
+   GLboolean OES_texture_buffer;
    /* vendor extensions */
    GLboolean AMD_performance_monitor;
    GLboolean AMD_pinned_memory;
diff --git a/src/mesa/main/multisample.c b/src/mesa/main/multisample.c
index 77773a20883..5453e38632e 100644
--- a/src/mesa/main/multisample.c
+++ b/src/mesa/main/multisample.c
@@ -127,7 +127,8 @@ _mesa_MinSampleShading(GLclampf value)
 {
    GET_CURRENT_CONTEXT(ctx);
 
-   if (!ctx->Extensions.ARB_sample_shading || !_mesa_is_desktop_gl(ctx)) {
+   if (!_mesa_has_ARB_sample_shading(ctx) &&
+       !_mesa_has_OES_sample_shading(ctx)) {
       _mesa_error(ctx, GL_INVALID_OPERATION, "glMinSampleShading");
       return;
    }
diff --git a/src/mesa/main/program_resource.c b/src/mesa/main/program_resource.c
index 0d9f8aecf08..f2a9f006dd8 100644
--- a/src/mesa/main/program_resource.c
+++ b/src/mesa/main/program_resource.c
@@ -39,6 +39,7 @@ supported_interface_enum(struct gl_context *ctx, GLenum iface)
    case GL_UNIFORM_BLOCK:
    case GL_PROGRAM_INPUT:
    case GL_PROGRAM_OUTPUT:
+   case GL_TRANSFORM_FEEDBACK_BUFFER:
    case GL_TRANSFORM_FEEDBACK_VARYING:
    case GL_ATOMIC_COUNTER_BUFFER:
    case GL_BUFFER_VARIABLE:
@@ -105,7 +106,8 @@ _mesa_GetProgramInterfaceiv(GLuint program, GLenum programInterface,
             (*params)++;
       break;
    case GL_MAX_NAME_LENGTH:
-      if (programInterface == GL_ATOMIC_COUNTER_BUFFER) {
+      if (programInterface == GL_ATOMIC_COUNTER_BUFFER ||
+          programInterface == GL_TRANSFORM_FEEDBACK_BUFFER) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "glGetProgramInterfaceiv(%s pname %s)",
                      _mesa_enum_to_string(programInterface),
@@ -165,6 +167,16 @@ _mesa_GetProgramInterfaceiv(GLuint program, GLenum programInterface,
             }
          }
          break;
+      case GL_TRANSFORM_FEEDBACK_BUFFER:
+         for (i = 0, *params = 0; i < shProg->NumProgramResourceList; i++) {
+            if (shProg->ProgramResourceList[i].Type == programInterface) {
+               struct gl_transform_feedback_buffer *buffer =
+                  (struct gl_transform_feedback_buffer *)
+                  shProg->ProgramResourceList[i].Data;
+               *params = MAX2(*params, buffer->NumVaryings);
+            }
+         }
+         break;
       default:
         _mesa_error(ctx, GL_INVALID_OPERATION,
                     "glGetProgramInterfaceiv(%s pname %s)",
@@ -289,6 +301,7 @@ _mesa_GetProgramResourceIndex(GLuint program, GLenum programInterface,
 
       return _mesa_program_resource_index(shProg, res);
    case GL_ATOMIC_COUNTER_BUFFER:
+   case GL_TRANSFORM_FEEDBACK_BUFFER:
    default:
       _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramResourceIndex(%s)",
                   _mesa_enum_to_string(programInterface));
@@ -318,6 +331,7 @@ _mesa_GetProgramResourceName(GLuint program, GLenum programInterface,
       return;
 
    if (programInterface == GL_ATOMIC_COUNTER_BUFFER ||
+       programInterface == GL_TRANSFORM_FEEDBACK_BUFFER ||
        !supported_interface_enum(ctx, programInterface)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glGetProgramResourceName(%s)",
                   _mesa_enum_to_string(programInterface));
diff --git a/src/mesa/main/shader_query.cpp b/src/mesa/main/shader_query.cpp
index 4967e4b1df1..993dc863220 100644
--- a/src/mesa/main/shader_query.cpp
+++ b/src/mesa/main/shader_query.cpp
@@ -60,7 +60,8 @@ DECL_RESOURCE_FUNC(VAR, gl_shader_variable);
 DECL_RESOURCE_FUNC(UBO, gl_uniform_block);
 DECL_RESOURCE_FUNC(UNI, gl_uniform_storage);
 DECL_RESOURCE_FUNC(ATC, gl_active_atomic_buffer);
-DECL_RESOURCE_FUNC(XFB, gl_transform_feedback_varying_info);
+DECL_RESOURCE_FUNC(XFV, gl_transform_feedback_varying_info);
+DECL_RESOURCE_FUNC(XFB, gl_transform_feedback_buffer);
 DECL_RESOURCE_FUNC(SUB, gl_subroutine_function);
 
 void GLAPIENTRY
@@ -433,7 +434,7 @@ _mesa_program_resource_name(struct gl_program_resource *res)
    case GL_SHADER_STORAGE_BLOCK:
       return RESOURCE_UBO(res)->Name;
    case GL_TRANSFORM_FEEDBACK_VARYING:
-      return RESOURCE_XFB(res)->Name;
+      return RESOURCE_XFV(res)->Name;
    case GL_PROGRAM_INPUT:
       var = RESOURCE_VAR(res);
       /* Special case gl_VertexIDMESA -> gl_VertexID. */
@@ -473,8 +474,8 @@ _mesa_program_resource_array_size(struct gl_program_resource *res)
 {
    switch (res->Type) {
    case GL_TRANSFORM_FEEDBACK_VARYING:
-      return RESOURCE_XFB(res)->Size > 1 ?
-             RESOURCE_XFB(res)->Size : 0;
+      return RESOURCE_XFV(res)->Size > 1 ?
+             RESOURCE_XFV(res)->Size : 0;
    case GL_PROGRAM_INPUT:
    case GL_PROGRAM_OUTPUT:
       return RESOURCE_VAR(res)->type->length;
@@ -670,6 +671,7 @@ _mesa_program_resource_index(struct gl_shader_program *shProg,
       return RESOURCE_SUB(res)->index;
    case GL_UNIFORM_BLOCK:
    case GL_SHADER_STORAGE_BLOCK:
+   case GL_TRANSFORM_FEEDBACK_BUFFER:
    case GL_TRANSFORM_FEEDBACK_VARYING:
    default:
       return calc_resource_index(shProg, res);
@@ -707,6 +709,7 @@ _mesa_program_resource_find_index(struct gl_shader_program *shProg,
       case GL_UNIFORM_BLOCK:
       case GL_ATOMIC_COUNTER_BUFFER:
       case GL_SHADER_STORAGE_BLOCK:
+      case GL_TRANSFORM_FEEDBACK_BUFFER:
          if (_mesa_program_resource_index(shProg, res) == index)
             return res;
          break;
@@ -1009,7 +1012,8 @@ get_buffer_property(struct gl_shader_program *shProg,
    GET_CURRENT_CONTEXT(ctx);
    if (res->Type != GL_UNIFORM_BLOCK &&
        res->Type != GL_ATOMIC_COUNTER_BUFFER &&
-       res->Type != GL_SHADER_STORAGE_BLOCK)
+       res->Type != GL_SHADER_STORAGE_BLOCK &&
+       res->Type != GL_TRANSFORM_FEEDBACK_BUFFER)
       goto invalid_operation;
 
    if (res->Type == GL_UNIFORM_BLOCK) {
@@ -1110,6 +1114,30 @@ get_buffer_property(struct gl_shader_program *shProg,
          }
          return RESOURCE_ATC(res)->NumUniforms;
       }
+   } else if (res->Type == GL_TRANSFORM_FEEDBACK_BUFFER) {
+      switch (prop) {
+      case GL_BUFFER_BINDING:
+         *val = RESOURCE_XFB(res)->Binding;
+         return 1;
+      case GL_NUM_ACTIVE_VARIABLES:
+         *val = RESOURCE_XFB(res)->NumVaryings;
+         return 1;
+      case GL_ACTIVE_VARIABLES:
+         int i = 0;
+         for ( ; i < shProg->LinkedTransformFeedback.NumVarying; i++) {
+            unsigned index =
+               shProg->LinkedTransformFeedback.Varyings[i].BufferIndex;
+            struct gl_program_resource *buf_res =
+               _mesa_program_resource_find_index(shProg,
+                                                 GL_TRANSFORM_FEEDBACK_BUFFER,
+                                                 index);
+            assert(buf_res);
+            if (res == buf_res) {
+               *val++ = i;
+            }
+         }
+         return RESOURCE_XFB(res)->NumVaryings;
+      }
    }
    assert(!"support for property type not implemented");
 
@@ -1140,6 +1168,7 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
    case GL_NAME_LENGTH:
       switch (res->Type) {
       case GL_ATOMIC_COUNTER_BUFFER:
+      case GL_TRANSFORM_FEEDBACK_BUFFER:
          goto invalid_operation;
       default:
          /* Resource name length + terminator. */
@@ -1157,7 +1186,7 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
          *val = RESOURCE_VAR(res)->type->gl_type;
          return 1;
       case GL_TRANSFORM_FEEDBACK_VARYING:
-         *val = RESOURCE_XFB(res)->Type;
+         *val = RESOURCE_XFV(res)->Type;
          return 1;
       default:
          goto invalid_operation;
@@ -1180,15 +1209,23 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
          *val = MAX2(_mesa_program_resource_array_size(res), 1);
          return 1;
       case GL_TRANSFORM_FEEDBACK_VARYING:
-         *val = MAX2(RESOURCE_XFB(res)->Size, 1);
+         *val = MAX2(RESOURCE_XFV(res)->Size, 1);
          return 1;
       default:
          goto invalid_operation;
       }
    case GL_OFFSET:
-      VALIDATE_TYPE_2(GL_UNIFORM, GL_BUFFER_VARIABLE);
-      *val = RESOURCE_UNI(res)->offset;
-      return 1;
+      switch (res->Type) {
+      case GL_UNIFORM:
+      case GL_BUFFER_VARIABLE:
+         *val = RESOURCE_UNI(res)->offset;
+         return 1;
+      case GL_TRANSFORM_FEEDBACK_VARYING:
+         *val = RESOURCE_XFV(res)->Offset;
+         return 1;
+      default:
+         goto invalid_operation;
+      }
    case GL_BLOCK_INDEX:
       VALIDATE_TYPE_2(GL_UNIFORM, GL_BUFFER_VARIABLE);
       *val = RESOURCE_UNI(res)->block_index;
@@ -1314,6 +1351,16 @@ _mesa_program_resource_prop(struct gl_shader_program *shProg,
       default:
          goto invalid_operation;
       }
+
+   case GL_TRANSFORM_FEEDBACK_BUFFER_INDEX:
+      VALIDATE_TYPE(GL_TRANSFORM_FEEDBACK_VARYING);
+      *val = RESOURCE_XFV(res)->BufferIndex;
+      return 1;
+   case GL_TRANSFORM_FEEDBACK_BUFFER_STRIDE:
+      VALIDATE_TYPE(GL_TRANSFORM_FEEDBACK_BUFFER);
+      *val = RESOURCE_XFB(res)->Stride * 4;
+      return 1;
+
    default:
       goto invalid_enum;
    }
diff --git a/src/mesa/main/shaderapi.c b/src/mesa/main/shaderapi.c
index 32fad56f651..ba2607221d9 100644
--- a/src/mesa/main/shaderapi.c
+++ b/src/mesa/main/shaderapi.c
@@ -2568,7 +2568,6 @@ _mesa_UniformSubroutinesuiv(GLenum shadertype, GLsizei count,
       memcpy(&uni->storage[0], &indices[i],
              sizeof(GLuint) * uni_count);
 
-      uni->initialized = true;
       _mesa_propagate_uniforms_to_driver_storage(uni, 0, uni_count);
       i += uni_count;
    } while(i < count);
@@ -2742,7 +2741,7 @@ _mesa_shader_init_subroutine_defaults(struct gl_shader *sh)
 
       for (j = 0; j < uni_count; j++)
          memcpy(&uni->storage[j], &val, sizeof(int));
-      uni->initialized = true;
+
       _mesa_propagate_uniforms_to_driver_storage(uni, 0, uni_count);
    }
 }
diff --git a/src/mesa/main/shaderimage.c b/src/mesa/main/shaderimage.c
index fd5934f939f..90643c4ed6d 100644
--- a/src/mesa/main/shaderimage.c
+++ b/src/mesa/main/shaderimage.c
@@ -583,8 +583,13 @@ _mesa_BindImageTexture(GLuint unit, GLuint texture, GLint level,
        *
        * "An INVALID_OPERATION error is generated if texture is not the name
        *  of an immutable texture object."
+       *
+       * However note that issue 7 of the GL_OES_texture_buffer spec
+       * recognizes that there is no way to create immutable buffer textures,
+       * so those are excluded from this requirement.
        */
-      if (_mesa_is_gles(ctx) && !t->Immutable) {
+      if (_mesa_is_gles(ctx) && !t->Immutable &&
+          t->Target != GL_TEXTURE_BUFFER) {
          _mesa_error(ctx, GL_INVALID_OPERATION,
                      "glBindImageTexture(!immutable)");
          return;
diff --git a/src/mesa/main/state.c b/src/mesa/main/state.c
index 917ae4da023..bf6035e0142 100644
--- a/src/mesa/main/state.c
+++ b/src/mesa/main/state.c
@@ -124,7 +124,8 @@ update_program(struct gl_context *ctx)
     * follows:
     *   1. OpenGL 2.0/ARB vertex/fragment shaders
     *   2. ARB/NV vertex/fragment programs
-    *   3. Programs derived from fixed-function state.
+    *   3. ATI fragment shader
+    *   4. Programs derived from fixed-function state.
     *
     * Note: it's possible for a vertex shader to get used with a fragment
     * program (and vice versa) here, but in practice that shouldn't ever
@@ -152,6 +153,17 @@ update_program(struct gl_context *ctx)
       _mesa_reference_fragprog(ctx, &ctx->FragmentProgram._TexEnvProgram,
 			       NULL);
    }
+   else if (ctx->ATIFragmentShader._Enabled &&
+            ctx->ATIFragmentShader.Current->Program) {
+       /* Use the enabled ATI fragment shader's associated program */
+      _mesa_reference_shader_program(ctx,
+                                     &ctx->_Shader->_CurrentFragmentProgram,
+                                     NULL);
+      _mesa_reference_fragprog(ctx, &ctx->FragmentProgram._Current,
+                               gl_fragment_program(ctx->ATIFragmentShader.Current->Program));
+      _mesa_reference_fragprog(ctx, &ctx->FragmentProgram._TexEnvProgram,
+                               NULL);
+   }
    else if (ctx->FragmentProgram._MaintainTexEnvProgram) {
       /* Use fragment program generated from fixed-function state */
       struct gl_shader_program *f = _mesa_get_fixed_func_fragment_program(ctx);
diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp
index 09b97c33074..9f278be47ca 100644
--- a/src/mesa/main/tests/dispatch_sanity.cpp
+++ b/src/mesa/main/tests/dispatch_sanity.cpp
@@ -2450,6 +2450,26 @@ const struct function gles3_functions_possible[] = {
    { "glGetSamplerParameterIivOES", 30, -1 },
    { "glGetSamplerParameterIuivOES", 30, -1 },
 
+   /* GL_OES_texture_buffer */
+   { "glTexBufferOES", 31, -1 },
+   { "glTexBufferRangeOES", 31, -1 },
+
+   /* GL_OES_sample_shading */
+   { "glMinSampleShadingOES", 30, -1 },
+
+   /* GL_OES_copy_image */
+   { "glCopyImageSubDataOES", 30, -1 },
+
+   /* GL_OES_draw_buffers_indexed */
+   { "glBlendFunciOES", 30, -1 },
+   { "glBlendFuncSeparateiOES", 30, -1 },
+   { "glBlendEquationiOES", 30, -1 },
+   { "glBlendEquationSeparateiOES", 30, -1 },
+   { "glColorMaskiOES", 30, -1 },
+   { "glEnableiOES", 30, -1 },
+   { "glDisableiOES", 30, -1 },
+   { "glIsEnablediOES", 30, -1 },
+
    { NULL, 0, -1 }
 };
 
diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 616a92953e7..6ac6fb109d3 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -499,8 +499,8 @@ _mesa_max_texture_levels(struct gl_context *ctx, GLenum target)
       return ctx->Extensions.ARB_texture_cube_map_array
          ? ctx->Const.MaxCubeTextureLevels : 0;
    case GL_TEXTURE_BUFFER:
-      return ctx->API == API_OPENGL_CORE &&
-             ctx->Extensions.ARB_texture_buffer_object ? 1 : 0;
+      return (_mesa_has_ARB_texture_buffer_object(ctx) ||
+              _mesa_has_OES_texture_buffer(ctx)) ? 1 : 0;
    case GL_TEXTURE_2D_MULTISAMPLE:
    case GL_PROXY_TEXTURE_2D_MULTISAMPLE:
    case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
@@ -3484,6 +3484,24 @@ formats_differ_in_component_sizes(mesa_format f1, mesa_format f2)
    return GL_FALSE;
 }
 
+static bool
+can_avoid_reallocation(struct gl_texture_image *texImage, GLenum internalFormat,
+                       mesa_format texFormat, GLint x, GLint y, GLsizei width,
+                       GLsizei height, GLint border)
+{
+   if (texImage->InternalFormat != internalFormat)
+      return false;
+   if (texImage->TexFormat != texFormat)
+      return false;
+   if (texImage->Border != border)
+      return false;
+   if (texImage->Width2 != width)
+      return false;
+   if (texImage->Height2 != height)
+      return false;
+   return true;
+}
+
 /**
  * Implement the glCopyTexImage1/2D() functions.
  */
@@ -3527,6 +3545,24 @@ copyteximage(struct gl_context *ctx, GLuint dims,
    texFormat = _mesa_choose_texture_format(ctx, texObj, target, level,
                                            internalFormat, GL_NONE, GL_NONE);
 
+   /* First check if reallocating the texture buffer can be avoided.
+    * Without the realloc the copy can be 20x faster.
+    */
+   _mesa_lock_texture(ctx, texObj);
+   {
+      texImage = _mesa_select_tex_image(texObj, target, level);
+      if (texImage && can_avoid_reallocation(texImage, internalFormat, texFormat,
+                                             x, y, width, height, border)) {
+         _mesa_unlock_texture(ctx, texObj);
+         return _mesa_copy_texture_sub_image(ctx, dims, texObj, target, level,
+                                             0, 0, 0, x, y, width, height,
+                                             "CopyTexImage");
+      }
+   }
+   _mesa_unlock_texture(ctx, texObj);
+   _mesa_perf_debug(ctx, MESA_DEBUG_SEVERITY_LOW, "glCopyTexImage "
+                    "can't avoid reallocating texture storage\n");
+
    rb = _mesa_get_read_renderbuffer_for_format(ctx, internalFormat);
 
    if (_mesa_is_gles3(ctx)) {
@@ -4681,7 +4717,7 @@ _mesa_CompressedTextureSubImage3D(GLuint texture, GLint level, GLint xoffset,
 static mesa_format
 get_texbuffer_format(const struct gl_context *ctx, GLenum internalFormat)
 {
-   if (ctx->API != API_OPENGL_CORE) {
+   if (ctx->API == API_OPENGL_COMPAT) {
       switch (internalFormat) {
       case GL_ALPHA8:
          return MESA_FORMAT_A_UNORM8;
@@ -4768,8 +4804,8 @@ get_texbuffer_format(const struct gl_context *ctx, GLenum internalFormat)
       }
    }
 
-   if (ctx->API == API_OPENGL_CORE &&
-       ctx->Extensions.ARB_texture_buffer_object_rgb32) {
+   if (_mesa_has_ARB_texture_buffer_object_rgb32(ctx) ||
+       _mesa_has_OES_texture_buffer(ctx)) {
       switch (internalFormat) {
       case GL_RGB32F:
          return MESA_FORMAT_RGB_FLOAT32;
@@ -4786,6 +4822,8 @@ get_texbuffer_format(const struct gl_context *ctx, GLenum internalFormat)
    case GL_RGBA8:
       return MESA_FORMAT_R8G8B8A8_UNORM;
    case GL_RGBA16:
+      if (_mesa_is_gles(ctx))
+         return MESA_FORMAT_NONE;
       return MESA_FORMAT_RGBA_UNORM16;
    case GL_RGBA16F_ARB:
       return MESA_FORMAT_RGBA_FLOAT16;
@@ -4807,6 +4845,8 @@ get_texbuffer_format(const struct gl_context *ctx, GLenum internalFormat)
    case GL_RG8:
       return MESA_FORMAT_R8G8_UNORM;
    case GL_RG16:
+      if (_mesa_is_gles(ctx))
+         return MESA_FORMAT_NONE;
       return MESA_FORMAT_R16G16_UNORM;
    case GL_RG16F:
       return MESA_FORMAT_RG_FLOAT16;
@@ -4828,6 +4868,8 @@ get_texbuffer_format(const struct gl_context *ctx, GLenum internalFormat)
    case GL_R8:
       return MESA_FORMAT_R_UNORM8;
    case GL_R16:
+      if (_mesa_is_gles(ctx))
+         return MESA_FORMAT_NONE;
       return MESA_FORMAT_R_UNORM16;
    case GL_R16F:
       return MESA_FORMAT_R_FLOAT16;
@@ -4905,8 +4947,8 @@ _mesa_texture_buffer_range(struct gl_context *ctx,
    /* NOTE: ARB_texture_buffer_object has interactions with
     * the compatibility profile that are not implemented.
     */
-   if (!(ctx->API == API_OPENGL_CORE &&
-         ctx->Extensions.ARB_texture_buffer_object)) {
+   if (!_mesa_has_ARB_texture_buffer_object(ctx) &&
+       !_mesa_has_OES_texture_buffer(ctx)) {
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "%s(ARB_texture_buffer_object is not"
                   " implemented for the compatibility profile)", caller);
diff --git a/src/mesa/main/texobj.c b/src/mesa/main/texobj.c
index d8407f04340..c9502bda236 100644
--- a/src/mesa/main/texobj.c
+++ b/src/mesa/main/texobj.c
@@ -204,8 +204,8 @@ _mesa_get_current_tex_object(struct gl_context *ctx, GLenum target)
       case GL_PROXY_TEXTURE_2D_ARRAY_EXT:
          return arrayTex ? ctx->Texture.ProxyTex[TEXTURE_2D_ARRAY_INDEX] : NULL;
       case GL_TEXTURE_BUFFER:
-         return ctx->API == API_OPENGL_CORE &&
-                ctx->Extensions.ARB_texture_buffer_object ?
+         return (_mesa_has_ARB_texture_buffer_object(ctx) ||
+                 _mesa_has_OES_texture_buffer(ctx)) ?
                 texUnit->CurrentTex[TEXTURE_BUFFER_INDEX] : NULL;
       case GL_TEXTURE_EXTERNAL_OES:
          return _mesa_is_gles(ctx) && ctx->Extensions.OES_EGL_image_external
@@ -1574,8 +1574,8 @@ _mesa_tex_target_to_index(const struct gl_context *ctx, GLenum target)
          || _mesa_is_gles3(ctx)
          ? TEXTURE_2D_ARRAY_INDEX : -1;
    case GL_TEXTURE_BUFFER:
-      return ctx->API == API_OPENGL_CORE &&
-             ctx->Extensions.ARB_texture_buffer_object ?
+      return (_mesa_has_ARB_texture_buffer_object(ctx) ||
+              _mesa_has_OES_texture_buffer(ctx)) ?
              TEXTURE_BUFFER_INDEX : -1;
    case GL_TEXTURE_EXTERNAL_OES:
       return _mesa_is_gles(ctx) && ctx->Extensions.OES_EGL_image_external
diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c
index 9350ca5c035..ba83f8fda9a 100644
--- a/src/mesa/main/texparam.c
+++ b/src/mesa/main/texparam.c
@@ -1223,6 +1223,26 @@ _mesa_legal_get_tex_level_parameter_target(struct gl_context *ctx, GLenum target
    case GL_TEXTURE_2D_MULTISAMPLE:
    case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
       return ctx->Extensions.ARB_texture_multisample;
+   case GL_TEXTURE_BUFFER:
+      /* GetTexLevelParameter accepts GL_TEXTURE_BUFFER in GL 3.1+ contexts,
+       * but not in earlier versions that expose ARB_texture_buffer_object.
+       *
+       * From the ARB_texture_buffer_object spec:
+       * "(7) Do buffer textures support texture parameters (TexParameter) or
+       *      queries (GetTexParameter, GetTexLevelParameter, GetTexImage)?
+       *
+       *    RESOLVED:  No. [...] Note that the spec edits above don't add
+       *    explicit error language for any of these cases.  That is because
+       *    each of the functions enumerate the set of valid <target>
+       *    parameters.  Not editing the spec to allow TEXTURE_BUFFER_ARB in
+       *    these cases means that target is not legal, and an INVALID_ENUM
+       *    error should be generated."
+       *
+       * From the OpenGL 3.1 spec:
+       * "target may also be TEXTURE_BUFFER, indicating the texture buffer."
+       */
+      return (ctx->API == API_OPENGL_CORE && ctx->Version >= 31) ||
+         _mesa_has_OES_texture_buffer(ctx);
    }
 
    if (!_mesa_is_desktop_gl(ctx))
@@ -1247,25 +1267,6 @@ _mesa_legal_get_tex_level_parameter_target(struct gl_context *ctx, GLenum target
    case GL_PROXY_TEXTURE_1D_ARRAY_EXT:
    case GL_PROXY_TEXTURE_2D_ARRAY_EXT:
       return ctx->Extensions.EXT_texture_array;
-   case GL_TEXTURE_BUFFER:
-      /* GetTexLevelParameter accepts GL_TEXTURE_BUFFER in GL 3.1+ contexts,
-       * but not in earlier versions that expose ARB_texture_buffer_object.
-       *
-       * From the ARB_texture_buffer_object spec:
-       * "(7) Do buffer textures support texture parameters (TexParameter) or
-       *      queries (GetTexParameter, GetTexLevelParameter, GetTexImage)?
-       *
-       *    RESOLVED:  No. [...] Note that the spec edits above don't add
-       *    explicit error language for any of these cases.  That is because
-       *    each of the functions enumerate the set of valid <target>
-       *    parameters.  Not editing the spec to allow TEXTURE_BUFFER_ARB in
-       *    these cases means that target is not legal, and an INVALID_ENUM
-       *    error should be generated."
-       *
-       * From the OpenGL 3.1 spec:
-       * "target may also be TEXTURE_BUFFER, indicating the texture buffer."
-       */
-      return ctx->API == API_OPENGL_CORE && ctx->Version >= 31;
    case GL_PROXY_TEXTURE_2D_MULTISAMPLE:
    case GL_PROXY_TEXTURE_2D_MULTISAMPLE_ARRAY:
       return ctx->Extensions.ARB_texture_multisample;
@@ -1447,6 +1448,29 @@ get_tex_level_parameter_image(struct gl_context *ctx,
          *params = img->FixedSampleLocations;
          break;
 
+      /* There is never a buffer data store here, but these pnames still have
+       * to work.
+       */
+
+      /* GL_ARB_texture_buffer_object */
+      case GL_TEXTURE_BUFFER_DATA_STORE_BINDING:
+         if (!ctx->Extensions.ARB_texture_buffer_object)
+            goto invalid_pname;
+         *params = 0;
+         break;
+
+      /* GL_ARB_texture_buffer_range */
+      case GL_TEXTURE_BUFFER_OFFSET:
+         if (!ctx->Extensions.ARB_texture_buffer_range)
+            goto invalid_pname;
+         *params = 0;
+         break;
+      case GL_TEXTURE_BUFFER_SIZE:
+         if (!ctx->Extensions.ARB_texture_buffer_range)
+            goto invalid_pname;
+         *params = 0;
+         break;
+
       default:
          goto invalid_pname;
    }
@@ -1468,13 +1492,24 @@ get_tex_level_parameter_buffer(struct gl_context *ctx,
 {
    const struct gl_buffer_object *bo = texObj->BufferObject;
    mesa_format texFormat = texObj->_BufferObjectFormat;
+   int bytes = MAX2(1, _mesa_get_format_bytes(texFormat));
    GLenum internalFormat = texObj->BufferObjectFormat;
    GLenum baseFormat = _mesa_get_format_base_format(texFormat);
    const char *suffix = dsa ? "ture" : "";
 
    if (!bo) {
       /* undefined texture buffer object */
-      *params = pname == GL_TEXTURE_COMPONENTS ? 1 : 0;
+      switch (pname) {
+      case GL_TEXTURE_FIXED_SAMPLE_LOCATIONS:
+         *params = GL_TRUE;
+         break;
+      case GL_TEXTURE_INTERNAL_FORMAT:
+         *params = internalFormat;
+         break;
+      default:
+         *params = 0;
+         break;
+      }
       return;
    }
 
@@ -1483,10 +1518,13 @@ get_tex_level_parameter_buffer(struct gl_context *ctx,
          *params = bo->Name;
          break;
       case GL_TEXTURE_WIDTH:
-         *params = bo->Size;
+         *params = ((texObj->BufferSize == -1) ? bo->Size : texObj->BufferSize)
+            / bytes;
          break;
       case GL_TEXTURE_HEIGHT:
       case GL_TEXTURE_DEPTH:
+         *params = 1;
+         break;
       case GL_TEXTURE_BORDER:
       case GL_TEXTURE_SHARED_SIZE:
       case GL_TEXTURE_COMPRESSED:
@@ -1536,6 +1574,19 @@ get_tex_level_parameter_buffer(struct gl_context *ctx,
          *params = (texObj->BufferSize == -1) ? bo->Size : texObj->BufferSize;
          break;
 
+      /* GL_ARB_texture_multisample */
+      case GL_TEXTURE_SAMPLES:
+         if (!ctx->Extensions.ARB_texture_multisample)
+            goto invalid_pname;
+         *params = 0;
+         break;
+
+      case GL_TEXTURE_FIXED_SAMPLE_LOCATIONS:
+         if (!ctx->Extensions.ARB_texture_multisample)
+            goto invalid_pname;
+         *params = GL_TRUE;
+         break;
+
       /* GL_ARB_texture_compression */
       case GL_TEXTURE_COMPRESSED_IMAGE_SIZE:
          /* Always illegal for GL_TEXTURE_BUFFER */
diff --git a/src/mesa/main/textureview.c b/src/mesa/main/textureview.c
index 419fbebf2f0..4b3b3245c2e 100644
--- a/src/mesa/main/textureview.c
+++ b/src/mesa/main/textureview.c
@@ -82,6 +82,39 @@
     |                       | COMPRESSED_RGB_BPTC_UNSIGNED_FLOAT              |
     ---------------------------------------------------------------------------
  */
+
+#define VIEW_CLASS_GLES(x)             (GL_VIEW_CLASS_BPTC_FLOAT + 1 + x)
+#define VIEW_CLASS_EAC_R11             VIEW_CLASS_GLES(0)
+#define VIEW_CLASS_EAC_RG11            VIEW_CLASS_GLES(1)
+#define VIEW_CLASS_ETC2_RGB            VIEW_CLASS_GLES(2)
+#define VIEW_CLASS_ETC2_RGBA           VIEW_CLASS_GLES(3)
+#define VIEW_CLASS_ETC2_EAC_RGBA       VIEW_CLASS_GLES(4)
+#define VIEW_CLASS_ASTC_4x4_RGBA       VIEW_CLASS_GLES(5)
+#define VIEW_CLASS_ASTC_5x4_RGBA       VIEW_CLASS_GLES(6)
+#define VIEW_CLASS_ASTC_5x5_RGBA       VIEW_CLASS_GLES(7)
+#define VIEW_CLASS_ASTC_6x5_RGBA       VIEW_CLASS_GLES(8)
+#define VIEW_CLASS_ASTC_6x6_RGBA       VIEW_CLASS_GLES(9)
+#define VIEW_CLASS_ASTC_8x5_RGBA       VIEW_CLASS_GLES(10)
+#define VIEW_CLASS_ASTC_8x6_RGBA       VIEW_CLASS_GLES(11)
+#define VIEW_CLASS_ASTC_8x8_RGBA       VIEW_CLASS_GLES(12)
+#define VIEW_CLASS_ASTC_10x5_RGBA      VIEW_CLASS_GLES(13)
+#define VIEW_CLASS_ASTC_10x6_RGBA      VIEW_CLASS_GLES(14)
+#define VIEW_CLASS_ASTC_10x8_RGBA      VIEW_CLASS_GLES(15)
+#define VIEW_CLASS_ASTC_10x10_RGBA     VIEW_CLASS_GLES(16)
+#define VIEW_CLASS_ASTC_12x10_RGBA     VIEW_CLASS_GLES(17)
+#define VIEW_CLASS_ASTC_12x12_RGBA     VIEW_CLASS_GLES(18)
+#define VIEW_CLASS_ASTC_3x3x3_RGBA     VIEW_CLASS_GLES(19)
+#define VIEW_CLASS_ASTC_4x3x3_RGBA     VIEW_CLASS_GLES(20)
+#define VIEW_CLASS_ASTC_4x4x3_RGBA     VIEW_CLASS_GLES(21)
+#define VIEW_CLASS_ASTC_4x4x4_RGBA     VIEW_CLASS_GLES(22)
+#define VIEW_CLASS_ASTC_5x4x4_RGBA     VIEW_CLASS_GLES(23)
+#define VIEW_CLASS_ASTC_5x5x4_RGBA     VIEW_CLASS_GLES(24)
+#define VIEW_CLASS_ASTC_5x5x5_RGBA     VIEW_CLASS_GLES(25)
+#define VIEW_CLASS_ASTC_6x5x5_RGBA     VIEW_CLASS_GLES(26)
+#define VIEW_CLASS_ASTC_6x6x5_RGBA     VIEW_CLASS_GLES(27)
+#define VIEW_CLASS_ASTC_6x6x6_RGBA     VIEW_CLASS_GLES(28)
+
+
 struct internal_format_class_info {
    GLenum view_class;
    GLenum internal_format;
@@ -162,6 +195,41 @@ static const struct internal_format_class_info s3tc_compatible_internal_formats[
    {GL_VIEW_CLASS_S3TC_DXT5_RGBA, GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT},
 };
 
+static const struct internal_format_class_info gles_etc2_compatible_internal_formats[] = {
+   {VIEW_CLASS_EAC_R11, GL_COMPRESSED_R11_EAC},
+   {VIEW_CLASS_EAC_R11, GL_COMPRESSED_SIGNED_R11_EAC},
+   {VIEW_CLASS_EAC_RG11, GL_COMPRESSED_RG11_EAC},
+   {VIEW_CLASS_EAC_RG11, GL_COMPRESSED_SIGNED_RG11_EAC},
+   {VIEW_CLASS_ETC2_RGB, GL_COMPRESSED_RGB8_ETC2},
+   {VIEW_CLASS_ETC2_RGB, GL_COMPRESSED_SRGB8_ETC2},
+   {VIEW_CLASS_ETC2_RGBA, GL_COMPRESSED_RGB8_PUNCHTHROUGH_ALPHA1_ETC2},
+   {VIEW_CLASS_ETC2_RGBA, GL_COMPRESSED_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2},
+   {VIEW_CLASS_ETC2_EAC_RGBA, GL_COMPRESSED_RGBA8_ETC2_EAC},
+   {VIEW_CLASS_ETC2_EAC_RGBA, GL_COMPRESSED_SRGB8_ALPHA8_ETC2_EAC},
+};
+
+static const struct internal_format_class_info gles_astc_compatible_internal_formats[] = {
+#define ASTC_FMT(size) \
+   {VIEW_CLASS_ASTC_##size## _RGBA, GL_COMPRESSED_RGBA_ASTC_##size##_KHR}, \
+   {VIEW_CLASS_ASTC_##size##_RGBA, GL_COMPRESSED_SRGB8_ALPHA8_ASTC_##size##_KHR}
+
+   ASTC_FMT(4x4),
+   ASTC_FMT(5x4),
+   ASTC_FMT(5x5),
+   ASTC_FMT(6x5),
+   ASTC_FMT(6x6),
+   ASTC_FMT(8x5),
+   ASTC_FMT(8x6),
+   ASTC_FMT(8x8),
+   ASTC_FMT(10x5),
+   ASTC_FMT(10x6),
+   ASTC_FMT(10x8),
+   ASTC_FMT(10x10),
+   ASTC_FMT(12x10),
+   ASTC_FMT(12x12),
+#undef ASTC_FMT
+};
+
 GLenum
 _mesa_texture_view_lookup_view_class(const struct gl_context *ctx, GLenum internalformat)
 {
@@ -180,6 +248,24 @@ _mesa_texture_view_lookup_view_class(const struct gl_context *ctx, GLenum intern
             return s3tc_compatible_internal_formats[i].view_class;
       }
    }
+
+   if (_mesa_is_gles3(ctx)) {
+      for (i = 0; i < ARRAY_SIZE(gles_etc2_compatible_internal_formats); i++) {
+         if (gles_etc2_compatible_internal_formats[i].internal_format
+             == internalformat)
+            return gles_etc2_compatible_internal_formats[i].view_class;
+      }
+
+      if (ctx->Extensions.KHR_texture_compression_astc_ldr) {
+         for (i = 0; i < ARRAY_SIZE(gles_astc_compatible_internal_formats); i++) {
+            if (gles_astc_compatible_internal_formats[i].internal_format
+                == internalformat)
+               return gles_astc_compatible_internal_formats[i].view_class;
+         }
+      }
+
+      /* FINISHME: Add 3D OES formats when supported */
+   }
    return GL_FALSE;
 }
 
diff --git a/src/mesa/main/transformfeedback.c b/src/mesa/main/transformfeedback.c
index f73a89f6c0f..c92f0ccd5a5 100644
--- a/src/mesa/main/transformfeedback.c
+++ b/src/mesa/main/transformfeedback.c
@@ -347,23 +347,25 @@ compute_transform_feedback_buffer_sizes(
  * enabled transform feedback buffers without overflowing any of them.
  */
 unsigned
-_mesa_compute_max_transform_feedback_vertices(
+_mesa_compute_max_transform_feedback_vertices(struct gl_context *ctx,
       const struct gl_transform_feedback_object *obj,
       const struct gl_transform_feedback_info *info)
 {
    unsigned max_index = 0xffffffff;
    unsigned i;
 
-   for (i = 0; i < info->NumBuffers; ++i) {
-      unsigned stride = info->BufferStride[i];
-      unsigned max_for_this_buffer;
+   for (i = 0; i < ctx->Const.MaxTransformFeedbackBuffers; i++) {
+      if ((info->ActiveBuffers >> i) & 1) {
+         unsigned stride = info->Buffers[i].Stride;
+         unsigned max_for_this_buffer;
 
-      /* Skip any inactive buffers, which have a stride of 0. */
-      if (stride == 0)
-	 continue;
+         /* Skip any inactive buffers, which have a stride of 0. */
+         if (stride == 0)
+	    continue;
 
-      max_for_this_buffer = obj->Size[i] / (4 * stride);
-      max_index = MIN2(max_index, max_for_this_buffer);
+         max_for_this_buffer = obj->Size[i] / (4 * stride);
+         max_index = MIN2(max_index, max_for_this_buffer);
+      }
    }
 
    return max_index;
@@ -445,12 +447,14 @@ _mesa_BeginTransformFeedback(GLenum mode)
       return;
    }
 
-   for (i = 0; i < info->NumBuffers; ++i) {
-      if (obj->BufferNames[i] == 0) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "glBeginTransformFeedback(binding point %d does not have "
-                     "a buffer object bound)", i);
-         return;
+   for (i = 0; i < ctx->Const.MaxTransformFeedbackBuffers; i++) {
+      if ((info->ActiveBuffers >> i) & 1) {
+         if (obj->BufferNames[i] == 0) {
+            _mesa_error(ctx, GL_INVALID_OPERATION,
+                        "glBeginTransformFeedback(binding point %d does not "
+                        "have a buffer object bound)", i);
+            return;
+         }
       }
    }
 
@@ -470,7 +474,7 @@ _mesa_BeginTransformFeedback(GLenum mode)
        * feedback.
        */
       unsigned max_vertices
-         = _mesa_compute_max_transform_feedback_vertices(obj, info);
+         = _mesa_compute_max_transform_feedback_vertices(ctx, obj, info);
       obj->GlesRemainingPrims = max_vertices / vertices_per_prim;
    }
 
diff --git a/src/mesa/main/transformfeedback.h b/src/mesa/main/transformfeedback.h
index eb274ad6540..c83f917a532 100644
--- a/src/mesa/main/transformfeedback.h
+++ b/src/mesa/main/transformfeedback.h
@@ -50,7 +50,7 @@ extern void
 _mesa_init_transform_feedback_functions(struct dd_function_table *driver);
 
 extern unsigned
-_mesa_compute_max_transform_feedback_vertices(
+_mesa_compute_max_transform_feedback_vertices( struct gl_context *ctx,
       const struct gl_transform_feedback_object *obj,
       const struct gl_transform_feedback_info *info);
 
diff --git a/src/mesa/main/uniform_query.cpp b/src/mesa/main/uniform_query.cpp
index 2ced201ebca..ab5c3cd9249 100644
--- a/src/mesa/main/uniform_query.cpp
+++ b/src/mesa/main/uniform_query.cpp
@@ -815,8 +815,6 @@ _mesa_uniform(struct gl_context *ctx, struct gl_shader_program *shProg,
       }
    }
 
-   uni->initialized = true;
-
    _mesa_propagate_uniforms_to_driver_storage(uni, offset, count);
 
    /* If the uniform is a sampler, do the extra magic necessary to propagate
@@ -1030,8 +1028,6 @@ _mesa_uniform_matrix(struct gl_context *ctx, struct gl_shader_program *shProg,
       }
    }
 
-   uni->initialized = true;
-
    _mesa_propagate_uniforms_to_driver_storage(uni, offset, count);
 }
 
diff --git a/src/mesa/main/uniforms.c b/src/mesa/main/uniforms.c
index b1968b3f795..7dcbdccf442 100644
--- a/src/mesa/main/uniforms.c
+++ b/src/mesa/main/uniforms.c
@@ -1018,26 +1018,11 @@ _mesa_UniformBlockBinding(GLuint program,
 
    if (shProg->UniformBlocks[uniformBlockIndex]->Binding !=
        uniformBlockBinding) {
-      int i;
 
       FLUSH_VERTICES(ctx, 0);
       ctx->NewDriverState |= ctx->DriverFlags.NewUniformBuffer;
 
-      const int interface_block_index =
-         shProg->UboInterfaceBlockIndex[uniformBlockIndex];
-
-      shProg->BufferInterfaceBlocks[interface_block_index].Binding =
-         uniformBlockBinding;
-
-      for (i = 0; i < MESA_SHADER_STAGES; i++) {
-	 int stage_index =
-            shProg->InterfaceBlockStageIndex[i][interface_block_index];
-
-	 if (stage_index != -1) {
-	    struct gl_shader *sh = shProg->_LinkedShaders[i];
-	    sh->BufferInterfaceBlocks[stage_index].Binding = uniformBlockBinding;
-	 }
-      }
+      shProg->UniformBlocks[uniformBlockIndex]->Binding = uniformBlockBinding;
    }
 }
 
@@ -1076,26 +1061,12 @@ _mesa_ShaderStorageBlockBinding(GLuint program,
 
    if (shProg->ShaderStorageBlocks[shaderStorageBlockIndex]->Binding !=
        shaderStorageBlockBinding) {
-      int i;
 
       FLUSH_VERTICES(ctx, 0);
       ctx->NewDriverState |= ctx->DriverFlags.NewShaderStorageBuffer;
 
-      const int interface_block_index =
-         shProg->SsboInterfaceBlockIndex[shaderStorageBlockIndex];
-
-      shProg->BufferInterfaceBlocks[interface_block_index].Binding =
+      shProg->ShaderStorageBlocks[shaderStorageBlockIndex]->Binding =
          shaderStorageBlockBinding;
-
-      for (i = 0; i < MESA_SHADER_STAGES; i++) {
-	 int stage_index =
-            shProg->InterfaceBlockStageIndex[i][interface_block_index];
-
-	 if (stage_index != -1) {
-	    struct gl_shader *sh = shProg->_LinkedShaders[i];
-	    sh->BufferInterfaceBlocks[stage_index].Binding = shaderStorageBlockBinding;
-	 }
-      }
    }
 }
 
diff --git a/src/mesa/program/ir_to_mesa.cpp b/src/mesa/program/ir_to_mesa.cpp
index 1d9047ee6fd..35a68562001 100644
--- a/src/mesa/program/ir_to_mesa.cpp
+++ b/src/mesa/program/ir_to_mesa.cpp
@@ -2976,7 +2976,7 @@ _mesa_ir_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
       _mesa_reference_program(ctx, &linked_prog, NULL);
    }
 
-   build_program_resource_list(prog);
+   build_program_resource_list(ctx, prog);
    return prog->LinkStatus;
 }
 
diff --git a/src/mesa/program/prog_to_nir.c b/src/mesa/program/prog_to_nir.c
index 16b79c94c84..a6119ae4e7c 100644
--- a/src/mesa/program/prog_to_nir.c
+++ b/src/mesa/program/prog_to_nir.c
@@ -59,7 +59,6 @@ struct ptn_compile {
 
 #define SWIZ(X, Y, Z, W) \
    (unsigned[4]){ SWIZZLE_##X, SWIZZLE_##Y, SWIZZLE_##Z, SWIZZLE_##W }
-#define ptn_swizzle(b, src, x, y, z, w) nir_swizzle(b, src, SWIZ(x, y, z, w), 4, true)
 #define ptn_channel(b, src, ch) nir_swizzle(b, src, SWIZ(ch, ch, ch, ch), 1, true)
 
 static nir_ssa_def *
@@ -491,11 +490,11 @@ ptn_xpd(nir_builder *b, nir_alu_dest dest, nir_ssa_def **src)
    ptn_move_dest_masked(b, dest,
                         nir_fsub(b,
                                  nir_fmul(b,
-                                          ptn_swizzle(b, src[0], Y, Z, X, X),
-                                          ptn_swizzle(b, src[1], Z, X, Y, X)),
+                                          nir_swizzle(b, src[0], SWIZ(Y, Z, X, W), 3, true),
+                                          nir_swizzle(b, src[1], SWIZ(Z, X, Y, W), 3, true)),
                                  nir_fmul(b,
-                                          ptn_swizzle(b, src[1], Y, Z, X, X),
-                                          ptn_swizzle(b, src[0], Z, X, Y, X))),
+                                          nir_swizzle(b, src[1], SWIZ(Y, Z, X, W), 3, true),
+                                          nir_swizzle(b, src[0], SWIZ(Z, X, Y, W), 3, true))),
                         WRITEMASK_XYZ);
    ptn_move_dest_masked(b, dest, nir_imm_float(b, 1.0), WRITEMASK_W);
 }
@@ -545,7 +544,7 @@ ptn_lrp(nir_builder *b, nir_alu_dest dest, nir_ssa_def **src)
 }
 
 static void
-ptn_kil(nir_builder *b, nir_alu_dest dest, nir_ssa_def **src)
+ptn_kil(nir_builder *b, nir_ssa_def **src)
 {
    nir_ssa_def *cmp = b->shader->options->native_integers ?
       nir_bany_inequal4(b, nir_flt(b, src[0], nir_imm_float(b, 0.0)), nir_imm_int(b, 0)) :
@@ -642,7 +641,8 @@ ptn_tex(nir_builder *b, nir_alu_dest dest, nir_ssa_def **src,
    unsigned src_number = 0;
 
    instr->src[src_number].src =
-      nir_src_for_ssa(ptn_swizzle(b, src[0], X, Y, Z, W));
+      nir_src_for_ssa(nir_swizzle(b, src[0], SWIZ(X, Y, Z, W),
+                                  instr->coord_components, true));
    instr->src[src_number].src_type = nir_tex_src_coord;
    src_number++;
 
@@ -830,7 +830,7 @@ ptn_emit_instruction(struct ptn_compile *c, struct prog_instruction *prog_inst)
       break;
 
    case OPCODE_KIL:
-      ptn_kil(b, dest, src);
+      ptn_kil(b, src);
       break;
 
    case OPCODE_CMP:
diff --git a/src/mesa/program/program.h b/src/mesa/program/program.h
index 24e05974dc3..09e69280d46 100644
--- a/src/mesa/program/program.h
+++ b/src/mesa/program/program.h
@@ -172,6 +172,8 @@ _mesa_program_enum_to_shader_stage(GLenum v)
       return MESA_SHADER_VERTEX;
    case GL_FRAGMENT_PROGRAM_ARB:
       return MESA_SHADER_FRAGMENT;
+   case GL_FRAGMENT_SHADER_ATI:
+      return MESA_SHADER_FRAGMENT;
    case GL_GEOMETRY_PROGRAM_NV:
       return MESA_SHADER_GEOMETRY;
    case GL_TESS_CONTROL_PROGRAM_NV:
diff --git a/src/mesa/state_tracker/st_atifs_to_tgsi.c b/src/mesa/state_tracker/st_atifs_to_tgsi.c
new file mode 100644
index 00000000000..66f442aee5a
--- /dev/null
+++ b/src/mesa/state_tracker/st_atifs_to_tgsi.c
@@ -0,0 +1,845 @@
+/*
+ * Copyright (C) 2016 Miklós Máté
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "main/mtypes.h"
+#include "main/atifragshader.h"
+#include "main/errors.h"
+#include "program/prog_parameter.h"
+
+#include "tgsi/tgsi_ureg.h"
+#include "tgsi/tgsi_scan.h"
+#include "tgsi/tgsi_transform.h"
+
+#include "st_program.h"
+#include "st_atifs_to_tgsi.h"
+
+/**
+ * Intermediate state used during shader translation.
+ */
+struct st_translate {
+   struct ureg_program *ureg;
+   struct ati_fragment_shader *atifs;
+
+   struct ureg_dst temps[MAX_PROGRAM_TEMPS];
+   struct ureg_src *constants;
+   struct ureg_dst outputs[PIPE_MAX_SHADER_OUTPUTS];
+   struct ureg_src inputs[PIPE_MAX_SHADER_INPUTS];
+   struct ureg_src samplers[PIPE_MAX_SAMPLERS];
+
+   const GLuint *inputMapping;
+   const GLuint *outputMapping;
+
+   unsigned current_pass;
+
+   bool regs_written[MAX_NUM_PASSES_ATI][MAX_NUM_FRAGMENT_REGISTERS_ATI];
+
+   boolean error;
+};
+
+struct instruction_desc {
+   unsigned TGSI_opcode;
+   const char *name;
+   unsigned char arg_count;
+};
+
+static const struct instruction_desc inst_desc[] = {
+   {TGSI_OPCODE_MOV, "MOV", 1},
+   {TGSI_OPCODE_NOP, "UND", 0}, /* unused */
+   {TGSI_OPCODE_ADD, "ADD", 2},
+   {TGSI_OPCODE_MUL, "MUL", 2},
+   {TGSI_OPCODE_SUB, "SUB", 2},
+   {TGSI_OPCODE_DP3, "DOT3", 2},
+   {TGSI_OPCODE_DP4, "DOT4", 2},
+   {TGSI_OPCODE_MAD, "MAD", 3},
+   {TGSI_OPCODE_LRP, "LERP", 3},
+   {TGSI_OPCODE_NOP, "CND", 3},
+   {TGSI_OPCODE_NOP, "CND0", 3},
+   {TGSI_OPCODE_NOP, "DOT2_ADD", 3}
+};
+
+static struct ureg_dst
+get_temp(struct st_translate *t, unsigned index)
+{
+   if (ureg_dst_is_undef(t->temps[index]))
+      t->temps[index] = ureg_DECL_temporary(t->ureg);
+   return t->temps[index];
+}
+
+static struct ureg_src
+apply_swizzle(struct st_translate *t,
+              struct ureg_src src, GLuint swizzle)
+{
+   if (swizzle == GL_SWIZZLE_STR_ATI) {
+      return src;
+   } else if (swizzle == GL_SWIZZLE_STQ_ATI) {
+      return ureg_swizzle(src,
+                          TGSI_SWIZZLE_X,
+                          TGSI_SWIZZLE_Y,
+                          TGSI_SWIZZLE_W,
+                          TGSI_SWIZZLE_Z);
+   } else {
+      struct ureg_dst tmp[2];
+      struct ureg_src imm[3];
+
+      tmp[0] = get_temp(t, MAX_NUM_FRAGMENT_REGISTERS_ATI);
+      tmp[1] = get_temp(t, MAX_NUM_FRAGMENT_REGISTERS_ATI + 1);
+      imm[0] = src;
+      imm[1] = ureg_imm4f(t->ureg, 1.0f, 1.0f, 0.0f, 0.0f);
+      imm[2] = ureg_imm4f(t->ureg, 0.0f, 0.0f, 1.0f, 1.0f);
+      ureg_insn(t->ureg, TGSI_OPCODE_MAD, &tmp[0], 1, imm, 3);
+
+      if (swizzle == GL_SWIZZLE_STR_DR_ATI) {
+         imm[0] = ureg_scalar(src, TGSI_SWIZZLE_Z);
+      } else {
+         imm[0] = ureg_scalar(src, TGSI_SWIZZLE_W);
+      }
+      ureg_insn(t->ureg, TGSI_OPCODE_RCP, &tmp[1], 1, &imm[0], 1);
+
+      imm[0] = ureg_src(tmp[0]);
+      imm[1] = ureg_src(tmp[1]);
+      ureg_insn(t->ureg, TGSI_OPCODE_MUL, &tmp[0], 1, imm, 2);
+
+      return ureg_src(tmp[0]);
+   }
+}
+
+static struct ureg_src
+get_source(struct st_translate *t, GLuint src_type)
+{
+   if (src_type >= GL_REG_0_ATI && src_type <= GL_REG_5_ATI) {
+      if (t->regs_written[t->current_pass][src_type - GL_REG_0_ATI]) {
+         return ureg_src(get_temp(t, src_type - GL_REG_0_ATI));
+      } else {
+         return ureg_imm1f(t->ureg, 0.0f);
+      }
+   } else if (src_type >= GL_CON_0_ATI && src_type <= GL_CON_7_ATI) {
+      return t->constants[src_type - GL_CON_0_ATI];
+   } else if (src_type == GL_ZERO) {
+      return ureg_imm1f(t->ureg, 0.0f);
+   } else if (src_type == GL_ONE) {
+      return ureg_imm1f(t->ureg, 1.0f);
+   } else if (src_type == GL_PRIMARY_COLOR_ARB) {
+      return t->inputs[t->inputMapping[VARYING_SLOT_COL0]];
+   } else if (src_type == GL_SECONDARY_INTERPOLATOR_ATI) {
+      return t->inputs[t->inputMapping[VARYING_SLOT_COL1]];
+   } else {
+      /* frontend prevents this */
+      unreachable("unknown source");
+   }
+}
+
+static struct ureg_src
+prepare_argument(struct st_translate *t, const unsigned argId,
+                 const struct atifragshader_src_register *srcReg)
+{
+   struct ureg_src src = get_source(t, srcReg->Index);
+   struct ureg_dst arg = get_temp(t, MAX_NUM_FRAGMENT_REGISTERS_ATI + argId);
+
+   switch (srcReg->argRep) {
+   case GL_NONE:
+      break;
+   case GL_RED:
+      src = ureg_scalar(src, TGSI_SWIZZLE_X);
+      break;
+   case GL_GREEN:
+      src = ureg_scalar(src, TGSI_SWIZZLE_Y);
+      break;
+   case GL_BLUE:
+      src = ureg_scalar(src, TGSI_SWIZZLE_Z);
+      break;
+   case GL_ALPHA:
+      src = ureg_scalar(src, TGSI_SWIZZLE_W);
+      break;
+   }
+   ureg_insn(t->ureg, TGSI_OPCODE_MOV, &arg, 1, &src, 1);
+
+   if (srcReg->argMod & GL_COMP_BIT_ATI) {
+      struct ureg_src modsrc[2];
+      modsrc[0] = ureg_imm1f(t->ureg, 1.0f);
+      modsrc[1] = ureg_src(arg);
+
+      ureg_insn(t->ureg, TGSI_OPCODE_SUB, &arg, 1, modsrc, 2);
+   }
+   if (srcReg->argMod & GL_BIAS_BIT_ATI) {
+      struct ureg_src modsrc[2];
+      modsrc[0] = ureg_src(arg);
+      modsrc[1] = ureg_imm1f(t->ureg, 0.5f);
+
+      ureg_insn(t->ureg, TGSI_OPCODE_SUB, &arg, 1, modsrc, 2);
+   }
+   if (srcReg->argMod & GL_2X_BIT_ATI) {
+      struct ureg_src modsrc[2];
+      modsrc[0] = ureg_src(arg);
+      modsrc[1] = ureg_src(arg);
+
+      ureg_insn(t->ureg, TGSI_OPCODE_ADD, &arg, 1, modsrc, 2);
+   }
+   if (srcReg->argMod & GL_NEGATE_BIT_ATI) {
+      struct ureg_src modsrc[2];
+      modsrc[0] = ureg_src(arg);
+      modsrc[1] = ureg_imm1f(t->ureg, -1.0f);
+
+      ureg_insn(t->ureg, TGSI_OPCODE_MUL, &arg, 1, modsrc, 2);
+   }
+   return  ureg_src(arg);
+}
+
+/* These instructions need special treatment */
+static void
+emit_special_inst(struct st_translate *t, const struct instruction_desc *desc,
+                  struct ureg_dst *dst, struct ureg_src *args, unsigned argcount)
+{
+   struct ureg_dst tmp[1];
+   struct ureg_src src[3];
+
+   if (!strcmp(desc->name, "CND")) {
+      tmp[0] = get_temp(t, MAX_NUM_FRAGMENT_REGISTERS_ATI + 2); /* re-purpose a3 */
+      src[0] = ureg_imm1f(t->ureg, 0.5f);
+      src[1] = args[2];
+      ureg_insn(t->ureg, TGSI_OPCODE_SUB, tmp, 1, src, 2);
+      src[0] = ureg_src(tmp[0]);
+      src[1] = args[0];
+      src[2] = args[1];
+      ureg_insn(t->ureg, TGSI_OPCODE_CMP, dst, 1, src, 3);
+   } else if (!strcmp(desc->name, "CND0")) {
+      src[0] = args[2];
+      src[1] = args[1];
+      src[2] = args[0];
+      ureg_insn(t->ureg, TGSI_OPCODE_CMP, dst, 1, src, 3);
+   } else if (!strcmp(desc->name, "DOT2_ADD")) {
+      /* note: DP2A is not implemented in most pipe drivers */
+      tmp[0] = get_temp(t, MAX_NUM_FRAGMENT_REGISTERS_ATI); /* re-purpose a1 */
+      src[0] = args[0];
+      src[1] = args[1];
+      ureg_insn(t->ureg, TGSI_OPCODE_DP2, tmp, 1, src, 2);
+      src[0] = ureg_src(tmp[0]);
+      src[1] = ureg_scalar(args[2], TGSI_SWIZZLE_Z);
+      ureg_insn(t->ureg, TGSI_OPCODE_ADD, dst, 1, src, 2);
+   }
+}
+
+static void
+emit_arith_inst(struct st_translate *t,
+                const struct instruction_desc *desc,
+                struct ureg_dst *dst, struct ureg_src *args, unsigned argcount)
+{
+   if (desc->TGSI_opcode == TGSI_OPCODE_NOP) {
+      return emit_special_inst(t, desc, dst, args, argcount);
+   }
+
+   ureg_insn(t->ureg, desc->TGSI_opcode, dst, 1, args, argcount);
+}
+
+static void
+emit_dstmod(struct st_translate *t,
+            struct ureg_dst dst, GLuint dstMod)
+{
+   float imm;
+   struct ureg_src src[3];
+   GLuint scale = dstMod & ~GL_SATURATE_BIT_ATI;
+
+   if (dstMod == GL_NONE) {
+      return;
+   }
+
+   switch (scale) {
+   case GL_2X_BIT_ATI:
+      imm = 2.0f;
+      break;
+   case GL_4X_BIT_ATI:
+      imm = 4.0f;
+      break;
+   case GL_8X_BIT_ATI:
+      imm = 8.0f;
+      break;
+   case GL_HALF_BIT_ATI:
+      imm = 0.5f;
+      break;
+   case GL_QUARTER_BIT_ATI:
+      imm = 0.25f;
+      break;
+   case GL_EIGHTH_BIT_ATI:
+      imm = 0.125f;
+      break;
+   default:
+      imm = 1.0f;
+   }
+
+   src[0] = ureg_src(dst);
+   src[1] = ureg_imm1f(t->ureg, imm);
+   if (dstMod & GL_SATURATE_BIT_ATI) {
+      dst = ureg_saturate(dst);
+   }
+   ureg_insn(t->ureg, TGSI_OPCODE_MUL, &dst, 1, src, 2);
+}
+
+/**
+ * Compile one setup instruction to TGSI instructions.
+ */
+static void
+compile_setupinst(struct st_translate *t,
+                  const unsigned r,
+                  const struct atifs_setupinst *texinst)
+{
+   struct ureg_dst dst[1];
+   struct ureg_src src[2];
+
+   if (!texinst->Opcode)
+      return;
+
+   dst[0] = get_temp(t, r);
+
+   GLuint pass_tex = texinst->src;
+
+   if (pass_tex >= GL_TEXTURE0_ARB && pass_tex <= GL_TEXTURE7_ARB) {
+      unsigned attr = pass_tex - GL_TEXTURE0_ARB + VARYING_SLOT_TEX0;
+
+      src[0] = t->inputs[t->inputMapping[attr]];
+   } else if (pass_tex >= GL_REG_0_ATI && pass_tex <= GL_REG_5_ATI) {
+      unsigned reg = pass_tex - GL_REG_0_ATI;
+
+      /* the frontend already validated that REG is only allowed in second pass */
+      if (t->regs_written[0][reg]) {
+         src[0] = ureg_src(t->temps[reg]);
+      } else {
+         src[0] = ureg_imm1f(t->ureg, 0.0f);
+      }
+   }
+   src[0] = apply_swizzle(t, src[0], texinst->swizzle);
+
+   if (texinst->Opcode == ATI_FRAGMENT_SHADER_SAMPLE_OP) {
+      /* by default texture and sampler indexes are the same */
+      src[1] = t->samplers[r];
+      /* the texture target is still unknown, it will be fixed in the draw call */
+      ureg_tex_insn(t->ureg, TGSI_OPCODE_TEX, dst, 1, TGSI_TEXTURE_2D,
+                    NULL, 0, src, 2);
+   } else if (texinst->Opcode == ATI_FRAGMENT_SHADER_PASS_OP) {
+      ureg_insn(t->ureg, TGSI_OPCODE_MOV, dst, 1, src, 1);
+   }
+
+   t->regs_written[t->current_pass][r] = true;
+}
+
+/**
+ * Compile one arithmetic operation COLOR&ALPHA pair into TGSI instructions.
+ */
+static void
+compile_instruction(struct st_translate *t,
+                    const struct atifs_instruction *inst)
+{
+   unsigned optype;
+
+   for (optype = 0; optype < 2; optype++) { /* color, alpha */
+      const struct instruction_desc *desc;
+      struct ureg_dst dst[1];
+      struct ureg_src args[3]; /* arguments for the main operation */
+      unsigned arg;
+      unsigned dstreg = inst->DstReg[optype].Index - GL_REG_0_ATI;
+
+      if (!inst->Opcode[optype])
+         continue;
+
+      desc = &inst_desc[inst->Opcode[optype] - GL_MOV_ATI];
+
+      /* prepare the arguments */
+      for (arg = 0; arg < desc->arg_count; arg++) {
+         if (arg >= inst->ArgCount[optype]) {
+            _mesa_warning(0, "Using 0 for missing argument %d of %s\n",
+                          arg, desc->name);
+            args[arg] = ureg_imm1f(t->ureg, 0.0f);
+         } else {
+            args[arg] = prepare_argument(t, arg,
+                                         &inst->SrcReg[optype][arg]);
+         }
+      }
+
+      /* prepare dst */
+      dst[0] = get_temp(t, dstreg);
+
+      if (optype) {
+         dst[0] = ureg_writemask(dst[0], TGSI_WRITEMASK_W);
+      } else {
+         GLuint dstMask = inst->DstReg[optype].dstMask;
+         if (dstMask == GL_NONE) {
+            dst[0] = ureg_writemask(dst[0], TGSI_WRITEMASK_XYZ);
+         } else {
+            dst[0] = ureg_writemask(dst[0], dstMask); /* the enum values match */
+         }
+      }
+
+      /* emit the main instruction */
+      emit_arith_inst(t, desc, dst, args, arg);
+
+      emit_dstmod(t, *dst, inst->DstReg[optype].dstMod);
+
+      t->regs_written[t->current_pass][dstreg] = true;
+   }
+}
+
+static void
+finalize_shader(struct st_translate *t, unsigned numPasses)
+{
+   struct ureg_dst dst[1] = { { 0 } };
+   struct ureg_src src[1] = { { 0 } };
+
+   if (t->regs_written[numPasses-1][0]) {
+      /* copy the result into the OUT slot */
+      dst[0] = t->outputs[t->outputMapping[FRAG_RESULT_COLOR]];
+      src[0] = ureg_src(t->temps[0]);
+      ureg_insn(t->ureg, TGSI_OPCODE_MOV, dst, 1, src, 1);
+   }
+
+   /* signal the end of the program */
+   ureg_insn(t->ureg, TGSI_OPCODE_END, dst, 0, src, 0);
+}
+
+/**
+ * Called when a new variant is needed, we need to translate
+ * the ATI fragment shader to TGSI
+ */
+enum pipe_error
+st_translate_atifs_program(
+   struct ureg_program *ureg,
+   struct ati_fragment_shader *atifs,
+   struct gl_program *program,
+   GLuint numInputs,
+   const GLuint inputMapping[],
+   const ubyte inputSemanticName[],
+   const ubyte inputSemanticIndex[],
+   const GLuint interpMode[],
+   GLuint numOutputs,
+   const GLuint outputMapping[],
+   const ubyte outputSemanticName[],
+   const ubyte outputSemanticIndex[])
+{
+   enum pipe_error ret = PIPE_OK;
+
+   unsigned pass, i, r;
+
+   struct st_translate translate, *t;
+   t = &translate;
+   memset(t, 0, sizeof *t);
+
+   t->inputMapping = inputMapping;
+   t->outputMapping = outputMapping;
+   t->ureg = ureg;
+   t->atifs = atifs;
+
+   /*
+    * Declare input attributes.
+    */
+   for (i = 0; i < numInputs; i++) {
+      t->inputs[i] = ureg_DECL_fs_input(ureg,
+                                        inputSemanticName[i],
+                                        inputSemanticIndex[i],
+                                        interpMode[i]);
+   }
+
+   /*
+    * Declare output attributes:
+    *  we always have numOutputs=1 and it's FRAG_RESULT_COLOR
+    */
+   t->outputs[0] = ureg_DECL_output(ureg,
+                                    TGSI_SEMANTIC_COLOR,
+                                    outputSemanticIndex[0]);
+
+   /* Emit constants and immediates.  Mesa uses a single index space
+    * for these, so we put all the translated regs in t->constants.
+    */
+   if (program->Parameters) {
+      t->constants = calloc(program->Parameters->NumParameters,
+                            sizeof t->constants[0]);
+      if (t->constants == NULL) {
+         ret = PIPE_ERROR_OUT_OF_MEMORY;
+         goto out;
+      }
+
+      for (i = 0; i < program->Parameters->NumParameters; i++) {
+         switch (program->Parameters->Parameters[i].Type) {
+         case PROGRAM_STATE_VAR:
+         case PROGRAM_UNIFORM:
+            t->constants[i] = ureg_DECL_constant(ureg, i);
+            break;
+         case PROGRAM_CONSTANT:
+            t->constants[i] =
+               ureg_DECL_immediate(ureg,
+                                   (const float*)program->Parameters->ParameterValues[i],
+                                   4);
+            break;
+         default:
+            break;
+         }
+      }
+   }
+
+   /* texture samplers */
+   for (i = 0; i < MAX_NUM_FRAGMENT_REGISTERS_ATI; i++) {
+      if (program->SamplersUsed & (1 << i)) {
+         t->samplers[i] = ureg_DECL_sampler(ureg, i);
+         /* the texture target is still unknown, it will be fixed in the draw call */
+         ureg_DECL_sampler_view(ureg, i, TGSI_TEXTURE_2D,
+                                TGSI_RETURN_TYPE_FLOAT,
+                                TGSI_RETURN_TYPE_FLOAT,
+                                TGSI_RETURN_TYPE_FLOAT,
+                                TGSI_RETURN_TYPE_FLOAT);
+      }
+   }
+
+   /* emit instructions */
+   for (pass = 0; pass < atifs->NumPasses; pass++) {
+      t->current_pass = pass;
+      for (r = 0; r < MAX_NUM_FRAGMENT_REGISTERS_ATI; r++) {
+         struct atifs_setupinst *texinst = &atifs->SetupInst[pass][r];
+         compile_setupinst(t, r, texinst);
+      }
+      for (i = 0; i < atifs->numArithInstr[pass]; i++) {
+         struct atifs_instruction *inst = &atifs->Instructions[pass][i];
+         compile_instruction(t, inst);
+      }
+   }
+
+   finalize_shader(t, atifs->NumPasses);
+
+out:
+   free(t->constants);
+
+   if (t->error) {
+      debug_printf("%s: translate error flag set\n", __func__);
+   }
+
+   return ret;
+}
+
+/**
+ * Called in ProgramStringNotify, we need to fill the metadata of the
+ * gl_program attached to the ati_fragment_shader
+ */
+void
+st_init_atifs_prog(struct gl_context *ctx, struct gl_program *prog)
+{
+   /* we know this is st_fragment_program, because of st_new_ati_fs() */
+   struct st_fragment_program *stfp = (struct st_fragment_program *) prog;
+   struct ati_fragment_shader *atifs = stfp->ati_fs;
+
+   unsigned pass, i, r, optype, arg;
+
+   static const gl_state_index fog_params_state[STATE_LENGTH] =
+      {STATE_INTERNAL, STATE_FOG_PARAMS_OPTIMIZED, 0, 0, 0};
+   static const gl_state_index fog_color[STATE_LENGTH] =
+      {STATE_FOG_COLOR, 0, 0, 0, 0};
+
+   prog->InputsRead = 0;
+   prog->OutputsWritten = BITFIELD64_BIT(FRAG_RESULT_COLOR);
+   prog->SamplersUsed = 0;
+   prog->Parameters = _mesa_new_parameter_list();
+
+   /* fill in InputsRead, SamplersUsed, TexturesUsed */
+   for (pass = 0; pass < atifs->NumPasses; pass++) {
+      for (r = 0; r < MAX_NUM_FRAGMENT_REGISTERS_ATI; r++) {
+         struct atifs_setupinst *texinst = &atifs->SetupInst[pass][r];
+         GLuint pass_tex = texinst->src;
+
+         if (texinst->Opcode == ATI_FRAGMENT_SHADER_SAMPLE_OP) {
+            /* mark which texcoords are used */
+            prog->InputsRead |= BITFIELD64_BIT(VARYING_SLOT_TEX0 + pass_tex - GL_TEXTURE0_ARB);
+            /* by default there is 1:1 mapping between samplers and textures */
+            prog->SamplersUsed |= (1 << r);
+            /* the target is unknown here, it will be fixed in the draw call */
+            prog->TexturesUsed[r] = TEXTURE_2D_BIT;
+         } else if (texinst->Opcode == ATI_FRAGMENT_SHADER_PASS_OP) {
+            if (pass_tex >= GL_TEXTURE0_ARB && pass_tex <= GL_TEXTURE7_ARB) {
+               prog->InputsRead |= BITFIELD64_BIT(VARYING_SLOT_TEX0 + pass_tex - GL_TEXTURE0_ARB);
+            }
+         }
+      }
+   }
+   for (pass = 0; pass < atifs->NumPasses; pass++) {
+      for (i = 0; i < atifs->numArithInstr[pass]; i++) {
+         struct atifs_instruction *inst = &atifs->Instructions[pass][i];
+
+         for (optype = 0; optype < 2; optype++) { /* color, alpha */
+            if (inst->Opcode[optype]) {
+               for (arg = 0; arg < inst->ArgCount[optype]; arg++) {
+                  GLint index = inst->SrcReg[optype][arg].Index;
+                  if (index == GL_PRIMARY_COLOR_EXT) {
+                     prog->InputsRead |= BITFIELD64_BIT(VARYING_SLOT_COL0);
+                  } else if (index == GL_SECONDARY_INTERPOLATOR_ATI) {
+                     /* note: ATI_fragment_shader.txt never specifies what
+                      * GL_SECONDARY_INTERPOLATOR_ATI is, swrast uses
+                      * VARYING_SLOT_COL1 for this input */
+                     prog->InputsRead |= BITFIELD64_BIT(VARYING_SLOT_COL1);
+                  }
+               }
+            }
+         }
+      }
+   }
+   /* we may need fog */
+   prog->InputsRead |= BITFIELD64_BIT(VARYING_SLOT_FOGC);
+
+   /* we always have the ATI_fs constants, and the fog params */
+   for (i = 0; i < MAX_NUM_FRAGMENT_CONSTANTS_ATI; i++) {
+      _mesa_add_parameter(prog->Parameters, PROGRAM_UNIFORM,
+                          NULL, 4, GL_FLOAT, NULL, NULL);
+   }
+   _mesa_add_state_reference(prog->Parameters, fog_params_state);
+   _mesa_add_state_reference(prog->Parameters, fog_color);
+
+   prog->NumInstructions = 0;
+   prog->NumTemporaries = MAX_NUM_FRAGMENT_REGISTERS_ATI + 3; /* 3 input temps for arith ops */
+   prog->NumParameters = MAX_NUM_FRAGMENT_CONSTANTS_ATI + 2; /* 2 state variables for fog */
+}
+
+
+struct tgsi_atifs_transform {
+   struct tgsi_transform_context base;
+   struct tgsi_shader_info info;
+   const struct st_fp_variant_key *key;
+   bool first_instruction_emitted;
+   unsigned fog_factor_temp;
+   unsigned fog_clamp_imm;
+};
+
+static inline struct tgsi_atifs_transform *
+tgsi_atifs_transform(struct tgsi_transform_context *tctx)
+{
+   return (struct tgsi_atifs_transform *)tctx;
+}
+
+/* copied from st_cb_drawpixels_shader.c */
+static void
+set_src(struct tgsi_full_instruction *inst, unsigned i, unsigned file, unsigned index,
+        unsigned x, unsigned y, unsigned z, unsigned w)
+{
+   inst->Src[i].Register.File  = file;
+   inst->Src[i].Register.Index = index;
+   inst->Src[i].Register.SwizzleX = x;
+   inst->Src[i].Register.SwizzleY = y;
+   inst->Src[i].Register.SwizzleZ = z;
+   inst->Src[i].Register.SwizzleW = w;
+}
+
+#define SET_SRC(inst, i, file, index, x, y, z, w) \
+   set_src(inst, i, file, index, TGSI_SWIZZLE_##x, TGSI_SWIZZLE_##y, \
+           TGSI_SWIZZLE_##z, TGSI_SWIZZLE_##w)
+
+static void
+transform_decl(struct tgsi_transform_context *tctx,
+               struct tgsi_full_declaration *decl)
+{
+   struct tgsi_atifs_transform *ctx = tgsi_atifs_transform(tctx);
+
+   if (decl->Declaration.File == TGSI_FILE_SAMPLER_VIEW) {
+      /* fix texture target */
+      unsigned newtarget = ctx->key->texture_targets[decl->Range.First];
+      if (newtarget)
+         decl->SamplerView.Resource = newtarget;
+   }
+
+   tctx->emit_declaration(tctx, decl);
+}
+
+static void
+transform_instr(struct tgsi_transform_context *tctx,
+                struct tgsi_full_instruction *current_inst)
+{
+   struct tgsi_atifs_transform *ctx = tgsi_atifs_transform(tctx);
+
+   if (ctx->first_instruction_emitted)
+      goto transform_inst;
+
+   ctx->first_instruction_emitted = true;
+
+   if (ctx->key->fog) {
+      /* add a new temp for the fog factor */
+      ctx->fog_factor_temp = ctx->info.file_max[TGSI_FILE_TEMPORARY] + 1;
+      tgsi_transform_temp_decl(tctx, ctx->fog_factor_temp);
+
+      /* add immediates for clamp */
+      ctx->fog_clamp_imm = ctx->info.immediate_count;
+      tgsi_transform_immediate_decl(tctx, 1.0f, 0.0f, 0.0f, 0.0f);
+   }
+
+transform_inst:
+   if (current_inst->Instruction.Opcode == TGSI_OPCODE_TEX) {
+      /* fix texture target */
+      unsigned newtarget = ctx->key->texture_targets[current_inst->Src[1].Register.Index];
+      if (newtarget)
+         current_inst->Texture.Texture = newtarget;
+
+   } else if (ctx->key->fog && current_inst->Instruction.Opcode == TGSI_OPCODE_MOV &&
+              current_inst->Dst[0].Register.File == TGSI_FILE_OUTPUT) {
+      struct tgsi_full_instruction inst;
+      unsigned i;
+      int fogc_index = -1;
+
+      /* find FOGC input */
+      for (i = 0; i < ctx->info.num_inputs; i++) {
+         if (ctx->info.input_semantic_name[i] == TGSI_SEMANTIC_FOG) {
+            fogc_index = i;
+            break;
+         }
+      }
+      if (fogc_index < 0) {
+         /* should never be reached, because fog coord input is always declared */
+         tctx->emit_instruction(tctx, current_inst);
+         return;
+      }
+
+      /* compute the 1 component fog factor f */
+      if (ctx->key->fog == 1) {
+         /* LINEAR formula: f = (end - z) / (end - start)
+          * with optimized parameters:
+          *    f = MAD(fogcoord, oparams.x, oparams.y)
+          */
+         inst = tgsi_default_full_instruction();
+         inst.Instruction.Opcode = TGSI_OPCODE_MAD;
+         inst.Instruction.NumDstRegs = 1;
+         inst.Dst[0].Register.File  = TGSI_FILE_TEMPORARY;
+         inst.Dst[0].Register.Index = ctx->fog_factor_temp;
+         inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+         inst.Instruction.NumSrcRegs = 3;
+         SET_SRC(&inst, 0, TGSI_FILE_INPUT, fogc_index, X, Y, Z, W);
+         SET_SRC(&inst, 1, TGSI_FILE_CONSTANT, MAX_NUM_FRAGMENT_CONSTANTS_ATI, X, X, X, X);
+         SET_SRC(&inst, 2, TGSI_FILE_CONSTANT, MAX_NUM_FRAGMENT_CONSTANTS_ATI, Y, Y, Y, Y);
+         tctx->emit_instruction(tctx, &inst);
+      } else if (ctx->key->fog == 2) {
+         /* EXP formula: f = exp(-dens * z)
+          * with optimized parameters:
+          *    f = MUL(fogcoord, oparams.z); f= EX2(-f)
+          */
+         inst = tgsi_default_full_instruction();
+         inst.Instruction.Opcode = TGSI_OPCODE_MUL;
+         inst.Instruction.NumDstRegs = 1;
+         inst.Dst[0].Register.File  = TGSI_FILE_TEMPORARY;
+         inst.Dst[0].Register.Index = ctx->fog_factor_temp;
+         inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+         inst.Instruction.NumSrcRegs = 2;
+         SET_SRC(&inst, 0, TGSI_FILE_INPUT, fogc_index, X, Y, Z, W);
+         SET_SRC(&inst, 1, TGSI_FILE_CONSTANT, MAX_NUM_FRAGMENT_CONSTANTS_ATI, Z, Z, Z, Z);
+         tctx->emit_instruction(tctx, &inst);
+
+         inst = tgsi_default_full_instruction();
+         inst.Instruction.Opcode = TGSI_OPCODE_EX2;
+         inst.Instruction.NumDstRegs = 1;
+         inst.Dst[0].Register.File  = TGSI_FILE_TEMPORARY;
+         inst.Dst[0].Register.Index = ctx->fog_factor_temp;
+         inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+         inst.Instruction.NumSrcRegs = 1;
+         SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->fog_factor_temp, X, Y, Z, W);
+         inst.Src[0].Register.Negate = 1;
+         tctx->emit_instruction(tctx, &inst);
+      } else if (ctx->key->fog == 3) {
+         /* EXP2 formula: f = exp(-(dens * z)^2)
+          * with optimized parameters:
+          *    f = MUL(fogcoord, oparams.w); f=MUL(f, f); f= EX2(-f)
+          */
+         inst = tgsi_default_full_instruction();
+         inst.Instruction.Opcode = TGSI_OPCODE_MUL;
+         inst.Instruction.NumDstRegs = 1;
+         inst.Dst[0].Register.File  = TGSI_FILE_TEMPORARY;
+         inst.Dst[0].Register.Index = ctx->fog_factor_temp;
+         inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+         inst.Instruction.NumSrcRegs = 2;
+         SET_SRC(&inst, 0, TGSI_FILE_INPUT, fogc_index, X, Y, Z, W);
+         SET_SRC(&inst, 1, TGSI_FILE_CONSTANT, MAX_NUM_FRAGMENT_CONSTANTS_ATI, W, W, W, W);
+         tctx->emit_instruction(tctx, &inst);
+
+         inst = tgsi_default_full_instruction();
+         inst.Instruction.Opcode = TGSI_OPCODE_MUL;
+         inst.Instruction.NumDstRegs = 1;
+         inst.Dst[0].Register.File  = TGSI_FILE_TEMPORARY;
+         inst.Dst[0].Register.Index = ctx->fog_factor_temp;
+         inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+         inst.Instruction.NumSrcRegs = 2;
+         SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->fog_factor_temp, X, Y, Z, W);
+         SET_SRC(&inst, 1, TGSI_FILE_TEMPORARY, ctx->fog_factor_temp, X, Y, Z, W);
+         tctx->emit_instruction(tctx, &inst);
+
+         inst = tgsi_default_full_instruction();
+         inst.Instruction.Opcode = TGSI_OPCODE_EX2;
+         inst.Instruction.NumDstRegs = 1;
+         inst.Dst[0].Register.File  = TGSI_FILE_TEMPORARY;
+         inst.Dst[0].Register.Index = ctx->fog_factor_temp;
+         inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+         inst.Instruction.NumSrcRegs = 1;
+         SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->fog_factor_temp, X, Y, Z, W);
+         inst.Src[0].Register.Negate ^= 1;
+         tctx->emit_instruction(tctx, &inst);
+      }
+      /* f = CLAMP(f, 0.0, 1.0) */
+      inst = tgsi_default_full_instruction();
+      inst.Instruction.Opcode = TGSI_OPCODE_CLAMP;
+      inst.Instruction.NumDstRegs = 1;
+      inst.Dst[0].Register.File  = TGSI_FILE_TEMPORARY;
+      inst.Dst[0].Register.Index = ctx->fog_factor_temp;
+      inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+      inst.Instruction.NumSrcRegs = 3;
+      SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->fog_factor_temp, X, Y, Z, W);
+      SET_SRC(&inst, 1, TGSI_FILE_IMMEDIATE, ctx->fog_clamp_imm, Y, Y, Y, Y); // 0.0
+      SET_SRC(&inst, 2, TGSI_FILE_IMMEDIATE, ctx->fog_clamp_imm, X, X, X, X); // 1.0
+      tctx->emit_instruction(tctx, &inst);
+
+      /* REG0 = LRP(f, REG0, fogcolor) */
+      inst = tgsi_default_full_instruction();
+      inst.Instruction.Opcode = TGSI_OPCODE_LRP;
+      inst.Instruction.NumDstRegs = 1;
+      inst.Dst[0].Register.File  = TGSI_FILE_TEMPORARY;
+      inst.Dst[0].Register.Index = 0;
+      inst.Dst[0].Register.WriteMask = TGSI_WRITEMASK_XYZW;
+      inst.Instruction.NumSrcRegs = 3;
+      SET_SRC(&inst, 0, TGSI_FILE_TEMPORARY, ctx->fog_factor_temp, X, X, X, Y);
+      SET_SRC(&inst, 1, TGSI_FILE_TEMPORARY, 0, X, Y, Z, W);
+      SET_SRC(&inst, 2, TGSI_FILE_CONSTANT, MAX_NUM_FRAGMENT_CONSTANTS_ATI + 1, X, Y, Z, W);
+      tctx->emit_instruction(tctx, &inst);
+   }
+
+   tctx->emit_instruction(tctx, current_inst);
+}
+
+/*
+ * A post-process step in the draw call to fix texture targets and
+ * insert code for fog.
+ */
+const struct tgsi_token *
+st_fixup_atifs(const struct tgsi_token *tokens,
+               const struct st_fp_variant_key *key)
+{
+   struct tgsi_atifs_transform ctx;
+   struct tgsi_token *newtoks;
+   int newlen;
+
+   memset(&ctx, 0, sizeof(ctx));
+   ctx.base.transform_declaration = transform_decl;
+   ctx.base.transform_instruction = transform_instr;
+   ctx.key = key;
+   tgsi_scan_shader(tokens, &ctx.info);
+
+   newlen = tgsi_num_tokens(tokens) + 30;
+   newtoks = tgsi_alloc_tokens(newlen);
+   if (!newtoks)
+      return NULL;
+
+   tgsi_transform_shader(tokens, newtoks, newlen, &ctx.base);
+   return newtoks;
+}
+
diff --git a/src/mesa/state_tracker/st_atifs_to_tgsi.h b/src/mesa/state_tracker/st_atifs_to_tgsi.h
new file mode 100644
index 00000000000..c1b6758ba02
--- /dev/null
+++ b/src/mesa/state_tracker/st_atifs_to_tgsi.h
@@ -0,0 +1,67 @@
+/*
+ * Copyright (C) 2016 Miklós Máté
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef ST_ATIFS_TO_TGSI_H
+#define ST_ATIFS_TO_TGSI_H
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+#include "main/glheader.h"
+#include "pipe/p_defines.h"
+
+struct gl_context;
+struct gl_program;
+struct ureg_program;
+struct tgsi_token;
+struct ati_fragment_shader;
+struct st_fp_variant_key;
+
+enum pipe_error
+st_translate_atifs_program(
+    struct ureg_program *ureg,
+    struct ati_fragment_shader *atifs,
+    struct gl_program *program,
+    GLuint numInputs,
+    const GLuint inputMapping[],
+    const ubyte inputSemanticName[],
+    const ubyte inputSemanticIndex[],
+    const GLuint interpMode[],
+    GLuint numOutputs,
+    const GLuint outputMapping[],
+    const ubyte outputSemanticName[],
+    const ubyte outputSemanticIndex[]);
+
+
+void
+st_init_atifs_prog(struct gl_context *ctx, struct gl_program *prog);
+
+const struct tgsi_token *
+st_fixup_atifs(const struct tgsi_token *tokens,
+               const struct st_fp_variant_key *key);
+
+#if defined __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* ST_ATIFS_TO_TGSI_H */
diff --git a/src/mesa/state_tracker/st_atom_constbuf.c b/src/mesa/state_tracker/st_atom_constbuf.c
index 407dfd31c80..a980dbedac5 100644
--- a/src/mesa/state_tracker/st_atom_constbuf.c
+++ b/src/mesa/state_tracker/st_atom_constbuf.c
@@ -64,6 +64,21 @@ void st_upload_constants( struct st_context *st,
           shader_type == PIPE_SHADER_TESS_EVAL ||
           shader_type == PIPE_SHADER_COMPUTE);
 
+   /* update the ATI constants before rendering */
+   if (shader_type == PIPE_SHADER_FRAGMENT && st->fp->ati_fs) {
+      struct ati_fragment_shader *ati_fs = st->fp->ati_fs;
+      unsigned c;
+
+      for (c = 0; c < MAX_NUM_FRAGMENT_CONSTANTS_ATI; c++) {
+         if (ati_fs->LocalConstDef & (1 << c))
+            memcpy(params->ParameterValues[c],
+                   ati_fs->Constants[c], sizeof(GLfloat) * 4);
+         else
+            memcpy(params->ParameterValues[c],
+                   st->ctx->ATIFragmentShader.GlobalConstants[c], sizeof(GLfloat) * 4);
+      }
+   }
+
    /* update constants */
    if (params && params->NumParameters) {
       struct pipe_constant_buffer cb;
diff --git a/src/mesa/state_tracker/st_atom_sampler.c b/src/mesa/state_tracker/st_atom_sampler.c
index 82dcf5ee0ca..a1cfa1c34c5 100644
--- a/src/mesa/state_tracker/st_atom_sampler.c
+++ b/src/mesa/state_tracker/st_atom_sampler.c
@@ -133,18 +133,19 @@ convert_sampler(struct st_context *st,
 {
    const struct gl_texture_object *texobj;
    struct gl_context *ctx = st->ctx;
-   struct gl_sampler_object *msamp;
+   const struct gl_sampler_object *msamp;
    GLenum texBaseFormat;
 
    texobj = ctx->Texture.Unit[texUnit]._Current;
    if (!texobj) {
       texobj = _mesa_get_fallback_texture(ctx, TEXTURE_2D_INDEX);
+      msamp = &texobj->Sampler;
+   } else {
+      msamp = _mesa_get_samplerobj(ctx, texUnit);
    }
 
    texBaseFormat = _mesa_texture_base_format(texobj);
 
-   msamp = _mesa_get_samplerobj(ctx, texUnit);
-
    memset(sampler, 0, sizeof(*sampler));
    sampler->wrap_s = gl_wrap_xlate(msamp->WrapS);
    sampler->wrap_t = gl_wrap_xlate(msamp->WrapT);
diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c
index 709f0cbcb91..d0c2429dcef 100644
--- a/src/mesa/state_tracker/st_atom_shader.c
+++ b/src/mesa/state_tracker/st_atom_shader.c
@@ -38,18 +38,69 @@
 #include "main/imports.h"
 #include "main/mtypes.h"
 #include "main/framebuffer.h"
+#include "main/texobj.h"
+#include "main/texstate.h"
 #include "program/program.h"
 
 #include "pipe/p_context.h"
 #include "pipe/p_shader_tokens.h"
 #include "util/u_simple_shaders.h"
 #include "cso_cache/cso_context.h"
+#include "util/u_debug.h"
 
 #include "st_context.h"
 #include "st_atom.h"
 #include "st_program.h"
 
 
+/** Compress the fog function enums into a 2-bit value */
+static GLuint
+translate_fog_mode(GLenum mode)
+{
+   switch (mode) {
+   case GL_LINEAR: return 1;
+   case GL_EXP:    return 2;
+   case GL_EXP2:   return 3;
+   default:
+      return 0;
+   }
+}
+
+static unsigned
+get_texture_target(struct gl_context *ctx, const unsigned unit)
+{
+   struct gl_texture_object *texObj = _mesa_get_tex_unit(ctx, unit)->_Current;
+   gl_texture_index index;
+
+   if (texObj) {
+      index = _mesa_tex_target_to_index(ctx, texObj->Target);
+   } else {
+      /* fallback for missing texture */
+      index = TEXTURE_2D_INDEX;
+   }
+
+   /* Map mesa texture target to TGSI texture target.
+    * Copied from st_mesa_to_tgsi.c, the shadow part is omitted */
+   switch(index) {
+   case TEXTURE_2D_MULTISAMPLE_INDEX: return TGSI_TEXTURE_2D_MSAA;
+   case TEXTURE_2D_MULTISAMPLE_ARRAY_INDEX: return TGSI_TEXTURE_2D_ARRAY_MSAA;
+   case TEXTURE_BUFFER_INDEX: return TGSI_TEXTURE_BUFFER;
+   case TEXTURE_1D_INDEX:   return TGSI_TEXTURE_1D;
+   case TEXTURE_2D_INDEX:   return TGSI_TEXTURE_2D;
+   case TEXTURE_3D_INDEX:   return TGSI_TEXTURE_3D;
+   case TEXTURE_CUBE_INDEX: return TGSI_TEXTURE_CUBE;
+   case TEXTURE_CUBE_ARRAY_INDEX: return TGSI_TEXTURE_CUBE_ARRAY;
+   case TEXTURE_RECT_INDEX: return TGSI_TEXTURE_RECT;
+   case TEXTURE_1D_ARRAY_INDEX:   return TGSI_TEXTURE_1D_ARRAY;
+   case TEXTURE_2D_ARRAY_INDEX:   return TGSI_TEXTURE_2D_ARRAY;
+   case TEXTURE_EXTERNAL_INDEX:   return TGSI_TEXTURE_2D;
+   default:
+      debug_assert(0);
+      return TGSI_TEXTURE_1D;
+   }
+}
+
+
 /**
  * Update fragment program state/atom.  This involves translating the
  * Mesa fragment program into a gallium fragment program and binding it.
@@ -79,6 +130,18 @@ update_fp( struct st_context *st )
       st->ctx->Multisample.MinSampleShadingValue *
       _mesa_geometric_samples(st->ctx->DrawBuffer) > 1;
 
+   if (stfp->ati_fs) {
+      unsigned u;
+
+      if (st->ctx->Fog.Enabled) {
+         key.fog = translate_fog_mode(st->ctx->Fog.Mode);
+      }
+
+      for (u = 0; u < MAX_NUM_FRAGMENT_REGISTERS_ATI; u++) {
+         key.texture_targets[u] = get_texture_target(st->ctx, u);
+      }
+   }
+
    st->fp_variant = st_get_fp_variant(st, stfp, &key);
 
    st_reference_fragprog(st, &st->fp, stfp);
@@ -91,7 +154,7 @@ update_fp( struct st_context *st )
 const struct st_tracked_state st_update_fp = {
    "st_update_fp",					/* name */
    {							/* dirty */
-      _NEW_BUFFERS | _NEW_MULTISAMPLE,			/* mesa */
+      _NEW_BUFFERS | _NEW_MULTISAMPLE | _NEW_FOG,	/* mesa */
       ST_NEW_FRAGMENT_PROGRAM                           /* st */
    },
    update_fp  					/* update */
diff --git a/src/mesa/state_tracker/st_cb_drawpixels.c b/src/mesa/state_tracker/st_cb_drawpixels.c
index 09f4d8e00d1..01ed5441d11 100644
--- a/src/mesa/state_tracker/st_cb_drawpixels.c
+++ b/src/mesa/state_tracker/st_cb_drawpixels.c
@@ -1302,6 +1302,7 @@ blit_copy_pixels(struct gl_context *ctx, GLint srcx, GLint srcy,
        !ctx->FragmentProgram.Enabled &&
        !ctx->VertexProgram.Enabled &&
        !ctx->_Shader->CurrentProgram[MESA_SHADER_FRAGMENT] &&
+       !ctx->ATIFragmentShader._Enabled &&
        ctx->DrawBuffer->_NumColorDrawBuffers == 1 &&
        !ctx->Query.CondRenderQuery &&
        !ctx->Query.CurrentOcclusionObject) {
diff --git a/src/mesa/state_tracker/st_cb_program.c b/src/mesa/state_tracker/st_cb_program.c
index 27cc0f3d154..d79cfe239e4 100644
--- a/src/mesa/state_tracker/st_cb_program.c
+++ b/src/mesa/state_tracker/st_cb_program.c
@@ -46,6 +46,7 @@
 #include "st_mesa_to_tgsi.h"
 #include "st_cb_program.h"
 #include "st_glsl_to_tgsi.h"
+#include "st_atifs_to_tgsi.h"
 
 
 
@@ -302,6 +303,22 @@ st_program_string_notify( struct gl_context *ctx,
       if (st->cp == stcp)
          st->dirty_cp.st |= ST_NEW_COMPUTE_PROGRAM;
    }
+   else if (target == GL_FRAGMENT_SHADER_ATI) {
+      assert(prog);
+
+      struct st_fragment_program *stfp = (struct st_fragment_program *) prog;
+      assert(stfp->ati_fs);
+      assert(stfp->ati_fs->Program == prog);
+
+      st_init_atifs_prog(ctx, prog);
+
+      st_release_fp_variants(st, stfp);
+      if (!st_translate_fragment_program(st, stfp))
+         return false;
+
+      if (st->fp == stfp)
+         st->dirty.st |= ST_NEW_FRAGMENT_PROGRAM;
+   }
 
    if (ST_DEBUG & DEBUG_PRECOMPILE ||
        st->shader_has_one_variant[stage])
@@ -310,6 +327,19 @@ st_program_string_notify( struct gl_context *ctx,
    return GL_TRUE;
 }
 
+/**
+ * Called via ctx->Driver.NewATIfs()
+ * Called in glEndFragmentShaderATI()
+ */
+static struct gl_program *
+st_new_ati_fs(struct gl_context *ctx, struct ati_fragment_shader *curProg)
+{
+   struct gl_program *prog = ctx->Driver.NewProgram(ctx, GL_FRAGMENT_PROGRAM_ARB,
+         curProg->Id);
+   struct st_fragment_program *stfp = (struct st_fragment_program *)prog;
+   stfp->ati_fs = curProg;
+   return prog;
+}
 
 /**
  * Plug in the program and shader-related device driver functions.
@@ -322,6 +352,7 @@ st_init_program_functions(struct dd_function_table *functions)
    functions->NewProgram = st_new_program;
    functions->DeleteProgram = st_delete_program;
    functions->ProgramStringNotify = st_program_string_notify;
+   functions->NewATIfs = st_new_ati_fs;
    
    functions->LinkShader = st_link_shader;
 }
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index 460c1790663..3980f5d2f51 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -2886,12 +2886,17 @@ st_finalize_texture(struct gl_context *ctx,
          /* Need to import images in main memory or held in other textures.
           */
          if (stImage && stObj->pt != stImage->pt) {
+            GLuint height = stObj->height0;
             GLuint depth = stObj->depth0;
+
+            if (stObj->base.Target != GL_TEXTURE_1D_ARRAY)
+               height = u_minify(height, level);
             if (stObj->base.Target == GL_TEXTURE_3D)
                depth = u_minify(depth, level);
+
             if (level == 0 ||
                 (stImage->base.Width == u_minify(stObj->width0, level) &&
-                 stImage->base.Height == u_minify(stObj->height0, level) &&
+                 stImage->base.Height == height &&
                  stImage->base.Depth == depth)) {
                /* src image fits expected dest mipmap level size */
                copy_image_data_to_texture(st, stObj, level, stImage);
diff --git a/src/mesa/state_tracker/st_cb_xformfb.c b/src/mesa/state_tracker/st_cb_xformfb.c
index 0c01cd5ab78..a5cf3dfd5a9 100644
--- a/src/mesa/state_tracker/st_cb_xformfb.c
+++ b/src/mesa/state_tracker/st_cb_xformfb.c
@@ -125,7 +125,7 @@ st_begin_transform_feedback(struct gl_context *ctx, GLenum mode,
 
       if (bo && bo->buffer) {
          unsigned stream =
-            obj->shader_program->LinkedTransformFeedback.BufferStream[i];
+            obj->shader_program->LinkedTransformFeedback.Buffers[i].Stream;
 
          /* Check whether we need to recreate the target. */
          if (!sobj->targets[i] ||
@@ -204,7 +204,7 @@ st_end_transform_feedback(struct gl_context *ctx,
 
    for (i = 0; i < ARRAY_SIZE(sobj->targets); i++) {
       unsigned stream =
-         obj->shader_program->LinkedTransformFeedback.BufferStream[i];
+         obj->shader_program->LinkedTransformFeedback.Buffers[i].Stream;
 
       /* Is it not bound or already set for this stream? */
       if (!sobj->targets[i] || sobj->draw_count[stream])
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index fdd59a383a9..3db5749725e 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -127,35 +127,6 @@ setup_index_buffer(struct st_context *st,
 
 
 /**
- * Prior to drawing, check that any uniforms referenced by the
- * current shader have been set.  If a uniform has not been set,
- * issue a warning.
- */
-static void
-check_uniforms(struct gl_context *ctx)
-{
-   struct gl_shader_program **shProg = ctx->_Shader->CurrentProgram;
-   unsigned j;
-
-   for (j = 0; j < 3; j++) {
-      unsigned i;
-
-      if (shProg[j] == NULL || !shProg[j]->LinkStatus)
-	 continue;
-
-      for (i = 0; i < shProg[j]->NumUniformStorage; i++) {
-         const struct gl_uniform_storage *u = &shProg[j]->UniformStorage[i];
-         if (!u->initialized) {
-            _mesa_warning(ctx,
-                          "Using shader with uninitialized uniform: %s",
-                          u->name);
-         }
-      }
-   }
-}
-
-
-/**
  * Translate OpenGL primtive type (GL_POINTS, GL_TRIANGLE_STRIP, etc) to
  * the corresponding Gallium type.
  */
@@ -203,14 +174,6 @@ st_draw_vbo(struct gl_context *ctx,
    /* Validate state. */
    if (st->dirty.st || st->dirty.mesa || ctx->NewDriverState) {
       st_validate_state(st, ST_PIPELINE_RENDER);
-
-#if 0
-      if (MESA_VERBOSE & VERBOSE_GLSL) {
-         check_uniforms(ctx);
-      }
-#else
-      (void) check_uniforms;
-#endif
    }
 
    if (st->vertex_array_out_of_memory) {
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 2fdaba073a2..8748ab5c876 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -552,7 +552,6 @@ void st_init_extensions(struct pipe_screen *screen,
                         boolean has_lib_dxtc)
 {
    unsigned i;
-   int glsl_feature_level;
    GLboolean *extension_table = (GLboolean *) extensions;
 
    static const struct st_extension_cap_mapping cap_mapping[] = {
@@ -811,6 +810,7 @@ void st_init_extensions(struct pipe_screen *screen,
    extensions->EXT_texture_env_dot3 = GL_TRUE;
    extensions->EXT_vertex_array_bgra = GL_TRUE;
 
+   extensions->ATI_fragment_shader = GL_TRUE;
    extensions->ATI_texture_env_combine3 = GL_TRUE;
 
    extensions->MESA_pack_invert = GL_TRUE;
@@ -844,12 +844,8 @@ void st_init_extensions(struct pipe_screen *screen,
                           ARRAY_SIZE(vertex_mapping), PIPE_BUFFER,
                           PIPE_BIND_VERTEX_BUFFER);
 
-   /* Figure out GLSL support. */
-   glsl_feature_level = screen->get_param(screen, PIPE_CAP_GLSL_FEATURE_LEVEL);
-
-   consts->GLSLVersion = glsl_feature_level;
-   if (glsl_feature_level >= 410)
-      consts->GLSLVersion = 410;
+   /* Figure out GLSL support and set GLSLVersion to it. */
+   consts->GLSLVersion = screen->get_param(screen, PIPE_CAP_GLSL_FEATURE_LEVEL);
 
    _mesa_override_glsl_version(consts);
 
@@ -858,9 +854,9 @@ void st_init_extensions(struct pipe_screen *screen,
       consts->ForceGLSLVersion = options->force_glsl_version;
    }
 
-   if (glsl_feature_level >= 400)
+   if (consts->GLSLVersion >= 400)
       extensions->ARB_gpu_shader5 = GL_TRUE;
-   if (glsl_feature_level >= 410)
+   if (consts->GLSLVersion >= 410)
       extensions->ARB_shader_precision = GL_TRUE;
 
    /* This extension needs full OpenGL 3.2, but we don't know if that's
@@ -925,6 +921,23 @@ void st_init_extensions(struct pipe_screen *screen,
       extensions->ARB_sync = GL_TRUE;
    }
 
+   /* Needs PIPE_CAP_SAMPLE_SHADING + all the sample-related bits of
+    * ARB_gpu_shader5. This enables all the per-sample shading ES extensions.
+    */
+   extensions->OES_sample_variables = extensions->ARB_sample_shading &&
+      extensions->ARB_gpu_shader5;
+
+   /* If we don't have native ETC2 support, we don't keep track of the
+    * original ETC2 data. This is necessary to be able to copy images between
+    * compatible view classes.
+    */
+   if (extensions->ARB_copy_image && screen->is_format_supported(
+             screen, PIPE_FORMAT_ETC2_RGB8,
+             PIPE_TEXTURE_2D, 0,
+             PIPE_BIND_SAMPLER_VIEW)) {
+      extensions->OES_copy_image = GL_TRUE;
+   }
+
    /* Maximum sample count. */
    {
       enum pipe_format color_formats[] = {
@@ -1020,6 +1033,12 @@ void st_init_extensions(struct pipe_screen *screen,
                              PIPE_BIND_SAMPLER_VIEW);
    }
 
+   extensions->OES_texture_buffer =
+      extensions->ARB_texture_buffer_object &&
+      extensions->ARB_texture_buffer_range &&
+      extensions->ARB_texture_buffer_object_rgb32 &&
+      extensions->ARB_shader_image_load_store;
+
    /* Unpacking a varying in the fragment shader costs 1 texture indirection.
     * If the number of available texture indirections is very limited, then we
     * prefer to disable varying packing rather than run the risk of varying
@@ -1036,7 +1055,7 @@ void st_init_extensions(struct pipe_screen *screen,
 
    consts->MaxViewports = screen->get_param(screen, PIPE_CAP_MAX_VIEWPORTS);
    if (consts->MaxViewports >= 16) {
-      if (glsl_feature_level >= 400) {
+      if (consts->GLSLVersion >= 400) {
          consts->ViewportBounds.Min = -32768.0;
          consts->ViewportBounds.Max = 32767.0;
       } else {
diff --git a/src/mesa/state_tracker/st_gen_mipmap.c b/src/mesa/state_tracker/st_gen_mipmap.c
index c4b3492b0d3..a14bbfabaa3 100644
--- a/src/mesa/state_tracker/st_gen_mipmap.c
+++ b/src/mesa/state_tracker/st_gen_mipmap.c
@@ -82,7 +82,6 @@ st_generate_mipmap(struct gl_context *ctx, GLenum target,
    const uint baseLevel = texObj->BaseLevel;
    enum pipe_format format;
    uint lastLevel, first_layer, last_layer;
-   uint dstLevel;
 
    if (!pt)
       return;
@@ -103,42 +102,33 @@ st_generate_mipmap(struct gl_context *ctx, GLenum target,
    stObj->lastLevel = lastLevel;
 
    if (!texObj->Immutable) {
-      if (pt->last_level < lastLevel) {
-         /* The current gallium texture doesn't have space for all the
-         * mipmap levels we need to generate.  So allocate a new texture.
-         */
-         struct pipe_resource *oldTex = stObj->pt;
-
-         /* create new texture with space for more levels */
-         stObj->pt = st_texture_create(st,
-                                       oldTex->target,
-                                       oldTex->format,
-                                       lastLevel,
-                                       oldTex->width0,
-                                       oldTex->height0,
-                                       oldTex->depth0,
-                                       oldTex->array_size,
-                                       0,
-                                       oldTex->bind);
-
-         /* This will copy the old texture's base image into the new texture
-         * which we just allocated.
-         */
-         st_finalize_texture(ctx, st->pipe, texObj);
-
-         /* release the old tex (will likely be freed too) */
-         pipe_resource_reference(&oldTex, NULL);
-         st_texture_release_all_sampler_views(st, stObj);
-      }
-      else {
-         /* Make sure that the base texture image data is present in the
-         * texture buffer.
-         */
-         st_finalize_texture(ctx, st->pipe, texObj);
-      }
+      const GLboolean genSave = texObj->GenerateMipmap;
+
+      /* Temporarily set GenerateMipmap to true so that allocate_full_mipmap()
+       * makes the right decision about full mipmap allocation.
+       */
+      texObj->GenerateMipmap = GL_TRUE;
+
+      _mesa_prepare_mipmap_levels(ctx, texObj, baseLevel, lastLevel);
+
+      texObj->GenerateMipmap = genSave;
+
+      /* At this point, memory for all the texture levels has been
+       * allocated.  However, the base level image may be in one resource
+       * while the subsequent/smaller levels may be in another resource.
+       * Finalizing the texture will copy the base images from the former
+       * resource to the latter.
+       *
+       * After this, we'll have all mipmap levels in one resource.
+       */
+      st_finalize_texture(ctx, st->pipe, texObj);
    }
 
    pt = stObj->pt;
+   if (!pt) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "mipmap generation");
+      return;
+   }
 
    assert(pt->last_level >= lastLevel);
 
@@ -169,48 +159,4 @@ st_generate_mipmap(struct gl_context *ctx, GLenum target,
          _mesa_generate_mipmap(ctx, target, texObj);
       }
    }
-
-   /* Fill in the Mesa gl_texture_image fields */
-   for (dstLevel = baseLevel + 1; dstLevel <= lastLevel; dstLevel++) {
-      const uint srcLevel = dstLevel - 1;
-      const struct gl_texture_image *srcImage
-         = _mesa_get_tex_image(ctx, texObj, target, srcLevel);
-      struct gl_texture_image *dstImage;
-      struct st_texture_image *stImage;
-      uint border = srcImage->Border;
-      uint dstWidth, dstHeight, dstDepth;
-
-      dstWidth = u_minify(pt->width0, dstLevel);
-      if (texObj->Target == GL_TEXTURE_1D_ARRAY) {
-         dstHeight = pt->array_size;
-      }
-      else {
-         dstHeight = u_minify(pt->height0, dstLevel);
-      }
-      if (texObj->Target == GL_TEXTURE_2D_ARRAY ||
-          texObj->Target == GL_TEXTURE_CUBE_MAP_ARRAY) {
-         dstDepth = pt->array_size;
-      }
-      else {
-         dstDepth = u_minify(pt->depth0, dstLevel);
-      }
-
-      dstImage = _mesa_get_tex_image(ctx, texObj, target, dstLevel);
-      if (!dstImage) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "generating mipmaps");
-         return;
-      }
-
-      /* Free old image data */
-      ctx->Driver.FreeTextureImageBuffer(ctx, dstImage);
-
-      /* initialize new image */
-      _mesa_init_teximage_fields(ctx, dstImage, dstWidth, dstHeight,
-                                 dstDepth, border, srcImage->InternalFormat,
-                                 srcImage->TexFormat);
-
-      stImage = st_texture_image(dstImage);
-
-      pipe_resource_reference(&stImage->pt, pt);
-   }
 }
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 06b4bb41a9b..23786b85529 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -6811,7 +6811,7 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
       validate_ir_tree(ir);
    }
 
-   build_program_resource_list(prog);
+   build_program_resource_list(ctx, prog);
 
    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
       struct gl_program *linked_prog;
@@ -6861,7 +6861,7 @@ st_translate_stream_output_info(glsl_to_tgsi_visitor *glsl_to_tgsi,
    }
 
    for (i = 0; i < PIPE_MAX_SO_BUFFERS; i++) {
-      so->stride[i] = info->BufferStride[i];
+      so->stride[i] = info->Buffers[i].Stride;
    }
    so->num_outputs = info->NumOutputs;
 }
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index 80dcfd82743..94dc48971ec 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -53,6 +53,7 @@
 #include "st_context.h"
 #include "st_program.h"
 #include "st_mesa_to_tgsi.h"
+#include "st_atifs_to_tgsi.h"
 #include "cso_cache/cso_context.h"
 
 
@@ -811,7 +812,22 @@ st_translate_fragment_program(struct st_context *st,
 
       free_glsl_to_tgsi_visitor(stfp->glsl_to_tgsi);
       stfp->glsl_to_tgsi = NULL;
-   } else
+   } else if (stfp->ati_fs)
+      st_translate_atifs_program(ureg,
+                                 stfp->ati_fs,
+                                 &stfp->Base.Base,
+                                 /* inputs */
+                                 fs_num_inputs,
+                                 inputMapping,
+                                 input_semantic_name,
+                                 input_semantic_index,
+                                 interpMode,
+                                 /* outputs */
+                                 fs_num_outputs,
+                                 outputMapping,
+                                 fs_output_semantic_name,
+                                 fs_output_semantic_index);
+   else
       st_translate_mesa_program(st->ctx,
                                 TGSI_PROCESSOR_FRAGMENT,
                                 ureg,
@@ -849,6 +865,16 @@ st_create_fp_variant(struct st_context *st,
 
    assert(!(key->bitmap && key->drawpixels));
 
+   /* Fix texture targets and add fog for ATI_fs */
+   if (stfp->ati_fs) {
+      const struct tgsi_token *tokens = st_fixup_atifs(tgsi.tokens, key);
+
+      if (tokens)
+         tgsi.tokens = tokens;
+      else
+         fprintf(stderr, "mesa: cannot post-process ATI_fs\n");
+   }
+
    /* Emulate features. */
    if (key->clamp_color || key->persample_shading) {
       const struct tgsi_token *tokens;
@@ -858,9 +884,11 @@ st_create_fp_variant(struct st_context *st,
 
       tokens = tgsi_emulate(tgsi.tokens, flags);
 
-      if (tokens)
+      if (tokens) {
+         if (tgsi.tokens != stfp->tgsi.tokens)
+            tgsi_free_tokens(tgsi.tokens);
          tgsi.tokens = tokens;
-      else
+      } else
          fprintf(stderr, "mesa: cannot emulate deprecated features\n");
    }
 
diff --git a/src/mesa/state_tracker/st_program.h b/src/mesa/state_tracker/st_program.h
index 028fba99a74..7c90fd74e14 100644
--- a/src/mesa/state_tracker/st_program.h
+++ b/src/mesa/state_tracker/st_program.h
@@ -35,6 +35,7 @@
 #define ST_PROGRAM_H
 
 #include "main/mtypes.h"
+#include "main/atifragshader.h"
 #include "program/program.h"
 #include "pipe/p_state.h"
 #include "st_context.h"
@@ -65,6 +66,12 @@ struct st_fp_variant_key
 
    /** for ARB_sample_shading */
    GLuint persample_shading:1;
+
+   /** needed for ATI_fragment_shader */
+   GLuint fog:2;
+
+   /** needed for ATI_fragment_shader */
+   char texture_targets[MAX_NUM_FRAGMENT_REGISTERS_ATI];
 };
 
 
@@ -99,6 +106,7 @@ struct st_fragment_program
    struct gl_fragment_program Base;
    struct pipe_shader_state tgsi;
    struct glsl_to_tgsi_visitor* glsl_to_tgsi;
+   struct ati_fragment_shader *ati_fs;
 
    struct st_fp_variant *variants;
 };
diff --git a/src/mesa/state_tracker/st_vdpau.c b/src/mesa/state_tracker/st_vdpau.c
index 71dd15bc4fe..b9abebfc7bf 100644
--- a/src/mesa/state_tracker/st_vdpau.c
+++ b/src/mesa/state_tracker/st_vdpau.c
@@ -39,8 +39,6 @@
 #include "pipe/p_state.h"
 #include "pipe/p_video_codec.h"
 
-#include "state_tracker/vdpau_interop.h"
-
 #include "util/u_inlines.h"
 
 #include "st_vdpau.h"
@@ -51,70 +49,155 @@
 
 #ifdef HAVE_ST_VDPAU
 
+#include "state_tracker/vdpau_interop.h"
+#include "state_tracker/vdpau_dmabuf.h"
+#include "state_tracker/vdpau_funcs.h"
+#include "state_tracker/drm_driver.h"
+
+static struct pipe_resource *
+st_vdpau_video_surface_gallium(struct gl_context *ctx, const GLvoid *vdpSurface,
+                               GLuint index)
+{
+   int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr);
+   uint32_t device = (uintptr_t)ctx->vdpDevice;
+   struct pipe_sampler_view *sv;
+   VdpVideoSurfaceGallium *f;
+
+   struct pipe_video_buffer *buffer;
+   struct pipe_sampler_view **samplers;
+
+   getProcAddr = (void *)ctx->vdpGetProcAddress;
+   if (getProcAddr(device, VDP_FUNC_ID_VIDEO_SURFACE_GALLIUM, (void**)&f))
+      return NULL;
+
+   buffer = f((uintptr_t)vdpSurface);
+   if (!buffer)
+      return NULL;
+
+   samplers = buffer->get_sampler_view_planes(buffer);
+   if (!samplers)
+      return NULL;
+
+   sv = samplers[index >> 1];
+   if (!sv)
+      return NULL;
+
+   return sv->texture;
+}
+
+static struct pipe_resource *
+st_vdpau_output_surface_gallium(struct gl_context *ctx, const GLvoid *vdpSurface)
+{
+   int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr);
+   uint32_t device = (uintptr_t)ctx->vdpDevice;
+   VdpOutputSurfaceGallium *f;
+
+   getProcAddr = (void *)ctx->vdpGetProcAddress;
+   if (getProcAddr(device, VDP_FUNC_ID_OUTPUT_SURFACE_GALLIUM, (void**)&f))
+      return NULL;
+
+   return f((uintptr_t)vdpSurface);
+}
+
+static struct pipe_resource *
+st_vdpau_resource_from_description(struct gl_context *ctx,
+                                   const struct VdpSurfaceDMABufDesc *desc)
+{
+   struct st_context *st = st_context(ctx);
+   struct pipe_resource templ, *res;
+   struct winsys_handle whandle;
+
+   if (desc->handle == -1)
+      return NULL;
+
+   memset(&templ, 0, sizeof(templ));
+   templ.target = PIPE_TEXTURE_2D;
+   templ.last_level = 0;
+   templ.depth0 = 1;
+   templ.array_size = 1;
+   templ.width0 = desc->width;
+   templ.height0 = desc->height;
+   templ.format = VdpFormatRGBAToPipe(desc->format);
+   templ.bind = PIPE_BIND_SAMPLER_VIEW | PIPE_BIND_RENDER_TARGET;
+   templ.usage = PIPE_USAGE_DEFAULT;
+
+   memset(&whandle, 0, sizeof(whandle));
+   whandle.type = DRM_API_HANDLE_TYPE_FD;
+   whandle.handle = desc->handle;
+   whandle.offset = desc->offset;
+   whandle.stride = desc->stride;
+
+   res = st->pipe->screen->resource_from_handle(st->pipe->screen, &templ, &whandle,
+						PIPE_HANDLE_USAGE_READ_WRITE);
+   close(desc->handle);
+
+   return res;
+}
+
+static struct pipe_resource *
+st_vdpau_output_surface_dma_buf(struct gl_context *ctx, const GLvoid *vdpSurface)
+{
+   int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr);
+   uint32_t device = (uintptr_t)ctx->vdpDevice;
+
+   struct VdpSurfaceDMABufDesc desc;
+   VdpOutputSurfaceDMABuf *f;
+
+   getProcAddr = (void *)ctx->vdpGetProcAddress;
+   if (getProcAddr(device, VDP_FUNC_ID_OUTPUT_SURFACE_DMA_BUF, (void**)&f))
+      return NULL;
+
+   if (f((uintptr_t)vdpSurface, &desc) != VDP_STATUS_OK)
+      return NULL;
+
+   return st_vdpau_resource_from_description(ctx, &desc);
+}
+
+static struct pipe_resource *
+st_vdpau_video_surface_dma_buf(struct gl_context *ctx, const GLvoid *vdpSurface,
+                               GLuint index)
+{
+   int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr);
+   uint32_t device = (uintptr_t)ctx->vdpDevice;
+
+   struct VdpSurfaceDMABufDesc desc;
+   VdpVideoSurfaceDMABuf *f;
+
+   getProcAddr = (void *)ctx->vdpGetProcAddress;
+   if (getProcAddr(device, VDP_FUNC_ID_VIDEO_SURFACE_DMA_BUF, (void**)&f))
+      return NULL;
+
+   if (f((uintptr_t)vdpSurface, index, &desc) != VDP_STATUS_OK)
+      return NULL;
+
+   return st_vdpau_resource_from_description(ctx, &desc);
+}
+
 static void
 st_vdpau_map_surface(struct gl_context *ctx, GLenum target, GLenum access,
                      GLboolean output, struct gl_texture_object *texObj,
                      struct gl_texture_image *texImage,
                      const GLvoid *vdpSurface, GLuint index)
 {
-   int (*getProcAddr)(uint32_t device, uint32_t id, void **ptr);
-   uint32_t device = (uintptr_t)ctx->vdpDevice;
-
    struct st_context *st = st_context(ctx);
    struct st_texture_object *stObj = st_texture_object(texObj);
    struct st_texture_image *stImage = st_texture_image(texImage);
- 
+
    struct pipe_resource *res;
    struct pipe_sampler_view templ, **sampler_view;
    mesa_format texFormat;
 
-   getProcAddr = (void *)ctx->vdpGetProcAddress;
    if (output) {
-      VdpOutputSurfaceGallium *f;
-      
-      if (getProcAddr(device, VDP_FUNC_ID_OUTPUT_SURFACE_GALLIUM, (void**)&f)) {
-         _mesa_error(ctx, GL_INVALID_OPERATION, "VDPAUMapSurfacesNV");
-         return;
-      }
-
-      res = f((uintptr_t)vdpSurface);
+      res = st_vdpau_output_surface_dma_buf(ctx, vdpSurface);
 
-      if (!res) {
-         _mesa_error(ctx, GL_INVALID_OPERATION, "VDPAUMapSurfacesNV");
-         return;
-      }
+      if (!res)
+         res = st_vdpau_output_surface_gallium(ctx, vdpSurface);
 
    } else {
-      struct pipe_sampler_view *sv;
-      VdpVideoSurfaceGallium *f;
-
-      struct pipe_video_buffer *buffer;
-      struct pipe_sampler_view **samplers;
-
-      if (getProcAddr(device, VDP_FUNC_ID_VIDEO_SURFACE_GALLIUM, (void**)&f)) {
-         _mesa_error(ctx, GL_INVALID_OPERATION, "VDPAUMapSurfacesNV");
-         return;
-      }
-
-      buffer = f((uintptr_t)vdpSurface);
-      if (!buffer) {
-         _mesa_error(ctx, GL_INVALID_OPERATION, "VDPAUMapSurfacesNV");
-         return;
-      }
-
-      samplers = buffer->get_sampler_view_planes(buffer);
-      if (!samplers) {
-         _mesa_error(ctx, GL_INVALID_OPERATION, "VDPAUMapSurfacesNV");
-         return;
-      }
-
-      sv = samplers[index >> 1];
-      if (!sv) {
-         _mesa_error(ctx, GL_INVALID_OPERATION, "VDPAUMapSurfacesNV");
-         return;
-      }
-
-      res = sv->texture;
+      res = st_vdpau_video_surface_dma_buf(ctx, vdpSurface, index);
+
+      if (!res)
+         res = st_vdpau_video_surface_gallium(ctx, vdpSurface, index);
    }
 
    if (!res) {
author	Jason Ekstrand <[email protected]>	2016-04-01 14:59:38 -0700
committer	Jason Ekstrand <[email protected]>	2016-04-01 15:16:21 -0700
commit	95106f6bfbbb87b702e4bbba98e2eaea71924cd9 (patch)
tree	9650d284ec7f7417b2bcf8a906dfa43dfc547cf7 /src
parent	cf2257069cbde19fd177a02c079206914aac5d14 (diff)
parent	14c46954c910efb1db94a068a866c7259deaa9d9 (diff)