Merge remote-tracking branch 'public/master' into vulkan

author: Jason Ekstrand <[email protected]> 2016-03-15 14:09:50 -0700
committer: Jason Ekstrand <[email protected]> 2016-03-15 14:09:50 -0700
commit: 7f6a0cb29c89a03441be744680a2145445be3a3c (patch)
tree: 516824ab49962521563b95fa79430cf948baaccc
parent: b83785d86d2c7f07323920615c72a9f09695a9a7 (diff)
parent: e103b52aec773537d2821d8acc42ac9caa2a4b17 (diff)
457 files changed, 62759 insertions, 2759 deletions
diff --git a/.gitignore b/.gitignore
index 21aa35cd36d..b4f88f7b7fb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -46,3 +46,4 @@ manifest.txt
 Makefile
 Makefile.in
 .install-mesa-links
+.install-gallium-links
diff --git a/configure.ac b/configure.ac
index 5f686f5602a..384de4dbde6 100644
--- a/configure.ac
+++ b/configure.ac
@@ -68,7 +68,7 @@ OPENCL_VERSION=1
 AC_SUBST([OPENCL_VERSION])
 
 dnl Versions for external dependencies
-LIBDRM_REQUIRED=2.4.60
+LIBDRM_REQUIRED=2.4.66
 LIBDRM_RADEON_REQUIRED=2.4.56
 LIBDRM_AMDGPU_REQUIRED=2.4.63
 LIBDRM_INTEL_REQUIRED=2.4.61
@@ -1737,6 +1737,7 @@ AM_CONDITIONAL(HAVE_ST_XVMC, test "x$enable_xvmc" = xyes)
 if test "x$enable_vdpau" = xyes; then
     PKG_CHECK_MODULES([VDPAU], [vdpau >= $VDPAU_REQUIRED])
     gallium_st="$gallium_st vdpau"
+    DEFINES="$DEFINES -DHAVE_ST_VDPAU"
 fi
 AM_CONDITIONAL(HAVE_ST_VDPAU, test "x$enable_vdpau" = xyes)
 
@@ -2193,6 +2194,16 @@ radeon_llvm_check() {
     fi
 }
 
+swr_llvm_check() {
+    gallium_require_llvm $1
+    if test ${LLVM_VERSION_INT} -lt 306; then
+        AC_MSG_ERROR([LLVM version 3.6 or later required when building $1])
+    fi
+    if test "x$enable_gallium_llvm" != "xyes"; then
+        AC_MSG_ERROR([--enable-gallium-llvm is required when building $1])
+    fi
+}
+
 dnl Duplicates in GALLIUM_DRIVERS_DIRS are removed by sorting it after this block
 if test -n "$with_gallium_drivers"; then
     gallium_drivers=`IFS=', '; echo $with_gallium_drivers`
@@ -2265,6 +2276,30 @@ if test -n "$with_gallium_drivers"; then
                 HAVE_GALLIUM_LLVMPIPE=yes
             fi
             ;;
+        xswr)
+            AX_CXX_COMPILE_STDCXX([11], [noext], [mandatory])
+            swr_llvm_check "swr"
+
+            AC_MSG_CHECKING([whether $CXX supports AVX/AVX2])
+            AVX_CXXFLAGS="-march=core-avx-i"
+            AVX2_CXXFLAGS="-march=core-avx2"
+
+            AC_LANG_PUSH([C++])
+            save_CXXFLAGS="$CXXFLAGS"
+            CXXFLAGS="$AVX_CXXFLAGS $CXXFLAGS"
+            AC_COMPILE_IFELSE([AC_LANG_PROGRAM()],[],
+                              [AC_MSG_ERROR([AVX compiler support not detected])])
+            CXXFLAGS="$save_CXXFLAGS"
+
+            save_CFLAGS="$CXXFLAGS"
+            CXXFLAGS="$AVX2_CXXFLAGS $CXXFLAGS"
+            AC_COMPILE_IFELSE([AC_LANG_PROGRAM()],[],
+                              [AC_MSG_ERROR([AVX2 compiler support not detected])])
+            CXXFLAGS="$save_CXXFLAGS"
+            AC_LANG_POP([C++])
+
+            HAVE_GALLIUM_SWR=yes
+            ;;
         xvc4)
             HAVE_GALLIUM_VC4=yes
             gallium_require_drm "vc4"
@@ -2354,6 +2389,7 @@ AM_CONDITIONAL(HAVE_GALLIUM_NOUVEAU, test "x$HAVE_GALLIUM_NOUVEAU" = xyes)
 AM_CONDITIONAL(HAVE_GALLIUM_FREEDRENO, test "x$HAVE_GALLIUM_FREEDRENO" = xyes)
 AM_CONDITIONAL(HAVE_GALLIUM_SOFTPIPE, test "x$HAVE_GALLIUM_SOFTPIPE" = xyes)
 AM_CONDITIONAL(HAVE_GALLIUM_LLVMPIPE, test "x$HAVE_GALLIUM_LLVMPIPE" = xyes)
+AM_CONDITIONAL(HAVE_GALLIUM_SWR, test "x$HAVE_GALLIUM_SWR" = xyes)
 AM_CONDITIONAL(HAVE_GALLIUM_VC4, test "x$HAVE_GALLIUM_VC4" = xyes)
 AM_CONDITIONAL(HAVE_GALLIUM_VIRGL, test "x$HAVE_GALLIUM_VIRGL" = xyes)
 
@@ -2470,6 +2506,9 @@ AC_CONFIG_FILES([Makefile
 		src/gallium/drivers/rbug/Makefile
 		src/gallium/drivers/softpipe/Makefile
 		src/gallium/drivers/svga/Makefile
+		src/gallium/drivers/swr/Makefile
+		src/gallium/drivers/swr/avx/Makefile
+		src/gallium/drivers/swr/avx2/Makefile
 		src/gallium/drivers/trace/Makefile
 		src/gallium/drivers/vc4/Makefile
 		src/gallium/drivers/virgl/Makefile
diff --git a/docs/GL3.txt b/docs/GL3.txt
index d141c221f19..ee7facafc95 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -158,7 +158,7 @@ GL 4.3, GLSL 4.30:
   GL_ARB_explicit_uniform_location                     DONE (all drivers that support GLSL)
   GL_ARB_fragment_layer_viewport                       DONE (i965, nv50, nvc0, r600, radeonsi, llvmpipe)
   GL_ARB_framebuffer_no_attachments                    DONE (i965)
-  GL_ARB_internalformat_query2                         in progress (elima)
+  GL_ARB_internalformat_query2                         DONE (i965)
   GL_ARB_invalidate_subdata                            DONE (all drivers)
   GL_ARB_multi_draw_indirect                           DONE (i965, nvc0, r600, radeonsi, llvmpipe, softpipe)
   GL_ARB_program_interface_query                       DONE (all drivers)
@@ -180,8 +180,8 @@ GL 4.4, GLSL 4.40:
   GL_ARB_clear_texture                                 DONE (i965, nv50, nvc0)
   GL_ARB_enhanced_layouts                              in progress (Timothy)
   - compile-time constant expressions                  DONE
-  - explicit byte offsets for blocks                   in progress
-  - forced alignment within blocks                     in progress
+  - explicit byte offsets for blocks                   DONE
+  - forced alignment within blocks                     DONE
   - specified vec4-slot component numbers              in progress
   - specified transform/feedback layout                in progress
   - input/output block locations                       DONE
diff --git a/docs/relnotes/11.3.0.html b/docs/relnotes/11.3.0.html
index fa650830e23..c31296ef9b1 100644
--- a/docs/relnotes/11.3.0.html
+++ b/docs/relnotes/11.3.0.html
@@ -44,6 +44,8 @@ Note: some of the new features are only available with certain drivers.
 </p>
 
 <ul>
+<li>GL_ARB_internalformat_query2 on i965</li>
+<li>GL_ARB_shader_atomic_counter_ops on nvc0</li>
 <li>GL_OES_texture_border_clamp and GL_EXT_texture_border_clamp on all drivers that support GL_ARB_texture_border_clamp</li>
 <li>GL_OES_shader_image_atomic on all drivers that support GL_ARB_shader_image_load_store</li>
 </ul>
diff --git a/include/EGL/eglmesaext.h b/include/EGL/eglmesaext.h
index 917a2043c77..337dd2cb789 100644
--- a/include/EGL/eglmesaext.h
+++ b/include/EGL/eglmesaext.h
@@ -34,17 +34,6 @@ extern "C" {
 
 #include <EGL/eglplatform.h>
 
-#ifndef EGL_MESA_drm_display
-#define EGL_MESA_drm_display 1
-
-#ifdef EGL_EGLEXT_PROTOTYPES
-EGLAPI EGLDisplay EGLAPIENTRY eglGetDRMDisplayMESA(int fd);
-#endif /* EGL_EGLEXT_PROTOTYPES */
-
-typedef EGLDisplay (EGLAPIENTRYP PFNEGLGETDRMDISPLAYMESA) (int fd);
-
-#endif /* EGL_MESA_drm_display */
-
 #ifdef EGL_MESA_drm_image
 /* Mesa's extension to EGL_MESA_drm_image... */
 #ifndef EGL_DRM_BUFFER_USE_CURSOR_MESA
diff --git a/include/GL/internal/dri_interface.h b/include/GL/internal/dri_interface.h
index 6bbd3fa87f5..2b49a2941e1 100644
--- a/include/GL/internal/dri_interface.h
+++ b/include/GL/internal/dri_interface.h
@@ -1100,6 +1100,11 @@ struct __DRIdri2ExtensionRec {
 #define __DRI_IMAGE_USE_SCANOUT		0x0002
 #define __DRI_IMAGE_USE_CURSOR		0x0004 /* Depricated */
 #define __DRI_IMAGE_USE_LINEAR		0x0008
+/* The buffer will only be read by an external process after SwapBuffers,
+ * in contrary to gbm buffers, front buffers and fake front buffers, which
+ * could be read after a flush."
+ */
+#define __DRI_IMAGE_USE_BACKBUFFER      0x0010
 
 
 /**
diff --git a/include/pci_ids/i965_pci_ids.h b/include/pci_ids/i965_pci_ids.h
index bdfbefe0b75..bd645fae640 100644
--- a/include/pci_ids/i965_pci_ids.h
+++ b/include/pci_ids/i965_pci_ids.h
@@ -156,8 +156,8 @@ CHIPSET(0x5932, kbl_gt4, "Intel(R) Kabylake GT4")
 CHIPSET(0x593A, kbl_gt4, "Intel(R) Kabylake GT4")
 CHIPSET(0x593B, kbl_gt4, "Intel(R) Kabylake GT4")
 CHIPSET(0x593D, kbl_gt4, "Intel(R) Kabylake GT4")
-CHIPSET(0x22B0, chv,     "Intel(R) HD Graphics (Cherryview)")
-CHIPSET(0x22B1, chv,     "Intel(R) HD Graphics (Cherryview)")
+CHIPSET(0x22B0, chv,     "Intel(R) HD Graphics (Cherrytrail)")
+CHIPSET(0x22B1, chv,     "Intel(R) HD Graphics XXX (Braswell)") /* Overridden in brw_get_renderer_string */
 CHIPSET(0x22B2, chv,     "Intel(R) HD Graphics (Cherryview)")
 CHIPSET(0x22B3, chv,     "Intel(R) HD Graphics (Cherryview)")
 CHIPSET(0x0A84, bxt,     "Intel(R) HD Graphics (Broxton)")
diff --git a/m4/ax_cxx_compile_stdcxx.m4 b/m4/ax_cxx_compile_stdcxx.m4
new file mode 100644
index 00000000000..079e17d2a62
--- /dev/null
+++ b/m4/ax_cxx_compile_stdcxx.m4
@@ -0,0 +1,558 @@
+# ===========================================================================
+#   http://www.gnu.org/software/autoconf-archive/ax_cxx_compile_stdcxx.html
+# ===========================================================================
+#
+# SYNOPSIS
+#
+#   AX_CXX_COMPILE_STDCXX(VERSION, [ext|noext], [mandatory|optional])
+#
+# DESCRIPTION
+#
+#   Check for baseline language coverage in the compiler for the specified
+#   version of the C++ standard.  If necessary, add switches to CXXFLAGS to
+#   enable support.  VERSION may be '11' (for the C++11 standard) or '14'
+#   (for the C++14 standard).
+#
+#   The second argument, if specified, indicates whether you insist on an
+#   extended mode (e.g. -std=gnu++11) or a strict conformance mode (e.g.
+#   -std=c++11).  If neither is specified, you get whatever works, with
+#   preference for an extended mode.
+#
+#   The third argument, if specified 'mandatory' or if left unspecified,
+#   indicates that baseline support for the specified C++ standard is
+#   required and that the macro should error out if no mode with that
+#   support is found.  If specified 'optional', then configuration proceeds
+#   regardless, after defining HAVE_CXX${VERSION} if and only if a
+#   supporting mode is found.
+#
+# LICENSE
+#
+#   Copyright (c) 2008 Benjamin Kosnik <[email protected]>
+#   Copyright (c) 2012 Zack Weinberg <[email protected]>
+#   Copyright (c) 2013 Roy Stogner <[email protected]>
+#   Copyright (c) 2014, 2015 Google Inc.; contributed by Alexey Sokolov <[email protected]>
+#   Copyright (c) 2015 Paul Norman <[email protected]>
+#   Copyright (c) 2015 Moritz Klammler <[email protected]>
+#
+#   Copying and distribution of this file, with or without modification, are
+#   permitted in any medium without royalty provided the copyright notice
+#   and this notice are preserved.  This file is offered as-is, without any
+#   warranty.
+
+#serial 1
+
+dnl  This macro is based on the code from the AX_CXX_COMPILE_STDCXX_11 macro
+dnl  (serial version number 13).
+
+AC_DEFUN([AX_CXX_COMPILE_STDCXX], [dnl
+  m4_if([$1], [11], [],
+        [$1], [14], [],
+        [$1], [17], [m4_fatal([support for C++17 not yet implemented in AX_CXX_COMPILE_STDCXX])],
+        [m4_fatal([invalid first argument `$1' to AX_CXX_COMPILE_STDCXX])])dnl
+  m4_if([$2], [], [],
+        [$2], [ext], [],
+        [$2], [noext], [],
+        [m4_fatal([invalid second argument `$2' to AX_CXX_COMPILE_STDCXX])])dnl
+  m4_if([$3], [], [ax_cxx_compile_cxx$1_required=true],
+        [$3], [mandatory], [ax_cxx_compile_cxx$1_required=true],
+        [$3], [optional], [ax_cxx_compile_cxx$1_required=false],
+        [m4_fatal([invalid third argument `$3' to AX_CXX_COMPILE_STDCXX])])
+  AC_LANG_PUSH([C++])dnl
+  ac_success=no
+  AC_CACHE_CHECK(whether $CXX supports C++$1 features by default,
+  ax_cv_cxx_compile_cxx$1,
+  [AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])],
+    [ax_cv_cxx_compile_cxx$1=yes],
+    [ax_cv_cxx_compile_cxx$1=no])])
+  if test x$ax_cv_cxx_compile_cxx$1 = xyes; then
+    ac_success=yes
+  fi
+
+  m4_if([$2], [noext], [], [dnl
+  if test x$ac_success = xno; then
+    for switch in -std=gnu++$1 -std=gnu++0x; do
+      cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx$1_$switch])
+      AC_CACHE_CHECK(whether $CXX supports C++$1 features with $switch,
+                     $cachevar,
+        [ac_save_CXXFLAGS="$CXXFLAGS"
+         CXXFLAGS="$CXXFLAGS $switch"
+         AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])],
+          [eval $cachevar=yes],
+          [eval $cachevar=no])
+         CXXFLAGS="$ac_save_CXXFLAGS"])
+      if eval test x\$$cachevar = xyes; then
+        CXXFLAGS="$CXXFLAGS $switch"
+        ac_success=yes
+        break
+      fi
+    done
+  fi])
+
+  m4_if([$2], [ext], [], [dnl
+  if test x$ac_success = xno; then
+    dnl HP's aCC needs +std=c++11 according to:
+    dnl http://h21007.www2.hp.com/portal/download/files/unprot/aCxx/PDF_Release_Notes/769149-001.pdf
+    dnl Cray's crayCC needs "-h std=c++11"
+    for switch in -std=c++$1 -std=c++0x +std=c++$1 "-h std=c++$1"; do
+      cachevar=AS_TR_SH([ax_cv_cxx_compile_cxx$1_$switch])
+      AC_CACHE_CHECK(whether $CXX supports C++$1 features with $switch,
+                     $cachevar,
+        [ac_save_CXXFLAGS="$CXXFLAGS"
+         CXXFLAGS="$CXXFLAGS $switch"
+         AC_COMPILE_IFELSE([AC_LANG_SOURCE([_AX_CXX_COMPILE_STDCXX_testbody_$1])],
+          [eval $cachevar=yes],
+          [eval $cachevar=no])
+         CXXFLAGS="$ac_save_CXXFLAGS"])
+      if eval test x\$$cachevar = xyes; then
+        CXXFLAGS="$CXXFLAGS $switch"
+        ac_success=yes
+        break
+      fi
+    done
+  fi])
+  AC_LANG_POP([C++])
+  if test x$ax_cxx_compile_cxx$1_required = xtrue; then
+    if test x$ac_success = xno; then
+      AC_MSG_ERROR([*** A compiler with support for C++$1 language features is required.])
+    fi
+  else
+    if test x$ac_success = xno; then
+      HAVE_CXX$1=0
+      AC_MSG_NOTICE([No compiler with C++$1 support was found])
+    else
+      HAVE_CXX$1=1
+      AC_DEFINE(HAVE_CXX$1,1,
+                [define if the compiler supports basic C++$1 syntax])
+    fi
+
+    AC_SUBST(HAVE_CXX$1)
+  fi
+])
+
+
+dnl  Test body for checking C++11 support
+
+m4_define([_AX_CXX_COMPILE_STDCXX_testbody_11],
+  _AX_CXX_COMPILE_STDCXX_testbody_new_in_11
+)
+
+
+dnl  Test body for checking C++14 support
+
+m4_define([_AX_CXX_COMPILE_STDCXX_testbody_14],
+  _AX_CXX_COMPILE_STDCXX_testbody_new_in_11
+  _AX_CXX_COMPILE_STDCXX_testbody_new_in_14
+)
+
+
+dnl  Tests for new features in C++11
+
+m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_11], [[
+
+// If the compiler admits that it is not ready for C++11, why torture it?
+// Hopefully, this will speed up the test.
+
+#ifndef __cplusplus
+
+#error "This is not a C++ compiler"
+
+#elif __cplusplus < 201103L
+
+#error "This is not a C++11 compiler"
+
+#else
+
+namespace cxx11
+{
+
+  namespace test_static_assert
+  {
+
+    template <typename T>
+    struct check
+    {
+      static_assert(sizeof(int) <= sizeof(T), "not big enough");
+    };
+
+  }
+
+  namespace test_final_override
+  {
+
+    struct Base
+    {
+      virtual void f() {}
+    };
+
+    struct Derived : public Base
+    {
+      virtual void f() override {}
+    };
+
+  }
+
+  namespace test_double_right_angle_brackets
+  {
+
+    template < typename T >
+    struct check {};
+
+    typedef check<void> single_type;
+    typedef check<check<void>> double_type;
+    typedef check<check<check<void>>> triple_type;
+    typedef check<check<check<check<void>>>> quadruple_type;
+
+  }
+
+  namespace test_decltype
+  {
+
+    int
+    f()
+    {
+      int a = 1;
+      decltype(a) b = 2;
+      return a + b;
+    }
+
+  }
+
+  namespace test_type_deduction
+  {
+
+    template < typename T1, typename T2 >
+    struct is_same
+    {
+      static const bool value = false;
+    };
+
+    template < typename T >
+    struct is_same<T, T>
+    {
+      static const bool value = true;
+    };
+
+    template < typename T1, typename T2 >
+    auto
+    add(T1 a1, T2 a2) -> decltype(a1 + a2)
+    {
+      return a1 + a2;
+    }
+
+    int
+    test(const int c, volatile int v)
+    {
+      static_assert(is_same<int, decltype(0)>::value == true, "");
+      static_assert(is_same<int, decltype(c)>::value == false, "");
+      static_assert(is_same<int, decltype(v)>::value == false, "");
+      auto ac = c;
+      auto av = v;
+      auto sumi = ac + av + 'x';
+      auto sumf = ac + av + 1.0;
+      static_assert(is_same<int, decltype(ac)>::value == true, "");
+      static_assert(is_same<int, decltype(av)>::value == true, "");
+      static_assert(is_same<int, decltype(sumi)>::value == true, "");
+      static_assert(is_same<int, decltype(sumf)>::value == false, "");
+      static_assert(is_same<int, decltype(add(c, v))>::value == true, "");
+      return (sumf > 0.0) ? sumi : add(c, v);
+    }
+
+  }
+
+  namespace test_noexcept
+  {
+
+    int f() { return 0; }
+    int g() noexcept { return 0; }
+
+    static_assert(noexcept(f()) == false, "");
+    static_assert(noexcept(g()) == true, "");
+
+  }
+
+  namespace test_constexpr
+  {
+
+    template < typename CharT >
+    unsigned long constexpr
+    strlen_c_r(const CharT *const s, const unsigned long acc) noexcept
+    {
+      return *s ? strlen_c_r(s + 1, acc + 1) : acc;
+    }
+
+    template < typename CharT >
+    unsigned long constexpr
+    strlen_c(const CharT *const s) noexcept
+    {
+      return strlen_c_r(s, 0UL);
+    }
+
+    static_assert(strlen_c("") == 0UL, "");
+    static_assert(strlen_c("1") == 1UL, "");
+    static_assert(strlen_c("example") == 7UL, "");
+    static_assert(strlen_c("another\0example") == 7UL, "");
+
+  }
+
+  namespace test_rvalue_references
+  {
+
+    template < int N >
+    struct answer
+    {
+      static constexpr int value = N;
+    };
+
+    answer<1> f(int&)       { return answer<1>(); }
+    answer<2> f(const int&) { return answer<2>(); }
+    answer<3> f(int&&)      { return answer<3>(); }
+
+    void
+    test()
+    {
+      int i = 0;
+      const int c = 0;
+      static_assert(decltype(f(i))::value == 1, "");
+      static_assert(decltype(f(c))::value == 2, "");
+      static_assert(decltype(f(0))::value == 3, "");
+    }
+
+  }
+
+  namespace test_uniform_initialization
+  {
+
+    struct test
+    {
+      static const int zero {};
+      static const int one {1};
+    };
+
+    static_assert(test::zero == 0, "");
+    static_assert(test::one == 1, "");
+
+  }
+
+  namespace test_lambdas
+  {
+
+    void
+    test1()
+    {
+      auto lambda1 = [](){};
+      auto lambda2 = lambda1;
+      lambda1();
+      lambda2();
+    }
+
+    int
+    test2()
+    {
+      auto a = [](int i, int j){ return i + j; }(1, 2);
+      auto b = []() -> int { return '0'; }();
+      auto c = [=](){ return a + b; }();
+      auto d = [&](){ return c; }();
+      auto e = [a, &b](int x) mutable {
+        const auto identity = [](int y){ return y; };
+        for (auto i = 0; i < a; ++i)
+          a += b--;
+        return x + identity(a + b);
+      }(0);
+      return a + b + c + d + e;
+    }
+
+    int
+    test3()
+    {
+      const auto nullary = [](){ return 0; };
+      const auto unary = [](int x){ return x; };
+      using nullary_t = decltype(nullary);
+      using unary_t = decltype(unary);
+      const auto higher1st = [](nullary_t f){ return f(); };
+      const auto higher2nd = [unary](nullary_t f1){
+        return [unary, f1](unary_t f2){ return f2(unary(f1())); };
+      };
+      return higher1st(nullary) + higher2nd(nullary)(unary);
+    }
+
+  }
+
+  namespace test_variadic_templates
+  {
+
+    template <int...>
+    struct sum;
+
+    template <int N0, int... N1toN>
+    struct sum<N0, N1toN...>
+    {
+      static constexpr auto value = N0 + sum<N1toN...>::value;
+    };
+
+    template <>
+    struct sum<>
+    {
+      static constexpr auto value = 0;
+    };
+
+    static_assert(sum<>::value == 0, "");
+    static_assert(sum<1>::value == 1, "");
+    static_assert(sum<23>::value == 23, "");
+    static_assert(sum<1, 2>::value == 3, "");
+    static_assert(sum<5, 5, 11>::value == 21, "");
+    static_assert(sum<2, 3, 5, 7, 11, 13>::value == 41, "");
+
+  }
+
+  // http://stackoverflow.com/questions/13728184/template-aliases-and-sfinae
+  // Clang 3.1 fails with headers of libstd++ 4.8.3 when using std::function
+  // because of this.
+  namespace test_template_alias_sfinae
+  {
+
+    struct foo {};
+
+    template<typename T>
+    using member = typename T::member_type;
+
+    template<typename T>
+    void func(...) {}
+
+    template<typename T>
+    void func(member<T>*) {}
+
+    void test();
+
+    void test() { func<foo>(0); }
+
+  }
+
+}  // namespace cxx11
+
+#endif  // __cplusplus >= 201103L
+
+]])
+
+
+dnl  Tests for new features in C++14
+
+m4_define([_AX_CXX_COMPILE_STDCXX_testbody_new_in_14], [[
+
+// If the compiler admits that it is not ready for C++14, why torture it?
+// Hopefully, this will speed up the test.
+
+#ifndef __cplusplus
+
+#error "This is not a C++ compiler"
+
+#elif __cplusplus < 201402L
+
+#error "This is not a C++14 compiler"
+
+#else
+
+namespace cxx14
+{
+
+  namespace test_polymorphic_lambdas
+  {
+
+    int
+    test()
+    {
+      const auto lambda = [](auto&&... args){
+        const auto istiny = [](auto x){
+          return (sizeof(x) == 1UL) ? 1 : 0;
+        };
+        const int aretiny[] = { istiny(args)... };
+        return aretiny[0];
+      };
+      return lambda(1, 1L, 1.0f, '1');
+    }
+
+  }
+
+  namespace test_binary_literals
+  {
+
+    constexpr auto ivii = 0b0000000000101010;
+    static_assert(ivii == 42, "wrong value");
+
+  }
+
+  namespace test_generalized_constexpr
+  {
+
+    template < typename CharT >
+    constexpr unsigned long
+    strlen_c(const CharT *const s) noexcept
+    {
+      auto length = 0UL;
+      for (auto p = s; *p; ++p)
+        ++length;
+      return length;
+    }
+
+    static_assert(strlen_c("") == 0UL, "");
+    static_assert(strlen_c("x") == 1UL, "");
+    static_assert(strlen_c("test") == 4UL, "");
+    static_assert(strlen_c("another\0test") == 7UL, "");
+
+  }
+
+  namespace test_lambda_init_capture
+  {
+
+    int
+    test()
+    {
+      auto x = 0;
+      const auto lambda1 = [a = x](int b){ return a + b; };
+      const auto lambda2 = [a = lambda1(x)](){ return a; };
+      return lambda2();
+    }
+
+  }
+
+  namespace test_digit_seperators
+  {
+
+    constexpr auto ten_million = 100'000'000;
+    static_assert(ten_million == 100000000, "");
+
+  }
+
+  namespace test_return_type_deduction
+  {
+
+    auto f(int& x) { return x; }
+    decltype(auto) g(int& x) { return x; }
+
+    template < typename T1, typename T2 >
+    struct is_same
+    {
+      static constexpr auto value = false;
+    };
+
+    template < typename T >
+    struct is_same<T, T>
+    {
+      static constexpr auto value = true;
+    };
+
+    int
+    test()
+    {
+      auto x = 0;
+      static_assert(is_same<int, decltype(f(x))>::value, "");
+      static_assert(is_same<int&, decltype(g(x))>::value, "");
+      return x;
+    }
+
+  }
+
+}  // namespace cxx14
+
+#endif  // __cplusplus >= 201402L
+
+]])
diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources
index 2a4568aa679..b0b8281869d 100644
--- a/src/compiler/Makefile.sources
+++ b/src/compiler/Makefile.sources
@@ -188,10 +188,10 @@ NIR_FILES = \
 	nir/nir_lower_clip.c \
 	nir/nir_lower_global_vars_to_local.c \
 	nir/nir_lower_gs_intrinsics.c \
-        nir/nir_lower_indirect_derefs.c \
 	nir/nir_lower_load_const_to_scalar.c \
 	nir/nir_lower_locals_to_regs.c \
 	nir/nir_lower_idiv.c \
+	nir/nir_lower_indirect_derefs.c \
 	nir/nir_lower_io.c \
 	nir/nir_lower_outputs_to_temporaries.c \
 	nir/nir_lower_phis_to_scalar.c \
diff --git a/src/compiler/glsl/ast.h b/src/compiler/glsl/ast.h
index 9aa5bb99f49..727aa432631 100644
--- a/src/compiler/glsl/ast.h
+++ b/src/compiler/glsl/ast.h
@@ -479,6 +479,12 @@ struct ast_type_qualifier {
 	 unsigned pixel_center_integer:1;
 	 /*@}*/
 
+         /**
+          * Flag set if GL_ARB_enhanced_layouts "align" layout qualifier is
+          * used.
+          */
+         unsigned explicit_align:1;
+
 	 /**
 	  * Flag set if GL_ARB_explicit_attrib_location "location" layout
 	  * qualifier is used.
@@ -577,6 +583,11 @@ struct ast_type_qualifier {
    /** Precision of the type (highp/medium/lowp). */
    unsigned precision:2;
 
+   /**
+    * Alignment specified via GL_ARB_enhanced_layouts "align" layout qualifier
+    */
+   ast_expression *align;
+
    /** Geometry shader invocations for GL_ARB_gpu_shader5. */
    ast_layout_expression *invocations;
 
@@ -1061,10 +1072,9 @@ public:
 
 class ast_interface_block : public ast_node {
 public:
-   ast_interface_block(ast_type_qualifier layout,
-                       const char *instance_name,
+   ast_interface_block(const char *instance_name,
                        ast_array_specifier *array_specifier)
-   : layout(layout), block_name(NULL), instance_name(instance_name),
+   : block_name(NULL), instance_name(instance_name),
      array_specifier(array_specifier)
    {
    }
diff --git a/src/compiler/glsl/ast_to_hir.cpp b/src/compiler/glsl/ast_to_hir.cpp
index db5ec9a4ad9..5262bd87655 100644
--- a/src/compiler/glsl/ast_to_hir.cpp
+++ b/src/compiler/glsl/ast_to_hir.cpp
@@ -4223,6 +4223,18 @@ ast_declarator_list::hir(exec_list *instructions,
                           type_name);
       } else {
          if (decl_type->base_type == GLSL_TYPE_ARRAY) {
+            /* From Section 13.22 (Array Declarations) of the GLSL ES 3.2
+             * spec:
+             *
+             *    "... any declaration that leaves the size undefined is
+             *    disallowed as this would add complexity and there are no
+             *    use-cases."
+             */
+            if (state->es_shader && decl_type->is_unsized_array()) {
+               _mesa_glsl_error(&loc, state, "array size must be explicitly "
+                                "or implicitly defined");
+            }
+
             /* From Section 4.12 (Empty Declarations) of the GLSL 4.5 spec:
              *
              *    "The combinations of types and qualifiers that cause
@@ -6244,9 +6256,11 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
                                           ir_variable_mode var_mode,
                                           ast_type_qualifier *layout,
                                           unsigned block_stream,
-                                          unsigned expl_location)
+                                          unsigned expl_location,
+                                          unsigned expl_align)
 {
    unsigned decl_count = 0;
+   unsigned next_offset = 0;
 
    /* Make an initial pass over the list of fields to determine how
     * many there are.  Each element in this list is an ast_declarator_list.
@@ -6460,13 +6474,93 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
             }
          }
 
+         /* Offset can only be used with std430 and std140 layouts an initial
+          * value of 0 is used for error detection.
+          */
+         unsigned align = 0;
+         unsigned size = 0;
+         if (layout) {
+            bool row_major;
+            if (qual->flags.q.row_major ||
+                matrix_layout == GLSL_MATRIX_LAYOUT_ROW_MAJOR) {
+               row_major = true;
+            } else {
+               row_major = false;
+            }
+
+            if(layout->flags.q.std140) {
+               align = field_type->std140_base_alignment(row_major);
+               size = field_type->std140_size(row_major);
+            } else if (layout->flags.q.std430) {
+               align = field_type->std430_base_alignment(row_major);
+               size = field_type->std430_size(row_major);
+            }
+         }
+
+         if (qual->flags.q.explicit_offset) {
+            unsigned qual_offset;
+            if (process_qualifier_constant(state, &loc, "offset",
+                                           qual->offset, &qual_offset)) {
+               if (align != 0 && size != 0) {
+                   if (next_offset > qual_offset)
+                      _mesa_glsl_error(&loc, state, "layout qualifier "
+                                       "offset overlaps previous member");
+
+                  if (qual_offset % align) {
+                     _mesa_glsl_error(&loc, state, "layout qualifier offset "
+                                      "must be a multiple of the base "
+                                      "alignment of %s", field_type->name);
+                  }
+                  fields[i].offset = qual_offset;
+                  next_offset = glsl_align(qual_offset + size, align);
+               } else {
+                  _mesa_glsl_error(&loc, state, "offset can only be used "
+                                   "with std430 and std140 layouts");
+               }
+            }
+         } else {
+            fields[i].offset = -1;
+         }
+
+         if (qual->flags.q.explicit_align || expl_align != 0) {
+            unsigned offset = fields[i].offset != -1 ? fields[i].offset :
+               next_offset;
+            if (align == 0 || size == 0) {
+               _mesa_glsl_error(&loc, state, "align can only be used with "
+                                "std430 and std140 layouts");
+            } else if (qual->flags.q.explicit_align) {
+               unsigned member_align;
+               if (process_qualifier_constant(state, &loc, "align",
+                                              qual->align, &member_align)) {
+                  if (member_align == 0 ||
+                      member_align & (member_align - 1)) {
+                     _mesa_glsl_error(&loc, state, "align layout qualifier "
+                                      "in not a power of 2");
+                  } else {
+                     fields[i].offset = glsl_align(offset, member_align);
+                     next_offset = glsl_align(fields[i].offset + size, align);
+                  }
+               }
+            } else {
+               fields[i].offset = glsl_align(offset, expl_align);
+               next_offset = glsl_align(fields[i].offset + size, align);
+            }
+         }
+
+         if (!qual->flags.q.explicit_offset) {
+            if (align != 0 && size != 0)
+               next_offset = glsl_align(next_offset + size, align);
+         }
+
          /* Propogate row- / column-major information down the fields of the
           * structure or interface block.  Structures need this data because
           * the structure may contain a structure that contains ... a matrix
           * that need the proper layout.
           */
-         if (field_type->without_array()->is_matrix()
-             || field_type->without_array()->is_record()) {
+         if (is_interface &&
+             (layout->flags.q.uniform || layout->flags.q.buffer) &&
+             (field_type->without_array()->is_matrix()
+              || field_type->without_array()->is_record())) {
             /* If no layout is specified for the field, inherit the layout
              * from the block.
              */
@@ -6477,11 +6571,10 @@ ast_process_struct_or_iface_block_members(exec_list *instructions,
             else if (qual->flags.q.column_major)
                fields[i].matrix_layout = GLSL_MATRIX_LAYOUT_COLUMN_MAJOR;
 
-            /* If we're processing an interface block, the matrix layout must
-             * be decided by this point.
+            /* If we're processing an uniform or buffer block, the matrix
+             * layout must be decided by this point.
              */
-            assert(!is_interface
-                   || fields[i].matrix_layout == GLSL_MATRIX_LAYOUT_ROW_MAJOR
+            assert(fields[i].matrix_layout == GLSL_MATRIX_LAYOUT_ROW_MAJOR
                    || fields[i].matrix_layout == GLSL_MATRIX_LAYOUT_COLUMN_MAJOR);
          }
 
@@ -6553,7 +6646,8 @@ ast_struct_specifier::hir(exec_list *instructions,
                                                 ir_var_auto,
                                                 layout,
                                                 0, /* for interface only */
-                                                expl_location);
+                                                expl_location,
+                                                0 /* for interface only */);
 
    validate_identifier(this->name, loc, state);
 
@@ -6721,6 +6815,20 @@ ast_interface_block::hir(exec_list *instructions,
       }
    }
 
+   unsigned expl_align = 0;
+   if (layout.flags.q.explicit_align) {
+      if (!process_qualifier_constant(state, &loc, "align",
+                                      layout.align, &expl_align)) {
+         return NULL;
+      } else {
+         if (expl_align == 0 || expl_align & (expl_align - 1)) {
+            _mesa_glsl_error(&loc, state, "align layout qualifier in not a "
+                             "power of 2.");
+            return NULL;
+         }
+      }
+   }
+
    unsigned int num_variables =
       ast_process_struct_or_iface_block_members(&declared_variables,
                                                 state,
@@ -6732,7 +6840,8 @@ ast_interface_block::hir(exec_list *instructions,
                                                 var_mode,
                                                 &this->layout,
                                                 qual_stream,
-                                                expl_location);
+                                                expl_location,
+                                                expl_align);
 
    if (!redeclaring_per_vertex) {
       validate_identifier(this->block_name, loc, state);
@@ -6833,6 +6942,8 @@ ast_interface_block::hir(exec_list *instructions,
          } else {
             fields[i].location =
                earlier_per_vertex->fields.structure[j].location;
+            fields[i].offset =
+               earlier_per_vertex->fields.structure[j].offset;
             fields[i].interpolation =
                earlier_per_vertex->fields.structure[j].interpolation;
             fields[i].centroid =
diff --git a/src/compiler/glsl/ast_type.cpp b/src/compiler/glsl/ast_type.cpp
index dcd83efa6ff..07ed4f2356c 100644
--- a/src/compiler/glsl/ast_type.cpp
+++ b/src/compiler/glsl/ast_type.cpp
@@ -73,6 +73,7 @@ ast_type_qualifier::has_layout() const
           || this->flags.q.column_major
           || this->flags.q.row_major
           || this->flags.q.packed
+          || this->flags.q.explicit_align
           || this->flags.q.explicit_location
           || this->flags.q.explicit_image_format
           || this->flags.q.explicit_index
@@ -134,6 +135,28 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc,
    stream_layout_mask.flags.i = 0;
    stream_layout_mask.flags.q.stream = 1;
 
+   /* FIXME: We should probably do interface and function param validation
+    * separately.
+    */
+   ast_type_qualifier input_layout_mask;
+   input_layout_mask.flags.i = 0;
+   input_layout_mask.flags.q.centroid = 1;
+   /* Function params can have constant */
+   input_layout_mask.flags.q.constant = 1;
+   input_layout_mask.flags.q.explicit_location = 1;
+   input_layout_mask.flags.q.flat = 1;
+   input_layout_mask.flags.q.in = 1;
+   input_layout_mask.flags.q.invariant = 1;
+   input_layout_mask.flags.q.noperspective = 1;
+   input_layout_mask.flags.q.origin_upper_left = 1;
+   /* Function params 'inout' will set this */
+   input_layout_mask.flags.q.out = 1;
+   input_layout_mask.flags.q.patch = 1;
+   input_layout_mask.flags.q.pixel_center_integer = 1;
+   input_layout_mask.flags.q.precise = 1;
+   input_layout_mask.flags.q.sample = 1;
+   input_layout_mask.flags.q.smooth = 1;
+
    /* Uniform block layout qualifiers get to overwrite each
     * other (rightmost having priority), while all other
     * qualifiers currently don't allow duplicates.
@@ -258,6 +281,16 @@ ast_type_qualifier::merge_qualifier(YYLTYPE *loc,
 
    this->flags.i |= q.flags.i;
 
+   if (this->flags.q.in &&
+       (this->flags.i & ~input_layout_mask.flags.i) != 0) {
+      _mesa_glsl_error(loc, state,
+		       "invalid input layout qualifier used");
+      return false;
+   }
+
+   if (q.flags.q.explicit_align)
+      this->align = q.align;
+
    if (q.flags.q.explicit_location)
       this->location = q.location;
 
diff --git a/src/compiler/glsl/builtin_functions.cpp b/src/compiler/glsl/builtin_functions.cpp
index 0a0dcc68a05..ff6b628eb64 100644
--- a/src/compiler/glsl/builtin_functions.cpp
+++ b/src/compiler/glsl/builtin_functions.cpp
@@ -432,6 +432,12 @@ shader_atomic_counters(const _mesa_glsl_parse_state *state)
 }
 
 static bool
+shader_atomic_counter_ops(const _mesa_glsl_parse_state *state)
+{
+   return state->ARB_shader_atomic_counter_ops_enable;
+}
+
+static bool
 shader_clock(const _mesa_glsl_parse_state *state)
 {
    return state->ARB_shader_clock_enable;
@@ -578,7 +584,7 @@ private:
    ir_dereference_array *array_ref(ir_variable *var, int i);
    ir_swizzle *matrix_elt(ir_variable *var, int col, int row);
 
-   ir_expression *asin_expr(ir_variable *x);
+   ir_expression *asin_expr(ir_variable *x, float p0, float p1);
    void do_atan(ir_factory &body, const glsl_type *type, ir_variable *res, operand y_over_x);
 
    /**
@@ -792,8 +798,14 @@ private:
    B1(interpolateAtSample)
 
    ir_function_signature *_atomic_counter_intrinsic(builtin_available_predicate avail);
+   ir_function_signature *_atomic_counter_intrinsic1(builtin_available_predicate avail);
+   ir_function_signature *_atomic_counter_intrinsic2(builtin_available_predicate avail);
    ir_function_signature *_atomic_counter_op(const char *intrinsic,
                                              builtin_available_predicate avail);
+   ir_function_signature *_atomic_counter_op1(const char *intrinsic,
+                                              builtin_available_predicate avail);
+   ir_function_signature *_atomic_counter_op2(const char *intrinsic,
+                                              builtin_available_predicate avail);
 
    ir_function_signature *_atomic_intrinsic2(builtin_available_predicate avail,
                                              const glsl_type *type);
@@ -968,48 +980,59 @@ builtin_builder::create_intrinsics()
                                    glsl_type::uint_type),
                 _atomic_intrinsic2(buffer_atomics_supported,
                                    glsl_type::int_type),
+                _atomic_counter_intrinsic1(shader_atomic_counter_ops),
+                NULL);
+   add_function("__intrinsic_atomic_sub",
+                _atomic_counter_intrinsic1(shader_atomic_counter_ops),
                 NULL);
    add_function("__intrinsic_atomic_min",
                 _atomic_intrinsic2(buffer_atomics_supported,
                                    glsl_type::uint_type),
                 _atomic_intrinsic2(buffer_atomics_supported,
                                    glsl_type::int_type),
+                _atomic_counter_intrinsic1(shader_atomic_counter_ops),
                 NULL);
    add_function("__intrinsic_atomic_max",
                 _atomic_intrinsic2(buffer_atomics_supported,
                                    glsl_type::uint_type),
                 _atomic_intrinsic2(buffer_atomics_supported,
                                    glsl_type::int_type),
+                _atomic_counter_intrinsic1(shader_atomic_counter_ops),
                 NULL);
    add_function("__intrinsic_atomic_and",
                 _atomic_intrinsic2(buffer_atomics_supported,
                                    glsl_type::uint_type),
                 _atomic_intrinsic2(buffer_atomics_supported,
                                    glsl_type::int_type),
+                _atomic_counter_intrinsic1(shader_atomic_counter_ops),
                 NULL);
    add_function("__intrinsic_atomic_or",
                 _atomic_intrinsic2(buffer_atomics_supported,
                                    glsl_type::uint_type),
                 _atomic_intrinsic2(buffer_atomics_supported,
                                    glsl_type::int_type),
+                _atomic_counter_intrinsic1(shader_atomic_counter_ops),
                 NULL);
    add_function("__intrinsic_atomic_xor",
                 _atomic_intrinsic2(buffer_atomics_supported,
                                    glsl_type::uint_type),
                 _atomic_intrinsic2(buffer_atomics_supported,
                                    glsl_type::int_type),
+                _atomic_counter_intrinsic1(shader_atomic_counter_ops),
                 NULL);
    add_function("__intrinsic_atomic_exchange",
                 _atomic_intrinsic2(buffer_atomics_supported,
                                    glsl_type::uint_type),
                 _atomic_intrinsic2(buffer_atomics_supported,
                                    glsl_type::int_type),
+                _atomic_counter_intrinsic1(shader_atomic_counter_ops),
                 NULL);
    add_function("__intrinsic_atomic_comp_swap",
                 _atomic_intrinsic3(buffer_atomics_supported,
                                    glsl_type::uint_type),
                 _atomic_intrinsic3(buffer_atomics_supported,
                                    glsl_type::int_type),
+                _atomic_counter_intrinsic2(shader_atomic_counter_ops),
                 NULL);
 
    add_image_functions(false);
@@ -2714,6 +2737,43 @@ builtin_builder::create_builtins()
                                    shader_atomic_counters),
                 NULL);
 
+   add_function("atomicCounterAddARB",
+                _atomic_counter_op1("__intrinsic_atomic_add",
+                                    shader_atomic_counter_ops),
+                NULL);
+   add_function("atomicCounterSubtractARB",
+                _atomic_counter_op1("__intrinsic_atomic_sub",
+                                    shader_atomic_counter_ops),
+                NULL);
+   add_function("atomicCounterMinARB",
+                _atomic_counter_op1("__intrinsic_atomic_min",
+                                    shader_atomic_counter_ops),
+                NULL);
+   add_function("atomicCounterMaxARB",
+                _atomic_counter_op1("__intrinsic_atomic_max",
+                                    shader_atomic_counter_ops),
+                NULL);
+   add_function("atomicCounterAndARB",
+                _atomic_counter_op1("__intrinsic_atomic_and",
+                                    shader_atomic_counter_ops),
+                NULL);
+   add_function("atomicCounterOrARB",
+                _atomic_counter_op1("__intrinsic_atomic_or",
+                                    shader_atomic_counter_ops),
+                NULL);
+   add_function("atomicCounterXorARB",
+                _atomic_counter_op1("__intrinsic_atomic_xor",
+                                    shader_atomic_counter_ops),
+                NULL);
+   add_function("atomicCounterExchangeARB",
+                _atomic_counter_op1("__intrinsic_atomic_exchange",
+                                    shader_atomic_counter_ops),
+                NULL);
+   add_function("atomicCounterCompSwapARB",
+                _atomic_counter_op2("__intrinsic_atomic_comp_swap",
+                                    shader_atomic_counter_ops),
+                NULL);
+
    add_function("atomicAdd",
                 _atomic_op2("__intrinsic_atomic_add",
                             buffer_atomics_supported,
@@ -3212,7 +3272,7 @@ builtin_builder::_tan(const glsl_type *type)
 }
 
 ir_expression *
-builtin_builder::asin_expr(ir_variable *x)
+builtin_builder::asin_expr(ir_variable *x, float p0, float p1)
 {
    return mul(sign(x),
               sub(imm(M_PI_2f),
@@ -3221,8 +3281,8 @@ builtin_builder::asin_expr(ir_variable *x)
                           mul(abs(x),
                               add(imm(M_PI_4f - 1.0f),
                                   mul(abs(x),
-                                      add(imm(0.086566724f),
-                                          mul(abs(x), imm(-0.03102955f))))))))));
+                                      add(imm(p0),
+                                          mul(abs(x), imm(p1))))))))));
 }
 
 ir_call *
@@ -3251,7 +3311,7 @@ builtin_builder::_asin(const glsl_type *type)
    ir_variable *x = in_var(type, "x");
    MAKE_SIG(type, always_available, 1, x);
 
-   body.emit(ret(asin_expr(x)));
+   body.emit(ret(asin_expr(x, 0.086566724f, -0.03102955f)));
 
    return sig;
 }
@@ -3262,7 +3322,7 @@ builtin_builder::_acos(const glsl_type *type)
    ir_variable *x = in_var(type, "x");
    MAKE_SIG(type, always_available, 1, x);
 
-   body.emit(ret(sub(imm(M_PI_2f), asin_expr(x))));
+   body.emit(ret(sub(imm(M_PI_2f), asin_expr(x, 0.08132463f, -0.02363318f))));
 
    return sig;
 }
@@ -5145,6 +5205,25 @@ builtin_builder::_atomic_counter_intrinsic(builtin_available_predicate avail)
 }
 
 ir_function_signature *
+builtin_builder::_atomic_counter_intrinsic1(builtin_available_predicate avail)
+{
+   ir_variable *counter = in_var(glsl_type::atomic_uint_type, "counter");
+   ir_variable *data = in_var(glsl_type::uint_type, "data");
+   MAKE_INTRINSIC(glsl_type::uint_type, avail, 2, counter, data);
+   return sig;
+}
+
+ir_function_signature *
+builtin_builder::_atomic_counter_intrinsic2(builtin_available_predicate avail)
+{
+   ir_variable *counter = in_var(glsl_type::atomic_uint_type, "counter");
+   ir_variable *compare = in_var(glsl_type::uint_type, "compare");
+   ir_variable *data = in_var(glsl_type::uint_type, "data");
+   MAKE_INTRINSIC(glsl_type::uint_type, avail, 3, counter, compare, data);
+   return sig;
+}
+
+ir_function_signature *
 builtin_builder::_atomic_intrinsic2(builtin_available_predicate avail,
                                     const glsl_type *type)
 {
@@ -5180,6 +5259,37 @@ builtin_builder::_atomic_counter_op(const char *intrinsic,
 }
 
 ir_function_signature *
+builtin_builder::_atomic_counter_op1(const char *intrinsic,
+                                     builtin_available_predicate avail)
+{
+   ir_variable *counter = in_var(glsl_type::atomic_uint_type, "atomic_counter");
+   ir_variable *data = in_var(glsl_type::uint_type, "data");
+   MAKE_SIG(glsl_type::uint_type, avail, 2, counter, data);
+
+   ir_variable *retval = body.make_temp(glsl_type::uint_type, "atomic_retval");
+   body.emit(call(shader->symbols->get_function(intrinsic), retval,
+                  sig->parameters));
+   body.emit(ret(retval));
+   return sig;
+}
+
+ir_function_signature *
+builtin_builder::_atomic_counter_op2(const char *intrinsic,
+                                    builtin_available_predicate avail)
+{
+   ir_variable *counter = in_var(glsl_type::atomic_uint_type, "atomic_counter");
+   ir_variable *compare = in_var(glsl_type::uint_type, "compare");
+   ir_variable *data = in_var(glsl_type::uint_type, "data");
+   MAKE_SIG(glsl_type::uint_type, avail, 3, counter, compare, data);
+
+   ir_variable *retval = body.make_temp(glsl_type::uint_type, "atomic_retval");
+   body.emit(call(shader->symbols->get_function(intrinsic), retval,
+                  sig->parameters));
+   body.emit(ret(retval));
+   return sig;
+}
+
+ir_function_signature *
 builtin_builder::_atomic_op2(const char *intrinsic,
                              builtin_available_predicate avail,
                              const glsl_type *type)
diff --git a/src/compiler/glsl/builtin_variables.cpp b/src/compiler/glsl/builtin_variables.cpp
index d20fc4a816c..4e2de37fbba 100644
--- a/src/compiler/glsl/builtin_variables.cpp
+++ b/src/compiler/glsl/builtin_variables.cpp
@@ -323,6 +323,7 @@ per_vertex_accumulator::add_field(int slot, const glsl_type *type,
    this->fields[this->num_fields].name = name;
    this->fields[this->num_fields].matrix_layout = GLSL_MATRIX_LAYOUT_INHERITED;
    this->fields[this->num_fields].location = slot;
+   this->fields[this->num_fields].offset = -1;
    this->fields[this->num_fields].interpolation = INTERP_QUALIFIER_NONE;
    this->fields[this->num_fields].centroid = 0;
    this->fields[this->num_fields].sample = 0;
diff --git a/src/compiler/glsl/glcpp/glcpp-lex.l b/src/compiler/glsl/glcpp/glcpp-lex.l
index fa9aa506912..d09441aac88 100644
--- a/src/compiler/glsl/glcpp/glcpp-lex.l
+++ b/src/compiler/glsl/glcpp/glcpp-lex.l
@@ -120,6 +120,11 @@ void glcpp_set_column (int  column_no , yyscan_t yyscanner);
 static int
 glcpp_lex_update_state_per_token (glcpp_parser_t *parser, int token)
 {
+	if (token != NEWLINE && token != SPACE && token != HASH_TOKEN &&
+	    !parser->lexing_version_directive) {
+		glcpp_parser_resolve_implicit_version(parser);
+	}
+
 	/* After the first non-space token in a line, we won't
 	 * allow any '#' to introduce a directive. */
 	if (token == NEWLINE) {
@@ -285,6 +290,7 @@ HEXADECIMAL_INTEGER	0[xX][0-9a-fA-F]+[uU]?
 <HASH>version{HSPACE}+ {
 	BEGIN INITIAL;
 	yyextra->space_tokens = 0;
+	yyextra->lexing_version_directive = 1;
 	RETURN_STRING_TOKEN (VERSION_TOKEN);
 }
 
@@ -314,6 +320,9 @@ HEXADECIMAL_INTEGER	0[xX][0-9a-fA-F]+[uU]?
 
 <HASH>{NEWLINE} {
 	BEGIN INITIAL;
+	yyextra->space_tokens = 0;
+	yylineno++;
+	yycolumn = 0;
 	RETURN_TOKEN_NEVER_SKIP (NEWLINE);
 }
 
@@ -536,6 +545,7 @@ HEXADECIMAL_INTEGER	0[xX][0-9a-fA-F]+[uU]?
 	}
 	yyextra->space_tokens = 1;
 	yyextra->lexing_directive = 0;
+	yyextra->lexing_version_directive = 0;
 	yylineno++;
 	yycolumn = 0;
 	RETURN_TOKEN_NEVER_SKIP (NEWLINE);
@@ -546,6 +556,7 @@ HEXADECIMAL_INTEGER	0[xX][0-9a-fA-F]+[uU]?
 		glcpp_error(yylloc, yyextra, "Unterminated comment");
 	BEGIN DONE; /* Don't keep matching this rule forever. */
 	yyextra->lexing_directive = 0;
+	yyextra->lexing_version_directive = 0;
 	if (! parser->last_token_was_newline)
 		RETURN_TOKEN (NEWLINE);
 }
diff --git a/src/compiler/glsl/glcpp/glcpp-parse.y b/src/compiler/glsl/glcpp/glcpp-parse.y
index 5c38f86d333..007b70b020d 100644
--- a/src/compiler/glsl/glcpp/glcpp-parse.y
+++ b/src/compiler/glsl/glcpp/glcpp-parse.y
@@ -266,45 +266,37 @@ control_line:
 		ralloc_asprintf_rewrite_tail (&parser->output, &parser->output_length, "\n");
 	}
 |	control_line_error
-|	HASH_TOKEN LINE {
-		glcpp_parser_resolve_implicit_version(parser);
-	} pp_tokens NEWLINE {
+|	HASH_TOKEN LINE pp_tokens NEWLINE {
 
 		if (parser->skip_stack == NULL ||
 		    parser->skip_stack->type == SKIP_NO_SKIP)
 		{
 			_glcpp_parser_expand_and_lex_from (parser,
-							   LINE_EXPANDED, $4,
+							   LINE_EXPANDED, $3,
 							   EXPANSION_MODE_IGNORE_DEFINED);
 		}
 	}
 ;
 
 control_line_success:
-	HASH_TOKEN DEFINE_TOKEN {
-		glcpp_parser_resolve_implicit_version(parser);
-	} define
-|	HASH_TOKEN UNDEF {
-		glcpp_parser_resolve_implicit_version(parser);
-	} IDENTIFIER NEWLINE {
+	HASH_TOKEN DEFINE_TOKEN define
+|	HASH_TOKEN UNDEF IDENTIFIER NEWLINE {
 		macro_t *macro;
-		if (strcmp("__LINE__", $4) == 0
-		    || strcmp("__FILE__", $4) == 0
-		    || strcmp("__VERSION__", $4) == 0
-		    || strncmp("GL_", $4, 3) == 0)
+		if (strcmp("__LINE__", $3) == 0
+		    || strcmp("__FILE__", $3) == 0
+		    || strcmp("__VERSION__", $3) == 0
+		    || strncmp("GL_", $3, 3) == 0)
 			glcpp_error(& @1, parser, "Built-in (pre-defined)"
 				    " macro names cannot be undefined.");
 
-		macro = hash_table_find (parser->defines, $4);
+		macro = hash_table_find (parser->defines, $3);
 		if (macro) {
-			hash_table_remove (parser->defines, $4);
+			hash_table_remove (parser->defines, $3);
 			ralloc_free (macro);
 		}
-		ralloc_free ($4);
+		ralloc_free ($3);
 	}
-|	HASH_TOKEN IF {
-		glcpp_parser_resolve_implicit_version(parser);
-	} pp_tokens NEWLINE {
+|	HASH_TOKEN IF pp_tokens NEWLINE {
 		/* Be careful to only evaluate the 'if' expression if
 		 * we are not skipping. When we are skipping, we
 		 * simply push a new 0-valued 'if' onto the skip
@@ -316,7 +308,7 @@ control_line_success:
 		    parser->skip_stack->type == SKIP_NO_SKIP)
 		{
 			_glcpp_parser_expand_and_lex_from (parser,
-							   IF_EXPANDED, $4,
+							   IF_EXPANDED, $3,
 							   EXPANSION_MODE_EVALUATE_DEFINED);
 		}	
 		else
@@ -335,18 +327,14 @@ control_line_success:
 		}	
 		_glcpp_parser_skip_stack_push_if (parser, & @1, 0);
 	}
-|	HASH_TOKEN IFDEF {
-		glcpp_parser_resolve_implicit_version(parser);
-	} IDENTIFIER junk NEWLINE {
-		macro_t *macro = hash_table_find (parser->defines, $4);
-		ralloc_free ($4);
+|	HASH_TOKEN IFDEF IDENTIFIER junk NEWLINE {
+		macro_t *macro = hash_table_find (parser->defines, $3);
+		ralloc_free ($3);
 		_glcpp_parser_skip_stack_push_if (parser, & @1, macro != NULL);
 	}
-|	HASH_TOKEN IFNDEF {
-		glcpp_parser_resolve_implicit_version(parser);
-	} IDENTIFIER junk NEWLINE {
-		macro_t *macro = hash_table_find (parser->defines, $4);
-		ralloc_free ($4);
+|	HASH_TOKEN IFNDEF IDENTIFIER junk NEWLINE {
+		macro_t *macro = hash_table_find (parser->defines, $3);
+		ralloc_free ($3);
 		_glcpp_parser_skip_stack_push_if (parser, & @3, macro == NULL);
 	}
 |	HASH_TOKEN ELIF pp_tokens NEWLINE {
@@ -2494,6 +2482,9 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio
 	      if (extensions->ARB_shader_atomic_counters)
 	         add_builtin_define(parser, "GL_ARB_shader_atomic_counters", 1);
 
+	      if (extensions->ARB_shader_atomic_counter_ops)
+	         add_builtin_define(parser, "GL_ARB_shader_atomic_counter_ops", 1);
+
 	      if (extensions->ARB_viewport_array)
 	         add_builtin_define(parser, "GL_ARB_viewport_array", 1);
 
diff --git a/src/compiler/glsl/glcpp/glcpp.h b/src/compiler/glsl/glcpp/glcpp.h
index 70aa14b6ec0..d87e6b77dc5 100644
--- a/src/compiler/glsl/glcpp/glcpp.h
+++ b/src/compiler/glsl/glcpp/glcpp.h
@@ -176,6 +176,7 @@ struct glcpp_parser {
 	struct hash_table *defines;
 	active_list_t *active;
 	int lexing_directive;
+	int lexing_version_directive;
 	int space_tokens;
 	int last_token_was_newline;
 	int last_token_was_space;
diff --git a/src/compiler/glsl/glcpp/tests/129-define-non-identifier.c.expected b/src/compiler/glsl/glcpp/tests/129-define-non-identifier.c.expected
index fd0b41347fa..5206a5c553c 100644
--- a/src/compiler/glsl/glcpp/tests/129-define-non-identifier.c.expected
+++ b/src/compiler/glsl/glcpp/tests/129-define-non-identifier.c.expected
@@ -1,2 +1,2 @@
 0:1(9): preprocessor error: #define followed by a non-identifier: 123
-0:1(9): preprocessor error: syntax error, unexpected INTEGER_STRING, expecting FUNC_IDENTIFIER or OBJ_IDENTIFIER
+0:1(9): preprocessor error: syntax error, unexpected INTEGER_STRING, expecting FUNC_IDENTIFIER or OBJ_IDENTIFIER or NEWLINE
diff --git a/src/compiler/glsl/glcpp/tests/144-implicit-version.c b/src/compiler/glsl/glcpp/tests/144-implicit-version.c
new file mode 100644
index 00000000000..7bf72fc19e9
--- /dev/null
+++ b/src/compiler/glsl/glcpp/tests/144-implicit-version.c
@@ -0,0 +1 @@
+int x = __VERSION__;
diff --git a/src/compiler/glsl/glcpp/tests/144-implicit-version.c.expected b/src/compiler/glsl/glcpp/tests/144-implicit-version.c.expected
new file mode 100644
index 00000000000..8c2dfd9ce30
--- /dev/null
+++ b/src/compiler/glsl/glcpp/tests/144-implicit-version.c.expected
@@ -0,0 +1 @@
+int x = 110;
diff --git a/src/compiler/glsl/glcpp/tests/145-version-first.c b/src/compiler/glsl/glcpp/tests/145-version-first.c
new file mode 100644
index 00000000000..f9fcfb08246
--- /dev/null
+++ b/src/compiler/glsl/glcpp/tests/145-version-first.c
@@ -0,0 +1,2 @@
+123
+#version 120
diff --git a/src/compiler/glsl/glcpp/tests/145-version-first.c.expected b/src/compiler/glsl/glcpp/tests/145-version-first.c.expected
new file mode 100644
index 00000000000..f4092b04af7
--- /dev/null
+++ b/src/compiler/glsl/glcpp/tests/145-version-first.c.expected
@@ -0,0 +1,3 @@
+0:2(1): preprocessor error: #version must appear on the first line
+123
+
diff --git a/src/compiler/glsl/glcpp/tests/146-version-first-hash.c b/src/compiler/glsl/glcpp/tests/146-version-first-hash.c
new file mode 100644
index 00000000000..14dbe964bd6
--- /dev/null
+++ b/src/compiler/glsl/glcpp/tests/146-version-first-hash.c
@@ -0,0 +1,2 @@
+#
+#version 120
diff --git a/src/compiler/glsl/glcpp/tests/146-version-first-hash.c.expected b/src/compiler/glsl/glcpp/tests/146-version-first-hash.c.expected
new file mode 100644
index 00000000000..e8e449793fd
--- /dev/null
+++ b/src/compiler/glsl/glcpp/tests/146-version-first-hash.c.expected
@@ -0,0 +1,3 @@
+0:2(1): preprocessor error: #version must appear on the first line
+
+
diff --git a/src/compiler/glsl/glsl_parser.yy b/src/compiler/glsl/glsl_parser.yy
index 99bd0e61d0e..5ed051a6705 100644
--- a/src/compiler/glsl/glsl_parser.yy
+++ b/src/compiler/glsl/glsl_parser.yy
@@ -170,7 +170,6 @@ static bool match_layout_qualifier(const char *s1, const char *s2,
 %token <identifier> IDENTIFIER TYPE_IDENTIFIER NEW_IDENTIFIER
 %type <identifier> any_identifier
 %type <interface_block> instance_name_opt
-%type <interface_block> buffer_instance_name_opt
 %token <real> FLOATCONSTANT
 %token <dreal> DOUBLECONSTANT
 %token <n> INTCONSTANT UINTCONSTANT BOOLCONSTANT
@@ -220,6 +219,7 @@ static bool match_layout_qualifier(const char *s1, const char *s2,
 %type <type_qualifier> subroutine_qualifier
 %type <subroutine_list> subroutine_type_list
 %type <type_qualifier> interface_qualifier
+%type <type_qualifier> uniform_interface_qualifier
 %type <type_qualifier> buffer_interface_qualifier
 %type <type_specifier> type_specifier
 %type <type_specifier> type_specifier_nonarray
@@ -894,6 +894,7 @@ parameter_declarator:
       $$->type->set_location(@1);
       $$->type->specifier = $1;
       $$->identifier = $2;
+      state->symbols->add_variable(new(state) ir_variable(NULL, $2, ir_var_auto));
    }
    | type_specifier any_identifier array_specifier
    {
@@ -905,6 +906,7 @@ parameter_declarator:
       $$->type->specifier = $1;
       $$->identifier = $2;
       $$->array_specifier = $3;
+      state->symbols->add_variable(new(state) ir_variable(NULL, $2, ir_var_auto));
    }
    ;
 
@@ -1062,6 +1064,7 @@ single_declaration:
       $$ = new(ctx) ast_declarator_list($1);
       $$->set_location_range(@1, @2);
       $$->declarations.push_tail(&decl->link);
+      state->symbols->add_variable(new(state) ir_variable(NULL, $2, ir_var_auto));
    }
    | fully_specified_type any_identifier array_specifier
    {
@@ -1072,6 +1075,7 @@ single_declaration:
       $$ = new(ctx) ast_declarator_list($1);
       $$->set_location_range(@1, @3);
       $$->declarations.push_tail(&decl->link);
+      state->symbols->add_variable(new(state) ir_variable(NULL, $2, ir_var_auto));
    }
    | fully_specified_type any_identifier array_specifier '=' initializer
    {
@@ -1082,6 +1086,7 @@ single_declaration:
       $$ = new(ctx) ast_declarator_list($1);
       $$->set_location_range(@1, @3);
       $$->declarations.push_tail(&decl->link);
+      state->symbols->add_variable(new(state) ir_variable(NULL, $2, ir_var_auto));
    }
    | fully_specified_type any_identifier '=' initializer
    {
@@ -1092,6 +1097,7 @@ single_declaration:
       $$ = new(ctx) ast_declarator_list($1);
       $$->set_location_range(@1, @2);
       $$->declarations.push_tail(&decl->link);
+      state->symbols->add_variable(new(state) ir_variable(NULL, $2, ir_var_auto));
    }
    | INVARIANT variable_identifier
    {
@@ -1468,6 +1474,17 @@ layout_qualifier_id:
                           "GLSL 4.40 or ARB_enhanced_layouts");
       }
 
+      if (match_layout_qualifier("align", $1, state) == 0) {
+         if (!state->has_enhanced_layouts()) {
+            _mesa_glsl_error(& @1, state,
+                             "align qualifier requires "
+                             "GLSL 4.40 or ARB_enhanced_layouts");
+         } else {
+            $$.flags.q.explicit_align = 1;
+            $$.align = $3;
+         }
+      }
+
       if (match_layout_qualifier("location", $1, state) == 0) {
          $$.flags.q.explicit_location = 1;
 
@@ -1498,7 +1515,8 @@ layout_qualifier_id:
          $$.binding = $3;
       }
 
-      if (state->has_atomic_counters() &&
+      if ((state->has_atomic_counters() ||
+           state->has_enhanced_layouts()) &&
           match_layout_qualifier("offset", $1, state) == 0) {
          $$.flags.q.explicit_offset = 1;
          $$.offset = $3;
@@ -2625,10 +2643,23 @@ basic_interface_block:
 
       $$ = block;
    }
-   | buffer_interface_qualifier NEW_IDENTIFIER '{' member_list '}' buffer_instance_name_opt ';'
+   | uniform_interface_qualifier NEW_IDENTIFIER '{' member_list '}' instance_name_opt ';'
+   {
+      ast_interface_block *const block = $6;
+
+      block->layout = *state->default_uniform_qualifier;
+      block->block_name = $2;
+      block->declarations.push_degenerate_list_at_head(& $4->link);
+
+      _mesa_ast_process_interface_block(& @1, state, block, $1);
+
+      $$ = block;
+   }
+   | buffer_interface_qualifier NEW_IDENTIFIER '{' member_list '}' instance_name_opt ';'
    {
       ast_interface_block *const block = $6;
 
+      block->layout = *state->default_shader_storage_qualifier;
       block->block_name = $2;
       block->declarations.push_degenerate_list_at_head(& $4->link);
 
@@ -2649,7 +2680,10 @@ interface_qualifier:
       memset(& $$, 0, sizeof($$));
       $$.flags.q.out = 1;
    }
-   | UNIFORM
+   ;
+
+uniform_interface_qualifier:
+   UNIFORM
    {
       memset(& $$, 0, sizeof($$));
       $$.flags.q.uniform = 1;
@@ -2667,39 +2701,16 @@ buffer_interface_qualifier:
 instance_name_opt:
    /* empty */
    {
-      $$ = new(state) ast_interface_block(*state->default_uniform_qualifier,
-                                          NULL, NULL);
-   }
-   | NEW_IDENTIFIER
-   {
-      $$ = new(state) ast_interface_block(*state->default_uniform_qualifier,
-                                          $1, NULL);
-      $$->set_location(@1);
-   }
-   | NEW_IDENTIFIER array_specifier
-   {
-      $$ = new(state) ast_interface_block(*state->default_uniform_qualifier,
-                                          $1, $2);
-      $$->set_location_range(@1, @2);
-   }
-   ;
-
-buffer_instance_name_opt:
-   /* empty */
-   {
-      $$ = new(state) ast_interface_block(*state->default_shader_storage_qualifier,
-                                          NULL, NULL);
+      $$ = new(state) ast_interface_block(NULL, NULL);
    }
    | NEW_IDENTIFIER
    {
-      $$ = new(state) ast_interface_block(*state->default_shader_storage_qualifier,
-                                          $1, NULL);
+      $$ = new(state) ast_interface_block($1, NULL);
       $$->set_location(@1);
    }
    | NEW_IDENTIFIER array_specifier
    {
-      $$ = new(state) ast_interface_block(*state->default_shader_storage_qualifier,
-                                          $1, $2);
+      $$ = new(state) ast_interface_block($1, $2);
       $$->set_location_range(@1, @2);
    }
    ;
diff --git a/src/compiler/glsl/glsl_parser_extras.cpp b/src/compiler/glsl/glsl_parser_extras.cpp
index fe8b3bb2e79..1ac8489b45a 100644
--- a/src/compiler/glsl/glsl_parser_extras.cpp
+++ b/src/compiler/glsl/glsl_parser_extras.cpp
@@ -575,6 +575,7 @@ static const _mesa_glsl_extension _mesa_glsl_supported_extensions[] = {
    EXT(ARB_gpu_shader_fp64,              true,  false,     ARB_gpu_shader_fp64),
    EXT(ARB_sample_shading,               true,  false,     ARB_sample_shading),
    EXT(ARB_separate_shader_objects,      true,  false,     dummy_true),
+   EXT(ARB_shader_atomic_counter_ops,    true,  false,     ARB_shader_atomic_counter_ops),
    EXT(ARB_shader_atomic_counters,       true,  false,     ARB_shader_atomic_counters),
    EXT(ARB_shader_bit_encoding,          true,  false,     ARB_shader_bit_encoding),
    EXT(ARB_shader_clock,                 true,  false,     ARB_shader_clock),
@@ -926,7 +927,8 @@ _mesa_ast_process_interface_block(YYLTYPE *locp,
    block->layout.flags.i |= block_interface_qualifier;
 
    if (state->stage == MESA_SHADER_GEOMETRY &&
-       state->has_explicit_attrib_stream()) {
+       state->has_explicit_attrib_stream() &&
+       block->layout.flags.q.out) {
       /* Assign global layout's stream value. */
       block->layout.flags.q.stream = 1;
       block->layout.flags.q.explicit_stream = 0;
diff --git a/src/compiler/glsl/glsl_parser_extras.h b/src/compiler/glsl/glsl_parser_extras.h
index 74825a0bd35..12a3a46928c 100644
--- a/src/compiler/glsl/glsl_parser_extras.h
+++ b/src/compiler/glsl/glsl_parser_extras.h
@@ -533,6 +533,8 @@ struct _mesa_glsl_parse_state {
    bool ARB_sample_shading_warn;
    bool ARB_separate_shader_objects_enable;
    bool ARB_separate_shader_objects_warn;
+   bool ARB_shader_atomic_counter_ops_enable;
+   bool ARB_shader_atomic_counter_ops_warn;
    bool ARB_shader_atomic_counters_enable;
    bool ARB_shader_atomic_counters_warn;
    bool ARB_shader_bit_encoding_enable;
diff --git a/src/compiler/glsl/ir.h b/src/compiler/glsl/ir.h
index f6ed16de0c3..f4519679ff3 100644
--- a/src/compiler/glsl/ir.h
+++ b/src/compiler/glsl/ir.h
@@ -866,7 +866,7 @@ public:
       unsigned stream;
 
       /**
-       * Location an atomic counter is stored at.
+       * Atomic or block member offset.
        */
       unsigned offset;
 
diff --git a/src/compiler/glsl/ir_builder.cpp b/src/compiler/glsl/ir_builder.cpp
index c9cf1240dfe..d68647f4234 100644
--- a/src/compiler/glsl/ir_builder.cpp
+++ b/src/compiler/glsl/ir_builder.cpp
@@ -51,7 +51,7 @@ assign(deref lhs, operand rhs, operand condition, int writemask)
    void *mem_ctx = ralloc_parent(lhs.val);
 
    ir_assignment *assign = new(mem_ctx) ir_assignment(lhs.val,
-						      rhs.val,
+                                                      rhs.val,
                                                       condition.val,
                                                       writemask);
 
@@ -89,11 +89,11 @@ swizzle(operand a, int swizzle, int components)
    void *mem_ctx = ralloc_parent(a.val);
 
    return new(mem_ctx) ir_swizzle(a.val,
-				  GET_SWZ(swizzle, 0),
-				  GET_SWZ(swizzle, 1),
-				  GET_SWZ(swizzle, 2),
-				  GET_SWZ(swizzle, 3),
-				  components);
+                                  GET_SWZ(swizzle, 0),
+                                  GET_SWZ(swizzle, 1),
+                                  GET_SWZ(swizzle, 2),
+                                  GET_SWZ(swizzle, 3),
+                                  components);
 }
 
 ir_swizzle *
diff --git a/src/compiler/glsl/link_interface_blocks.cpp b/src/compiler/glsl/link_interface_blocks.cpp
index 64c30fea9a3..4c6fb56f891 100644
--- a/src/compiler/glsl/link_interface_blocks.cpp
+++ b/src/compiler/glsl/link_interface_blocks.cpp
@@ -81,6 +81,66 @@ intrastage_match(ir_variable *a,
    return true;
 }
 
+/**
+ * Return true if interface members mismatch and its not allowed by GLSL.
+ */
+static bool
+interstage_member_mismatch(struct gl_shader_program *prog,
+                           const glsl_type *c, const glsl_type *p) {
+
+   if (c->length != p->length)
+      return true;
+
+   for (unsigned i = 0; i < c->length; i++) {
+      if (c->fields.structure[i].type != p->fields.structure[i].type)
+         return true;
+      if (strcmp(c->fields.structure[i].name,
+                 p->fields.structure[i].name) != 0)
+         return true;
+      if (c->fields.structure[i].location !=
+          p->fields.structure[i].location)
+         return true;
+      if (c->fields.structure[i].patch !=
+          p->fields.structure[i].patch)
+         return true;
+
+      /* From Section 4.5 (Interpolation Qualifiers) of the GLSL 4.40 spec:
+       *
+       *    "It is a link-time error if, within the same stage, the
+       *    interpolation qualifiers of variables of the same name do not
+       *    match."
+       */
+      if (prog->IsES || prog->Version < 440)
+         if (c->fields.structure[i].interpolation !=
+             p->fields.structure[i].interpolation)
+            return true;
+
+      /* From Section 4.3.4 (Input Variables) of the GLSL ES 3.0 spec:
+       *
+       *    "The output of the vertex shader and the input of the fragment
+       *    shader form an interface.  For this interface, vertex shader
+       *    output variables and fragment shader input variables of the same
+       *    name must match in type and qualification (other than precision
+       *    and out matching to in).
+       *
+       * The table in Section 9.2.1 Linked Shaders of the GLSL ES 3.1 spec
+       * says that centroid no longer needs to match for varyings.
+       *
+       * The table in Section 9.2.1 Linked Shaders of the GLSL ES 3.2 spec
+       * says that sample need not match for varyings.
+       */
+      if (!prog->IsES || prog->Version < 310)
+         if (c->fields.structure[i].centroid !=
+             p->fields.structure[i].centroid)
+            return true;
+      if (!prog->IsES)
+         if (c->fields.structure[i].sample !=
+             p->fields.structure[i].sample)
+            return true;
+   }
+
+   return false;
+}
 
 /**
  * Check if two interfaces match, according to interstage (in/out) interface
@@ -90,10 +150,9 @@ intrastage_match(ir_variable *a,
  * an array and the producer interface is required to be a non-array.
  * This is used for tessellation control and geometry shader consumers.
  */
-bool
-interstage_match(ir_variable *producer,
-                 ir_variable *consumer,
-                 bool extra_array_level)
+static bool
+interstage_match(struct gl_shader_program *prog, ir_variable *producer,
+                 ir_variable *consumer, bool extra_array_level)
 {
    /* Unsized arrays should not occur during interstage linking.  They
     * should have all been assigned a size by link_intrastage_shaders.
@@ -106,9 +165,16 @@ interstage_match(ir_variable *producer,
       /* Exception: if both the interface blocks are implicitly declared,
        * don't force their types to match.  They might mismatch due to the two
        * shaders using different GLSL versions, and that's ok.
+       *
+       * Also we store some member information such as interpolation in
+       * glsl_type that doesn't always have to match across shader stages.
+       * Therefore we make a pass over the members glsl_struct_field to make
+       * sure we don't reject shaders where fields don't need to match.
        */
-      if (consumer->data.how_declared != ir_var_declared_implicitly ||
-          producer->data.how_declared != ir_var_declared_implicitly)
+      if ((consumer->data.how_declared != ir_var_declared_implicitly ||
+           producer->data.how_declared != ir_var_declared_implicitly) &&
+          interstage_member_mismatch(prog, consumer->get_interface_type(),
+                                     producer->get_interface_type()))
          return false;
    }
 
@@ -311,7 +377,7 @@ validate_interstage_inout_blocks(struct gl_shader_program *prog,
       if (consumer_def == NULL)
          continue;
 
-      if (!interstage_match(var, consumer_def, extra_array_level)) {
+      if (!interstage_match(prog, var, consumer_def, extra_array_level)) {
          linker_error(prog, "definitions of interface block `%s' do not "
                       "match\n", var->get_interface_type()->name);
          return;
diff --git a/src/compiler/glsl/link_uniform_blocks.cpp b/src/compiler/glsl/link_uniform_blocks.cpp
index 7d755765852..c8fa181a15d 100644
--- a/src/compiler/glsl/link_uniform_blocks.cpp
+++ b/src/compiler/glsl/link_uniform_blocks.cpp
@@ -97,6 +97,11 @@ private:
             this->offset, type->std140_base_alignment(row_major));
    }
 
+   virtual void set_buffer_offset(unsigned offset)
+   {
+      this->offset = offset;
+   }
+
    virtual void visit_field(const glsl_type *type, const char *name,
                             bool row_major, const glsl_type *,
                             const unsigned packing,
diff --git a/src/compiler/glsl/link_uniforms.cpp b/src/compiler/glsl/link_uniforms.cpp
index deaba94df1c..940cc61181d 100644
--- a/src/compiler/glsl/link_uniforms.cpp
+++ b/src/compiler/glsl/link_uniforms.cpp
@@ -188,12 +188,15 @@ program_resource_visitor::recursion(const glsl_type *t, char **name,
          this->enter_record(t, *name, row_major, packing);
 
       for (unsigned i = 0; i < t->length; i++) {
-	 const char *field = t->fields.structure[i].name;
-	 size_t new_length = name_length;
+         const char *field = t->fields.structure[i].name;
+         size_t new_length = name_length;
 
          if (t->fields.structure[i].type->is_record())
             this->visit_field(&t->fields.structure[i]);
 
+         if (t->is_interface() && t->fields.structure[i].offset != -1)
+            this->set_buffer_offset(t->fields.structure[i].offset);
+
          /* Append '.field' to the current variable name. */
          if (name_length == 0) {
             ralloc_asprintf_rewrite_tail(name, &new_length, "%s", field);
@@ -247,10 +250,10 @@ program_resource_visitor::recursion(const glsl_type *t, char **name,
       record_array_count *= length;
 
       for (unsigned i = 0; i < length; i++) {
-	 size_t new_length = name_length;
+         size_t new_length = name_length;
 
-	 /* Append the subscript to the current variable name */
-	 ralloc_asprintf_rewrite_tail(name, &new_length, "[%u]", i);
+         /* Append the subscript to the current variable name */
+         ralloc_asprintf_rewrite_tail(name, &new_length, "[%u]", i);
 
          recursion(t->fields.array, name, new_length, row_major,
                    record_type,
@@ -298,6 +301,11 @@ program_resource_visitor::leave_record(const glsl_type *, const char *, bool,
 }
 
 void
+program_resource_visitor::set_buffer_offset(unsigned)
+{
+}
+
+void
 program_resource_visitor::set_record_array_count(unsigned)
 {
 }
@@ -415,19 +423,19 @@ private:
          if(!is_shader_storage)
             this->num_shader_uniform_components += values;
       } else {
-	 /* Accumulate the total number of uniform slots used by this shader.
-	  * Note that samplers do not count against this limit because they
-	  * don't use any storage on current hardware.
-	  */
-	 if (!is_ubo_var && !is_shader_storage)
-	    this->num_shader_uniform_components += values;
+         /* Accumulate the total number of uniform slots used by this shader.
+          * Note that samplers do not count against this limit because they
+          * don't use any storage on current hardware.
+          */
+         if (!is_ubo_var && !is_shader_storage)
+            this->num_shader_uniform_components += values;
       }
 
       /* If the uniform is already in the map, there's nothing more to do.
        */
       unsigned id;
       if (this->map->get(id, name))
-	 return;
+         return;
 
       if (this->current_var->data.how_declared == ir_var_hidden) {
          this->hidden_map->put(this->num_hidden_uniforms, name);
@@ -473,8 +481,8 @@ class parcel_out_uniform_storage : public program_resource_visitor {
 public:
    parcel_out_uniform_storage(struct gl_shader_program *prog,
                               struct string_to_uint_map *map,
-			      struct gl_uniform_storage *uniforms,
-			      union gl_constant_value *values)
+                              struct gl_uniform_storage *uniforms,
+                              union gl_constant_value *values)
       : prog(prog), map(map), uniforms(uniforms), values(values)
    {
    }
@@ -520,9 +528,9 @@ public:
                   ubo_block_index = i;
                   break;
                }
-	    }
-	 }
-	 assert(ubo_block_index != -1);
+            }
+         }
+         assert(ubo_block_index != -1);
 
          /* Uniform blocks that were specified with an instance name must be
           * handled a little bit differently.  The name of the variable is the
@@ -676,6 +684,11 @@ private:
       }
    }
 
+   virtual void set_buffer_offset(unsigned offset)
+   {
+      this->ubo_byte_offset = offset;
+   }
+
    virtual void set_record_array_count(unsigned record_array_count)
    {
       this->record_array_count = record_array_count;
@@ -730,15 +743,15 @@ private:
       assert(found);
 
       if (!found)
-	 return;
+         return;
 
       const glsl_type *base_type;
       if (type->is_array()) {
-	 this->uniforms[id].array_elements = type->length;
-	 base_type = type->fields.array;
+         this->uniforms[id].array_elements = type->length;
+         base_type = type->fields.array;
       } else {
-	 this->uniforms[id].array_elements = 0;
-	 base_type = type;
+         this->uniforms[id].array_elements = 0;
+         base_type = type;
       }
 
       /* Initialise opaque data */
@@ -822,11 +835,11 @@ private:
                this->uniforms[id].array_stride =
                   glsl_align(type->without_array()->std140_size(row_major),
                              16);
-	 } else {
-	    this->uniforms[id].array_stride = 0;
-	 }
+         } else {
+            this->uniforms[id].array_stride = 0;
+         }
 
-	 if (type->without_array()->is_matrix()) {
+         if (type->without_array()->is_matrix()) {
             const glsl_type *matrix = type->without_array();
             const unsigned N = matrix->base_type == GLSL_TYPE_DOUBLE ? 8 : 4;
             const unsigned items =
@@ -838,17 +851,17 @@ private:
                                                     glsl_align(items * N, 16);
             else
                this->uniforms[id].matrix_stride = glsl_align(items * N, 16);
-	    this->uniforms[id].row_major = row_major;
-	 } else {
-	    this->uniforms[id].matrix_stride = 0;
-	    this->uniforms[id].row_major = false;
-	 }
+            this->uniforms[id].row_major = row_major;
+         } else {
+            this->uniforms[id].matrix_stride = 0;
+            this->uniforms[id].row_major = false;
+         }
       } else {
-	 this->uniforms[id].block_index = -1;
-	 this->uniforms[id].offset = -1;
-	 this->uniforms[id].array_stride = -1;
-	 this->uniforms[id].matrix_stride = -1;
-	 this->uniforms[id].row_major = false;
+         this->uniforms[id].block_index = -1;
+         this->uniforms[id].offset = -1;
+         this->uniforms[id].array_stride = -1;
+         this->uniforms[id].matrix_stride = -1;
+         this->uniforms[id].row_major = false;
       }
 
       this->values += values_for_type(type);
@@ -914,36 +927,36 @@ public:
  */
 int
 link_cross_validate_uniform_block(void *mem_ctx,
-				  struct gl_uniform_block **linked_blocks,
-				  unsigned int *num_linked_blocks,
-				  struct gl_uniform_block *new_block)
+                                  struct gl_uniform_block **linked_blocks,
+                                  unsigned int *num_linked_blocks,
+                                  struct gl_uniform_block *new_block)
 {
    for (unsigned int i = 0; i < *num_linked_blocks; i++) {
       struct gl_uniform_block *old_block = &(*linked_blocks)[i];
 
       if (strcmp(old_block->Name, new_block->Name) == 0)
-	 return link_uniform_blocks_are_compatible(old_block, new_block)
-	    ? i : -1;
+         return link_uniform_blocks_are_compatible(old_block, new_block)
+            ? i : -1;
    }
 
    *linked_blocks = reralloc(mem_ctx, *linked_blocks,
-			     struct gl_uniform_block,
-			     *num_linked_blocks + 1);
+                             struct gl_uniform_block,
+                             *num_linked_blocks + 1);
    int linked_block_index = (*num_linked_blocks)++;
    struct gl_uniform_block *linked_block = &(*linked_blocks)[linked_block_index];
 
    memcpy(linked_block, new_block, sizeof(*new_block));
    linked_block->Uniforms = ralloc_array(*linked_blocks,
-					 struct gl_uniform_buffer_variable,
-					 linked_block->NumUniforms);
+                                         struct gl_uniform_buffer_variable,
+                                         linked_block->NumUniforms);
 
    memcpy(linked_block->Uniforms,
-	  new_block->Uniforms,
-	  sizeof(*linked_block->Uniforms) * linked_block->NumUniforms);
+          new_block->Uniforms,
+          sizeof(*linked_block->Uniforms) * linked_block->NumUniforms);
 
    for (unsigned int i = 0; i < linked_block->NumUniforms; i++) {
       struct gl_uniform_buffer_variable *ubo_var =
-	 &linked_block->Uniforms[i];
+         &linked_block->Uniforms[i];
 
       if (ubo_var->Name == ubo_var->IndexName) {
          ubo_var->Name = ralloc_strdup(*linked_blocks, ubo_var->Name);
@@ -970,7 +983,7 @@ link_update_uniform_buffer_variables(struct gl_shader *shader)
       ir_variable *const var = node->as_variable();
 
       if ((var == NULL) || !var->is_in_buffer_block())
-	 continue;
+         continue;
 
       assert(var->data.mode == ir_var_uniform ||
              var->data.mode == ir_var_shader_storage);
@@ -992,7 +1005,7 @@ link_update_uniform_buffer_variables(struct gl_shader *shader)
 
       const unsigned l = strlen(var->name);
       for (unsigned i = 0; i < shader->NumBufferInterfaceBlocks; i++) {
-	 for (unsigned j = 0; j < shader->BufferInterfaceBlocks[i].NumUniforms; j++) {
+         for (unsigned j = 0; j < shader->BufferInterfaceBlocks[i].NumUniforms; j++) {
             if (sentinel) {
                const char *begin = shader->BufferInterfaceBlocks[i].Uniforms[j].Name;
                const char *end = strchr(begin, sentinel);
@@ -1010,13 +1023,13 @@ link_update_uniform_buffer_variables(struct gl_shader *shader)
                }
             } else if (!strcmp(var->name,
                                shader->BufferInterfaceBlocks[i].Uniforms[j].Name)) {
-	       found = true;
-	       var->data.location = j;
-	       break;
-	    }
-	 }
-	 if (found)
-	    break;
+               found = true;
+               var->data.location = j;
+               break;
+            }
+         }
+         if (found)
+            break;
       }
       assert(found);
    }
@@ -1099,7 +1112,7 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
       struct gl_shader *sh = prog->_LinkedShaders[i];
 
       if (sh == NULL)
-	 continue;
+         continue;
 
       /* Uniforms that lack an initializer in the shader code have an initial
        * value of zero.  This includes sampler uniforms.
@@ -1120,13 +1133,13 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
       uniform_size.start_shader();
 
       foreach_in_list(ir_instruction, node, sh->ir) {
-	 ir_variable *const var = node->as_variable();
+         ir_variable *const var = node->as_variable();
 
-	 if ((var == NULL) || (var->data.mode != ir_var_uniform &&
-	                       var->data.mode != ir_var_shader_storage))
-	    continue;
+         if ((var == NULL) || (var->data.mode != ir_var_uniform &&
+                               var->data.mode != ir_var_shader_storage))
+            continue;
 
-	 uniform_size.process(var);
+         uniform_size.process(var);
       }
 
       sh->num_samplers = uniform_size.num_shader_samplers;
@@ -1136,8 +1149,8 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
 
       for (unsigned i = 0; i < sh->NumBufferInterfaceBlocks; i++) {
          if (!sh->BufferInterfaceBlocks[i].IsShaderStorage) {
-	    sh->num_combined_uniform_components +=
-	       sh->BufferInterfaceBlocks[i].UniformBufferSize / 4;
+            sh->num_combined_uniform_components +=
+               sh->BufferInterfaceBlocks[i].UniformBufferSize / 4;
          }
       }
    }
@@ -1170,18 +1183,18 @@ link_assign_uniform_locations(struct gl_shader_program *prog,
 
    for (unsigned i = 0; i < MESA_SHADER_STAGES; i++) {
       if (prog->_LinkedShaders[i] == NULL)
-	 continue;
+         continue;
 
       parcel.start_shader((gl_shader_stage)i);
 
       foreach_in_list(ir_instruction, node, prog->_LinkedShaders[i]->ir) {
-	 ir_variable *const var = node->as_variable();
+         ir_variable *const var = node->as_variable();
 
          if ((var == NULL) || (var->data.mode != ir_var_uniform &&
                                var->data.mode != ir_var_shader_storage))
-	    continue;
+            continue;
 
-	 parcel.set_and_process(var);
+         parcel.set_and_process(var);
       }
 
       prog->_LinkedShaders[i]->active_samplers = parcel.shader_samplers_used;
diff --git a/src/compiler/glsl/link_varyings.cpp b/src/compiler/glsl/link_varyings.cpp
index 05cc1a2b7f8..34eb848a9c1 100644
--- a/src/compiler/glsl/link_varyings.cpp
+++ b/src/compiler/glsl/link_varyings.cpp
@@ -219,7 +219,7 @@ cross_validate_front_and_back_color(struct gl_shader_program *prog,
  */
 void
 cross_validate_outputs_to_inputs(struct gl_shader_program *prog,
-				 gl_shader *producer, gl_shader *consumer)
+                                 gl_shader *producer, gl_shader *consumer)
 {
    glsl_symbol_table parameters;
    ir_variable *explicit_locations[MAX_VARYING] = { NULL, };
@@ -312,8 +312,14 @@ cross_validate_outputs_to_inputs(struct gl_shader_program *prog,
          }
 
          if (output != NULL) {
-            cross_validate_types_and_qualifiers(prog, input, output,
-                                                consumer->Stage, producer->Stage);
+            /* Interface blocks have their own validation elsewhere so don't
+             * try validating them here.
+             */
+            if (!(input->get_interface_type() &&
+                  output->get_interface_type()))
+               cross_validate_types_and_qualifiers(prog, input, output,
+                                                   consumer->Stage,
+                                                   producer->Stage);
          } else {
             /* Check for input vars with unmatched output vars in prev stage
              * taking into account that interface blocks could have a matching
@@ -348,7 +354,7 @@ remove_unused_shader_inputs_and_outputs(bool is_separate_shader_object,
       ir_variable *const var = node->as_variable();
 
       if ((var == NULL) || (var->data.mode != int(mode)))
-	 continue;
+         continue;
 
       /* A shader 'in' or 'out' variable is only really an input or output if
        * its value is used by other shader stages. This will cause the
@@ -356,7 +362,7 @@ remove_unused_shader_inputs_and_outputs(bool is_separate_shader_object,
        */
       if (var->data.is_unmatched_generic_inout) {
          assert(var->data.mode != ir_var_temporary);
-	 var->data.mode = ir_var_auto;
+         var->data.mode = ir_var_auto;
       }
    }
 
@@ -748,8 +754,8 @@ store_tfeedback_info(struct gl_context *ctx, struct gl_shader_program *prog,
 
    prog->LinkedTransformFeedback.Varyings =
       rzalloc_array(prog,
-		    struct gl_transform_feedback_varying_info,
-		    num_tfeedback_decls);
+                    struct gl_transform_feedback_varying_info,
+                    num_tfeedback_decls);
 
    unsigned num_outputs = 0;
    for (unsigned i = 0; i < num_tfeedback_decls; ++i)
@@ -1561,9 +1567,9 @@ reserved_varying_slot(struct gl_shader *stage, ir_variable_mode io_mode)
  */
 bool
 assign_varying_locations(struct gl_context *ctx,
-			 void *mem_ctx,
-			 struct gl_shader_program *prog,
-			 gl_shader *producer, gl_shader *consumer,
+                         void *mem_ctx,
+                         struct gl_shader_program *prog,
+                         gl_shader *producer, gl_shader *consumer,
                          unsigned num_tfeedback_decls,
                          tfeedback_decl *tfeedback_decls)
 {
@@ -1755,7 +1761,7 @@ assign_varying_locations(struct gl_context *ctx,
                linker_error(prog, "%s shader varying %s not written "
                             "by %s shader\n.",
                             _mesa_shader_stage_to_string(consumer->Stage),
-			    var->name,
+                            var->name,
                             _mesa_shader_stage_to_string(producer->Stage));
             } else {
                linker_warning(prog, "%s shader varying %s not written "
diff --git a/src/compiler/glsl/linker.cpp b/src/compiler/glsl/linker.cpp
index 3039232162a..76b700d3451 100644
--- a/src/compiler/glsl/linker.cpp
+++ b/src/compiler/glsl/linker.cpp
@@ -2417,7 +2417,8 @@ assign_attribute_or_color_locations(gl_shader_program *prog,
 	 /* Reversed because we want a descending order sort below. */
 	 return r->slots - l->slots;
       }
-   } to_assign[16];
+   } to_assign[32];
+   assert(max_index <= 32);
 
    unsigned num_attr = 0;
 
@@ -2625,6 +2626,13 @@ assign_attribute_or_color_locations(gl_shader_program *prog,
 	 continue;
       }
 
+      if (num_attr >= max_index) {
+         linker_error(prog, "too many %s (max %u)",
+                      target_index == MESA_SHADER_VERTEX ?
+                      "vertex shader inputs" : "fragment shader outputs",
+                      max_index);
+         return false;
+      }
       to_assign[num_attr].slots = slots;
       to_assign[num_attr].var = var;
       num_attr++;
diff --git a/src/compiler/glsl/linker.h b/src/compiler/glsl/linker.h
index a60bb6ed087..4311d1659ec 100644
--- a/src/compiler/glsl/linker.h
+++ b/src/compiler/glsl/linker.h
@@ -182,6 +182,8 @@ protected:
    virtual void leave_record(const glsl_type *type, const char *name,
                              bool row_major, const unsigned packing);
 
+   virtual void set_buffer_offset(unsigned offset);
+
    virtual void set_record_array_count(unsigned record_array_count);
 
 private:
diff --git a/src/compiler/glsl/lower_buffer_access.cpp b/src/compiler/glsl/lower_buffer_access.cpp
index 9ad811de9f1..f85b421cf27 100644
--- a/src/compiler/glsl/lower_buffer_access.cpp
+++ b/src/compiler/glsl/lower_buffer_access.cpp
@@ -440,6 +440,10 @@ lower_buffer_access::setup_buffer_access(void *mem_ctx,
             else
                field_align = type->std140_base_alignment(field_row_major);
 
+            if (struct_type->fields.structure[i].offset != -1) {
+               intra_struct_offset = struct_type->fields.structure[i].offset;
+            }
+
             intra_struct_offset = glsl_align(intra_struct_offset, field_align);
 
             if (strcmp(struct_type->fields.structure[i].name,
diff --git a/src/compiler/glsl/opt_array_splitting.cpp b/src/compiler/glsl/opt_array_splitting.cpp
index cceec6b6431..a294da56616 100644
--- a/src/compiler/glsl/opt_array_splitting.cpp
+++ b/src/compiler/glsl/opt_array_splitting.cpp
@@ -55,9 +55,9 @@ public:
       this->components = NULL;
       this->mem_ctx = NULL;
       if (var->type->is_array())
-	 this->size = var->type->length;
+         this->size = var->type->length;
       else
-	 this->size = var->type->matrix_columns;
+         this->size = var->type->matrix_columns;
    }
 
    ir_variable *var; /* The key: the variable's pointer. */
@@ -137,7 +137,7 @@ ir_array_reference_visitor::get_variable_entry(ir_variable *var)
 
    foreach_in_list(variable_entry, entry, &this->variable_list) {
       if (entry->var == var)
-	 return entry;
+         return entry;
    }
 
    variable_entry *entry = new(mem_ctx) variable_entry(var);
@@ -185,8 +185,18 @@ ir_array_reference_visitor::visit_enter(ir_dereference_array *ir)
    /* If the access to the array has a variable index, we wouldn't
     * know which split variable this dereference should go to.
     */
-   if (entry && !ir->array_index->as_constant())
-      entry->split = false;
+   if (!ir->array_index->as_constant()) {
+      if (entry)
+         entry->split = false;
+      /* This variable indexing could come from a different array dereference
+       * that also has variable indexing, that is, something like a[b[a[b[0]]]].
+       * If we return visit_continue_with_parent here for the first appearence
+       * of a, then we can miss that b also has indirect indexing (if this is
+       * the only place in the program where such indirect indexing into b
+       * happens), so keep going.
+       */
+      return visit_continue;
+   }
 
    /* If the index is also array dereference, visit index. */
    if (ir->array_index->as_dereference_array())
@@ -208,7 +218,7 @@ ir_array_reference_visitor::visit_enter(ir_function_signature *ir)
 
 bool
 ir_array_reference_visitor::get_split_list(exec_list *instructions,
-					   bool linked)
+                                           bool linked)
 {
    visit_list_elements(this, instructions);
 
@@ -217,25 +227,25 @@ ir_array_reference_visitor::get_split_list(exec_list *instructions,
     */
    if (!linked) {
       foreach_in_list(ir_instruction, node, instructions) {
-	 ir_variable *var = node->as_variable();
-	 if (var) {
-	    variable_entry *entry = get_variable_entry(var);
-	    if (entry)
-	       entry->remove();
-	 }
+         ir_variable *var = node->as_variable();
+         if (var) {
+            variable_entry *entry = get_variable_entry(var);
+            if (entry)
+               entry->remove();
+         }
       }
    }
 
    /* Trim out variables we found that we can't split. */
    foreach_in_list_safe(variable_entry, entry, &variable_list) {
       if (debug) {
-	 printf("array %s@%p: decl %d, split %d\n",
-		entry->var->name, (void *) entry->var, entry->declaration,
-		entry->split);
+         printf("array %s@%p: decl %d, split %d\n",
+                entry->var->name, (void *) entry->var, entry->declaration,
+                entry->split);
       }
 
       if (!(entry->declaration && entry->split)) {
-	 entry->remove();
+         entry->remove();
       }
    }
 
@@ -273,7 +283,7 @@ ir_array_splitting_visitor::get_splitting_entry(ir_variable *var)
 
    foreach_in_list(variable_entry, entry, this->variable_list) {
       if (entry->var == var) {
-	 return entry;
+         return entry;
       }
    }
 
@@ -301,7 +311,7 @@ ir_array_splitting_visitor::split_deref(ir_dereference **deref)
 
    if (constant->value.i[0] >= 0 && constant->value.i[0] < (int)entry->size) {
       *deref = new(entry->mem_ctx)
-	 ir_dereference_variable(entry->components[constant->value.i[0]]);
+               ir_dereference_variable(entry->components[constant->value.i[0]]);
    } else {
       /* There was a constant array access beyond the end of the
        * array.  This might have happened due to constant folding
@@ -310,8 +320,8 @@ ir_array_splitting_visitor::split_deref(ir_dereference **deref)
        * variable.
        */
       ir_variable *temp = new(entry->mem_ctx) ir_variable(deref_array->type,
-							  "undef",
-							  ir_var_temporary);
+                                                          "undef",
+                                                          ir_var_temporary);
       entry->components[0]->insert_before(temp);
       *deref = new(entry->mem_ctx) ir_dereference_variable(temp);
    }
@@ -373,23 +383,21 @@ optimize_split_arrays(exec_list *instructions, bool linked)
       const struct glsl_type *subtype;
 
       if (type->is_matrix())
-	 subtype = type->column_type();
+         subtype = type->column_type();
       else
-	 subtype = type->fields.array;
+         subtype = type->fields.array;
 
       entry->mem_ctx = ralloc_parent(entry->var);
 
-      entry->components = ralloc_array(mem_ctx,
-				       ir_variable *,
-				       entry->size);
+      entry->components = ralloc_array(mem_ctx, ir_variable *, entry->size);
 
       for (unsigned int i = 0; i < entry->size; i++) {
-	 const char *name = ralloc_asprintf(mem_ctx, "%s_%d",
-					    entry->var->name, i);
+         const char *name = ralloc_asprintf(mem_ctx, "%s_%d",
+                                            entry->var->name, i);
 
-	 entry->components[i] =
-	    new(entry->mem_ctx) ir_variable(subtype, name, ir_var_temporary);
-	 entry->var->insert_before(entry->components[i]);
+         entry->components[i] =
+            new(entry->mem_ctx) ir_variable(subtype, name, ir_var_temporary);
+         entry->var->insert_before(entry->components[i]);
       }
 
       entry->var->remove();
diff --git a/src/compiler/glsl_types.cpp b/src/compiler/glsl_types.cpp
index c549230a83c..2421bd61954 100644
--- a/src/compiler/glsl_types.cpp
+++ b/src/compiler/glsl_types.cpp
@@ -120,6 +120,7 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
       this->fields.structure[i].name = ralloc_strdup(this->fields.structure,
                                                      fields[i].name);
       this->fields.structure[i].location = fields[i].location;
+      this->fields.structure[i].offset = fields[i].offset;
       this->fields.structure[i].interpolation = fields[i].interpolation;
       this->fields.structure[i].centroid = fields[i].centroid;
       this->fields.structure[i].sample = fields[i].sample;
@@ -159,6 +160,7 @@ glsl_type::glsl_type(const glsl_struct_field *fields, unsigned num_fields,
       this->fields.structure[i].name = ralloc_strdup(this->fields.structure,
                                                      fields[i].name);
       this->fields.structure[i].location = fields[i].location;
+      this->fields.structure[i].offset = fields[i].offset;
       this->fields.structure[i].interpolation = fields[i].interpolation;
       this->fields.structure[i].centroid = fields[i].centroid;
       this->fields.structure[i].sample = fields[i].sample;
@@ -880,6 +882,9 @@ glsl_type::record_compare(const glsl_type *b) const
       if (this->fields.structure[i].location
           != b->fields.structure[i].location)
          return false;
+      if (this->fields.structure[i].offset
+          != b->fields.structure[i].offset)
+         return false;
       if (this->fields.structure[i].interpolation
           != b->fields.structure[i].interpolation)
          return false;
diff --git a/src/compiler/glsl_types.h b/src/compiler/glsl_types.h
index 2f612d8857d..b0e6f3f730f 100644
--- a/src/compiler/glsl_types.h
+++ b/src/compiler/glsl_types.h
@@ -838,6 +838,14 @@ struct glsl_struct_field {
    int location;
 
    /**
+    * For interface blocks, members may have an explicit byte offset
+    * specified; -1 otherwise.
+    *
+    * Ignored for structs.
+    */
+   int offset;
+
+   /**
     * For interface blocks, the interpolation mode (as in
     * ir_variable::interpolation).  0 otherwise.
     */
diff --git a/src/compiler/nir/Makefile.sources b/src/compiler/nir/Makefile.sources
index 04e8ab88a35..a876eff289a 100644
--- a/src/compiler/nir/Makefile.sources
+++ b/src/compiler/nir/Makefile.sources
@@ -32,10 +32,10 @@ NIR_FILES = \
 	nir_lower_clip.c \
 	nir_lower_global_vars_to_local.c \
 	nir_lower_gs_intrinsics.c \
-        nir_lower_indirect_derefs.c \
 	nir_lower_load_const_to_scalar.c \
 	nir_lower_locals_to_regs.c \
 	nir_lower_idiv.c \
+	nir_lower_indirect_derefs.c \
 	nir_lower_io.c \
 	nir_lower_outputs_to_temporaries.c \
 	nir_lower_phis_to_scalar.c \
diff --git a/src/compiler/nir/glsl_to_nir.cpp b/src/compiler/nir/glsl_to_nir.cpp
index 6671691fd06..da5d730b49e 100644
--- a/src/compiler/nir/glsl_to_nir.cpp
+++ b/src/compiler/nir/glsl_to_nir.cpp
@@ -454,34 +454,8 @@ nir_visitor::create_function(ir_function_signature *ir)
 
    nir_function *func = nir_function_create(shader, ir->function_name());
 
-   unsigned num_params = ir->parameters.length();
-   func->num_params = num_params;
-   func->params = ralloc_array(shader, nir_parameter, num_params);
-
-   unsigned i = 0;
-   foreach_in_list(ir_variable, param, &ir->parameters) {
-      switch (param->data.mode) {
-      case ir_var_function_in:
-         func->params[i].param_type = nir_parameter_in;
-         break;
-
-      case ir_var_function_out:
-         func->params[i].param_type = nir_parameter_out;
-         break;
-
-      case ir_var_function_inout:
-         func->params[i].param_type = nir_parameter_inout;
-         break;
-
-      default:
-         unreachable("not reached");
-      }
-
-      func->params[i].type = param->type;
-      i++;
-   }
-
-   func->return_type = ir->return_type;
+   assert(ir->parameters.is_empty());
+   assert(ir->return_type == glsl_type::void_type);
 
    _mesa_hash_table_insert(this->overload_table, ir, func);
 }
@@ -509,24 +483,9 @@ nir_visitor::visit(ir_function_signature *ir)
       nir_function_impl *impl = nir_function_impl_create(func);
       this->impl = impl;
 
-      unsigned num_params = func->num_params;
-      impl->num_params = num_params;
-      impl->params = ralloc_array(this->shader, nir_variable *, num_params);
-      unsigned i = 0;
-      foreach_in_list(ir_variable, param, &ir->parameters) {
-         param->accept(this);
-         impl->params[i] = this->var;
-         i++;
-      }
-
-      if (func->return_type == glsl_type::void_type) {
-         impl->return_var = NULL;
-      } else {
-         impl->return_var = ralloc(this->shader, nir_variable);
-         impl->return_var->name = ralloc_strdup(impl->return_var,
-                                                "return_var");
-         impl->return_var->type = func->return_type;
-      }
+      assert(strcmp(func->name, "main") == 0);
+      assert(ir->parameters.is_empty());
+      assert(func->return_type == glsl_type::void_type);
 
       this->is_global = false;
 
diff --git a/src/compiler/nir/nir.c b/src/compiler/nir/nir.c
index 91206a92717..7e41ed37b0d 100644
--- a/src/compiler/nir/nir.c
+++ b/src/compiler/nir/nir.c
@@ -323,6 +323,8 @@ nir_function_impl_create(nir_function *function)
       impl->return_var->type = function->return_type;
       impl->return_var->data.mode = nir_var_param;
       impl->return_var->data.location = -1;
+   } else {
+      impl->return_var = NULL;
    }
 
    return impl;
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 0f8c78100bf..ae37cbf7325 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -2094,8 +2094,8 @@ void nir_index_blocks(nir_function_impl *impl);
 void nir_print_shader(nir_shader *shader, FILE *fp);
 void nir_print_instr(const nir_instr *instr, FILE *fp);
 
-nir_shader *nir_shader_clone(void *mem_ctx, const nir_shader *s);
-nir_function_impl *nir_function_impl_clone(const nir_function_impl *impl);
+nir_shader * nir_shader_clone(void *mem_ctx, const nir_shader *s);
+nir_function_impl *nir_function_impl_clone(const nir_function_impl *fi);
 nir_constant *nir_constant_clone(const nir_constant *c, nir_variable *var);
 
 #ifdef DEBUG
diff --git a/src/compiler/nir/nir_lower_indirect_derefs.c b/src/compiler/nir/nir_lower_indirect_derefs.c
index 69f2df4ba6d..a4affa7bdcf 100644
--- a/src/compiler/nir/nir_lower_indirect_derefs.c
+++ b/src/compiler/nir/nir_lower_indirect_derefs.c
@@ -134,7 +134,7 @@ emit_load_store(nir_builder *b, nir_intrinsic_instr *orig_instr,
       nir_intrinsic_instr *store =
          nir_intrinsic_instr_create(b->shader, nir_intrinsic_store_var);
       store->num_components = orig_instr->num_components;
-      store->const_index[0] = orig_instr->const_index[0]; /* writemask */
+      nir_intrinsic_set_write_mask(store, nir_intrinsic_write_mask(orig_instr));
       store->variables[0] =
          nir_deref_as_var(nir_copy_deref(store, &deref->deref));
       store->src[0] = nir_src_for_ssa(src);
diff --git a/src/compiler/nir/nir_opt_algebraic.py b/src/compiler/nir/nir_opt_algebraic.py
index c9c917b77a5..54f7d86843a 100644
--- a/src/compiler/nir/nir_opt_algebraic.py
+++ b/src/compiler/nir/nir_opt_algebraic.py
@@ -164,6 +164,8 @@ optimizations = [
    (('ishr', a, 0), a),
    (('ushr', 0, a), 0),
    (('ushr', a, 0), a),
+   (('iand', 0xff, ('ushr', a, 24)), ('ushr', a, 24)),
+   (('iand', 0xffff, ('ushr', a, 16)), ('ushr', a, 16)),
    # Exponential/logarithmic identities
    (('fexp2', ('flog2', a)), a), # 2^lg2(a) = a
    (('flog2', ('fexp2', a)), a), # lg2(2^a) = a
@@ -215,6 +217,16 @@ optimizations = [
    (('f2i', ('ftrunc', a)), ('f2i', a)),
    (('f2u', ('ftrunc', a)), ('f2u', a)),
 
+   # Byte extraction
+   (('ushr', a, 24), ('extract_u8', a, 3), '!options->lower_extract_byte'),
+   (('iand', 0xff, ('ushr', a, 16)), ('extract_u8', a, 2), '!options->lower_extract_byte'),
+   (('iand', 0xff, ('ushr', a,  8)), ('extract_u8', a, 1), '!options->lower_extract_byte'),
+   (('iand', 0xff, a), ('extract_u8', a, 0), '!options->lower_extract_byte'),
+
+    # Word extraction
+   (('ushr', a, 16), ('extract_u16', a, 1), '!options->lower_extract_word'),
+   (('iand', 0xffff, a), ('extract_u16', a, 0), '!options->lower_extract_word'),
+
    # Subtracts
    (('fsub', a, ('fsub', 0.0, b)), ('fadd', a, b)),
    (('isub', a, ('isub', 0, b)), ('iadd', a, b)),
diff --git a/src/compiler/nir/nir_print.c b/src/compiler/nir/nir_print.c
index 63e34ea5255..24d5281ec54 100644
--- a/src/compiler/nir/nir_print.c
+++ b/src/compiler/nir/nir_print.c
@@ -381,6 +381,14 @@ print_var(nir_variable *var, print_state *state)
 }
 
 static void
+print_arg(nir_variable *var, print_state *state)
+{
+   FILE *fp = state->fp;
+   glsl_print_type(var->type, fp);
+   fprintf(fp, " %s", get_var_name(var, state));
+}
+
+static void
 print_deref_var(nir_deref_var *deref, print_state *state)
 {
    print_var(deref->var, state);
@@ -942,14 +950,14 @@ print_function_impl(nir_function_impl *impl, print_state *state)
       if (i != 0)
          fprintf(fp, ", ");
 
-      print_var(impl->params[i], state);
+      print_arg(impl->params[i], state);
    }
 
    if (impl->return_var != NULL) {
       if (impl->num_params != 0)
          fprintf(fp, ", ");
       fprintf(fp, "returning ");
-      print_var(impl->return_var, state);
+      print_arg(impl->return_var, state);
    }
 
    fprintf(fp, "{\n");
diff --git a/src/compiler/nir/nir_validate.c b/src/compiler/nir/nir_validate.c
index a9d213b95c4..0c32d5fe07a 100644
--- a/src/compiler/nir/nir_validate.c
+++ b/src/compiler/nir/nir_validate.c
@@ -938,6 +938,7 @@ validate_function_impl(nir_function_impl *impl, validate_state *state)
    assert(impl->num_params == impl->function->num_params);
    for (unsigned i = 0; i < impl->num_params; i++) {
       assert(impl->params[i]->type == impl->function->params[i].type);
+      assert(impl->params[i]->data.mode == nir_var_param);
       assert(impl->params[i]->data.location == i);
       validate_var_decl(impl->params[i], false, state);
    }
@@ -946,6 +947,7 @@ validate_function_impl(nir_function_impl *impl, validate_state *state)
       assert(impl->return_var == NULL);
    } else {
       assert(impl->return_var->type == impl->function->return_type);
+      assert(impl->return_var->data.mode == nir_var_param);
       assert(impl->return_var->data.location == -1);
       validate_var_decl(impl->return_var, false, state);
    }
diff --git a/src/egl/drivers/dri2/platform_wayland.c b/src/egl/drivers/dri2/platform_wayland.c
index 341acb7ed1b..ff0d5c802ac 100644
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@@ -305,7 +305,7 @@ get_back_bo(struct dri2_egl_surface *dri2_surf)
 {
    struct dri2_egl_display *dri2_dpy =
       dri2_egl_display(dri2_surf->base.Resource.Display);
-   int i;
+   int i, use_flags;
    unsigned int dri_image_format;
 
    /* currently supports three WL DRM formats,
@@ -352,6 +352,8 @@ get_back_bo(struct dri2_egl_surface *dri2_surf)
    if (dri2_surf->back == NULL)
       return -1;
 
+   use_flags = __DRI_IMAGE_USE_SHARE | __DRI_IMAGE_USE_BACKBUFFER;
+
    if (dri2_dpy->is_different_gpu &&
        dri2_surf->back->linear_copy == NULL) {
        dri2_surf->back->linear_copy =
@@ -359,7 +361,7 @@ get_back_bo(struct dri2_egl_surface *dri2_surf)
                                       dri2_surf->base.Width,
                                       dri2_surf->base.Height,
                                       dri_image_format,
-                                      __DRI_IMAGE_USE_SHARE |
+                                      use_flags |
                                       __DRI_IMAGE_USE_LINEAR,
                                       NULL);
       if (dri2_surf->back->linear_copy == NULL)
@@ -373,7 +375,7 @@ get_back_bo(struct dri2_egl_surface *dri2_surf)
                                       dri2_surf->base.Height,
                                       dri_image_format,
                                       dri2_dpy->is_different_gpu ?
-                                         0 : __DRI_IMAGE_USE_SHARE,
+                                         0 : use_flags,
                                       NULL);
       dri2_surf->back->age = 0;
    }
diff --git a/src/egl/drivers/dri2/platform_x11.c b/src/egl/drivers/dri2/platform_x11.c
index 420f567651c..3ab91886e01 100644
--- a/src/egl/drivers/dri2/platform_x11.c
+++ b/src/egl/drivers/dri2/platform_x11.c
@@ -1006,6 +1006,9 @@ dri2_create_image_khr_pixmap(_EGLDisplay *disp, _EGLContext *ctx,
    geometry_cookie = xcb_get_geometry (dri2_dpy->conn, drawable);
    buffers_reply = xcb_dri2_get_buffers_reply (dri2_dpy->conn,
 					       buffers_cookie, NULL);
+   if (buffers_reply == NULL)
+     return NULL;
+
    buffers = xcb_dri2_get_buffers_buffers (buffers_reply);
    if (buffers == NULL) {
       return NULL;
diff --git a/src/egl/main/eglapi.c b/src/egl/main/eglapi.c
index 32f68233aeb..dd145a1195e 100644
--- a/src/egl/main/eglapi.c
+++ b/src/egl/main/eglapi.c
@@ -405,11 +405,9 @@ _eglCreateExtensionsString(_EGLDisplay *dpy)
    _EGL_CHECK_EXTENSION(KHR_image_pixmap);
    _EGL_CHECK_EXTENSION(KHR_reusable_sync);
    _EGL_CHECK_EXTENSION(KHR_surfaceless_context);
-   _EGL_CHECK_EXTENSION(KHR_vg_parent_image);
    _EGL_CHECK_EXTENSION(KHR_wait_sync);
 
    _EGL_CHECK_EXTENSION(MESA_configless_context);
-   _EGL_CHECK_EXTENSION(MESA_drm_display);
    _EGL_CHECK_EXTENSION(MESA_drm_image);
    _EGL_CHECK_EXTENSION(MESA_image_dma_buf_export);
 
@@ -1198,13 +1196,6 @@ eglGetError(void)
 }
 
 
-static EGLDisplay EGLAPIENTRY
-eglGetDRMDisplayMESA(int fd)
-{
-   _EGLDisplay *dpy = _eglFindDisplay(_EGL_PLATFORM_DRM, (void *) (intptr_t) fd);
-   return _eglGetDisplayHandle(dpy);
-}
-
 /**
  ** EGL 1.2
  **/
@@ -1858,7 +1849,6 @@ eglGetProcAddress(const char *procname)
       { "eglGetPlatformDisplay", (_EGLProc) eglGetPlatformDisplay },
       { "eglCreatePlatformWindowSurface", (_EGLProc) eglCreatePlatformWindowSurface },
       { "eglCreatePlatformPixmapSurface", (_EGLProc) eglCreatePlatformPixmapSurface },
-      { "eglGetDRMDisplayMESA", (_EGLProc) eglGetDRMDisplayMESA },
       { "eglCreateImageKHR", (_EGLProc) eglCreateImageKHR },
       { "eglDestroyImageKHR", (_EGLProc) eglDestroyImage },
       { "eglCreateSyncKHR", (_EGLProc) eglCreateSyncKHR },
diff --git a/src/egl/main/eglapi.h b/src/egl/main/eglapi.h
index 6c54c7c410d..3f6d3c27a52 100644
--- a/src/egl/main/eglapi.h
+++ b/src/egl/main/eglapi.h
@@ -41,153 +41,153 @@ extern "C" {
  */
 typedef void (*_EGLProc)(void);
 
-
-/**
- * Typedefs for all EGL API entrypoint functions.
- */
-
-/* driver funcs */
-typedef EGLBoolean (*Initialize_t)(_EGLDriver *, _EGLDisplay *dpy);
-typedef EGLBoolean (*Terminate_t)(_EGLDriver *, _EGLDisplay *dpy);
-
-/* config funcs */
-typedef EGLBoolean (*GetConfigs_t)(_EGLDriver *drv, _EGLDisplay *dpy, EGLConfig *configs, EGLint config_size, EGLint *num_config);
-typedef EGLBoolean (*ChooseConfig_t)(_EGLDriver *drv, _EGLDisplay *dpy, const EGLint *attrib_list, EGLConfig *configs, EGLint config_size, EGLint *num_config);
-typedef EGLBoolean (*GetConfigAttrib_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLConfig *config, EGLint attribute, EGLint *value);
-
-/* context funcs */
-typedef _EGLContext *(*CreateContext_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLConfig *config, _EGLContext *share_list, const EGLint *attrib_list);
-typedef EGLBoolean (*DestroyContext_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLContext *ctx);
-/* this is the only function (other than Initialize) that may be called with an uninitialized display */
-typedef EGLBoolean (*MakeCurrent_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *draw, _EGLSurface *read, _EGLContext *ctx);
-typedef EGLBoolean (*QueryContext_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLContext *ctx, EGLint attribute, EGLint *value);
-
-/* surface funcs */
-typedef _EGLSurface *(*CreateWindowSurface_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLConfig *config, void *native_window, const EGLint *attrib_list);
-typedef _EGLSurface *(*CreatePixmapSurface_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLConfig *config, void *native_pixmap, const EGLint *attrib_list);
-typedef _EGLSurface *(*CreatePbufferSurface_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLConfig *config, const EGLint *attrib_list);
-typedef EGLBoolean (*DestroySurface_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surface);
-typedef EGLBoolean (*QuerySurface_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surface, EGLint attribute, EGLint *value);
-typedef EGLBoolean (*SurfaceAttrib_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surface, EGLint attribute, EGLint value);
-typedef EGLBoolean (*BindTexImage_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surface, EGLint buffer);
-typedef EGLBoolean (*ReleaseTexImage_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surface, EGLint buffer);
-typedef EGLBoolean (*SwapInterval_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surf, EGLint interval);
-typedef EGLBoolean (*SwapBuffers_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *draw);
-typedef EGLBoolean (*CopyBuffers_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surface, void *native_pixmap_target);
-
-/* misc funcs */
-typedef EGLBoolean (*WaitClient_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLContext *ctx);
-typedef EGLBoolean (*WaitNative_t)(_EGLDriver *drv, _EGLDisplay *dpy, EGLint engine);
-
-/* this function may be called from multiple threads at the same time */
-typedef _EGLProc (*GetProcAddress_t)(_EGLDriver *drv, const char *procname);
-
-
-
-typedef _EGLSurface *(*CreatePbufferFromClientBuffer_t)(_EGLDriver *drv, _EGLDisplay *dpy, EGLenum buftype, EGLClientBuffer buffer, _EGLConfig *config, const EGLint *attrib_list);
-
-
-typedef _EGLImage *(*CreateImageKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLContext *ctx, EGLenum target, EGLClientBuffer buffer, const EGLint *attr_list);
-typedef EGLBoolean (*DestroyImageKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLImage *image);
-
-
-typedef _EGLSync *(*CreateSyncKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, EGLenum type, const EGLint *attrib_list, const EGLAttrib *attrib_list64);
-typedef EGLBoolean (*DestroySyncKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync);
-typedef EGLint (*ClientWaitSyncKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync, EGLint flags, EGLTime timeout);
-typedef EGLint (*WaitSyncKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync);
-typedef EGLBoolean (*SignalSyncKHR_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync, EGLenum mode);
-typedef EGLBoolean (*GetSyncAttrib_t)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync, EGLint attribute, EGLAttrib *value);
-
-
-typedef EGLBoolean (*SwapBuffersRegionNOK_t)(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *surf, EGLint numRects, const EGLint *rects);
-
-typedef _EGLImage *(*CreateDRMImageMESA_t)(_EGLDriver *drv, _EGLDisplay *disp, const EGLint *attr_list);
-typedef EGLBoolean (*ExportDRMImageMESA_t)(_EGLDriver *drv, _EGLDisplay *disp, _EGLImage *img, EGLint *name, EGLint *handle, EGLint *stride);
-
 struct wl_display;
-typedef EGLBoolean (*BindWaylandDisplayWL_t)(_EGLDriver *drv, _EGLDisplay *disp, struct wl_display *display);
-typedef EGLBoolean (*UnbindWaylandDisplayWL_t)(_EGLDriver *drv, _EGLDisplay *disp, struct wl_display *display);
-typedef EGLBoolean (*QueryWaylandBufferWL_t)(_EGLDriver *drv, _EGLDisplay *displ, struct wl_resource *buffer, EGLint attribute, EGLint *value);
-
-typedef struct wl_buffer * (*CreateWaylandBufferFromImageWL_t)(_EGLDriver *drv, _EGLDisplay *disp, _EGLImage *img);
-
-typedef EGLBoolean (*PostSubBufferNV_t)(_EGLDriver *drv, _EGLDisplay *disp, _EGLSurface *surface, EGLint x, EGLint y, EGLint width, EGLint height);
-
-typedef EGLint (*QueryBufferAge_t)(_EGLDriver *drv,
-                                   _EGLDisplay *dpy, _EGLSurface *surface);
-
-typedef EGLBoolean (*SwapBuffersWithDamageEXT_t) (_EGLDriver *drv, _EGLDisplay *dpy, _EGLSurface *surface, const EGLint *rects, EGLint n_rects);
-
-typedef EGLBoolean (*GetSyncValuesCHROMIUM_t) (_EGLDisplay *dpy, _EGLSurface *surface, EGLuint64KHR *ust, EGLuint64KHR *msc, EGLuint64KHR *sbc);
-
-typedef EGLBoolean (*ExportDMABUFImageQueryMESA_t)(_EGLDriver *drv, _EGLDisplay *disp, _EGLImage *img, EGLint *fourcc, EGLint *nplanes, EGLuint64KHR *modifiers);
-typedef EGLBoolean (*ExportDMABUFImageMESA_t)(_EGLDriver *drv, _EGLDisplay *disp, _EGLImage *img, EGLint *fds, EGLint *strides, EGLint *offsets);
 
 /**
  * The API dispatcher jumps through these functions
  */
 struct _egl_api
 {
-   Initialize_t Initialize;
-   Terminate_t Terminate;
-
-   GetConfigs_t GetConfigs;
-   ChooseConfig_t ChooseConfig;
-   GetConfigAttrib_t GetConfigAttrib;
-
-   CreateContext_t CreateContext;
-   DestroyContext_t DestroyContext;
-   MakeCurrent_t MakeCurrent;
-   QueryContext_t QueryContext;
-
-   CreateWindowSurface_t CreateWindowSurface;
-   CreatePixmapSurface_t CreatePixmapSurface;
-   CreatePbufferSurface_t CreatePbufferSurface;
-   DestroySurface_t DestroySurface;
-   QuerySurface_t QuerySurface;
-   SurfaceAttrib_t SurfaceAttrib;
-   BindTexImage_t BindTexImage;
-   ReleaseTexImage_t ReleaseTexImage;
-   SwapInterval_t SwapInterval;
-   SwapBuffers_t SwapBuffers;
-   CopyBuffers_t CopyBuffers;
-
-   WaitClient_t WaitClient;
-   WaitNative_t WaitNative;
-   GetProcAddress_t GetProcAddress;
-
-   CreatePbufferFromClientBuffer_t CreatePbufferFromClientBuffer;
-
-   CreateImageKHR_t CreateImageKHR;
-   DestroyImageKHR_t DestroyImageKHR;
-
-   CreateSyncKHR_t CreateSyncKHR;
-   DestroySyncKHR_t DestroySyncKHR;
-   ClientWaitSyncKHR_t ClientWaitSyncKHR;
-   WaitSyncKHR_t WaitSyncKHR;
-   SignalSyncKHR_t SignalSyncKHR;
-   GetSyncAttrib_t GetSyncAttrib;
-
-   SwapBuffersRegionNOK_t SwapBuffersRegionNOK;
-
-   CreateDRMImageMESA_t CreateDRMImageMESA;
-   ExportDRMImageMESA_t ExportDRMImageMESA;
-
-   BindWaylandDisplayWL_t BindWaylandDisplayWL;
-   UnbindWaylandDisplayWL_t UnbindWaylandDisplayWL;
-   QueryWaylandBufferWL_t QueryWaylandBufferWL;
-
-   CreateWaylandBufferFromImageWL_t CreateWaylandBufferFromImageWL;
-
-   SwapBuffersWithDamageEXT_t SwapBuffersWithDamageEXT;
-
-   PostSubBufferNV_t PostSubBufferNV;
-
-   QueryBufferAge_t QueryBufferAge;
-   GetSyncValuesCHROMIUM_t GetSyncValuesCHROMIUM;
-
-   ExportDMABUFImageQueryMESA_t ExportDMABUFImageQueryMESA;
-   ExportDMABUFImageMESA_t ExportDMABUFImageMESA;
+   /* driver funcs */
+   EGLBoolean (*Initialize)(_EGLDriver *, _EGLDisplay *dpy);
+   EGLBoolean (*Terminate)(_EGLDriver *, _EGLDisplay *dpy);
+
+   /* config funcs */
+   EGLBoolean (*GetConfigs)(_EGLDriver *drv, _EGLDisplay *dpy,
+                            EGLConfig *configs, EGLint config_size,
+                            EGLint *num_config);
+   EGLBoolean (*ChooseConfig)(_EGLDriver *drv, _EGLDisplay *dpy,
+                              const EGLint *attrib_list, EGLConfig *configs,
+                              EGLint config_size, EGLint *num_config);
+   EGLBoolean (*GetConfigAttrib)(_EGLDriver *drv, _EGLDisplay *dpy,
+                                 _EGLConfig *config, EGLint attribute,
+                                 EGLint *value);
+
+   /* context funcs */
+   _EGLContext *(*CreateContext)(_EGLDriver *drv, _EGLDisplay *dpy,
+                                 _EGLConfig *config, _EGLContext *share_list,
+                                 const EGLint *attrib_list);
+   EGLBoolean (*DestroyContext)(_EGLDriver *drv, _EGLDisplay *dpy,
+                                _EGLContext *ctx);
+   /* this is the only function (other than Initialize) that may be called
+    * with an uninitialized display
+    */
+   EGLBoolean (*MakeCurrent)(_EGLDriver *drv, _EGLDisplay *dpy,
+                             _EGLSurface *draw, _EGLSurface *read,
+                             _EGLContext *ctx);
+   EGLBoolean (*QueryContext)(_EGLDriver *drv, _EGLDisplay *dpy,
+                              _EGLContext *ctx, EGLint attribute,
+                              EGLint *value);
+
+   /* surface funcs */
+   _EGLSurface *(*CreateWindowSurface)(_EGLDriver *drv, _EGLDisplay *dpy,
+                                       _EGLConfig *config, void *native_window,
+                                       const EGLint *attrib_list);
+   _EGLSurface *(*CreatePixmapSurface)(_EGLDriver *drv, _EGLDisplay *dpy,
+                                       _EGLConfig *config, void *native_pixmap,
+                                       const EGLint *attrib_list);
+   _EGLSurface *(*CreatePbufferSurface)(_EGLDriver *drv, _EGLDisplay *dpy,
+                                        _EGLConfig *config,
+                                        const EGLint *attrib_list);
+   EGLBoolean (*DestroySurface)(_EGLDriver *drv, _EGLDisplay *dpy,
+                                _EGLSurface *surface);
+   EGLBoolean (*QuerySurface)(_EGLDriver *drv, _EGLDisplay *dpy,
+                              _EGLSurface *surface, EGLint attribute,
+                              EGLint *value);
+   EGLBoolean (*SurfaceAttrib)(_EGLDriver *drv, _EGLDisplay *dpy,
+                               _EGLSurface *surface, EGLint attribute,
+                               EGLint value);
+   EGLBoolean (*BindTexImage)(_EGLDriver *drv, _EGLDisplay *dpy,
+                              _EGLSurface *surface, EGLint buffer);
+   EGLBoolean (*ReleaseTexImage)(_EGLDriver *drv, _EGLDisplay *dpy,
+                                 _EGLSurface *surface, EGLint buffer);
+   EGLBoolean (*SwapInterval)(_EGLDriver *drv, _EGLDisplay *dpy,
+                              _EGLSurface *surf, EGLint interval);
+   EGLBoolean (*SwapBuffers)(_EGLDriver *drv, _EGLDisplay *dpy,
+                             _EGLSurface *draw);
+   EGLBoolean (*CopyBuffers)(_EGLDriver *drv, _EGLDisplay *dpy,
+                             _EGLSurface *surface, void *native_pixmap_target);
+
+   /* misc functions */
+   EGLBoolean (*WaitClient)(_EGLDriver *drv, _EGLDisplay *dpy,
+                            _EGLContext *ctx);
+   EGLBoolean (*WaitNative)(_EGLDriver *drv, _EGLDisplay *dpy,
+                            EGLint engine);
+
+   /* this function may be called from multiple threads at the same time */
+   _EGLProc (*GetProcAddress)(_EGLDriver *drv, const char *procname);
+
+   _EGLSurface *(*CreatePbufferFromClientBuffer)(_EGLDriver *drv,
+                                                 _EGLDisplay *dpy,
+                                                 EGLenum buftype,
+                                                 EGLClientBuffer buffer,
+                                                 _EGLConfig *config,
+                                                 const EGLint *attrib_list);
+
+   _EGLImage *(*CreateImageKHR)(_EGLDriver *drv, _EGLDisplay *dpy,
+                                _EGLContext *ctx, EGLenum target,
+                                EGLClientBuffer buffer,
+                                const EGLint *attr_list);
+   EGLBoolean (*DestroyImageKHR)(_EGLDriver *drv, _EGLDisplay *dpy,
+                                 _EGLImage *image);
+
+   _EGLSync *(*CreateSyncKHR)(_EGLDriver *drv, _EGLDisplay *dpy, EGLenum type,
+                              const EGLint *attrib_list,
+                              const EGLAttrib *attrib_list64);
+   EGLBoolean (*DestroySyncKHR)(_EGLDriver *drv, _EGLDisplay *dpy,
+                                _EGLSync *sync);
+   EGLint (*ClientWaitSyncKHR)(_EGLDriver *drv, _EGLDisplay *dpy,
+                               _EGLSync *sync, EGLint flags, EGLTime timeout);
+   EGLint (*WaitSyncKHR)(_EGLDriver *drv, _EGLDisplay *dpy, _EGLSync *sync);
+   EGLBoolean (*SignalSyncKHR)(_EGLDriver *drv, _EGLDisplay *dpy,
+                               _EGLSync *sync, EGLenum mode);
+   EGLBoolean (*GetSyncAttrib)(_EGLDriver *drv, _EGLDisplay *dpy,
+                               _EGLSync *sync, EGLint attribute,
+                               EGLAttrib *value);
+
+   EGLBoolean (*SwapBuffersRegionNOK)(_EGLDriver *drv, _EGLDisplay *disp,
+                                      _EGLSurface *surf, EGLint numRects,
+                                      const EGLint *rects);
+
+   _EGLImage *(*CreateDRMImageMESA)(_EGLDriver *drv, _EGLDisplay *disp,
+                                    const EGLint *attr_list);
+   EGLBoolean (*ExportDRMImageMESA)(_EGLDriver *drv, _EGLDisplay *disp,
+                                    _EGLImage *img, EGLint *name,
+                                    EGLint *handle, EGLint *stride);
+
+   EGLBoolean (*BindWaylandDisplayWL)(_EGLDriver *drv, _EGLDisplay *disp,
+                                      struct wl_display *display);
+   EGLBoolean (*UnbindWaylandDisplayWL)(_EGLDriver *drv, _EGLDisplay *disp,
+                                        struct wl_display *display);
+   EGLBoolean (*QueryWaylandBufferWL)(_EGLDriver *drv, _EGLDisplay *displ,
+                                      struct wl_resource *buffer,
+                                      EGLint attribute, EGLint *value);
+
+   struct wl_buffer *(*CreateWaylandBufferFromImageWL)(_EGLDriver *drv,
+                                                       _EGLDisplay *disp,
+                                                       _EGLImage *img);
+
+   EGLBoolean (*SwapBuffersWithDamageEXT)(_EGLDriver *drv, _EGLDisplay *dpy,
+                                          _EGLSurface *surface,
+                                          const EGLint *rects, EGLint n_rects);
+
+   EGLBoolean (*PostSubBufferNV)(_EGLDriver *drv, _EGLDisplay *disp,
+                                 _EGLSurface *surface, EGLint x, EGLint y,
+                                 EGLint width, EGLint height);
+
+   EGLint (*QueryBufferAge)(_EGLDriver *drv,
+                            _EGLDisplay *dpy, _EGLSurface *surface);
+   EGLBoolean (*GetSyncValuesCHROMIUM)(_EGLDisplay *dpy, _EGLSurface *surface,
+                                       EGLuint64KHR *ust, EGLuint64KHR *msc,
+                                       EGLuint64KHR *sbc);
+
+   EGLBoolean (*ExportDMABUFImageQueryMESA)(_EGLDriver *drv, _EGLDisplay *disp,
+                                            _EGLImage *img, EGLint *fourcc,
+                                            EGLint *nplanes,
+                                            EGLuint64KHR *modifiers);
+   EGLBoolean (*ExportDMABUFImageMESA)(_EGLDriver *drv, _EGLDisplay *disp,
+                                       _EGLImage *img, EGLint *fds,
+                                       EGLint *strides, EGLint *offsets);
 };
 
 
diff --git a/src/egl/main/egldisplay.h b/src/egl/main/egldisplay.h
index 6c64980cf20..cec6d59e6a4 100644
--- a/src/egl/main/egldisplay.h
+++ b/src/egl/main/egldisplay.h
@@ -112,11 +112,9 @@ struct _egl_extensions
    EGLBoolean KHR_image_pixmap;
    EGLBoolean KHR_reusable_sync;
    EGLBoolean KHR_surfaceless_context;
-   EGLBoolean KHR_vg_parent_image;
    EGLBoolean KHR_wait_sync;
 
    EGLBoolean MESA_configless_context;
-   EGLBoolean MESA_drm_display;
    EGLBoolean MESA_drm_image;
    EGLBoolean MESA_image_dma_buf_export;
 
diff --git a/src/egl/main/eglfallbacks.c b/src/egl/main/eglfallbacks.c
index 65daf8fd0f5..d0fce8c20de 100644
--- a/src/egl/main/eglfallbacks.c
+++ b/src/egl/main/eglfallbacks.c
@@ -59,29 +59,29 @@ _eglInitDriverFallbacks(_EGLDriver *drv)
    drv->API.ChooseConfig = _eglChooseConfig;
    drv->API.GetConfigAttrib = _eglGetConfigAttrib;
 
-   drv->API.CreateContext = (CreateContext_t) _eglReturnFalse;
-   drv->API.DestroyContext = (DestroyContext_t) _eglReturnFalse;
-   drv->API.MakeCurrent = (MakeCurrent_t) _eglReturnFalse;
+   drv->API.CreateContext = (void*) _eglReturnFalse;
+   drv->API.DestroyContext = (void*) _eglReturnFalse;
+   drv->API.MakeCurrent = (void*) _eglReturnFalse;
    drv->API.QueryContext = _eglQueryContext;
 
-   drv->API.CreateWindowSurface = (CreateWindowSurface_t) _eglReturnFalse;
-   drv->API.CreatePixmapSurface = (CreatePixmapSurface_t) _eglReturnFalse;
-   drv->API.CreatePbufferSurface = (CreatePbufferSurface_t) _eglReturnFalse;
+   drv->API.CreateWindowSurface = (void*) _eglReturnFalse;
+   drv->API.CreatePixmapSurface = (void*) _eglReturnFalse;
+   drv->API.CreatePbufferSurface = (void*) _eglReturnFalse;
    drv->API.CreatePbufferFromClientBuffer =
-      (CreatePbufferFromClientBuffer_t) _eglReturnFalse;
-   drv->API.DestroySurface = (DestroySurface_t) _eglReturnFalse;
+      (void*) _eglReturnFalse;
+   drv->API.DestroySurface = (void*) _eglReturnFalse;
    drv->API.QuerySurface = _eglQuerySurface;
    drv->API.SurfaceAttrib = _eglSurfaceAttrib;
 
-   drv->API.BindTexImage = (BindTexImage_t) _eglReturnFalse;
-   drv->API.ReleaseTexImage = (ReleaseTexImage_t) _eglReturnFalse;
-   drv->API.CopyBuffers = (CopyBuffers_t) _eglReturnFalse;
-   drv->API.SwapBuffers = (SwapBuffers_t) _eglReturnFalse;
+   drv->API.BindTexImage = (void*) _eglReturnFalse;
+   drv->API.ReleaseTexImage = (void*) _eglReturnFalse;
+   drv->API.CopyBuffers = (void*) _eglReturnFalse;
+   drv->API.SwapBuffers = (void*) _eglReturnFalse;
    drv->API.SwapInterval = _eglSwapInterval;
 
-   drv->API.WaitClient = (WaitClient_t) _eglReturnFalse;
-   drv->API.WaitNative = (WaitNative_t) _eglReturnFalse;
-   drv->API.GetProcAddress = (GetProcAddress_t) _eglReturnFalse;
+   drv->API.WaitClient = (void*) _eglReturnFalse;
+   drv->API.WaitNative = (void*) _eglReturnFalse;
+   drv->API.GetProcAddress = (void*) _eglReturnFalse;
 
    drv->API.CreateImageKHR = NULL;
    drv->API.DestroyImageKHR = NULL;
diff --git a/src/egl/wayland/wayland-egl/wayland-egl-priv.h b/src/egl/wayland/wayland-egl/wayland-egl-priv.h
index 74a155202be..f1e3ba28309 100644
--- a/src/egl/wayland/wayland-egl/wayland-egl-priv.h
+++ b/src/egl/wayland/wayland-egl/wayland-egl-priv.h
@@ -1,10 +1,6 @@
 #ifndef _WAYLAND_EGL_PRIV_H
 #define _WAYLAND_EGL_PRIV_H
 
-#ifdef  __cplusplus
-extern "C" {
-#endif
-
 /* GCC visibility */
 #if defined(__GNUC__)
 #define WL_EGL_EXPORT __attribute__ ((visibility("default")))
@@ -14,6 +10,10 @@ extern "C" {
 
 #include <wayland-client.h>
 
+#ifdef  __cplusplus
+extern "C" {
+#endif
+
 struct wl_egl_window {
 	struct wl_surface *surface;
 
diff --git a/src/gallium/Makefile.am b/src/gallium/Makefile.am
index e42a8f17703..086e1701128 100644
--- a/src/gallium/Makefile.am
+++ b/src/gallium/Makefile.am
@@ -78,6 +78,12 @@ SUBDIRS += drivers/llvmpipe
 endif
 endif
 
+if HAVE_GALLIUM_SWR
+SUBDIRS += drivers/swr
+SUBDIRS += drivers/swr/avx
+SUBDIRS += drivers/swr/avx2
+endif
+
 ## vc4/rpi
 if HAVE_GALLIUM_VC4
 SUBDIRS += drivers/vc4 winsys/vc4/drm
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
index f0013f70472..790e1211898 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -847,7 +847,7 @@ void cso_set_geometry_shader_handle(struct cso_context *ctx, void *handle)
 
 void cso_delete_geometry_shader(struct cso_context *ctx, void *handle)
 {
-    if (handle == ctx->geometry_shader) {
+   if (handle == ctx->geometry_shader) {
       /* unbind before deleting */
       ctx->pipe->bind_gs_state(ctx->pipe, NULL);
       ctx->geometry_shader = NULL;
@@ -892,7 +892,7 @@ void cso_set_tessctrl_shader_handle(struct cso_context *ctx, void *handle)
 
 void cso_delete_tessctrl_shader(struct cso_context *ctx, void *handle)
 {
-    if (handle == ctx->tessctrl_shader) {
+   if (handle == ctx->tessctrl_shader) {
       /* unbind before deleting */
       ctx->pipe->bind_tcs_state(ctx->pipe, NULL);
       ctx->tessctrl_shader = NULL;
@@ -937,7 +937,7 @@ void cso_set_tesseval_shader_handle(struct cso_context *ctx, void *handle)
 
 void cso_delete_tesseval_shader(struct cso_context *ctx, void *handle)
 {
-    if (handle == ctx->tesseval_shader) {
+   if (handle == ctx->tesseval_shader) {
       /* unbind before deleting */
       ctx->pipe->bind_tes_state(ctx->pipe, NULL);
       ctx->tesseval_shader = NULL;
@@ -982,7 +982,7 @@ void cso_set_compute_shader_handle(struct cso_context *ctx, void *handle)
 
 void cso_delete_compute_shader(struct cso_context *ctx, void *handle)
 {
-    if (handle == ctx->compute_shader) {
+   if (handle == ctx->compute_shader) {
       /* unbind before deleting */
       ctx->pipe->bind_compute_state(ctx->pipe, NULL);
       ctx->compute_shader = NULL;
diff --git a/src/gallium/auxiliary/draw/draw_pipe_stipple.c b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
index dcf05aac1d9..0d39ee4ec47 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_stipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
@@ -108,11 +108,11 @@ emit_segment(struct draw_stage *stage, struct prim_header *header,
 }
 
 
-static inline unsigned
+static inline bool
 stipple_test(int counter, ushort pattern, int factor)
 {
    int b = (counter / factor) & 0xf;
-   return (1 << b) & pattern;
+   return !!((1 << b) & pattern);
 }
 
 
@@ -126,7 +126,7 @@ stipple_line(struct draw_stage *stage, struct prim_header *header)
    const float *pos0 = v0->data[pos];
    const float *pos1 = v1->data[pos];
    float start = 0;
-   int state = 0;
+   bool state = 0;
 
    float x0 = pos0[0];
    float x1 = pos1[0];
@@ -143,29 +143,29 @@ stipple_line(struct draw_stage *stage, struct prim_header *header)
       stipple->counter = 0;
 
 
-   /* XXX ToDo: intead of iterating pixel-by-pixel, use a look-up table.
+   /* XXX ToDo: instead of iterating pixel-by-pixel, use a look-up table.
     */
    for (i = 0; i < length; i++) {
-      int result = stipple_test( (int) stipple->counter+i,
-                                 (ushort) stipple->pattern, stipple->factor );
+      bool result = stipple_test((int)stipple->counter + i,
+                                 (ushort)stipple->pattern, stipple->factor);
       if (result != state) {
          /* changing from "off" to "on" or vice versa */
-	 if (state) {
-	    if (start != i) {
+         if (state) {
+            if (start != i) {
                /* finishing an "on" segment */
-	       emit_segment( stage, header, start / length, i / length );
+               emit_segment(stage, header, start / length, i / length);
             }
-	 }
-	 else {
+         }
+         else {
             /* starting an "on" segment */
-	    start = (float) i;
-	 }
-	 state = result;	   
+            start = (float)i;
+         }
+         state = result;
       }
    }
 
    if (state && start < length)
-      emit_segment( stage, header, start / length, 1.0 );
+      emit_segment(stage, header, start / length, 1.0);
 
    stipple->counter += length;
 }
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_flow.h b/src/gallium/auxiliary/gallivm/lp_bld_flow.h
index 0da849bfe0c..083b0ad9fec 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_flow.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_flow.h
@@ -37,6 +37,9 @@
 
 #include "gallivm/lp_bld.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 struct lp_type;
 
@@ -198,4 +201,8 @@ lp_build_array_alloca(struct gallivm_state *gallivm,
                       LLVMValueRef count,
                       const char *name);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* !LP_BLD_FLOW_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_init.h b/src/gallium/auxiliary/gallivm/lp_bld_init.h
index 9e50f88931d..ab44661a271 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_init.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_init.h
@@ -35,6 +35,9 @@
 #include "lp_bld.h"
 #include <llvm-c/ExecutionEngine.h>
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 struct gallivm_state
 {
@@ -82,4 +85,8 @@ void
 lp_set_store_alignment(LLVMValueRef Inst,
 		       unsigned Align);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* !LP_BLD_INIT_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.h b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
index a6f0eff42f6..902ae41f960 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.h
@@ -42,6 +42,9 @@
 #include "gallivm/lp_bld_type.h"
 #include "gallivm/lp_bld_swizzle.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
 
 struct pipe_resource;
 struct pipe_sampler_view;
@@ -625,5 +628,8 @@ lp_build_minify(struct lp_build_context *bld,
                 LLVMValueRef level,
                 boolean lod_scalar);
 
+#ifdef __cplusplus
+}
+#endif
 
 #endif /* LP_BLD_SAMPLE_H */
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
index 1cbe47ca91f..614c6558ede 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.c
@@ -315,7 +315,7 @@ lp_build_tgsi_inst_llvm(
       }
    }
 
-   if (info->num_dst > 0) {
+   if (info->num_dst > 0 && info->opcode != TGSI_OPCODE_STORE) {
       bld_base->emit_store(bld_base, inst, info, emit_data.output);
    }
    return TRUE;
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
index cc4549778a3..b005d7a0ac1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi.h
@@ -48,6 +48,10 @@
 #include "tgsi/tgsi_scan.h"
 #include "tgsi/tgsi_info.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define LP_CHAN_ALL ~0
 
 #define LP_MAX_INSTRUCTIONS 256
@@ -663,4 +667,8 @@ lp_build_tgsi_llvm(
    struct lp_build_tgsi_context * bld_base,
    const struct tgsi_token *tokens);
 
+#ifdef __cplusplus
+}
+#endif
+
 #endif /* LP_BLD_TGSI_H */
diff --git a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
index a9ab16f2b54..5bb77a5bde2 100644
--- a/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
+++ b/src/gallium/auxiliary/target-helpers/inline_sw_helper.h
@@ -6,6 +6,9 @@
 #include "util/u_debug.h"
 #include "state_tracker/sw_winsys.h"
 
+#ifdef GALLIUM_SWR
+#include "swr/swr_public.h"
+#endif
 
 /* Helper function to choose and instantiate one of the software rasterizers:
  * llvmpipe, softpipe.
@@ -43,10 +46,15 @@ sw_screen_create_named(struct sw_winsys *winsys, const char *driver)
 #endif
 
 #if defined(GALLIUM_SOFTPIPE)
-   if (screen == NULL)
+   if (screen == NULL && strcmp(driver, "softpipe") == 0)
       screen = softpipe_create_screen(winsys);
 #endif
 
+#if defined(GALLIUM_SWR)
+   if (screen == NULL && strcmp(driver, "swr") == 0)
+      screen = swr_create_screen(winsys);
+#endif
+
    return screen;
 }
 
@@ -61,6 +69,8 @@ sw_screen_create(struct sw_winsys *winsys)
    default_driver = "llvmpipe";
 #elif defined(GALLIUM_SOFTPIPE)
    default_driver = "softpipe";
+#elif defined(GALLIUM_SWR)
+   default_driver = "swr";
 #else
    default_driver = "";
 #endif
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
index cfe9b92ee1b..e5355f573bb 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -1425,3 +1425,18 @@ tgsi_build_full_property(
 
    return size;
 }
+
+struct tgsi_full_src_register
+tgsi_full_src_register_from_dst(const struct tgsi_full_dst_register *dst)
+{
+   struct tgsi_full_src_register src;
+   src.Register = tgsi_default_src_register();
+   src.Register.File = dst->Register.File;
+   src.Register.Indirect = dst->Register.Indirect;
+   src.Register.Dimension = dst->Register.Dimension;
+   src.Register.Index = dst->Register.Index;
+   src.Indirect = dst->Indirect;
+   src.Dimension = dst->Dimension;
+   src.DimIndirect = dst->DimIndirect;
+   return src;
+}
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.h b/src/gallium/auxiliary/tgsi/tgsi_build.h
index c5127e1855c..34d181ab247 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.h
@@ -30,6 +30,8 @@
 
 
 struct tgsi_token;
+struct tgsi_full_dst_register;
+struct tgsi_full_src_register;
 
 
 #if defined __cplusplus
@@ -111,6 +113,9 @@ tgsi_build_full_instruction(
 struct tgsi_instruction_predicate
 tgsi_default_instruction_predicate(void);
 
+struct tgsi_full_src_register
+tgsi_full_src_register_from_dst(const struct tgsi_full_dst_register *dst);
+
 #if defined __cplusplus
 }
 #endif
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index f232f3870d1..c8b91bba534 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -637,6 +637,14 @@ iter_instruction(
          TXT(", ");
          ENM(bit, tgsi_memory_names);
       }
+      if (inst->Memory.Texture) {
+         TXT( ", " );
+         ENM( inst->Memory.Texture, tgsi_texture_names );
+      }
+      if (inst->Memory.Format) {
+         TXT( ", " );
+         TXT( util_format_name(inst->Memory.Format) );
+      }
    }
 
    switch (inst->Instruction.Opcode) {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c
index 70fc4604537..462bd15f01c 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -142,7 +142,7 @@ static const struct tgsi_opcode_info opcode_info[TGSI_OPCODE_LAST] =
    { 0, 0, 0, 0, 0, 1, 0, NONE, "ENDSUB", TGSI_OPCODE_ENDSUB },
    { 1, 1, 1, 0, 0, 0, 0, OTHR, "TXQ_LZ", TGSI_OPCODE_TXQ_LZ },
    { 1, 1, 1, 0, 0, 0, 0, OTHR, "TXQS", TGSI_OPCODE_TXQS },
-   { 1, 1, 0, 0, 0, 0, 0, NONE, "RESQ", TGSI_OPCODE_RESQ },
+   { 1, 1, 0, 0, 0, 0, 0, OTHR, "RESQ", TGSI_OPCODE_RESQ },
    { 0, 0, 0, 0, 0, 0, 0, NONE, "", 106 },     /* removed */
    { 0, 0, 0, 0, 0, 0, 0, NONE, "NOP", TGSI_OPCODE_NOP },
    { 1, 2, 0, 0, 0, 0, 0, COMP, "FSEQ", TGSI_OPCODE_FSEQ },
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.h b/src/gallium/auxiliary/tgsi/tgsi_sanity.h
index 1ff7874b8ce..b78d1aba714 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sanity.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.h
@@ -28,12 +28,12 @@
 #ifndef TGSI_SANITY_H
 #define TGSI_SANITY_H
 
+#include "pipe/p_compiler.h"
+
 #if defined __cplusplus
 extern "C" {
 #endif
 
-#include "pipe/p_compiler.h"
-
 struct tgsi_token;
 
 /* Check the given token stream for errors and common mistakes.
diff --git a/src/gallium/auxiliary/tgsi/tgsi_strings.c b/src/gallium/auxiliary/tgsi/tgsi_strings.c
index b15ae69cf7a..6bd1a2e14d2 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_strings.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_strings.c
@@ -144,6 +144,7 @@ const char *tgsi_property_names[TGSI_PROPERTY_COUNT] =
    "TES_POINT_MODE",
    "NUM_CLIPDIST_ENABLED",
    "NUM_CULLDIST_ENABLED",
+   "FS_EARLY_DEPTH_STENCIL",
 };
 
 const char *tgsi_return_type_names[TGSI_RETURN_TYPE_COUNT] =
diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.c b/src/gallium/auxiliary/tgsi/tgsi_text.c
index 91baa01ad8b..77598d2cb79 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_text.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.c
@@ -1388,7 +1388,9 @@ static boolean parse_declaration( struct translate_ctx *ctx )
          if (str_match_nocase_whole(&cur, "ATOMIC")) {
             decl.Declaration.Atomic = 1;
             ctx->cur = cur;
-         } else if (str_match_nocase_whole(&cur, "SHARED")) {
+         }
+      } else if (file == TGSI_FILE_MEMORY) {
+         if (str_match_nocase_whole(&cur, "SHARED")) {
             decl.Declaration.Shared = 1;
             ctx->cur = cur;
          }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_text.h b/src/gallium/auxiliary/tgsi/tgsi_text.h
index 6a306e6b674..a34565795a9 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_text.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_text.h
@@ -28,12 +28,12 @@
 #ifndef TGSI_TEXT_H
 #define TGSI_TEXT_H
 
+#include "pipe/p_compiler.h"
+
 #if defined __cplusplus
 extern "C" {
 #endif
 
-#include "pipe/p_compiler.h"
-
 struct tgsi_token;
 
 boolean
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.c b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
index e1a72786476..ab1d03458ef 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.c
@@ -1242,7 +1242,9 @@ ureg_emit_texture_offset(struct ureg_program *ureg,
 void
 ureg_emit_memory(struct ureg_program *ureg,
                  unsigned extended_token,
-                 unsigned qualifier)
+                 unsigned qualifier,
+                 unsigned texture,
+                 unsigned format)
 {
    union tgsi_any_token *out, *insn;
 
@@ -1253,6 +1255,8 @@ ureg_emit_memory(struct ureg_program *ureg,
 
    out[0].value = 0;
    out[0].insn_memory.Qualifier = qualifier;
+   out[0].insn_memory.Texture = texture;
+   out[0].insn_memory.Format = format;
 }
 
 void
@@ -1413,7 +1417,9 @@ ureg_memory_insn(struct ureg_program *ureg,
                  unsigned nr_dst,
                  const struct ureg_src *src,
                  unsigned nr_src,
-                 unsigned qualifier)
+                 unsigned qualifier,
+                 unsigned texture,
+                 unsigned format)
 {
    struct ureg_emit_insn_result insn;
    unsigned i;
@@ -1430,7 +1436,7 @@ ureg_memory_insn(struct ureg_program *ureg,
                          nr_dst,
                          nr_src);
 
-   ureg_emit_memory(ureg, insn.extended_token, qualifier);
+   ureg_emit_memory(ureg, insn.extended_token, qualifier, texture, format);
 
    for (i = 0; i < nr_dst; i++)
       ureg_emit_dst(ureg, dst[i]);
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
index 6a3b5ddf017..04a62a6e160 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
@@ -541,7 +541,9 @@ ureg_memory_insn(struct ureg_program *ureg,
                  unsigned nr_dst,
                  const struct ureg_src *src,
                  unsigned nr_src,
-                 unsigned qualifier);
+                 unsigned qualifier,
+                 unsigned texture,
+                 unsigned format);
 
 /***********************************************************************
  * Internal instruction helpers, don't call these directly:
@@ -582,7 +584,9 @@ ureg_emit_texture_offset(struct ureg_program *ureg,
 void
 ureg_emit_memory(struct ureg_program *ureg,
                  unsigned insn_token,
-                 unsigned qualifier);
+                 unsigned qualifier,
+                 unsigned texture,
+                 unsigned format);
 
 void 
 ureg_emit_dst( struct ureg_program *ureg,
diff --git a/src/gallium/auxiliary/util/u_debug.h b/src/gallium/auxiliary/util/u_debug.h
index c2707b402cb..85d0cb64e6c 100644
--- a/src/gallium/auxiliary/util/u_debug.h
+++ b/src/gallium/auxiliary/util/u_debug.h
@@ -39,6 +39,11 @@
 #define U_DEBUG_H_
 
 
+#if defined(PIPE_OS_HAIKU)
+/* Haiku provides debug_printf in libroot with OS.h */
+#include <OS.h>
+#endif
+
 #include "os/os_misc.h"
 
 #include "pipe/p_format.h"
@@ -94,9 +99,6 @@ debug_printf(const char *format, ...)
    (void) format; /* silence warning */
 #endif
 }
-#else /* is Haiku */
-/* Haiku provides debug_printf in libroot with OS.h */
-#include <OS.h>
 #endif
 
 
diff --git a/src/gallium/auxiliary/util/u_debug_refcnt.c b/src/gallium/auxiliary/util/u_debug_refcnt.c
index 2c3dc986a90..0a4786442fc 100644
--- a/src/gallium/auxiliary/util/u_debug_refcnt.c
+++ b/src/gallium/auxiliary/util/u_debug_refcnt.c
@@ -26,9 +26,14 @@
 
 #if defined(DEBUG)
 
-/* see http://www.mozilla.org/performance/refcnt-balancer.html for what do with the output
- * on Linux, use tools/addr2line.sh to postprocess it before anything else
- **/
+/**
+ * If the GALLIUM_REFCNT_LOG env var is defined as a filename, gallium
+ * reference counting will be logged to the file.
+ *
+ * See http://www-archive.mozilla.org/performance/refcnt-balancer.html
+ * for what to do with the output on Linux, use tools/addr2line.sh to
+ * postprocess it before anything else.
+ */
 
 #include <stdio.h>
 
@@ -42,30 +47,41 @@
 
 int debug_refcnt_state;
 
-FILE* stream;
+static FILE *stream;
 
-/* TODO: maybe move this serial machinery to a stand-alone module and expose it? */
+/* TODO: maybe move this serial machinery to a stand-alone module and
+ * expose it?
+ */
 pipe_static_mutex(serials_mutex);
 
-static struct util_hash_table* serials_hash;
+static struct util_hash_table *serials_hash;
 static unsigned serials_last;
 
-static unsigned hash_ptr(void* p)
+
+static unsigned
+hash_ptr(void *p)
 {
-   return (unsigned)(uintptr_t)p;
+   return (unsigned) (uintptr_t) p;
 }
 
-static int compare_ptr(void* a, void* b)
+
+static int
+compare_ptr(void *a, void *b)
 {
-   if(a == b)
+   if (a == b)
       return 0;
-   else if(a < b)
+   else if (a < b)
       return -1;
    else
       return 1;
 }
 
-static boolean debug_serial(void* p, unsigned* pserial)
+
+/**
+ * Return a small integer serial number for the given pointer.
+ */
+static boolean
+debug_serial(void *p, unsigned *pserial)
 {
    unsigned serial;
    boolean found = TRUE;
@@ -81,79 +97,99 @@ static boolean debug_serial(void* p, unsigned* pserial)
    pipe_mutex_lock(serials_mutex);
    if (!serials_hash)
       serials_hash = util_hash_table_create(hash_ptr, compare_ptr);
-   serial = (unsigned)(uintptr_t)util_hash_table_get(serials_hash, p);
-   if(!serial)
-   {
-      /* time to stop logging... (you'll have a 100 GB logfile at least at this point)
-       * TODO: avoid this
+
+   serial = (unsigned) (uintptr_t) util_hash_table_get(serials_hash, p);
+   if (!serial) {
+      /* time to stop logging... (you'll have a 100 GB logfile at least at
+       * this point)  TODO: avoid this
        */
       serial = ++serials_last;
-      if(!serial)
-      {
+      if (!serial) {
          debug_error("More than 2^32 objects detected, aborting.\n");
          os_abort();
       }
 
-      util_hash_table_set(serials_hash, p, (void*)(uintptr_t)serial);
+      util_hash_table_set(serials_hash, p, (void *) (uintptr_t) serial);
       found = FALSE;
    }
    pipe_mutex_unlock(serials_mutex);
+
    *pserial = serial;
+
    return found;
 }
 
-static void debug_serial_delete(void* p)
+
+/**
+ * Free the serial number for the given pointer.
+ */
+static void
+debug_serial_delete(void *p)
 {
    pipe_mutex_lock(serials_mutex);
    util_hash_table_remove(serials_hash, p);
    pipe_mutex_unlock(serials_mutex);
 }
 
+
 #define STACK_LEN 64
 
-static void dump_stack(const char* symbols[STACK_LEN])
+static void
+dump_stack(const char *symbols[STACK_LEN])
 {
    unsigned i;
-   for(i = 0; i < STACK_LEN; ++i)
-   {
-      if(symbols[i])
+   for (i = 0; i < STACK_LEN; ++i) {
+      if (symbols[i])
          fprintf(stream, "%s\n", symbols[i]);
    }
    fprintf(stream, "\n");
 }
 
-void debug_reference_slowpath(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change)
+
+/**
+ * Log a reference count change to the log file (if enabled).
+ * This is called via the pipe_reference() and debug_reference() functions,
+ * basically whenever a reference count is initialized or changed.
+ *
+ * \param p  the refcount being changed (the value is not changed here)
+ * \param get_desc  a function which will be called to print an object's
+ *                  name/pointer into a string buffer during logging
+ * \param change  the reference count change which must be +/-1 or 0 when
+ *                creating the object and initializing the refcount.
+ */
+void
+debug_reference_slowpath(const struct pipe_reference *p,
+                         debug_reference_descriptor get_desc, int change)
 {
-   if(debug_refcnt_state < 0)
+   assert(change >= -1);
+   assert(change <= 1);
+
+   if (debug_refcnt_state < 0)
       return;
 
-   if(!debug_refcnt_state)
-   {
-      const char* filename = debug_get_option("GALLIUM_REFCNT_LOG", NULL);
-      if(filename && filename[0])
+   if (!debug_refcnt_state) {
+      const char *filename = debug_get_option("GALLIUM_REFCNT_LOG", NULL);
+      if (filename && filename[0])
          stream = fopen(filename, "wt");
 
-      if(stream)
+      if (stream)
          debug_refcnt_state = 1;
       else
          debug_refcnt_state = -1;
    }
 
-   if(debug_refcnt_state > 0)
-   {
+   if (debug_refcnt_state > 0) {
       struct debug_stack_frame frames[STACK_LEN];
-      const char* symbols[STACK_LEN];
+      const char *symbols[STACK_LEN];
       char buf[1024];
-
       unsigned i;
       unsigned refcnt = p->count;
       unsigned serial;
-      boolean existing = debug_serial((void*)p, &serial);
+      boolean existing = debug_serial((void *) p, &serial);
 
       debug_backtrace_capture(frames, 1, STACK_LEN);
-      for(i = 0; i < STACK_LEN; ++i)
-      {
-         if(frames[i].function)
+      for (i = 0; i < STACK_LEN; ++i) {
+         if (frames[i].function)
             symbols[i] = debug_symbol_name_cached(frames[i].function);
          else
             symbols[i] = 0;
@@ -161,30 +197,28 @@ void debug_reference_slowpath(const struct pipe_reference* p, debug_reference_de
 
       get_desc(buf, p);
 
-      if(!existing)
-      {
+      if (!existing) {
          fprintf(stream, "<%s> %p %u Create\n", buf, (void *) p, serial);
          dump_stack(symbols);
 
-         /* this is there to provide a gradual change even if we don't see the initialization */
-         for(i = 1; i <= refcnt - change; ++i)
-         {
+         /* this is here to provide a gradual change even if we don't see
+          * the initialization
+          */
+         for (i = 1; i <= refcnt - change; ++i) {
             fprintf(stream, "<%s> %p %u AddRef %u\n", buf, (void *) p,
                     serial, i);
             dump_stack(symbols);
          }
       }
 
-      if(change)
-      {
+      if (change) {
          fprintf(stream, "<%s> %p %u %s %u\n", buf, (void *) p, serial,
                  change > 0 ? "AddRef" : "Release", refcnt);
          dump_stack(symbols);
       }
 
-      if(!refcnt)
-      {
-         debug_serial_delete((void*)p);
+      if (!refcnt) {
+         debug_serial_delete((void *) p);
          fprintf(stream, "<%s> %p %u Destroy\n", buf, (void *) p, serial);
          dump_stack(symbols);
       }
@@ -192,4 +226,5 @@ void debug_reference_slowpath(const struct pipe_reference* p, debug_reference_de
       fflush(stream);
    }
 }
-#endif
+
+#endif /* DEBUG */
diff --git a/src/gallium/auxiliary/util/u_debug_refcnt.h b/src/gallium/auxiliary/util/u_debug_refcnt.h
index 1f9218fec9a..cf047776661 100644
--- a/src/gallium/auxiliary/util/u_debug_refcnt.h
+++ b/src/gallium/auxiliary/util/u_debug_refcnt.h
@@ -40,9 +40,13 @@ typedef void (*debug_reference_descriptor)(char*, const struct pipe_reference*);
 
 extern int debug_refcnt_state;
 
-void debug_reference_slowpath(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change);
+void
+debug_reference_slowpath(const struct pipe_reference* p,
+                         debug_reference_descriptor get_desc, int change);
 
-static inline void debug_reference(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change)
+static inline void
+debug_reference(const struct pipe_reference* p,
+                debug_reference_descriptor get_desc, int change)
 {
    if (debug_refcnt_state >= 0)
       debug_reference_slowpath(p, get_desc, change);
@@ -50,7 +54,9 @@ static inline void debug_reference(const struct pipe_reference* p, debug_referen
 
 #else
 
-static inline void debug_reference(const struct pipe_reference* p, debug_reference_descriptor get_desc, int change)
+static inline void
+debug_reference(const struct pipe_reference* p,
+                debug_reference_descriptor get_desc, int change)
 {
 }
 
diff --git a/src/gallium/auxiliary/util/u_dl.h b/src/gallium/auxiliary/util/u_dl.h
index 80a00ed6796..d2f4737d42a 100644
--- a/src/gallium/auxiliary/util/u_dl.h
+++ b/src/gallium/auxiliary/util/u_dl.h
@@ -32,6 +32,9 @@
 
 #include "pipe/p_config.h"
 
+#ifdef	__cplusplus
+extern "C" {
+#endif
 
 #if defined(PIPE_OS_WINDOWS)
 #  define UTIL_DL_EXT ".dll"
@@ -79,5 +82,8 @@ util_dl_close(struct util_dl_library *library);
 const char *
 util_dl_error(void);
 
+#ifdef	__cplusplus
+}
+#endif
 
 #endif /* U_DL_H_ */
diff --git a/src/gallium/auxiliary/util/u_draw_quad.h b/src/gallium/auxiliary/util/u_draw_quad.h
index b298ef2ae59..6553d5d7b6b 100644
--- a/src/gallium/auxiliary/util/u_draw_quad.h
+++ b/src/gallium/auxiliary/util/u_draw_quad.h
@@ -32,6 +32,7 @@
 #include "pipe/p_compiler.h"
 #include "pipe/p_context.h"
 
+#include "util/u_draw.h"
 
 #ifdef __cplusplus
 extern "C" {
@@ -40,8 +41,6 @@ extern "C" {
 struct pipe_resource;
 struct cso_context;
 
-#include "util/u_draw.h"
-
 extern void 
 util_draw_vertex_buffer(struct pipe_context *pipe, struct cso_context *cso,
                         struct pipe_resource *vbuf, uint vbuf_slot,
diff --git a/src/gallium/auxiliary/util/u_helpers.h b/src/gallium/auxiliary/util/u_helpers.h
index f25f2807fe5..a9a53e4347a 100644
--- a/src/gallium/auxiliary/util/u_helpers.h
+++ b/src/gallium/auxiliary/util/u_helpers.h
@@ -28,12 +28,12 @@
 #ifndef U_HELPERS_H
 #define U_HELPERS_H
 
+#include "pipe/p_state.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#include "pipe/p_state.h"
-
 void util_set_vertex_buffers_mask(struct pipe_vertex_buffer *dst,
                                   uint32_t *enabled_buffers,
                                   const struct pipe_vertex_buffer *src,
diff --git a/src/gallium/auxiliary/util/u_inlines.h b/src/gallium/auxiliary/util/u_inlines.h
index d0812039292..0e80cef0b08 100644
--- a/src/gallium/auxiliary/util/u_inlines.h
+++ b/src/gallium/auxiliary/util/u_inlines.h
@@ -622,6 +622,16 @@ util_copy_constant_buffer(struct pipe_constant_buffer *dst,
    }
 }
 
+static inline void
+util_copy_image_view(struct pipe_image_view *dst,
+                     const struct pipe_image_view *src)
+{
+   pipe_resource_reference(&dst->resource, src->resource);
+   dst->format = src->format;
+   dst->access = src->access;
+   dst->u = src->u;
+}
+
 static inline unsigned
 util_max_layer(const struct pipe_resource *r, unsigned level)
 {
diff --git a/src/gallium/auxiliary/util/u_transfer.c b/src/gallium/auxiliary/util/u_transfer.c
index adae84bbfab..0610535cd2c 100644
--- a/src/gallium/auxiliary/util/u_transfer.c
+++ b/src/gallium/auxiliary/util/u_transfer.c
@@ -98,7 +98,8 @@ u_resource( struct pipe_resource *res )
 
 boolean u_resource_get_handle_vtbl(struct pipe_screen *screen,
                                    struct pipe_resource *resource,
-                                   struct winsys_handle *handle)
+                                   struct winsys_handle *handle,
+                                   unsigned usage)
 {
    struct u_resource *ur = u_resource(resource);
    return ur->vtbl->resource_get_handle(screen, resource, handle);
diff --git a/src/gallium/auxiliary/util/u_transfer.h b/src/gallium/auxiliary/util/u_transfer.h
index 6c25ee0f024..660dc161d33 100644
--- a/src/gallium/auxiliary/util/u_transfer.h
+++ b/src/gallium/auxiliary/util/u_transfer.h
@@ -78,7 +78,8 @@ struct u_resource {
 
 boolean u_resource_get_handle_vtbl(struct pipe_screen *screen,
                                    struct pipe_resource *resource,
-                                   struct winsys_handle *handle);
+                                   struct winsys_handle *handle,
+                                   unsigned usage);
 
 void u_resource_destroy_vtbl(struct pipe_screen *screen,
                              struct pipe_resource *resource);
diff --git a/src/gallium/auxiliary/util/u_video.h b/src/gallium/auxiliary/util/u_video.h
index ddc00216105..9196afc11be 100644
--- a/src/gallium/auxiliary/util/u_video.h
+++ b/src/gallium/auxiliary/util/u_video.h
@@ -28,10 +28,6 @@
 #ifndef U_VIDEO_H
 #define U_VIDEO_H
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
 #include "pipe/p_defines.h"
 #include "pipe/p_video_enums.h"
 
@@ -40,6 +36,10 @@ extern "C" {
 #include "util/u_debug.h"
 #include "util/u_math.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 static inline enum pipe_video_format
 u_reduce_video_profile(enum pipe_video_profile profile)
 {
diff --git a/src/gallium/auxiliary/vl/vl_winsys_dri.c b/src/gallium/auxiliary/vl/vl_winsys_dri.c
index 758f50d7c23..01365260312 100644
--- a/src/gallium/auxiliary/vl/vl_winsys_dri.c
+++ b/src/gallium/auxiliary/vl/vl_winsys_dri.c
@@ -248,7 +248,8 @@ vl_dri2_screen_texture_from_drawable(struct vl_screen *vscreen, void *drawable)
    template.flags = 0;
 
    tex = scrn->base.pscreen->resource_from_handle(scrn->base.pscreen, &template,
-                                                  &dri2_handle);
+                                                  &dri2_handle,
+                                                  PIPE_HANDLE_USAGE_READ_WRITE);
    free(reply);
 
    return tex;
diff --git a/src/gallium/docs/source/drivers/openswr.rst b/src/gallium/docs/source/drivers/openswr.rst
new file mode 100644
index 00000000000..84aa51f5d80
--- /dev/null
+++ b/src/gallium/docs/source/drivers/openswr.rst
@@ -0,0 +1,21 @@
+OpenSWR
+=======
+
+The Gallium OpenSWR driver is a high performance, highly scalable
+software renderer targeted towards visualization workloads.  For such
+geometry heavy workloads there is a considerable speedup over llvmpipe,
+which is to be expected as the geometry frontend of llvmpipe is single
+threaded.
+
+This rasterizer is x86 specific and requires AVX or AVX2.  The driver
+fits into the gallium framework, and reuses gallivm for doing the TGSI
+to vectorized llvm-IR conversion of the shader kernels.
+
+.. toctree::
+   :glob:
+
+   openswr/usage
+   openswr/faq
+   openswr/profiling
+   openswr/knobs
+
diff --git a/src/gallium/docs/source/drivers/openswr/faq.rst b/src/gallium/docs/source/drivers/openswr/faq.rst
new file mode 100644
index 00000000000..596d77f3780
--- /dev/null
+++ b/src/gallium/docs/source/drivers/openswr/faq.rst
@@ -0,0 +1,141 @@
+FAQ
+===
+
+Why another software rasterizer?
+--------------------------------
+
+Good question, given there are already three (swrast, softpipe,
+llvmpipe) in the Mesa3D tree. Two important reasons for this:
+
+ * Architecture - given our focus on scientific visualization, our
+   workloads are much different than the typical game; we have heavy
+   vertex load and relatively simple shaders.  In addition, the core
+   counts of machines we run on are much higher.  These parameters led
+   to design decisions much different than llvmpipe.
+
+ * Historical - Intel had developed a high performance software
+   graphics stack for internal purposes.  Later we adapted this
+   graphics stack for use in visualization and decided to move forward
+   with Mesa3D to provide a high quality API layer while at the same
+   time benefiting from the excellent performance the software
+   rasterizerizer gives us.
+
+What's the architecture?
+------------------------
+
+SWR is a tile based immediate mode renderer with a sort-free threading
+model which is arranged as a ring of queues.  Each entry in the ring
+represents a draw context that contains all of the draw state and work
+queues.  An API thread sets up each draw context and worker threads
+will execute both the frontend (vertex/geometry processing) and
+backend (fragment) work as required.  The ring allows for backend
+threads to pull work in order.  Large draws are split into chunks to
+allow vertex processing to happen in parallel, with the backend work
+pickup preserving draw ordering.
+
+Our pipeline uses just-in-time compiled code for the fetch shader that
+does vertex attribute gathering and AOS to SOA conversions, the vertex
+shader and fragment shaders, streamout, and fragment blending. SWR
+core also supports geometry and compute shaders but we haven't exposed
+them through our driver yet. The fetch shader, streamout, and blend is
+built internally to swr core using LLVM directly, while for the vertex
+and pixel shaders we reuse bits of llvmpipe from
+``gallium/auxiliary/gallivm`` to build the kernels, which we wrap
+differently than llvmpipe's ``auxiliary/draw`` code.
+
+What's the performance?
+-----------------------
+
+For the types of high-geometry workloads we're interested in, we are
+significantly faster than llvmpipe.  This is to be expected, as
+llvmpipe only threads the fragment processing and not the geometry
+frontend.  The performance advantage over llvmpipe roughly scales
+linearly with the number of cores available.
+
+While our current performance is quite good, we know there is more
+potential in this architecture.  When we switched from a prototype
+OpenGL driver to Mesa we regressed performance severely, some due to
+interface issues that need tuning, some differences in shader code
+generation, and some due to conformance and feature additions to the
+core swr.  We are looking to recovering most of this performance back.
+
+What's the conformance?
+-----------------------
+
+The major applications we are targeting are all based on the
+Visualization Toolkit (VTK), and as such our development efforts have
+been focused on making sure these work as best as possible.  Our
+current code passes vtk's rendering tests with their new "OpenGL2"
+(really OpenGL 3.2) backend at 99%.
+
+piglit testing shows a much lower pass rate, roughly 80% at the time
+of writing.  Core SWR undergoes rigorous unit testing and we are quite
+confident in the rasterizer, and understand the areas where it
+currently has issues (example: line rendering is done with triangles,
+so doesn't match the strict line rendering rules).  The majority of
+the piglit failures are errors in our driver layer interfacing Mesa
+and SWR.  Fixing these issues is one of our major future development
+goals.
+
+Why are you open sourcing this?
+-------------------------------
+
+ * Our customers prefer open source, and allowing them to simply
+   download the Mesa source and enable our driver makes life much
+   easier for them.
+
+ * The internal gallium APIs are not stable, so we'd like our driver
+   to be visible for changes.
+
+ * It's easier to work with the Mesa community when the source we're
+   working with can be used as reference.
+
+What are your development plans?
+--------------------------------
+
+ * Performance - see the performance section earlier for details.
+
+ * Conformance - see the conformance section earlier for details.
+
+ * Features - core SWR has a lot of functionality we have yet to
+   expose through our driver, such as MSAA, geometry shaders, compute
+   shaders, and tesselation.
+
+ * AVX512 support
+
+What is the licensing of the code?
+----------------------------------
+
+ * All code is under the normal Mesa MIT license.
+
+Will this work on AMD?
+----------------------
+
+ * If using an AMD processor with AVX or AVX2, it should work though
+   we don't have that hardware around to test.  Patches if needed
+   would be welcome.
+
+Will this work on ARM, MIPS, POWER, <other non-x86 architecture>?
+-------------------------------------------------------------------------
+
+ * Not without a lot of work.  We make extensive use of AVX and AVX2
+   intrinsics in our code and the in-tree JIT creation.  It is not the
+   intention for this codebase to support non-x86 architectures.
+
+What hardware do I need?
+------------------------
+
+ * Any x86 processor with at least AVX (introduced in the Intel
+   SandyBridge and AMD Bulldozer microarchitectures in 2011) will
+   work.
+
+ * You don't need a fire-breathing Xeon machine to work on SWR - we do
+   day-to-day development with laptops and desktop CPUs.
+
+Does one build work on both AVX and AVX2?
+-----------------------------------------
+
+Yes. The build system creates two shared libraries, ``libswrAVX.so`` and
+``libswrAVX2.so``, and ``swr_create_screen()`` loads the appropriate one at
+runtime.
+
diff --git a/src/gallium/docs/source/drivers/openswr/knobs.rst b/src/gallium/docs/source/drivers/openswr/knobs.rst
new file mode 100644
index 00000000000..06f228a2e92
--- /dev/null
+++ b/src/gallium/docs/source/drivers/openswr/knobs.rst
@@ -0,0 +1,114 @@
+Knobs
+=====
+
+OpenSWR has a number of environment variables which control its
+operation, in addition to the normal Mesa and gallium controls.
+
+.. envvar:: KNOB_ENABLE_ASSERT_DIALOGS <bool> (true)
+
+Use dialogs when asserts fire. Asserts are only enabled in debug builds
+
+.. envvar:: KNOB_SINGLE_THREADED <bool> (false)
+
+If enabled will perform all rendering on the API thread. This is useful mainly for debugging purposes.
+
+.. envvar:: KNOB_DUMP_SHADER_IR <bool> (false)
+
+Dumps shader LLVM IR at various stages of jit compilation.
+
+.. envvar:: KNOB_USE_GENERIC_STORETILE <bool> (false)
+
+Always use generic function for performing StoreTile. Will be slightly slower than using optimized (jitted) path
+
+.. envvar:: KNOB_FAST_CLEAR <bool> (true)
+
+Replace 3D primitive execute with a SWRClearRT operation and defer clear execution to first backend op on hottile, or hottile store
+
+.. envvar:: KNOB_MAX_NUMA_NODES <uint32_t> (0)
+
+Maximum # of NUMA-nodes per system used for worker threads   0 == ALL NUMA-nodes in the system   N == Use at most N NUMA-nodes for rendering
+
+.. envvar:: KNOB_MAX_CORES_PER_NUMA_NODE <uint32_t> (0)
+
+Maximum # of cores per NUMA-node used for worker threads.   0 == ALL non-API thread cores per NUMA-node   N == Use at most N cores per NUMA-node
+
+.. envvar:: KNOB_MAX_THREADS_PER_CORE <uint32_t> (1)
+
+Maximum # of (hyper)threads per physical core used for worker threads.   0 == ALL hyper-threads per core   N == Use at most N hyper-threads per physical core
+
+.. envvar:: KNOB_MAX_WORKER_THREADS <uint32_t> (0)
+
+Maximum worker threads to spawn.  IMPORTANT: If this is non-zero, no worker threads will be bound to specific HW threads.  They will all be "floating" SW threads. In this case, the above 3 KNOBS will be ignored.
+
+.. envvar:: KNOB_BUCKETS_START_FRAME <uint32_t> (1200)
+
+Frame from when to start saving buckets data.  NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h for this to have an effect.
+
+.. envvar:: KNOB_BUCKETS_END_FRAME <uint32_t> (1400)
+
+Frame at which to stop saving buckets data.  NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h for this to have an effect.
+
+.. envvar:: KNOB_WORKER_SPIN_LOOP_COUNT <uint32_t> (5000)
+
+Number of spin-loop iterations worker threads will perform before going to sleep when waiting for work
+
+.. envvar:: KNOB_MAX_DRAWS_IN_FLIGHT <uint32_t> (160)
+
+Maximum number of draws outstanding before API thread blocks.
+
+.. envvar:: KNOB_MAX_PRIMS_PER_DRAW <uint32_t> (2040)
+
+Maximum primitives in a single Draw(). Larger primitives are split into smaller Draw calls. Should be a multiple of (3 * vectorWidth).
+
+.. envvar:: KNOB_MAX_TESS_PRIMS_PER_DRAW <uint32_t> (16)
+
+Maximum primitives in a single Draw() with tessellation enabled. Larger primitives are split into smaller Draw calls. Should be a multiple of (vectorWidth).
+
+.. envvar:: KNOB_MAX_FRAC_ODD_TESS_FACTOR <float> (63.0f)
+
+(DEBUG) Maximum tessellation factor for fractional-odd partitioning.
+
+.. envvar:: KNOB_MAX_FRAC_EVEN_TESS_FACTOR <float> (64.0f)
+
+(DEBUG) Maximum tessellation factor for fractional-even partitioning.
+
+.. envvar:: KNOB_MAX_INTEGER_TESS_FACTOR <uint32_t> (64)
+
+(DEBUG) Maximum tessellation factor for integer partitioning.
+
+.. envvar:: KNOB_BUCKETS_ENABLE_THREADVIZ <bool> (false)
+
+Enable threadviz output.
+
+.. envvar:: KNOB_TOSS_DRAW <bool> (false)
+
+Disable per-draw/dispatch execution
+
+.. envvar:: KNOB_TOSS_QUEUE_FE <bool> (false)
+
+Stop per-draw execution at worker FE  NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h
+
+.. envvar:: KNOB_TOSS_FETCH <bool> (false)
+
+Stop per-draw execution at vertex fetch  NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h
+
+.. envvar:: KNOB_TOSS_IA <bool> (false)
+
+Stop per-draw execution at input assembler  NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h
+
+.. envvar:: KNOB_TOSS_VS <bool> (false)
+
+Stop per-draw execution at vertex shader  NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h
+
+.. envvar:: KNOB_TOSS_SETUP_TRIS <bool> (false)
+
+Stop per-draw execution at primitive setup  NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h
+
+.. envvar:: KNOB_TOSS_BIN_TRIS <bool> (false)
+
+Stop per-draw execution at primitive binning  NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h
+
+.. envvar:: KNOB_TOSS_RS <bool> (false)
+
+Stop per-draw execution at rasterizer  NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h
+
diff --git a/src/gallium/docs/source/drivers/openswr/profiling.rst b/src/gallium/docs/source/drivers/openswr/profiling.rst
new file mode 100644
index 00000000000..357754c3506
--- /dev/null
+++ b/src/gallium/docs/source/drivers/openswr/profiling.rst
@@ -0,0 +1,67 @@
+Profiling
+=========
+
+OpenSWR contains built-in profiling  which can be enabled
+at build time to provide insight into performance tuning.
+
+To enable this, uncomment the following line in ``rasterizer/core/knobs.h`` and rebuild: ::
+
+  //#define KNOB_ENABLE_RDTSC
+
+Running an application will result in a ``rdtsc.txt`` file being
+created in current working directory.  This file contains profile
+information captured between the ``KNOB_BUCKETS_START_FRAME`` and
+``KNOB_BUCKETS_END_FRAME`` (see knobs section).
+
+The resulting file will contain sections for each thread with a
+hierarchical breakdown of the time spent in the various operations.
+For example: ::
+
+ Thread 0 (API)
+  %Tot   %Par  Cycles     CPE        NumEvent   CPE2       NumEvent2  Bucket
+   0.00   0.00 28370      2837       10         0          0          APIClearRenderTarget
+   0.00  41.23 11698      1169       10         0          0          |-> APIDrawWakeAllThreads
+   0.00  18.34 5202       520        10         0          0          |-> APIGetDrawContext
+  98.72  98.72 12413773688 29957      414380     0          0          APIDraw
+   0.36   0.36 44689364   107        414380     0          0          |-> APIDrawWakeAllThreads
+  96.36  97.62 12117951562 9747       1243140    0          0          |-> APIGetDrawContext
+   0.00   0.00 19904      995        20         0          0          APIStoreTiles
+   0.00   7.88 1568       78         20         0          0          |-> APIDrawWakeAllThreads
+   0.00  25.28 5032       251        20         0          0          |-> APIGetDrawContext
+   1.28   1.28 161344902  64         2486370    0          0          APIGetDrawContext
+   0.00   0.00 50368      2518       20         0          0          APISync
+   0.00   2.70 1360       68         20         0          0          |-> APIDrawWakeAllThreads
+   0.00  65.27 32876      1643       20         0          0          |-> APIGetDrawContext
+
+
+ Thread 1 (WORKER)
+  %Tot   %Par  Cycles     CPE        NumEvent   CPE2       NumEvent2  Bucket
+  83.92  83.92 13198987522 96411      136902     0          0          FEProcessDraw
+  24.91  29.69 3918184840 167        23410158   0          0          |-> FEFetchShader
+  11.17  13.31 1756972646 75         23410158   0          0          |-> FEVertexShader
+   8.89  10.59 1397902996 59         23410161   0          0          |-> FEPAAssemble
+  19.06  22.71 2997794710 384        7803387    0          0          |-> FEClipTriangles
+  11.67  61.21 1834958176 235        7803387    0          0              |-> FEBinTriangles
+   0.00   0.00 0          0          187258     0          0                  |-> FECullZeroAreaAndBackface
+   0.00   0.00 0          0          60051033   0          0                  |-> FECullBetweenCenters
+   0.11   0.11 17217556   2869592    6          0          0          FEProcessStoreTiles
+  15.97  15.97 2511392576 73665      34092      0          0          WorkerWorkOnFifoBE
+  14.04  87.95 2208687340 9187       240408     0          0          |-> WorkerFoundWork
+   0.06   0.43 9390536    13263      708        0          0              |-> BELoadTiles
+   0.00   0.01 293020     182        1609       0          0              |-> BEClear
+  12.63  89.94 1986508990 949        2093014    0          0              |-> BERasterizeTriangle
+   2.37  18.75 372374596  177        2093014    0          0                  |-> BETriangleSetup
+   0.42   3.35 66539016   31         2093014    0          0                  |-> BEStepSetup
+   0.00   0.00 0          0          21766      0          0                  |-> BETrivialReject
+   1.05   8.33 165410662  79         2071248    0          0                  |-> BERasterizePartial
+   6.06  48.02 953847796  1260       756783     0          0                  |-> BEPixelBackend
+   0.20   3.30 31521202   41         756783     0          0                      |-> BESetup
+   0.16   2.69 25624304   33         756783     0          0                      |-> BEBarycentric
+   0.18   2.92 27884986   36         756783     0          0                      |-> BEEarlyDepthTest
+   0.19   3.20 30564174   41         744058     0          0                      |-> BEPixelShader
+   0.26   4.30 41058646   55         744058     0          0                      |-> BEOutputMerger
+   1.27  20.94 199750822  32         6054264    0          0                      |-> BEEndTile
+   0.33   2.34 51758160   23687      2185       0          0              |-> BEStoreTiles
+   0.20  60.22 31169500   28807      1082       0          0                  |-> B8G8R8A8_UNORM
+   0.00   0.00 302752     302752     1          0          0          WorkerWaitForThreadEvent
+
diff --git a/src/gallium/docs/source/drivers/openswr/usage.rst b/src/gallium/docs/source/drivers/openswr/usage.rst
new file mode 100644
index 00000000000..e55b4211a54
--- /dev/null
+++ b/src/gallium/docs/source/drivers/openswr/usage.rst
@@ -0,0 +1,38 @@
+Usage
+=====
+
+Requirements
+^^^^^^^^^^^^
+
+* An x86 processor with AVX or AVX2
+* LLVM version 3.6 or later
+
+Building
+^^^^^^^^
+
+To build with GNU automake, select building the swr driver at
+configure time, for example: ::
+
+  configure --with-gallium-drivers=swrast,swr
+
+Using
+^^^^^
+
+On Linux, building will create a drop-in alternative for libGL.so into::
+
+  lib/gallium/libGL.so
+
+or::
+
+  build/foo/gallium/targets/libgl-xlib/libGL.so
+
+To use it set the LD_LIBRARY_PATH environment variable accordingly.
+
+**IMPORTANT:** Mesa will default to using llvmpipe or softpipe as the default software renderer.  To select the OpenSWR driver, set the GALLIUM_DRIVER environment variable appropriately: ::
+
+  GALLIUM_DRIVER=swr
+
+To verify OpenSWR is being used, check to see if a message like the following is printed when the application is started: ::
+
+  SWR detected AVX2
+
diff --git a/src/gallium/docs/source/screen.rst b/src/gallium/docs/source/screen.rst
index b5d691f4f7e..46ec3815412 100644
--- a/src/gallium/docs/source/screen.rst
+++ b/src/gallium/docs/source/screen.rst
@@ -319,6 +319,10 @@ The integer capabilities:
   adjusted appropriately.
 * ``PIPE_CAP_QUERY_BUFFER_OBJECT``: Driver supports
   context::get_query_result_resource callback.
+* ``PIPE_CAP_PCI_GROUP``: Return the PCI segment group number.
+* ``PIPE_CAP_PCI_BUS``: Return the PCI bus number.
+* ``PIPE_CAP_PCI_DEVICE``: Return the PCI device number.
+* ``PIPE_CAP_PCI_FUNCTION``: Return the PCI function number.
 
 
 .. _pipe_capf:
diff --git a/src/gallium/docs/source/tgsi.rst b/src/gallium/docs/source/tgsi.rst
index 489cbb0bc2f..af2df2251da 100644
--- a/src/gallium/docs/source/tgsi.rst
+++ b/src/gallium/docs/source/tgsi.rst
@@ -3206,6 +3206,12 @@ NUM_CULLDIST_ENABLED
 
 How many cull distance scalar outputs are enabled.
 
+FS_EARLY_DEPTH_STENCIL
+""""""""""""""""""""""
+
+Whether depth test, stencil test, and occlusion query should run before
+the fragment shader (regardless of fragment shader side effects). Corresponds
+to GLSL early_fragment_tests.
 
 Texture Sampling and Texture Formats
 ------------------------------------
diff --git a/src/gallium/drivers/ddebug/dd_screen.c b/src/gallium/drivers/ddebug/dd_screen.c
index 3706b2d63f5..fbc0bec73dd 100644
--- a/src/gallium/drivers/ddebug/dd_screen.c
+++ b/src/gallium/drivers/ddebug/dd_screen.c
@@ -179,11 +179,12 @@ dd_screen_resource_create(struct pipe_screen *_screen,
 static struct pipe_resource *
 dd_screen_resource_from_handle(struct pipe_screen *_screen,
                                const struct pipe_resource *templ,
-                               struct winsys_handle *handle)
+                               struct winsys_handle *handle,
+                               unsigned usage)
 {
    struct pipe_screen *screen = dd_screen(_screen)->screen;
    struct pipe_resource *res =
-      screen->resource_from_handle(screen, templ, handle);
+      screen->resource_from_handle(screen, templ, handle, usage);
 
    if (!res)
       return NULL;
@@ -218,11 +219,12 @@ dd_screen_resource_destroy(struct pipe_screen *_screen,
 static boolean
 dd_screen_resource_get_handle(struct pipe_screen *_screen,
                               struct pipe_resource *resource,
-                              struct winsys_handle *handle)
+                              struct winsys_handle *handle,
+                              unsigned usage)
 {
    struct pipe_screen *screen = dd_screen(_screen)->screen;
 
-   return screen->resource_get_handle(screen, resource, handle);
+   return screen->resource_get_handle(screen, resource, handle, usage);
 }
 
 
diff --git a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
index 71ee55054d3..606252e6726 100644
--- a/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
+++ b/src/gallium/drivers/freedreno/a2xx/a2xx.xml.h
@@ -12,9 +12,9 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2016-02-10 17:07:21)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  11518 bytes, from 2016-02-10 21:03:25)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  16166 bytes, from 2016-02-11 21:20:31)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  16185 bytes, from 2016-03-05 03:08:05)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83967 bytes, from 2016-02-10 17:07:21)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 109858 bytes, from 2016-02-10 17:07:21)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 110342 bytes, from 2016-03-07 11:20:29)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
 
 Copyright (C) 2013-2015 by the following authors:
diff --git a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
index c6286a1f290..3838fdf44d0 100644
--- a/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
+++ b/src/gallium/drivers/freedreno/a3xx/a3xx.xml.h
@@ -12,9 +12,9 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2016-02-10 17:07:21)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  11518 bytes, from 2016-02-10 21:03:25)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  16166 bytes, from 2016-02-11 21:20:31)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  16185 bytes, from 2016-03-05 03:08:05)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83967 bytes, from 2016-02-10 17:07:21)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 109858 bytes, from 2016-02-10 17:07:21)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 110342 bytes, from 2016-03-07 11:20:29)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
 
 Copyright (C) 2013-2016 by the following authors:
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
index b8a31d84b3f..f48d464c294 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_draw.c
@@ -167,6 +167,7 @@ fd3_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info)
 	emit.key.binning_pass = false;
 	emit.dirty = dirty;
 	emit.vp = NULL;   /* we changed key so need to refetch vp */
+	emit.fp = NULL;
 	draw_impl(ctx, ctx->ring, &emit);
 }
 
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
index 8c37992e17d..adfa9a96a46 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.c
@@ -353,7 +353,7 @@ fd3_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd3_emit *emit)
 	int32_t i, j, last = -1;
 	uint32_t total_in = 0;
 	const struct fd_vertex_state *vtx = emit->vtx;
-	struct ir3_shader_variant *vp = fd3_emit_get_vp(emit);
+	const struct ir3_shader_variant *vp = fd3_emit_get_vp(emit);
 	unsigned vertex_regid = regid(63, 0);
 	unsigned instance_regid = regid(63, 0);
 	unsigned vtxcnt_regid = regid(63, 0);
@@ -478,8 +478,8 @@ void
 fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		struct fd3_emit *emit)
 {
-	struct ir3_shader_variant *vp = fd3_emit_get_vp(emit);
-	struct ir3_shader_variant *fp = fd3_emit_get_fp(emit);
+	const struct ir3_shader_variant *vp = fd3_emit_get_vp(emit);
+	const struct ir3_shader_variant *fp = fd3_emit_get_fp(emit);
 	uint32_t dirty = emit->dirty;
 
 	emit_marker(ring, 5);
@@ -656,9 +656,9 @@ fd3_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 	OUT_RING(ring, HLSQ_FLUSH);
 
 	if (emit->prog == &ctx->prog) { /* evil hack to deal sanely with clear path */
-		ir3_emit_consts(vp, ring, emit->info, dirty);
+		ir3_emit_consts(vp, ring, ctx, emit->info, dirty);
 		if (!emit->key.binning_pass)
-			ir3_emit_consts(fp, ring, emit->info, dirty);
+			ir3_emit_consts(fp, ring, ctx, emit->info, dirty);
 		/* mark clean after emitting consts: */
 		ctx->prog.dirty = 0;
 	}
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h
index 42483f6c39b..5dbb11599b5 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_emit.h
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_emit.h
@@ -58,10 +58,10 @@ struct fd3_emit {
 	bool rasterflat;
 
 	/* cached to avoid repeated lookups of same variants: */
-	struct ir3_shader_variant *vp, *fp;
+	const struct ir3_shader_variant *vp, *fp;
 };
 
-static inline struct ir3_shader_variant *
+static inline const struct ir3_shader_variant *
 fd3_emit_get_vp(struct fd3_emit *emit)
 {
 	if (!emit->vp) {
@@ -71,12 +71,18 @@ fd3_emit_get_vp(struct fd3_emit *emit)
 	return emit->vp;
 }
 
-static inline struct ir3_shader_variant *
+static inline const struct ir3_shader_variant *
 fd3_emit_get_fp(struct fd3_emit *emit)
 {
 	if (!emit->fp) {
-		struct fd3_shader_stateobj *so = emit->prog->fp;
-		emit->fp = ir3_shader_variant(so->shader, emit->key);
+		if (emit->key.binning_pass) {
+			/* use dummy stateobj to simplify binning vs non-binning: */
+			static const struct ir3_shader_variant binning_fp = {};
+			emit->fp = &binning_fp;
+		} else {
+			struct fd3_shader_stateobj *so = emit->prog->fp;
+			emit->fp = ir3_shader_variant(so->shader, emit->key);
+		}
 	}
 	return emit->fp;
 }
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
index 2ce393a41ae..815a310df83 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_gmem.c
@@ -931,9 +931,6 @@ fd3_emit_tile_init(struct fd_context *ctx)
 	update_vsc_pipe(ctx);
 
 	if (use_hw_binning(ctx)) {
-		/* mark the end of the binning cmds: */
-		fd_ringmarker_mark(ctx->binning_end);
-
 		/* emit hw binning pass: */
 		emit_binning_pass(ctx);
 
@@ -1017,8 +1014,8 @@ fd3_emit_tile_renderprep(struct fd_context *ctx, struct fd_tile *tile)
 
 
 		OUT_PKT3(ring, CP_SET_BIN_DATA, 2);
-		OUT_RELOC(ring, pipe->bo, 0, 0, 0);    /* BIN_DATA_ADDR <- VSC_PIPE[p].DATA_ADDRESS */
-		OUT_RELOC(ring, fd3_ctx->vsc_size_mem, /* BIN_SIZE_ADDR <- VSC_SIZE_ADDRESS + (p * 4) */
+		OUT_RELOCW(ring, pipe->bo, 0, 0, 0);    /* BIN_DATA_ADDR <- VSC_PIPE[p].DATA_ADDRESS */
+		OUT_RELOCW(ring, fd3_ctx->vsc_size_mem, /* BIN_SIZE_ADDR <- VSC_SIZE_ADDRESS + (p * 4) */
 				(tile->p * 4), 0, 0);
 	} else {
 		OUT_PKT0(ring, REG_A3XX_PC_VSTREAM_CONTROL, 1);
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_program.c b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
index a64ecf16eab..57e269cc21f 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_program.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_program.c
@@ -51,7 +51,8 @@ create_shader_stateobj(struct pipe_context *pctx, const struct pipe_shader_state
 		enum shader_t type)
 {
 	struct fd3_shader_stateobj *so = CALLOC_STRUCT(fd3_shader_stateobj);
-	so->shader = ir3_shader_create(pctx, cso, type);
+	struct ir3_compiler *compiler = fd_context(pctx)->screen->compiler;
+	so->shader = ir3_shader_create(compiler, cso, type);
 	return so;
 }
 
@@ -139,14 +140,7 @@ fd3_program_emit(struct fd_ringbuffer *ring, struct fd3_emit *emit,
 	debug_assert(nr <= ARRAY_SIZE(color_regid));
 
 	vp = fd3_emit_get_vp(emit);
-
-	if (emit->key.binning_pass) {
-		/* use dummy stateobj to simplify binning vs non-binning: */
-		static const struct ir3_shader_variant binning_fp = {};
-		fp = &binning_fp;
-	} else {
-		fp = fd3_emit_get_fp(emit);
-	}
+	fp = fd3_emit_get_fp(emit);
 
 	vsi = &vp->info;
 	fsi = &fp->info;
diff --git a/src/gallium/drivers/freedreno/a3xx/fd3_screen.c b/src/gallium/drivers/freedreno/a3xx/fd3_screen.c
index 722fe360202..4aea2fe0f37 100644
--- a/src/gallium/drivers/freedreno/a3xx/fd3_screen.c
+++ b/src/gallium/drivers/freedreno/a3xx/fd3_screen.c
@@ -106,7 +106,7 @@ fd3_screen_init(struct pipe_screen *pscreen)
 {
 	struct fd_screen *screen = fd_screen(pscreen);
 	screen->max_rts = A3XX_MAX_RENDER_TARGETS;
-	screen->compiler = ir3_compiler_create(screen->gpu_id);
+	screen->compiler = ir3_compiler_create(screen->dev, screen->gpu_id);
 	pscreen->context_create = fd3_context_create;
 	pscreen->is_format_supported = fd3_screen_is_format_supported;
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
index d6fd1bb583e..98750123291 100644
--- a/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
+++ b/src/gallium/drivers/freedreno/a4xx/a4xx.xml.h
@@ -12,9 +12,9 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2016-02-10 17:07:21)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  11518 bytes, from 2016-02-10 21:03:25)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  16166 bytes, from 2016-02-11 21:20:31)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  16185 bytes, from 2016-03-05 03:08:05)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83967 bytes, from 2016-02-10 17:07:21)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 109858 bytes, from 2016-02-10 17:07:21)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 110342 bytes, from 2016-03-07 11:20:29)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
 
 Copyright (C) 2013-2016 by the following authors:
@@ -940,6 +940,7 @@ static inline uint32_t A4XX_RB_MODE_CONTROL_HEIGHT(uint32_t val)
 {
 	return ((val >> 5) << A4XX_RB_MODE_CONTROL_HEIGHT__SHIFT) & A4XX_RB_MODE_CONTROL_HEIGHT__MASK;
 }
+#define A4XX_RB_MODE_CONTROL_ENABLE_GMEM			0x00010000
 
 #define REG_A4XX_RB_RENDER_CONTROL				0x000020a1
 #define A4XX_RB_RENDER_CONTROL_BINNING_PASS			0x00000001
@@ -1613,6 +1614,7 @@ static inline uint32_t REG_A4XX_RBBM_CLOCK_DELAY_TP_REG(uint32_t i0) { return 0x
 
 #define REG_A4XX_RBBM_POWER_CNTL_IP				0x00000098
 #define A4XX_RBBM_POWER_CNTL_IP_SW_COLLAPSE			0x00000001
+#define A4XX_RBBM_POWER_CNTL_IP_SP_TP_PWR_ON			0x00100000
 
 #define REG_A4XX_RBBM_PERFCTR_CP_0_LO				0x0000009c
 
@@ -3689,6 +3691,20 @@ static inline uint32_t A4XX_HLSQ_GS_CONTROL_REG_INSTRLENGTH(uint32_t val)
 
 #define REG_A4XX_PC_BIN_BASE					0x000021c0
 
+#define REG_A4XX_PC_VSTREAM_CONTROL				0x000021c2
+#define A4XX_PC_VSTREAM_CONTROL_SIZE__MASK			0x003f0000
+#define A4XX_PC_VSTREAM_CONTROL_SIZE__SHIFT			16
+static inline uint32_t A4XX_PC_VSTREAM_CONTROL_SIZE(uint32_t val)
+{
+	return ((val) << A4XX_PC_VSTREAM_CONTROL_SIZE__SHIFT) & A4XX_PC_VSTREAM_CONTROL_SIZE__MASK;
+}
+#define A4XX_PC_VSTREAM_CONTROL_N__MASK				0x07c00000
+#define A4XX_PC_VSTREAM_CONTROL_N__SHIFT			22
+static inline uint32_t A4XX_PC_VSTREAM_CONTROL_N(uint32_t val)
+{
+	return ((val) << A4XX_PC_VSTREAM_CONTROL_N__SHIFT) & A4XX_PC_VSTREAM_CONTROL_N__MASK;
+}
+
 #define REG_A4XX_PC_PRIM_VTX_CNTL				0x000021c4
 #define A4XX_PC_PRIM_VTX_CNTL_VAROUT__MASK			0x0000000f
 #define A4XX_PC_PRIM_VTX_CNTL_VAROUT__SHIFT			0
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.c b/src/gallium/drivers/freedreno/a4xx/fd4_context.c
index 7d6365bbb6d..62cfda97ac3 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_context.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.c
@@ -43,8 +43,6 @@ fd4_context_destroy(struct pipe_context *pctx)
 {
 	struct fd4_context *fd4_ctx = fd4_context(fd_context(pctx));
 
-	util_dynarray_fini(&fd4_ctx->rbrc_patches);
-
 	fd_bo_del(fd4_ctx->vs_pvt_mem);
 	fd_bo_del(fd4_ctx->fs_pvt_mem);
 	fd_bo_del(fd4_ctx->vsc_size_mem);
@@ -127,8 +125,6 @@ fd4_context_create(struct pipe_screen *pscreen, void *priv, unsigned flags)
 	if (!pctx)
 		return NULL;
 
-	util_dynarray_init(&fd4_ctx->rbrc_patches);
-
 	fd4_ctx->vs_pvt_mem = fd_bo_new(screen->dev, 0x2000,
 			DRM_FREEDRENO_GEM_TYPE_KMEM);
 
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_context.h b/src/gallium/drivers/freedreno/a4xx/fd4_context.h
index 0c1027d5804..8996de932b8 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_context.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_context.h
@@ -40,11 +40,6 @@
 struct fd4_context {
 	struct fd_context base;
 
-	/* Keep track of writes to RB_RENDER_CONTROL which need to be patched
-	 * once we know whether or not to use GMEM, and GMEM tile pitch.
-	 */
-	struct util_dynarray rbrc_patches;
-
 	struct fd_bo *vs_pvt_mem, *fs_pvt_mem;
 
 	/* This only needs to be 4 * num_of_pipes bytes (ie. 32 bytes).  We
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
index 8cbe68d5790..c34f9441c7b 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.c
@@ -156,6 +156,7 @@ fd4_draw_vbo(struct fd_context *ctx, const struct pipe_draw_info *info)
 	emit.key.binning_pass = false;
 	emit.dirty = dirty;
 	emit.vp = NULL;   /* we changed key so need to refetch vp */
+	emit.fp = NULL;
 	draw_impl(ctx, ctx->ring, &emit);
 }
 
@@ -175,6 +176,43 @@ reset_viewport(struct fd_ringbuffer *ring, struct pipe_framebuffer_state *pfb)
 	OUT_RING(ring, A4XX_GRAS_CL_VPORT_YSCALE_0(-half_height));
 }
 
+/* TODO maybe we should just migrate u_blitter for clear and do it in
+ * core (so we get normal draw pass state mgmt and binning).. That should
+ * work well enough for a3xx/a4xx (but maybe not a2xx?)
+ */
+
+static void
+fd4_clear_binning(struct fd_context *ctx, unsigned dirty)
+{
+	struct fd4_context *fd4_ctx = fd4_context(ctx);
+	struct fd_ringbuffer *ring = ctx->binning_ring;
+	struct fd4_emit emit = {
+		.vtx  = &fd4_ctx->solid_vbuf_state,
+		.prog = &ctx->solid_prog,
+		.key = {
+			.binning_pass = true,
+			.half_precision = true,
+		},
+		.dirty = dirty,
+	};
+
+	fd4_emit_state(ctx, ring, &emit);
+	fd4_emit_vertex_bufs(ring, &emit);
+	reset_viewport(ring, &ctx->framebuffer);
+
+	OUT_PKT0(ring, REG_A4XX_PC_PRIM_VTX_CNTL, 2);
+	OUT_RING(ring, A4XX_PC_PRIM_VTX_CNTL_VAROUT(0) |
+			A4XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST);
+	OUT_RING(ring, A4XX_PC_PRIM_VTX_CNTL2_POLYMODE_FRONT_PTYPE(PC_DRAW_TRIANGLES) |
+			A4XX_PC_PRIM_VTX_CNTL2_POLYMODE_BACK_PTYPE(PC_DRAW_TRIANGLES));
+
+	OUT_PKT0(ring, REG_A4XX_GRAS_ALPHA_CONTROL, 1);
+	OUT_RING(ring, 0x00000002);
+
+	fd4_draw(ctx, ring, DI_PT_RECTLIST, IGNORE_VISIBILITY,
+			DI_SRC_SEL_AUTO_INDEX, 2, 1, INDEX_SIZE_IGN, 0, 0, NULL);
+}
+
 static void
 fd4_clear(struct fd_context *ctx, unsigned buffers,
 		const union pipe_color_union *color, double depth, unsigned stencil)
@@ -197,6 +235,8 @@ fd4_clear(struct fd_context *ctx, unsigned buffers,
 	dirty |= FD_DIRTY_PROG;
 	emit.dirty = dirty;
 
+	fd4_clear_binning(ctx, dirty);
+
 	OUT_PKT0(ring, REG_A4XX_PC_PRIM_VTX_CNTL, 1);
 	OUT_RING(ring, A4XX_PC_PRIM_VTX_CNTL_PROVOKING_VTX_LAST);
 
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_draw.h b/src/gallium/drivers/freedreno/a4xx/fd4_draw.h
index a6c56404a8a..2b23e33b42f 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_draw.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_draw.h
@@ -41,9 +41,10 @@ static inline uint32_t DRAW4(enum pc_di_primtype prim_type,
 		enum pc_di_src_sel source_select, enum a4xx_index_size index_size,
 		enum pc_di_vis_cull_mode vis_cull_mode)
 {
-	return (prim_type         << 0) |
-			(source_select     << 6) |
-			(index_size        << 10);
+	return CP_DRAW_INDX_OFFSET_0_PRIM_TYPE(prim_type) |
+			CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(source_select) |
+			CP_DRAW_INDX_OFFSET_0_INDEX_SIZE(index_size) |
+			CP_DRAW_INDX_OFFSET_0_VIS_CULL(vis_cull_mode);
 }
 
 static inline void
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
index 72154bf286a..81ed16ce8ac 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.c
@@ -328,7 +328,7 @@ fd4_emit_vertex_bufs(struct fd_ringbuffer *ring, struct fd4_emit *emit)
 	int32_t i, j, last = -1;
 	uint32_t total_in = 0;
 	const struct fd_vertex_state *vtx = emit->vtx;
-	struct ir3_shader_variant *vp = fd4_emit_get_vp(emit);
+	const struct ir3_shader_variant *vp = fd4_emit_get_vp(emit);
 	unsigned vertex_regid = regid(63, 0);
 	unsigned instance_regid = regid(63, 0);
 	unsigned vtxcnt_regid = regid(63, 0);
@@ -460,8 +460,8 @@ void
 fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 		struct fd4_emit *emit)
 {
-	struct ir3_shader_variant *vp = fd4_emit_get_vp(emit);
-	struct ir3_shader_variant *fp = fd4_emit_get_fp(emit);
+	const struct ir3_shader_variant *vp = fd4_emit_get_vp(emit);
+	const struct ir3_shader_variant *fp = fd4_emit_get_fp(emit);
 	uint32_t dirty = emit->dirty;
 
 	emit_marker(ring, 5);
@@ -485,19 +485,6 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 				A4XX_RB_RENDER_COMPONENTS_RT7(mrt_comp[7]));
 	}
 
-	if ((dirty & (FD_DIRTY_ZSA | FD_DIRTY_PROG)) && !emit->key.binning_pass) {
-		uint32_t val = fd4_zsa_stateobj(ctx->zsa)->rb_render_control;
-
-		/* I suppose if we needed to (which I don't *think* we need
-		 * to), we could emit this for binning pass too.  But we
-		 * would need to keep a different patch-list for binning
-		 * vs render pass.
-		 */
-
-		OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL, 1);
-		OUT_RINGP(ring, val, &fd4_context(ctx)->rbrc_patches);
-	}
-
 	if (dirty & (FD_DIRTY_ZSA | FD_DIRTY_FRAMEBUFFER)) {
 		struct fd4_zsa_stateobj *zsa = fd4_zsa_stateobj(ctx->zsa);
 		struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
@@ -619,13 +606,17 @@ fd4_emit_state(struct fd_context *ctx, struct fd_ringbuffer *ring,
 
 	if (dirty & (FD_DIRTY_PROG | FD_DIRTY_FRAMEBUFFER)) {
 		struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
-		fd4_program_emit(ring, emit, pfb->nr_cbufs, pfb->cbufs);
+		unsigned n = pfb->nr_cbufs;
+		/* if we have depth/stencil, we need at least on MRT: */
+		if (pfb->zsbuf)
+			n = MAX2(1, n);
+		fd4_program_emit(ring, emit, n, pfb->cbufs);
 	}
 
 	if (emit->prog == &ctx->prog) { /* evil hack to deal sanely with clear path */
-		ir3_emit_consts(vp, ring, emit->info, dirty);
+		ir3_emit_consts(vp, ring, ctx, emit->info, dirty);
 		if (!emit->key.binning_pass)
-			ir3_emit_consts(fp, ring, emit->info, dirty);
+			ir3_emit_consts(fp, ring, ctx, emit->info, dirty);
 		/* mark clean after emitting consts: */
 		ctx->prog.dirty = 0;
 	}
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
index 3a1d4b617d3..d8d3fd88a69 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_emit.h
@@ -59,7 +59,7 @@ struct fd4_emit {
 	bool no_decode_srgb;
 
 	/* cached to avoid repeated lookups of same variants: */
-	struct ir3_shader_variant *vp, *fp;
+	const struct ir3_shader_variant *vp, *fp;
 	/* TODO: other shader stages.. */
 };
 
@@ -70,7 +70,7 @@ static inline enum a4xx_color_fmt fd4_emit_format(struct pipe_surface *surf)
 	return fd4_pipe2color(surf->format);
 }
 
-static inline struct ir3_shader_variant *
+static inline const struct ir3_shader_variant *
 fd4_emit_get_vp(struct fd4_emit *emit)
 {
 	if (!emit->vp) {
@@ -80,12 +80,18 @@ fd4_emit_get_vp(struct fd4_emit *emit)
 	return emit->vp;
 }
 
-static inline struct ir3_shader_variant *
+static inline const struct ir3_shader_variant *
 fd4_emit_get_fp(struct fd4_emit *emit)
 {
 	if (!emit->fp) {
-		struct fd4_shader_stateobj *so = emit->prog->fp;
-		emit->fp = ir3_shader_variant(so->shader, emit->key);
+		if (emit->key.binning_pass) {
+			/* use dummy stateobj to simplify binning vs non-binning: */
+			static const struct ir3_shader_variant binning_fp = {};
+			emit->fp = &binning_fp;
+		} else {
+			struct fd4_shader_stateobj *so = emit->prog->fp;
+			emit->fp = ir3_shader_variant(so->shader, emit->key);
+		}
 	}
 	return emit->fp;
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
index 221608127b4..0629c303656 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_gmem.c
@@ -130,6 +130,19 @@ emit_mrt(struct fd_ringbuffer *ring, unsigned nr_bufs,
 	}
 }
 
+static bool
+use_hw_binning(struct fd_context *ctx)
+{
+	struct fd_gmem_stateobj *gmem = &ctx->gmem;
+	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
+
+	/* this seems to be a hw bug.. but this hack fixes piglit fbo-maxsize: */
+	if ((pfb->width > 4096) && (pfb->height > 4096))
+		return false;
+
+	return fd_binning_enabled && ((gmem->nbins_x * gmem->nbins_y) > 2);
+}
+
 /* transfer from gmem to system memory (ie. normal RAM) */
 
 static void
@@ -502,18 +515,6 @@ patch_draws(struct fd_context *ctx, enum pc_di_vis_cull_mode vismode)
 	util_dynarray_resize(&ctx->draw_patches, 0);
 }
 
-static void
-patch_rbrc(struct fd_context *ctx, uint32_t val)
-{
-	struct fd4_context *fd4_ctx = fd4_context(ctx);
-	unsigned i;
-	for (i = 0; i < fd_patch_num_elements(&fd4_ctx->rbrc_patches); i++) {
-		struct fd_cs_patch *patch = fd_patch_element(&fd4_ctx->rbrc_patches, i);
-		*patch->cs = patch->val | val;
-	}
-	util_dynarray_resize(&fd4_ctx->rbrc_patches, 0);
-}
-
 /* for rendering directly to system memory: */
 static void
 fd4_emit_sysmem_prep(struct fd_context *ctx)
@@ -545,8 +546,10 @@ fd4_emit_sysmem_prep(struct fd_context *ctx)
 			A4XX_RB_MODE_CONTROL_HEIGHT(0) |
 			0x00c00000);  /* XXX */
 
+	OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL, 1);
+	OUT_RING(ring, 0x8);
+
 	patch_draws(ctx, IGNORE_VISIBILITY);
-	patch_rbrc(ctx, 0);  // XXX
 }
 
 static void
@@ -585,13 +588,76 @@ update_vsc_pipe(struct fd_context *ctx)
 	}
 }
 
+static void
+emit_binning_pass(struct fd_context *ctx)
+{
+	struct fd_gmem_stateobj *gmem = &ctx->gmem;
+	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
+	struct fd_ringbuffer *ring = ctx->ring;
+	int i;
+
+	uint32_t x1 = gmem->minx;
+	uint32_t y1 = gmem->miny;
+	uint32_t x2 = gmem->minx + gmem->width - 1;
+	uint32_t y2 = gmem->miny + gmem->height - 1;
+
+	OUT_PKT0(ring, REG_A4XX_PC_BINNING_COMMAND, 1);
+	OUT_RING(ring, A4XX_PC_BINNING_COMMAND_BINNING_ENABLE);
+
+	OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1);
+	OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_TILING_PASS) |
+			A4XX_GRAS_SC_CONTROL_MSAA_DISABLE |
+			A4XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) |
+			A4XX_GRAS_SC_CONTROL_RASTER_MODE(0));
+
+	OUT_PKT0(ring, REG_A4XX_RB_FRAME_BUFFER_DIMENSION, 1);
+	OUT_RING(ring, A4XX_RB_FRAME_BUFFER_DIMENSION_WIDTH(pfb->width) |
+			A4XX_RB_FRAME_BUFFER_DIMENSION_HEIGHT(pfb->height));
+
+	/* setup scissor/offset for whole screen: */
+	OUT_PKT0(ring, REG_A4XX_RB_BIN_OFFSET, 1);
+	OUT_RING(ring, A4XX_RB_BIN_OFFSET_X(x1) |
+			A4XX_RB_BIN_OFFSET_Y(y1));
+
+	OUT_PKT0(ring, REG_A4XX_GRAS_SC_SCREEN_SCISSOR_TL, 2);
+	OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_TL_X(x1) |
+			A4XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(y1));
+	OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_BR_X(x2) |
+			A4XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(y2));
+
+	for (i = 0; i < A4XX_MAX_RENDER_TARGETS; i++) {
+		OUT_PKT0(ring, REG_A4XX_RB_MRT_CONTROL(i), 1);
+		OUT_RING(ring, A4XX_RB_MRT_CONTROL_ROP_CODE(ROP_CLEAR) |
+				A4XX_RB_MRT_CONTROL_COMPONENT_ENABLE(0xf));
+	}
+
+	/* emit IB to binning drawcmds: */
+	ctx->emit_ib(ring, ctx->binning_start, ctx->binning_end);
+
+	fd_reset_wfi(ctx);
+	fd_wfi(ctx, ring);
+
+	/* and then put stuff back the way it was: */
+
+	OUT_PKT0(ring, REG_A4XX_PC_BINNING_COMMAND, 1);
+	OUT_RING(ring, 0x00000000);
+
+	OUT_PKT0(ring, REG_A4XX_GRAS_SC_CONTROL, 1);
+	OUT_RING(ring, A4XX_GRAS_SC_CONTROL_RENDER_MODE(RB_RENDERING_PASS) |
+			A4XX_GRAS_SC_CONTROL_MSAA_DISABLE |
+			A4XX_GRAS_SC_CONTROL_MSAA_SAMPLES(MSAA_ONE) |
+			A4XX_GRAS_SC_CONTROL_RASTER_MODE(0));
+
+	fd_event_write(ctx, ring, CACHE_FLUSH);
+	fd_wfi(ctx, ring);
+}
+
 /* before first tile */
 static void
 fd4_emit_tile_init(struct fd_context *ctx)
 {
 	struct fd_ringbuffer *ring = ctx->ring;
 	struct fd_gmem_stateobj *gmem = &ctx->gmem;
-	uint32_t rb_render_control;
 
 	fd4_emit_restore(ctx);
 
@@ -599,16 +665,30 @@ fd4_emit_tile_init(struct fd_context *ctx)
 	OUT_RING(ring, A4XX_VSC_BIN_SIZE_WIDTH(gmem->bin_w) |
 			A4XX_VSC_BIN_SIZE_HEIGHT(gmem->bin_h));
 
+	update_vsc_pipe(ctx);
+
+	if (use_hw_binning(ctx)) {
+		OUT_PKT0(ring, REG_A4XX_RB_MODE_CONTROL, 1);
+		OUT_RING(ring, A4XX_RB_MODE_CONTROL_WIDTH(gmem->bin_w) |
+				A4XX_RB_MODE_CONTROL_HEIGHT(gmem->bin_h));
+
+		OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL, 1);
+		OUT_RING(ring, A4XX_RB_RENDER_CONTROL_BINNING_PASS |
+				A4XX_RB_RENDER_CONTROL_DISABLE_COLOR_PIPE |
+				0x8);
+
+		/* emit hw binning pass: */
+		emit_binning_pass(ctx);
+
+		patch_draws(ctx, USE_VISIBILITY);
+	} else {
+		patch_draws(ctx, IGNORE_VISIBILITY);
+	}
+
 	OUT_PKT0(ring, REG_A4XX_RB_MODE_CONTROL, 1);
 	OUT_RING(ring, A4XX_RB_MODE_CONTROL_WIDTH(gmem->bin_w) |
 			A4XX_RB_MODE_CONTROL_HEIGHT(gmem->bin_h) |
-			0x00010000);  /* XXX */
-
-	update_vsc_pipe(ctx);
-	patch_draws(ctx, IGNORE_VISIBILITY);
-
-	rb_render_control = 0; // XXX or BINNING_PASS.. but maybe we can emit only from gmem
-	patch_rbrc(ctx, rb_render_control);
+			A4XX_RB_MODE_CONTROL_ENABLE_GMEM);
 }
 
 /* before mem2gmem */
@@ -670,6 +750,7 @@ fd4_emit_tile_prep(struct fd_context *ctx, struct fd_tile *tile)
 static void
 fd4_emit_tile_renderprep(struct fd_context *ctx, struct fd_tile *tile)
 {
+	struct fd4_context *fd4_ctx = fd4_context(ctx);
 	struct fd_ringbuffer *ring = ctx->ring;
 	struct fd_gmem_stateobj *gmem = &ctx->gmem;
 	struct pipe_framebuffer_state *pfb = &ctx->framebuffer;
@@ -679,6 +760,27 @@ fd4_emit_tile_renderprep(struct fd_context *ctx, struct fd_tile *tile)
 	uint32_t x2 = tile->xoff + tile->bin_w - 1;
 	uint32_t y2 = tile->yoff + tile->bin_h - 1;
 
+	if (use_hw_binning(ctx)) {
+		struct fd_vsc_pipe *pipe = &ctx->pipe[tile->p];
+
+		assert(pipe->w * pipe->h);
+
+		fd_event_write(ctx, ring, HLSQ_FLUSH);
+		fd_wfi(ctx, ring);
+
+		OUT_PKT0(ring, REG_A4XX_PC_VSTREAM_CONTROL, 1);
+		OUT_RING(ring, A4XX_PC_VSTREAM_CONTROL_SIZE(pipe->w * pipe->h) |
+				A4XX_PC_VSTREAM_CONTROL_N(tile->n));
+
+		OUT_PKT3(ring, CP_SET_BIN_DATA, 2);
+		OUT_RELOCW(ring, pipe->bo, 0, 0, 0);    /* BIN_DATA_ADDR <- VSC_PIPE[p].DATA_ADDRESS */
+		OUT_RELOCW(ring, fd4_ctx->vsc_size_mem, /* BIN_SIZE_ADDR <- VSC_SIZE_ADDRESS + (p * 4) */
+				(tile->p * 4), 0, 0);
+	} else {
+		OUT_PKT0(ring, REG_A4XX_PC_VSTREAM_CONTROL, 1);
+		OUT_RING(ring, 0x00000000);
+	}
+
 	OUT_PKT3(ring, CP_SET_BIN, 3);
 	OUT_RING(ring, 0x00000000);
 	OUT_RING(ring, CP_SET_BIN_1_X1(x1) | CP_SET_BIN_1_Y1(y1));
@@ -696,6 +798,9 @@ fd4_emit_tile_renderprep(struct fd_context *ctx, struct fd_tile *tile)
 			A4XX_GRAS_SC_SCREEN_SCISSOR_TL_Y(y1));
 	OUT_RING(ring, A4XX_GRAS_SC_SCREEN_SCISSOR_BR_X(x2) |
 			A4XX_GRAS_SC_SCREEN_SCISSOR_BR_Y(y2));
+
+	OUT_PKT0(ring, REG_A4XX_RB_RENDER_CONTROL, 1);
+	OUT_RING(ring, 0x8);
 }
 
 void
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_program.c b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
index 74716fb733f..d782b94f848 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_program.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_program.c
@@ -51,7 +51,8 @@ create_shader_stateobj(struct pipe_context *pctx, const struct pipe_shader_state
 		enum shader_t type)
 {
 	struct fd4_shader_stateobj *so = CALLOC_STRUCT(fd4_shader_stateobj);
-	so->shader = ir3_shader_create(pctx, cso, type);
+	struct ir3_compiler *compiler = fd_context(pctx)->screen->compiler;
+	so->shader = ir3_shader_create(compiler, cso, type);
 	return so;
 }
 
@@ -150,14 +151,7 @@ setup_stages(struct fd4_emit *emit, struct stage *s)
 	unsigned i;
 
 	s[VS].v = fd4_emit_get_vp(emit);
-
-	if (emit->key.binning_pass) {
-		/* use dummy stateobj to simplify binning vs non-binning: */
-		static const struct ir3_shader_variant binning_fp = {};
-		s[FS].v = &binning_fp;
-	} else {
-		s[FS].v = fd4_emit_get_fp(emit);
-	}
+	s[FS].v = fd4_emit_get_fp(emit);
 
 	s[HS].v = s[DS].v = s[GS].v = NULL;  /* for now */
 
@@ -223,6 +217,9 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 
 	debug_assert(nr <= ARRAY_SIZE(color_regid));
 
+	if (emit->key.binning_pass)
+		nr = 0;
+
 	setup_stages(emit, s);
 
 	fssz = (s[FS].i->max_reg >= 24) ? TWO_QUADS : FOUR_QUADS;
@@ -379,31 +376,49 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 			A4XX_SP_VS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[VS].instroff));
 	OUT_RELOC(ring, s[VS].v->bo, 0, 0, 0);  /* SP_VS_OBJ_START_REG */
 
-	OUT_PKT0(ring, REG_A4XX_SP_FS_LENGTH_REG, 1);
-	OUT_RING(ring, s[FS].v->instrlen);  /* SP_FS_LENGTH_REG */
-
-	OUT_PKT0(ring, REG_A4XX_SP_FS_CTRL_REG0, 2);
-	OUT_RING(ring, A4XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) |
-			COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG0_VARYING) |
-			A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(s[FS].i->max_half_reg + 1) |
-			A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(s[FS].i->max_reg + 1) |
-			A4XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) |
-			A4XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) |
-			A4XX_SP_FS_CTRL_REG0_SUPERTHREADMODE |
-			COND(s[FS].v->has_samp, A4XX_SP_FS_CTRL_REG0_PIXLODENABLE));
-	OUT_RING(ring, A4XX_SP_FS_CTRL_REG1_CONSTLENGTH(s[FS].constlen) |
-			0x80000000 |      /* XXX */
-			COND(s[FS].v->frag_face, A4XX_SP_FS_CTRL_REG1_FACENESS) |
-			COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG1_VARYING) |
-			COND(s[FS].v->frag_coord, A4XX_SP_FS_CTRL_REG1_FRAGCOORD));
-
-	OUT_PKT0(ring, REG_A4XX_SP_FS_OBJ_OFFSET_REG, 2);
-	OUT_RING(ring, A4XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[FS].constoff) |
-			A4XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[FS].instroff));
-	if (emit->key.binning_pass)
+	if (emit->key.binning_pass) {
+		OUT_PKT0(ring, REG_A4XX_SP_FS_LENGTH_REG, 1);
+		OUT_RING(ring, 0x00000000);         /* SP_FS_LENGTH_REG */
+
+		OUT_PKT0(ring, REG_A4XX_SP_FS_CTRL_REG0, 2);
+		OUT_RING(ring, A4XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) |
+				COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG0_VARYING) |
+				A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(0) |
+				A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(0) |
+				A4XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) |
+				A4XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) |
+				A4XX_SP_FS_CTRL_REG0_SUPERTHREADMODE);
+		OUT_RING(ring, A4XX_SP_FS_CTRL_REG1_CONSTLENGTH(s[FS].constlen) |
+				0x80000000);
+
+		OUT_PKT0(ring, REG_A4XX_SP_FS_OBJ_OFFSET_REG, 2);
+		OUT_RING(ring, A4XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[FS].constoff) |
+				A4XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[FS].instroff));
 		OUT_RING(ring, 0x00000000);
-	else
+	} else {
+		OUT_PKT0(ring, REG_A4XX_SP_FS_LENGTH_REG, 1);
+		OUT_RING(ring, s[FS].v->instrlen);  /* SP_FS_LENGTH_REG */
+
+		OUT_PKT0(ring, REG_A4XX_SP_FS_CTRL_REG0, 2);
+		OUT_RING(ring, A4XX_SP_FS_CTRL_REG0_THREADMODE(MULTI) |
+				COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG0_VARYING) |
+				A4XX_SP_FS_CTRL_REG0_HALFREGFOOTPRINT(s[FS].i->max_half_reg + 1) |
+				A4XX_SP_FS_CTRL_REG0_FULLREGFOOTPRINT(s[FS].i->max_reg + 1) |
+				A4XX_SP_FS_CTRL_REG0_INOUTREGOVERLAP(1) |
+				A4XX_SP_FS_CTRL_REG0_THREADSIZE(fssz) |
+				A4XX_SP_FS_CTRL_REG0_SUPERTHREADMODE |
+				COND(s[FS].v->has_samp, A4XX_SP_FS_CTRL_REG0_PIXLODENABLE));
+		OUT_RING(ring, A4XX_SP_FS_CTRL_REG1_CONSTLENGTH(s[FS].constlen) |
+				0x80000000 |      /* XXX */
+				COND(s[FS].v->frag_face, A4XX_SP_FS_CTRL_REG1_FACENESS) |
+				COND(s[FS].v->total_in > 0, A4XX_SP_FS_CTRL_REG1_VARYING) |
+				COND(s[FS].v->frag_coord, A4XX_SP_FS_CTRL_REG1_FRAGCOORD));
+
+		OUT_PKT0(ring, REG_A4XX_SP_FS_OBJ_OFFSET_REG, 2);
+		OUT_RING(ring, A4XX_SP_FS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[FS].constoff) |
+				A4XX_SP_FS_OBJ_OFFSET_REG_SHADEROBJOFFSET(s[FS].instroff));
 		OUT_RELOC(ring, s[FS].v->bo, 0, 0, 0);  /* SP_FS_OBJ_START_REG */
+	}
 
 	OUT_PKT0(ring, REG_A4XX_SP_HS_OBJ_OFFSET_REG, 1);
 	OUT_RING(ring, A4XX_SP_HS_OBJ_OFFSET_REG_CONSTOBJECTOFFSET(s[HS].constoff) |
@@ -427,11 +442,11 @@ fd4_program_emit(struct fd_ringbuffer *ring, struct fd4_emit *emit,
 					A4XX_RB_RENDER_CONTROL2_WCOORD));
 
 	OUT_PKT0(ring, REG_A4XX_RB_FS_OUTPUT_REG, 1);
-	OUT_RING(ring, A4XX_RB_FS_OUTPUT_REG_MRT(MAX2(1, nr)) |
+	OUT_RING(ring, A4XX_RB_FS_OUTPUT_REG_MRT(nr) |
 			COND(s[FS].v->writes_pos, A4XX_RB_FS_OUTPUT_REG_FRAG_WRITES_Z));
 
 	OUT_PKT0(ring, REG_A4XX_SP_FS_OUTPUT_REG, 1);
-	OUT_RING(ring, A4XX_SP_FS_OUTPUT_REG_MRT(MAX2(1, nr)) |
+	OUT_RING(ring, A4XX_SP_FS_OUTPUT_REG_MRT(nr) |
 			COND(s[FS].v->writes_pos, A4XX_SP_FS_OUTPUT_REG_DEPTH_ENABLE) |
 			A4XX_SP_FS_OUTPUT_REG_DEPTH_REGID(posz_regid));
 
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_query.c b/src/gallium/drivers/freedreno/a4xx/fd4_query.c
index 14a809431ac..77e203f6c56 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_query.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_query.c
@@ -173,7 +173,7 @@ time_elapsed_get_sample(struct fd_context *ctx, struct fd_ringbuffer *ring)
 	OUT_RING(ring, CP_REG_TO_MEM_0_REG(REG_A4XX_RBBM_PERFCTR_CP_0_LO) |
 			CP_REG_TO_MEM_0_64B |
 			CP_REG_TO_MEM_0_CNT(2-1)); /* write 2 regs to mem */
-	OUT_RELOC(ring, scratch_bo, sample_off, 0, 0);
+	OUT_RELOCW(ring, scratch_bo, sample_off, 0, 0);
 
 	/* ok... here we really *would* like to use the CP_SET_CONSTANT
 	 * mode which can add a constant to value in reg2 and write to
@@ -187,7 +187,7 @@ time_elapsed_get_sample(struct fd_context *ctx, struct fd_ringbuffer *ring)
 
 	/* per-sample offset to scratch bo: */
 	OUT_PKT3(ring, CP_MEM_WRITE, 2);
-	OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);
+	OUT_RELOCW(ring, scratch_bo, addr_off, 0, 0);
 	OUT_RING(ring, samp->offset);
 
 	/* now add to that the per-tile base: */
@@ -195,7 +195,7 @@ time_elapsed_get_sample(struct fd_context *ctx, struct fd_ringbuffer *ring)
 	OUT_RING(ring, CP_REG_TO_MEM_0_REG(HW_QUERY_BASE_REG) |
 			CP_REG_TO_MEM_0_ACCUMULATE |
 			CP_REG_TO_MEM_0_CNT(1-1));       /* readback 1 regs */
-	OUT_RELOC(ring, scratch_bo, addr_off, 0, 0);
+	OUT_RELOCW(ring, scratch_bo, addr_off, 0, 0);
 
 	/* now copy that back to CP_ME_NRT_ADDR: */
 	OUT_PKT3(ring, CP_MEM_TO_REG, 2);
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
index b2a69cca56c..c193f361e4c 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_screen.c
@@ -105,7 +105,7 @@ fd4_screen_init(struct pipe_screen *pscreen)
 {
 	struct fd_screen *screen = fd_screen(pscreen);
 	screen->max_rts = A4XX_MAX_RENDER_TARGETS;
-	screen->compiler = ir3_compiler_create(screen->gpu_id);
+	screen->compiler = ir3_compiler_create(screen->dev, screen->gpu_id);
 	pscreen->context_create = fd4_context_create;
 	pscreen->is_format_supported = fd4_screen_is_format_supported;
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_zsa.c b/src/gallium/drivers/freedreno/a4xx/fd4_zsa.c
index e14b617570d..a9c8d5a3d62 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_zsa.c
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_zsa.c
@@ -103,7 +103,5 @@ fd4_zsa_state_create(struct pipe_context *pctx,
 			A4XX_RB_DEPTH_CONTROL_EARLY_Z_DISABLE;
 	}
 
-	so->rb_render_control = 0x8;  /* XXX */
-
 	return so;
 }
diff --git a/src/gallium/drivers/freedreno/a4xx/fd4_zsa.h b/src/gallium/drivers/freedreno/a4xx/fd4_zsa.h
index 6a92a9b6785..3c46117a3fe 100644
--- a/src/gallium/drivers/freedreno/a4xx/fd4_zsa.h
+++ b/src/gallium/drivers/freedreno/a4xx/fd4_zsa.h
@@ -39,7 +39,6 @@ struct fd4_zsa_stateobj {
 	struct pipe_depth_stencil_alpha_state base;
 	uint32_t gras_alpha_control;
 	uint32_t rb_alpha_control;
-	uint32_t rb_render_control;
 	uint32_t rb_depth_control;
 	uint32_t rb_stencil_control;
 	uint32_t rb_stencil_control2;
diff --git a/src/gallium/drivers/freedreno/adreno_common.xml.h b/src/gallium/drivers/freedreno/adreno_common.xml.h
index ac5343f1a78..4e361b0a246 100644
--- a/src/gallium/drivers/freedreno/adreno_common.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_common.xml.h
@@ -12,9 +12,9 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2016-02-10 17:07:21)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  11518 bytes, from 2016-02-10 21:03:25)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  16166 bytes, from 2016-02-11 21:20:31)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  16185 bytes, from 2016-03-05 03:08:05)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83967 bytes, from 2016-02-10 17:07:21)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 109858 bytes, from 2016-02-10 17:07:21)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 110342 bytes, from 2016-03-07 11:20:29)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
 
 Copyright (C) 2013-2016 by the following authors:
diff --git a/src/gallium/drivers/freedreno/adreno_pm4.xml.h b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
index 09b26a253f0..932cfc0d5e8 100644
--- a/src/gallium/drivers/freedreno/adreno_pm4.xml.h
+++ b/src/gallium/drivers/freedreno/adreno_pm4.xml.h
@@ -12,9 +12,9 @@ The rules-ng-ng source files this header was generated from are:
 - /home/robclark/src/freedreno/envytools/rnndb/freedreno_copyright.xml  (   1572 bytes, from 2016-02-10 17:07:21)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a2xx.xml          (  32901 bytes, from 2015-05-20 20:03:14)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_common.xml (  11518 bytes, from 2016-02-10 21:03:25)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  16166 bytes, from 2016-02-11 21:20:31)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/adreno_pm4.xml    (  16185 bytes, from 2016-03-05 03:08:05)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/a3xx.xml          (  83967 bytes, from 2016-02-10 17:07:21)
-- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 109858 bytes, from 2016-02-10 17:07:21)
+- /home/robclark/src/freedreno/envytools/rnndb/adreno/a4xx.xml          ( 110342 bytes, from 2016-03-07 11:20:29)
 - /home/robclark/src/freedreno/envytools/rnndb/adreno/ocmem.xml         (   1773 bytes, from 2015-09-24 17:30:00)
 
 Copyright (C) 2013-2016 by the following authors:
@@ -389,7 +389,12 @@ static inline uint32_t CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT(enum pc_di_src_sel va
 {
 	return ((val) << CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT__SHIFT) & CP_DRAW_INDX_OFFSET_0_SOURCE_SELECT__MASK;
 }
-#define CP_DRAW_INDX_OFFSET_0_TESSELLATE			0x00000100
+#define CP_DRAW_INDX_OFFSET_0_VIS_CULL__MASK			0x00000300
+#define CP_DRAW_INDX_OFFSET_0_VIS_CULL__SHIFT			8
+static inline uint32_t CP_DRAW_INDX_OFFSET_0_VIS_CULL(enum pc_di_vis_cull_mode val)
+{
+	return ((val) << CP_DRAW_INDX_OFFSET_0_VIS_CULL__SHIFT) & CP_DRAW_INDX_OFFSET_0_VIS_CULL__MASK;
+}
 #define CP_DRAW_INDX_OFFSET_0_INDEX_SIZE__MASK			0x00000c00
 #define CP_DRAW_INDX_OFFSET_0_INDEX_SIZE__SHIFT			10
 static inline uint32_t CP_DRAW_INDX_OFFSET_0_INDEX_SIZE(enum a4xx_index_size val)
diff --git a/src/gallium/drivers/freedreno/freedreno_resource.c b/src/gallium/drivers/freedreno/freedreno_resource.c
index bcdd518c8bf..9aded3bb7fe 100644
--- a/src/gallium/drivers/freedreno/freedreno_resource.c
+++ b/src/gallium/drivers/freedreno/freedreno_resource.c
@@ -637,7 +637,8 @@ fail:
 static struct pipe_resource *
 fd_resource_from_handle(struct pipe_screen *pscreen,
 		const struct pipe_resource *tmpl,
-		struct winsys_handle *handle)
+		struct winsys_handle *handle,
+                unsigned usage)
 {
 	struct fd_resource *rsc = CALLOC_STRUCT(fd_resource);
 	struct fd_resource_slice *slice = &rsc->slices[0];
diff --git a/src/gallium/drivers/freedreno/freedreno_screen.c b/src/gallium/drivers/freedreno/freedreno_screen.c
index 2b3ecfe664e..d47cb07f10b 100644
--- a/src/gallium/drivers/freedreno/freedreno_screen.c
+++ b/src/gallium/drivers/freedreno/freedreno_screen.c
@@ -166,6 +166,10 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
 	case PIPE_CAP_COMPUTE:
 	case PIPE_CAP_QUERY_MEMORY_INFO:
+	case PIPE_CAP_PCI_GROUP:
+	case PIPE_CAP_PCI_BUS:
+	case PIPE_CAP_PCI_DEVICE:
+	case PIPE_CAP_PCI_FUNCTION:
 		return 0;
 
 	case PIPE_CAP_SM3:
@@ -241,7 +245,6 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_DEPTH_BOUNDS_TEST:
 	case PIPE_CAP_TGSI_TXQS:
 	case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
-	case PIPE_CAP_SHAREABLE_SHADERS:
 	case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
 	case PIPE_CAP_CLEAR_TEXTURE:
 	case PIPE_CAP_DRAW_PARAMETERS:
@@ -257,6 +260,11 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
 	case PIPE_CAP_MAX_VIEWPORTS:
 		return 1;
 
+	case PIPE_CAP_SHAREABLE_SHADERS:
+		if (is_ir3(screen))
+			return 1;
+		return 0;
+
 	/* Stream output. */
 	case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
 		if (is_ir3(screen))
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
index 481859efb17..7ae4e94f0b3 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_cmdline.c
@@ -233,7 +233,7 @@ int main(int argc, char **argv)
 		tgsi_dump(toks, 0);
 
 	nir_shader *nir = ir3_tgsi_to_nir(toks);
-	s.compiler = ir3_compiler_create(gpu_id);
+	s.compiler = ir3_compiler_create(NULL, gpu_id);
 	s.nir = ir3_optimize_nir(&s, nir, NULL);
 
 	v.key = key;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
index 7c8eccb54e1..37ad73380ab 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.c
@@ -30,9 +30,10 @@
 
 #include "ir3_compiler.h"
 
-struct ir3_compiler * ir3_compiler_create(uint32_t gpu_id)
+struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id)
 {
 	struct ir3_compiler *compiler = rzalloc(NULL, struct ir3_compiler);
+	compiler->dev = dev;
 	compiler->gpu_id = gpu_id;
 	compiler->set = ir3_ra_alloc_reg_set(compiler);
 	return compiler;
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
index 697afeba61a..0ad689ca1e7 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_compiler.h
@@ -34,12 +34,13 @@
 struct ir3_ra_reg_set;
 
 struct ir3_compiler {
+	struct fd_device *dev;
 	uint32_t gpu_id;
 	struct ir3_ra_reg_set *set;
 	uint32_t shader_count;
 };
 
-struct ir3_compiler * ir3_compiler_create(uint32_t gpu_id);
+struct ir3_compiler * ir3_compiler_create(struct fd_device *dev, uint32_t gpu_id);
 void ir3_compiler_destroy(struct ir3_compiler *compiler);
 
 int ir3_compile_shader_nir(struct ir3_compiler *compiler,
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_nir.c
index 565b9c32c1d..73c65d6ad27 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_nir.c
@@ -45,6 +45,8 @@ ir3_tgsi_to_nir(const struct tgsi_token *tokens)
 			.lower_flrp = true,
 			.lower_ffract = true,
 			.native_integers = true,
+			.lower_extract_byte = true,
+			.lower_extract_word = true,
 	};
 	return tgsi_to_nir(tokens, &options);
 }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.c b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
index 7d17f426ad3..c05b52e7a5e 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.c
@@ -127,14 +127,14 @@ void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id)
 static void
 assemble_variant(struct ir3_shader_variant *v)
 {
-	struct fd_context *ctx = fd_context(v->shader->pctx);
-	uint32_t gpu_id = v->shader->compiler->gpu_id;
+	struct ir3_compiler *compiler = v->shader->compiler;
+	uint32_t gpu_id = compiler->gpu_id;
 	uint32_t sz, *bin;
 
 	bin = ir3_shader_assemble(v, gpu_id);
 	sz = v->info.sizedwords * 4;
 
-	v->bo = fd_bo_new(ctx->dev, sz,
+	v->bo = fd_bo_new(compiler->dev, sz,
 			DRM_FREEDRENO_GEM_CACHE_WCOMBINE |
 			DRM_FREEDRENO_GEM_TYPE_KMEM);
 
@@ -266,14 +266,13 @@ ir3_shader_destroy(struct ir3_shader *shader)
 }
 
 struct ir3_shader *
-ir3_shader_create(struct pipe_context *pctx,
+ir3_shader_create(struct ir3_compiler *compiler,
 		const struct pipe_shader_state *cso,
 		enum shader_t type)
 {
 	struct ir3_shader *shader = CALLOC_STRUCT(ir3_shader);
-	shader->compiler = fd_context(pctx)->screen->compiler;
+	shader->compiler = compiler;
 	shader->id = ++shader->compiler->shader_count;
-	shader->pctx = pctx;
 	shader->type = type;
 	if (fd_mesa_debug & FD_DBG_DISASM) {
 		DBG("dump tgsi: type=%d", shader->type);
@@ -463,10 +462,9 @@ ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin)
 #include "freedreno_resource.h"
 
 static void
-emit_user_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
-		struct fd_constbuf_stateobj *constbuf)
+emit_user_consts(struct fd_context *ctx, const struct ir3_shader_variant *v,
+		struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf)
 {
-	struct fd_context *ctx = fd_context(v->shader->pctx);
 	const unsigned index = 0;     /* user consts are index 0 */
 	/* TODO save/restore dirty_mask for binning pass instead: */
 	uint32_t dirty_mask = constbuf->enabled_mask;
@@ -502,12 +500,11 @@ emit_user_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
 }
 
 static void
-emit_ubos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
-		struct fd_constbuf_stateobj *constbuf)
+emit_ubos(struct fd_context *ctx, const struct ir3_shader_variant *v,
+		struct fd_ringbuffer *ring, struct fd_constbuf_stateobj *constbuf)
 {
 	uint32_t offset = v->first_driver_param + IR3_UBOS_OFF;
 	if (v->constlen > offset) {
-		struct fd_context *ctx = fd_context(v->shader->pctx);
 		uint32_t params = MIN2(4, v->constlen - offset) * 4;
 		uint32_t offsets[params];
 		struct fd_bo *bos[params];
@@ -532,9 +529,9 @@ emit_ubos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
 }
 
 static void
-emit_immediates(struct ir3_shader_variant *v, struct fd_ringbuffer *ring)
+emit_immediates(struct fd_context *ctx, const struct ir3_shader_variant *v,
+		struct fd_ringbuffer *ring)
 {
-	struct fd_context *ctx = fd_context(v->shader->pctx);
 	int size = v->immediates_count;
 	uint32_t base = v->first_immediate;
 
@@ -556,12 +553,12 @@ emit_immediates(struct ir3_shader_variant *v, struct fd_ringbuffer *ring)
 
 /* emit stream-out buffers: */
 static void
-emit_tfbos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring)
+emit_tfbos(struct fd_context *ctx, const struct ir3_shader_variant *v,
+		struct fd_ringbuffer *ring)
 {
 	/* streamout addresses after driver-params: */
 	uint32_t offset = v->first_driver_param + IR3_TFBOS_OFF;
 	if (v->constlen > offset) {
-		struct fd_context *ctx = fd_context(v->shader->pctx);
 		struct fd_streamout_stateobj *so = &ctx->streamout;
 		struct pipe_stream_output_info *info = &v->shader->stream_output;
 		uint32_t params = 4;
@@ -587,9 +584,8 @@ emit_tfbos(struct ir3_shader_variant *v, struct fd_ringbuffer *ring)
 }
 
 static uint32_t
-max_tf_vtx(struct ir3_shader_variant *v)
+max_tf_vtx(struct fd_context *ctx, const struct ir3_shader_variant *v)
 {
-	struct fd_context *ctx = fd_context(v->shader->pctx);
 	struct fd_streamout_stateobj *so = &ctx->streamout;
 	struct pipe_stream_output_info *info = &v->shader->stream_output;
 	uint32_t maxvtxcnt = 0x7fffffff;
@@ -633,11 +629,9 @@ max_tf_vtx(struct ir3_shader_variant *v)
 }
 
 void
-ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
-		const struct pipe_draw_info *info, uint32_t dirty)
+ir3_emit_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
+		struct fd_context *ctx, const struct pipe_draw_info *info, uint32_t dirty)
 {
-	struct fd_context *ctx = fd_context(v->shader->pctx);
-
 	if (dirty & (FD_DIRTY_PROG | FD_DIRTY_CONSTBUF)) {
 		struct fd_constbuf_stateobj *constbuf;
 		bool shader_dirty;
@@ -653,10 +647,10 @@ ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
 			return;
 		}
 
-		emit_user_consts(v, ring, constbuf);
-		emit_ubos(v, ring, constbuf);
+		emit_user_consts(ctx, v, ring, constbuf);
+		emit_ubos(ctx, v, ring, constbuf);
 		if (shader_dirty)
-			emit_immediates(v, ring);
+			emit_immediates(ctx, v, ring);
 	}
 
 	/* emit driver params every time: */
@@ -667,7 +661,7 @@ ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
 			uint32_t vertex_params[IR3_DP_COUNT] = {
 				[IR3_DP_VTXID_BASE] = info->indexed ?
 						info->index_bias : info->start,
-				[IR3_DP_VTXCNT_MAX] = max_tf_vtx(v),
+				[IR3_DP_VTXCNT_MAX] = max_tf_vtx(ctx, v),
 			};
 			/* if no user-clip-planes, we don't need to emit the
 			 * entire thing:
@@ -692,7 +686,7 @@ ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
 
 			/* if needed, emit stream-out buffer addresses: */
 			if (vertex_params[IR3_DP_VTXCNT_MAX] > 0) {
-				emit_tfbos(v, ring);
+				emit_tfbos(ctx, v, ring);
 			}
 		}
 	}
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_shader.h b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
index 03d4fa2e927..c89dc29ff08 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_shader.h
+++ b/src/gallium/drivers/freedreno/ir3/ir3_shader.h
@@ -241,7 +241,6 @@ struct ir3_shader {
 
 	struct ir3_compiler *compiler;
 
-	struct pipe_context *pctx;    /* TODO replace w/ pipe_screen */
 	nir_shader *nir;
 	struct pipe_stream_output_info stream_output;
 
@@ -250,7 +249,7 @@ struct ir3_shader {
 
 void * ir3_shader_assemble(struct ir3_shader_variant *v, uint32_t gpu_id);
 
-struct ir3_shader * ir3_shader_create(struct pipe_context *pctx,
+struct ir3_shader * ir3_shader_create(struct ir3_compiler *compiler,
 		const struct pipe_shader_state *cso, enum shader_t type);
 void ir3_shader_destroy(struct ir3_shader *shader);
 struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader,
@@ -258,8 +257,9 @@ struct ir3_shader_variant * ir3_shader_variant(struct ir3_shader *shader,
 void ir3_shader_disasm(struct ir3_shader_variant *so, uint32_t *bin);
 
 struct fd_ringbuffer;
-void ir3_emit_consts(struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
-		const struct pipe_draw_info *info, uint32_t dirty);
+struct fd_context;
+void ir3_emit_consts(const struct ir3_shader_variant *v, struct fd_ringbuffer *ring,
+		struct fd_context *ctx, const struct pipe_draw_info *info, uint32_t dirty);
 
 static inline const char *
 ir3_shader_stage(struct ir3_shader *shader)
diff --git a/src/gallium/drivers/i915/i915_resource.c b/src/gallium/drivers/i915/i915_resource.c
index 627ed2b4445..3ffb0b7a5d2 100644
--- a/src/gallium/drivers/i915/i915_resource.c
+++ b/src/gallium/drivers/i915/i915_resource.c
@@ -23,7 +23,8 @@ i915_resource_create(struct pipe_screen *screen,
 static struct pipe_resource *
 i915_resource_from_handle(struct pipe_screen * screen,
 			 const struct pipe_resource *template,
-			 struct winsys_handle *whandle)
+			 struct winsys_handle *whandle,
+                          unsigned usage)
 {
    if (template->target == PIPE_BUFFER)
       return NULL;
diff --git a/src/gallium/drivers/i915/i915_screen.c b/src/gallium/drivers/i915/i915_screen.c
index 8d010f9dc8c..f4aa310ecdc 100644
--- a/src/gallium/drivers/i915/i915_screen.c
+++ b/src/gallium/drivers/i915/i915_screen.c
@@ -265,6 +265,10 @@ i915_get_param(struct pipe_screen *screen, enum pipe_cap cap)
    case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
    case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
    case PIPE_CAP_QUERY_MEMORY_INFO:
+   case PIPE_CAP_PCI_GROUP:
+   case PIPE_CAP_PCI_BUS:
+   case PIPE_CAP_PCI_DEVICE:
+   case PIPE_CAP_PCI_FUNCTION:
       return 0;
 
    case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
diff --git a/src/gallium/drivers/ilo/ilo_resource.c b/src/gallium/drivers/ilo/ilo_resource.c
index 9026ba9a983..8c888c529c4 100644
--- a/src/gallium/drivers/ilo/ilo_resource.c
+++ b/src/gallium/drivers/ilo/ilo_resource.c
@@ -714,7 +714,8 @@ ilo_resource_create(struct pipe_screen *screen,
 static struct pipe_resource *
 ilo_resource_from_handle(struct pipe_screen *screen,
                          const struct pipe_resource *templ,
-                         struct winsys_handle *handle)
+                         struct winsys_handle *handle,
+                         unsigned usage)
 {
    if (templ->target == PIPE_BUFFER)
       return NULL;
@@ -725,7 +726,8 @@ ilo_resource_from_handle(struct pipe_screen *screen,
 static boolean
 ilo_resource_get_handle(struct pipe_screen *screen,
                         struct pipe_resource *res,
-                        struct winsys_handle *handle)
+                        struct winsys_handle *handle,
+                        unsigned usage)
 {
    if (res->target == PIPE_BUFFER)
       return false;
diff --git a/src/gallium/drivers/ilo/ilo_screen.c b/src/gallium/drivers/ilo/ilo_screen.c
index ef9da6b8315..548d215c718 100644
--- a/src/gallium/drivers/ilo/ilo_screen.c
+++ b/src/gallium/drivers/ilo/ilo_screen.c
@@ -493,6 +493,10 @@ ilo_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
    case PIPE_CAP_QUERY_BUFFER_OBJECT:
    case PIPE_CAP_QUERY_MEMORY_INFO:
+   case PIPE_CAP_PCI_GROUP:
+   case PIPE_CAP_PCI_BUS:
+   case PIPE_CAP_PCI_DEVICE:
+   case PIPE_CAP_PCI_FUNCTION:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index 34008e1c01e..d9be7f392ef 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -116,6 +116,12 @@ struct lp_rast_plane {
 
    /* one-pixel sized trivial reject offsets for each plane */
    uint32_t eo;
+   /*
+    * We rely on this struct being 64bit aligned (ideally it would be 128bit
+    * but that's quite the waste) and therefore on 32bit we need padding
+    * since otherwise (even with the 64bit number in there) it wouldn't be.
+    */
+   uint32_t pad;
 };
 
 /**
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 2c66bf46332..2529b546564 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -315,6 +315,10 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
    case PIPE_CAP_QUERY_BUFFER_OBJECT:
    case PIPE_CAP_QUERY_MEMORY_INFO:
+   case PIPE_CAP_PCI_GROUP:
+   case PIPE_CAP_PCI_BUS:
+   case PIPE_CAP_PCI_DEVICE:
+   case PIPE_CAP_PCI_FUNCTION:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index 29aee726941..98243a12de1 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -94,6 +94,8 @@ lp_setup_alloc_triangle(struct lp_scene *scene,
    unsigned plane_sz = nr_planes * sizeof(struct lp_rast_plane);
    struct lp_rast_triangle *tri;
 
+   STATIC_ASSERT(sizeof(struct lp_rast_plane) % 8 == 0);
+
    *tri_size = (sizeof(struct lp_rast_triangle) +
                 3 * input_array_sz +
                 plane_sz);
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c
index ae266ceb082..c2ca8b8d6a0 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -434,7 +434,8 @@ llvmpipe_resource_data(struct pipe_resource *resource)
 static struct pipe_resource *
 llvmpipe_resource_from_handle(struct pipe_screen *screen,
                               const struct pipe_resource *template,
-                              struct winsys_handle *whandle)
+                              struct winsys_handle *whandle,
+                              unsigned usage)
 {
    struct sw_winsys *winsys = llvmpipe_screen(screen)->winsys;
    struct llvmpipe_resource *lpr;
@@ -485,7 +486,8 @@ no_lpr:
 static boolean
 llvmpipe_resource_get_handle(struct pipe_screen *screen,
                             struct pipe_resource *pt,
-                            struct winsys_handle *whandle)
+                            struct winsys_handle *whandle,
+                             unsigned usage)
 {
    struct sw_winsys *winsys = llvmpipe_screen(screen)->winsys;
    struct llvmpipe_resource *lpr = llvmpipe_resource(pt);
diff --git a/src/gallium/drivers/noop/noop_pipe.c b/src/gallium/drivers/noop/noop_pipe.c
index 165284a90bf..fd0a5d0f830 100644
--- a/src/gallium/drivers/noop/noop_pipe.c
+++ b/src/gallium/drivers/noop/noop_pipe.c
@@ -114,14 +114,15 @@ static struct pipe_resource *noop_resource_create(struct pipe_screen *screen,
 
 static struct pipe_resource *noop_resource_from_handle(struct pipe_screen *screen,
 							const struct pipe_resource *templ,
-							struct winsys_handle *handle)
+							struct winsys_handle *handle,
+                                                       unsigned usage)
 {
 	struct noop_pipe_screen *noop_screen = (struct noop_pipe_screen*)screen;
 	struct pipe_screen *oscreen = noop_screen->oscreen;
 	struct pipe_resource *result;
 	struct pipe_resource *noop_resource;
 
-	result = oscreen->resource_from_handle(oscreen, templ, handle);
+	result = oscreen->resource_from_handle(oscreen, templ, handle, usage);
 	noop_resource = noop_resource_create(screen, result);
 	pipe_resource_reference(&result, NULL);
 	return noop_resource;
@@ -129,7 +130,8 @@ static struct pipe_resource *noop_resource_from_handle(struct pipe_screen *scree
 
 static boolean noop_resource_get_handle(struct pipe_screen *screen,
 					struct pipe_resource *resource,
-					struct winsys_handle *handle)
+					struct winsys_handle *handle,
+                                        unsigned usage)
 {
 	return FALSE;
 }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
index f58cf97646e..84ebfdb1cba 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_build_util.cpp
@@ -585,6 +585,7 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i,
          return NULL;
       srcNr = 2;
       break;
+   case OP_SELP: srcNr = 3; break;
    default:
       // TODO when needed
       return NULL;
@@ -601,7 +602,10 @@ BuildUtil::split64BitOpPostRA(Function *fn, Instruction *i,
 
    for (int s = 0; s < srcNr; ++s) {
       if (lo->getSrc(s)->reg.size < 8) {
-         hi->setSrc(s, zero);
+         if (s == 2)
+            hi->setSrc(s, lo->getSrc(s));
+         else
+            hi->setSrc(s, zero);
       } else {
          if (lo->getSrc(s)->refCount() > 1)
             lo->setSrc(s, cloneShallow(fn, lo->getSrc(s)));
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index 4504240ac5e..9f7d2572bbe 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -146,7 +146,6 @@ struct nv50_ir_prog_info
          bool earlyFragTests;
          bool separateFragData;
          bool usesDiscard;
-         bool sampleInterp;      /* perform sample interp on all fp inputs */
       } fp;
       struct {
          uint32_t inputOffset; /* base address for user args */
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
index b6b3ec7b948..0d7d95e3105 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gk110.cpp
@@ -1273,15 +1273,41 @@ CodeEmitterGK110::emitBAR(const Instruction *i)
    case NV50_IR_SUBOP_BAR_RED_OR:   code[1] |= 0x90; break;
    case NV50_IR_SUBOP_BAR_RED_POPC: code[1] |= 0x10; break;
    default:
-      code[1] |= 0x20;
       assert(i->subOp == NV50_IR_SUBOP_BAR_SYNC);
       break;
    }
 
    emitPredicate(i);
 
-   srcId(i->src(0), 10);
-   srcId(i->src(1), 23);
+   // barrier id
+   if (i->src(0).getFile() == FILE_GPR) {
+      srcId(i->src(0), 10);
+   } else {
+      ImmediateValue *imm = i->getSrc(0)->asImm();
+      assert(imm);
+      code[0] |= imm->reg.data.u32 << 10;
+      code[1] |= 0x8000;
+   }
+
+   // thread count
+   if (i->src(1).getFile() == FILE_GPR) {
+      srcId(i->src(1), 23);
+   } else {
+      ImmediateValue *imm = i->getSrc(0)->asImm();
+      assert(imm);
+      assert(imm->reg.data.u32 <= 0xfff);
+      code[0] |= imm->reg.data.u32 << 23;
+      code[1] |= imm->reg.data.u32 >> 9;
+      code[1] |= 0x4000;
+   }
+
+   if (i->srcExists(2) && (i->predSrc != 2)) {
+      srcId(i->src(2), 32 + 10);
+      if (i->src(2).mod == Modifier(NV50_IR_MOD_NOT))
+         code[1] |= 1 << 13;
+   } else {
+      code[1] |= 7 << 10;
+   }
 }
 
 void CodeEmitterGK110::emitMEMBAR(const Instruction *i)
@@ -1386,7 +1412,7 @@ CodeEmitterGK110::emitVOTE(const Instruction *i)
    defId(i->def(0), 2);
    defId(i->def(1), 48);
    if (i->src(0).mod == Modifier(NV50_IR_MOD_NOT))
-      code[0] |= 1 << 45;
+      code[1] |= 1 << 13;
    srcId(i->src(0), 42);
 }
 
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
index a383c53fcd3..e079a574cc8 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_gm107.cpp
@@ -177,6 +177,7 @@ private:
    void emitAL2P();
    void emitIPA();
    void emitATOM();
+   void emitATOMS();
    void emitCCTL();
 
    void emitPIXLD();
@@ -194,6 +195,7 @@ private:
    void emitKIL();
    void emitOUT();
 
+   void emitBAR();
    void emitMEMBAR();
 
    void emitVOTE();
@@ -2373,6 +2375,45 @@ CodeEmitterGM107::emitATOM()
 }
 
 void
+CodeEmitterGM107::emitATOMS()
+{
+   unsigned dType, subOp;
+
+   if (insn->subOp == NV50_IR_SUBOP_ATOM_CAS) {
+      switch (insn->dType) {
+      case TYPE_U32: dType = 0; break;
+      case TYPE_U64: dType = 1; break;
+      default: assert(!"unexpected dType"); dType = 0; break;
+      }
+      subOp = 4;
+
+      emitInsn (0xee000000);
+      emitField(0x34, 1, dType);
+   } else {
+      switch (insn->dType) {
+      case TYPE_U32: dType = 0; break;
+      case TYPE_S32: dType = 1; break;
+      case TYPE_U64: dType = 2; break;
+      case TYPE_S64: dType = 3; break;
+      default: assert(!"unexpected dType"); dType = 0; break;
+      }
+
+      if (insn->subOp == NV50_IR_SUBOP_ATOM_EXCH)
+         subOp = 8;
+      else
+         subOp = insn->subOp;
+
+      emitInsn (0xec000000);
+      emitField(0x1c, 3, dType);
+   }
+
+   emitField(0x34, 4, subOp);
+   emitGPR  (0x14, insn->src(1));
+   emitADDR (0x08, 0x12, 22, 0, insn->src(0));
+   emitGPR  (0x00, insn->def(0));
+}
+
+void
 CodeEmitterGM107::emitCCTL()
 {
    unsigned width;
@@ -2649,6 +2690,54 @@ CodeEmitterGM107::emitOUT()
 }
 
 void
+CodeEmitterGM107::emitBAR()
+{
+   uint8_t subop;
+
+   emitInsn (0xf0a80000);
+
+   switch (insn->subOp) {
+   case NV50_IR_SUBOP_BAR_RED_POPC: subop = 0x02; break;
+   case NV50_IR_SUBOP_BAR_RED_AND:  subop = 0x0a; break;
+   case NV50_IR_SUBOP_BAR_RED_OR:   subop = 0x12; break;
+   case NV50_IR_SUBOP_BAR_ARRIVE:   subop = 0x81; break;
+   default:
+      subop = 0x80;
+      assert(insn->subOp == NV50_IR_SUBOP_BAR_SYNC);
+      break;
+   }
+
+   emitField(0x20, 8, subop);
+
+   // barrier id
+   if (insn->src(0).getFile() == FILE_GPR) {
+      emitGPR(0x08, insn->src(0));
+   } else {
+      ImmediateValue *imm = insn->getSrc(0)->asImm();
+      assert(imm);
+      emitField(0x08, 8, imm->reg.data.u32);
+      emitField(0x2b, 1, 1);
+   }
+
+   // thread count
+   if (insn->src(1).getFile() == FILE_GPR) {
+      emitGPR(0x14, insn->src(1));
+   } else {
+      ImmediateValue *imm = insn->getSrc(0)->asImm();
+      assert(imm);
+      emitField(0x14, 12, imm->reg.data.u32);
+      emitField(0x2c, 1, 1);
+   }
+
+   if (insn->srcExists(2) && (insn->predSrc != 2)) {
+      emitPRED (0x27, insn->src(2));
+      emitField(0x2a, 1, insn->src(2).mod == Modifier(NV50_IR_MOD_NOT));
+   } else {
+      emitField(0x27, 3, 7);
+   }
+}
+
+void
 CodeEmitterGM107::emitMEMBAR()
 {
    emitInsn (0xef980000);
@@ -2918,7 +3007,10 @@ CodeEmitterGM107::emitInstruction(Instruction *i)
       }
       break;
    case OP_ATOM:
-      emitATOM();
+      if (insn->src(0).getFile() == FILE_MEMORY_SHARED)
+         emitATOMS();
+      else
+         emitATOM();
       break;
    case OP_CCTL:
       emitCCTL();
@@ -2978,6 +3070,9 @@ CodeEmitterGM107::emitInstruction(Instruction *i)
    case OP_RESTART:
       emitOUT();
       break;
+   case OP_BAR:
+      emitBAR();
+      break;
    case OP_MEMBAR:
       emitMEMBAR();
       break;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
index 7bd7c732c49..8b9328b6296 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_emit_nvc0.cpp
@@ -1482,6 +1482,7 @@ CodeEmitterNVC0::emitBAR(const Instruction *i)
    } else {
       ImmediateValue *imm = i->getSrc(1)->asImm();
       assert(imm);
+      assert(imm->reg.data.u32 <= 0xfff);
       code[0] |= imm->reg.data.u32 << 26;
       code[1] |= imm->reg.data.u32 >> 6;
       code[1] |= 0x4000;
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
index d06e9efa463..d284446f5d9 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_from_tgsi.cpp
@@ -525,6 +525,7 @@ nv50_ir::DataType Instruction::inferSrcType() const
    case TGSI_OPCODE_DRCP:
    case TGSI_OPCODE_DSQRT:
    case TGSI_OPCODE_DMAD:
+   case TGSI_OPCODE_DFMA:
    case TGSI_OPCODE_DFRAC:
    case TGSI_OPCODE_DRSQ:
    case TGSI_OPCODE_DTRUNC:
@@ -615,6 +616,7 @@ static nv50_ir::operation translateOpcode(uint opcode)
 
    NV50_IR_OPCODE_CASE(RCP, RCP);
    NV50_IR_OPCODE_CASE(RSQ, RSQ);
+   NV50_IR_OPCODE_CASE(SQRT, SQRT);
 
    NV50_IR_OPCODE_CASE(MUL, MUL);
    NV50_IR_OPCODE_CASE(ADD, ADD);
@@ -624,6 +626,7 @@ static nv50_ir::operation translateOpcode(uint opcode)
    NV50_IR_OPCODE_CASE(SLT, SET);
    NV50_IR_OPCODE_CASE(SGE, SET);
    NV50_IR_OPCODE_CASE(MAD, MAD);
+   NV50_IR_OPCODE_CASE(FMA, FMA);
    NV50_IR_OPCODE_CASE(SUB, SUB);
 
    NV50_IR_OPCODE_CASE(FLR, FLOOR);
@@ -723,6 +726,7 @@ static nv50_ir::operation translateOpcode(uint opcode)
    NV50_IR_OPCODE_CASE(DRCP, RCP);
    NV50_IR_OPCODE_CASE(DSQRT, SQRT);
    NV50_IR_OPCODE_CASE(DMAD, MAD);
+   NV50_IR_OPCODE_CASE(DFMA, FMA);
    NV50_IR_OPCODE_CASE(D2I, CVT);
    NV50_IR_OPCODE_CASE(D2U, CVT);
    NV50_IR_OPCODE_CASE(I2D, CVT);
@@ -1182,10 +1186,6 @@ bool Source::scanDeclaration(const struct tgsi_full_declaration *decl)
       case TGSI_SEMANTIC_VERTEXID:
          info->io.vertexId = first;
          break;
-      case TGSI_SEMANTIC_SAMPLEID:
-      case TGSI_SEMANTIC_SAMPLEPOS:
-         info->prop.fp.sampleInterp = 1;
-         break;
       case TGSI_SEMANTIC_BASEVERTEX:
       case TGSI_SEMANTIC_BASEINSTANCE:
       case TGSI_SEMANTIC_DRAWID:
@@ -1564,7 +1564,7 @@ Converter::translateInterpMode(const struct nv50_ir_varying *var, operation& op)
    op = (mode == NV50_IR_INTERP_PERSPECTIVE || mode == NV50_IR_INTERP_SC)
       ? OP_PINTERP : OP_LINTERP;
 
-   if (var->centroid || info->prop.fp.sampleInterp)
+   if (var->centroid)
       mode |= NV50_IR_INTERP_CENTROID;
 
    return mode;
@@ -2676,6 +2676,7 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
    case TGSI_OPCODE_MAD:
    case TGSI_OPCODE_UMAD:
    case TGSI_OPCODE_SAD:
+   case TGSI_OPCODE_FMA:
       FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
          src0 = fetchSrc(0, c);
          src1 = fetchSrc(1, c);
@@ -2689,6 +2690,7 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
    case TGSI_OPCODE_FLR:
    case TGSI_OPCODE_TRUNC:
    case TGSI_OPCODE_RCP:
+   case TGSI_OPCODE_SQRT:
    case TGSI_OPCODE_IABS:
    case TGSI_OPCODE_INEG:
    case TGSI_OPCODE_NOT:
@@ -3399,6 +3401,7 @@ Converter::handleInstruction(const struct tgsi_full_instruction *insn)
       }
       break;
    case TGSI_OPCODE_DMAD:
+   case TGSI_OPCODE_DFMA:
       FOR_EACH_DST_ENABLED_CHANNEL(0, c, tgsi) {
          src0 = getSSA(8);
          src1 = getSSA(8);
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
index 8752b0c8c54..12c5f699603 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp
@@ -1203,10 +1203,9 @@ NV50LoweringPreSSA::handleDIV(Instruction *i)
 bool
 NV50LoweringPreSSA::handleSQRT(Instruction *i)
 {
-   Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
-                                bld.getSSA(), i->getSrc(0));
-   i->op = OP_MUL;
-   i->setSrc(1, rsq->getDef(0));
+   bld.setPosition(i, true);
+   i->op = OP_RSQ;
+   bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
 
    return true;
 }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
index d181f1574f1..d0936d88d60 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp
@@ -1778,22 +1778,21 @@ NVC0LoweringPass::handleMOD(Instruction *i)
 bool
 NVC0LoweringPass::handleSQRT(Instruction *i)
 {
-   Value *pred = bld.getSSA(1, FILE_PREDICATE);
-   Value *zero = bld.getSSA();
-   Instruction *rsq;
-
-   bld.mkOp1(OP_MOV, TYPE_U32, zero, bld.mkImm(0));
-   if (i->dType == TYPE_F64)
-      zero = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), zero, zero);
-   bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero);
-   bld.mkOp1(OP_MOV, i->dType, i->getDef(0), zero)->setPredicate(CC_P, pred);
-   rsq = bld.mkOp1(OP_RSQ, i->dType,
-                   bld.getSSA(typeSizeof(i->dType)), i->getSrc(0));
-   rsq->setPredicate(CC_NOT_P, pred);
-   i->op = OP_MUL;
-   i->setSrc(1, rsq->getDef(0));
-   i->setPredicate(CC_NOT_P, pred);
-
+   if (i->dType == TYPE_F64) {
+      Value *pred = bld.getSSA(1, FILE_PREDICATE);
+      Value *zero = bld.loadImm(NULL, 0.0d);
+      Value *dst = bld.getSSA(8);
+      bld.mkOp1(OP_RSQ, i->dType, dst, i->getSrc(0));
+      bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero);
+      bld.mkOp3(OP_SELP, TYPE_U64, dst, zero, dst, pred);
+      i->op = OP_MUL;
+      i->setSrc(1, dst);
+      // TODO: Handle this properly with a library function
+   } else {
+      bld.setPosition(i, true);
+      i->op = OP_RSQ;
+      bld.mkOp1(OP_RCP, i->dType, i->getDef(0), i->getDef(0));
+   }
 
    return true;
 }
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
index 6192c0665e4..66e7b2e8243 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_peephole.cpp
@@ -1635,11 +1635,10 @@ AlgebraicOpt::tryADDToMADOrSAD(Instruction *add, operation toOp)
    if (src->getUniqueInsn() && src->getUniqueInsn()->bb != add->bb)
       return false;
 
-   if (src->getInsn()->saturate)
+   if (src->getInsn()->saturate || src->getInsn()->postFactor ||
+       src->getInsn()->dnz)
       return false;
 
-   if (src->getInsn()->postFactor)
-      return false;
    if (toOp == OP_SAD) {
       ImmediateValue imm;
       if (!src->getInsn()->src(2).getImmediate(imm))
diff --git a/src/gallium/drivers/nouveau/nouveau_debug.h b/src/gallium/drivers/nouveau/nouveau_debug.h
index d17df81fed2..546a4ad0af3 100644
--- a/src/gallium/drivers/nouveau/nouveau_debug.h
+++ b/src/gallium/drivers/nouveau/nouveau_debug.h
@@ -16,7 +16,7 @@
 #define NOUVEAU_DEBUG 0
 
 #define NOUVEAU_ERR(fmt, args...)                                 \
-   fprintf(stderr, "%s:%d - "fmt, __FUNCTION__, __LINE__, ##args)
+   fprintf(stderr, "%s:%d - " fmt, __FUNCTION__, __LINE__, ##args)
 
 #define NOUVEAU_DBG(ch, args...)           \
    if ((NOUVEAU_DEBUG) & (NOUVEAU_DEBUG_##ch))        \
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_resource.c b/src/gallium/drivers/nouveau/nv30/nv30_resource.c
index a98a6464de8..4d215d2e616 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_resource.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_resource.c
@@ -66,7 +66,8 @@ nv30_resource_create(struct pipe_screen *pscreen,
 static struct pipe_resource *
 nv30_resource_from_handle(struct pipe_screen *pscreen,
                           const struct pipe_resource *tmpl,
-                          struct winsys_handle *handle)
+                          struct winsys_handle *handle,
+                          unsigned usage)
 {
    if (tmpl->target == PIPE_BUFFER)
       return NULL;
diff --git a/src/gallium/drivers/nouveau/nv30/nv30_screen.c b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
index 5be7a3dab76..b105c6aeb80 100644
--- a/src/gallium/drivers/nouveau/nv30/nv30_screen.c
+++ b/src/gallium/drivers/nouveau/nv30/nv30_screen.c
@@ -188,6 +188,10 @@ nv30_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
    case PIPE_CAP_QUERY_BUFFER_OBJECT:
    case PIPE_CAP_QUERY_MEMORY_INFO:
+   case PIPE_CAP_PCI_GROUP:
+   case PIPE_CAP_PCI_BUS:
+   case PIPE_CAP_PCI_DEVICE:
+   case PIPE_CAP_PCI_FUNCTION:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_resource.c b/src/gallium/drivers/nouveau/nv50/nv50_resource.c
index 5d415ae77eb..ad5f3b814db 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_resource.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_resource.c
@@ -22,7 +22,8 @@ nv50_resource_create(struct pipe_screen *screen,
 static struct pipe_resource *
 nv50_resource_from_handle(struct pipe_screen * screen,
                           const struct pipe_resource *templ,
-                          struct winsys_handle *whandle)
+                          struct winsys_handle *whandle,
+                          unsigned usage)
 {
    if (templ->target == PIPE_BUFFER)
       return NULL;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index 0bd5de91d1f..5836bb23764 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -232,6 +232,10 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
    case PIPE_CAP_QUERY_BUFFER_OBJECT:
    case PIPE_CAP_QUERY_MEMORY_INFO:
+   case PIPE_CAP_PCI_GROUP:
+   case PIPE_CAP_PCI_BUS:
+   case PIPE_CAP_PCI_DEVICE:
+   case PIPE_CAP_PCI_FUNCTION:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -301,7 +305,7 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
       return 1;
    case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
-      return 0;
+      return 1;
    case PIPE_SHADER_CAP_SUBROUTINES:
       return 0; /* please inline, or provide function declarations */
    case PIPE_SHADER_CAP_INTEGERS:
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index 6a09808807a..8504ba466cc 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -1245,7 +1245,7 @@ nv50_set_global_bindings(struct pipe_context *pipe,
 
    nouveau_bufctx_reset(nv50->bufctx_cp, NV50_BIND_CP_GLOBAL);
 
-   nv50->dirty_cp = NV50_NEW_CP_GLOBALS;
+   nv50->dirty_cp |= NV50_NEW_CP_GLOBALS;
 }
 
 void
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_surface.c b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
index 4db73cb7fef..84646f6adb1 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_surface.c
@@ -1756,6 +1756,7 @@ nv50_blitter_destroy(struct nv50_screen *screen)
       }
    }
 
+   pipe_mutex_destroy(blitter->mutex);
    FREE(blitter);
 }
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
index 060f59d0c73..ffbb16f79de 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_compute.c
@@ -262,35 +262,29 @@ nvc0_compute_validate_globals(struct nvc0_context *nvc0)
    }
 }
 
+static struct nvc0_state_validate
+validate_list_cp[] = {
+   { nvc0_compprog_validate,              NVC0_NEW_CP_PROGRAM     },
+   { nvc0_compute_validate_constbufs,     NVC0_NEW_CP_CONSTBUF    },
+   { nvc0_compute_validate_driverconst,   NVC0_NEW_CP_DRIVERCONST },
+   { nvc0_compute_validate_buffers,       NVC0_NEW_CP_BUFFERS     },
+   { nvc0_compute_validate_textures,      NVC0_NEW_CP_TEXTURES    },
+   { nvc0_compute_validate_samplers,      NVC0_NEW_CP_SAMPLERS    },
+   { nvc0_compute_validate_globals,       NVC0_NEW_CP_GLOBALS     },
+};
+
 static bool
-nvc0_compute_state_validate(struct nvc0_context *nvc0)
+nvc0_state_validate_cp(struct nvc0_context *nvc0, uint32_t mask)
 {
-   nvc0_compprog_validate(nvc0);
-   if (nvc0->dirty_cp & NVC0_NEW_CP_CONSTBUF)
-      nvc0_compute_validate_constbufs(nvc0);
-   if (nvc0->dirty_cp & NVC0_NEW_CP_DRIVERCONST)
-      nvc0_compute_validate_driverconst(nvc0);
-   if (nvc0->dirty_cp & NVC0_NEW_CP_BUFFERS)
-      nvc0_compute_validate_buffers(nvc0);
-   if (nvc0->dirty_cp & NVC0_NEW_CP_TEXTURES)
-      nvc0_compute_validate_textures(nvc0);
-   if (nvc0->dirty_cp & NVC0_NEW_CP_SAMPLERS)
-      nvc0_compute_validate_samplers(nvc0);
-   if (nvc0->dirty_cp & NVC0_NEW_CP_GLOBALS)
-      nvc0_compute_validate_globals(nvc0);
-
-   /* TODO: surfaces */
-
-   nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, false);
-
-   nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_cp);
-   if (unlikely(nouveau_pushbuf_validate(nvc0->base.pushbuf)))
-      return false;
-   if (unlikely(nvc0->state.flushed))
-      nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true);
+   bool ret;
 
-   return true;
+   ret = nvc0_state_validate(nvc0, mask, validate_list_cp,
+                             ARRAY_SIZE(validate_list_cp), &nvc0->dirty_cp,
+                             nvc0->bufctx_cp);
 
+   if (unlikely(nvc0->state.flushed))
+      nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true);
+   return ret;
 }
 
 static void
@@ -326,7 +320,7 @@ nvc0_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
    unsigned s;
    int ret;
 
-   ret = !nvc0_compute_state_validate(nvc0);
+   ret = !nvc0_state_validate_cp(nvc0, ~0);
    if (ret) {
       NOUVEAU_ERR("Failed to launch grid !\n");
       return;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
index 0f1ebb0a6e2..54afe887ebd 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_context.h
@@ -262,7 +262,15 @@ void nvc0_tfb_validate(struct nvc0_context *);
 extern void nvc0_init_state_functions(struct nvc0_context *);
 
 /* nvc0_state_validate.c */
-bool nvc0_state_validate(struct nvc0_context *, uint32_t state_mask);
+struct nvc0_state_validate {
+   void (*func)(struct nvc0_context *);
+   uint32_t states;
+};
+
+bool nvc0_state_validate(struct nvc0_context *, uint32_t,
+                         struct nvc0_state_validate *, int, uint32_t *,
+                         struct nouveau_bufctx *);
+bool nvc0_state_validate_3d(struct nvc0_context *, uint32_t);
 
 /* nvc0_surface.c */
 extern void nvc0_clear(struct pipe_context *, unsigned buffers,
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
index d2acce7d5be..92ca613cda1 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query.c
@@ -204,10 +204,7 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
 
    if (screen->base.drm->version >= 0x01000101) {
       if (screen->compute) {
-         if (screen->base.class_3d == NVE4_3D_CLASS) {
-            count += 2;
-         } else
-         if (screen->base.class_3d < NVE4_3D_CLASS) {
+         if (screen->base.class_3d <= NVF0_3D_CLASS) {
             count += 2;
          }
       }
@@ -227,29 +224,16 @@ nvc0_screen_get_driver_query_group_info(struct pipe_screen *pscreen,
           * currently only used by AMD_performance_monitor.
           */
          info->max_active_queries = 1;
-
-         if (screen->base.class_3d == NVE4_3D_CLASS) {
-            info->num_queries = NVE4_HW_SM_QUERY_COUNT;
-            return 1;
-         } else
-         if (screen->base.class_3d < NVE4_3D_CLASS) {
-            info->num_queries = NVC0_HW_SM_QUERY_COUNT;
-            return 1;
-         }
+         info->num_queries = nvc0_hw_sm_get_num_queries(screen);
+         return 1;
       }
    } else
    if (id == NVC0_HW_METRIC_QUERY_GROUP) {
       if (screen->compute) {
-          if (screen->base.class_3d == NVE4_3D_CLASS) {
-            info->name = "Performance metrics";
-            info->max_active_queries = 1;
-            info->num_queries = NVE4_HW_METRIC_QUERY_COUNT;
-            return 1;
-         } else
-         if (screen->base.class_3d < NVE4_3D_CLASS) {
+          if (screen->base.class_3d <= NVF0_3D_CLASS) {
             info->name = "Performance metrics";
             info->max_active_queries = 1;
-            info->num_queries = NVC0_HW_METRIC_QUERY_COUNT;
+            info->num_queries = nvc0_hw_metric_get_num_queries(screen);
             return 1;
          }
       }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
index 7a64b69b1c1..b961cbf652e 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.c
@@ -24,32 +24,51 @@
 #include "nvc0/nvc0_query_hw_metric.h"
 #include "nvc0/nvc0_query_hw_sm.h"
 
-/* === PERFORMANCE MONITORING METRICS for NVC0:NVE4 === */
-static const char *nvc0_hw_metric_names[] =
-{
-   "metric-achieved_occupancy",
-   "metric-branch_efficiency",
-   "metric-inst_issued",
-   "metric-inst_per_wrap",
-   "metric-inst_replay_overhead",
-   "metric-issued_ipc",
-   "metric-issue_slots",
-   "metric-issue_slot_utilization",
-   "metric-ipc",
+#define _Q(t,n) { NVC0_HW_METRIC_QUERY_##t, n }
+struct {
+   unsigned type;
+   const char *name;
+} nvc0_hw_metric_queries[] = {
+   _Q(ACHIEVED_OCCUPANCY,           "metric-achieved_occupancy"               ),
+   _Q(BRANCH_EFFICIENCY,            "metric-branch_efficiency"                ),
+   _Q(INST_ISSUED,                  "metric-inst_issued"                      ),
+   _Q(INST_PER_WRAP,                "metric-inst_per_wrap"                    ),
+   _Q(INST_REPLAY_OVERHEAD,         "metric-inst_replay_overhead"             ),
+   _Q(ISSUED_IPC,                   "metric-issued_ipc"                       ),
+   _Q(ISSUE_SLOTS,                  "metric-issue_slots"                      ),
+   _Q(ISSUE_SLOT_UTILIZATION,       "metric-issue_slot_utilization"           ),
+   _Q(IPC,                          "metric-ipc"                              ),
+   _Q(SHARED_REPLAY_OVERHEAD,       "metric-shared_replay_overhead"           ),
 };
 
+#undef _Q
+
+static inline const char *
+nvc0_hw_metric_query_get_name(unsigned query_type)
+{
+   unsigned i;
+
+   for (i = 0; i < ARRAY_SIZE(nvc0_hw_metric_queries); i++) {
+      if (nvc0_hw_metric_queries[i].type == query_type)
+         return nvc0_hw_metric_queries[i].name;
+   }
+   assert(0);
+   return NULL;
+}
+
 struct nvc0_hw_metric_query_cfg {
+   unsigned type;
    uint32_t queries[8];
    uint32_t num_queries;
 };
 
 #define _SM(n) NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_ ##n)
-#define _M(n, c) [NVC0_HW_METRIC_QUERY_##n] = c
 
 /* ==== Compute capability 2.0 (GF100/GF110) ==== */
 static const struct nvc0_hw_metric_query_cfg
 sm20_achieved_occupancy =
 {
+   .type        = NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY,
    .queries[0]  = _SM(ACTIVE_WARPS),
    .queries[1]  = _SM(ACTIVE_CYCLES),
    .num_queries = 2,
@@ -58,6 +77,7 @@ sm20_achieved_occupancy =
 static const struct nvc0_hw_metric_query_cfg
 sm20_branch_efficiency =
 {
+   .type        = NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY,
    .queries[0]  = _SM(BRANCH),
    .queries[1]  = _SM(DIVERGENT_BRANCH),
    .num_queries = 2,
@@ -66,6 +86,7 @@ sm20_branch_efficiency =
 static const struct nvc0_hw_metric_query_cfg
 sm20_inst_per_wrap =
 {
+   .type        = NVC0_HW_METRIC_QUERY_INST_PER_WRAP,
    .queries[0]  = _SM(INST_EXECUTED),
    .queries[1]  = _SM(WARPS_LAUNCHED),
    .num_queries = 2,
@@ -74,6 +95,7 @@ sm20_inst_per_wrap =
 static const struct nvc0_hw_metric_query_cfg
 sm20_inst_replay_overhead =
 {
+   .type        = NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD,
    .queries[0]  = _SM(INST_ISSUED),
    .queries[1]  = _SM(INST_EXECUTED),
    .num_queries = 2,
@@ -82,6 +104,16 @@ sm20_inst_replay_overhead =
 static const struct nvc0_hw_metric_query_cfg
 sm20_issued_ipc =
 {
+   .type        = NVC0_HW_METRIC_QUERY_ISSUED_IPC,
+   .queries[0]  = _SM(INST_ISSUED),
+   .queries[1]  = _SM(ACTIVE_CYCLES),
+   .num_queries = 2,
+};
+
+static const struct nvc0_hw_metric_query_cfg
+sm20_issue_slot_utilization =
+{
+   .type        = NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION,
    .queries[0]  = _SM(INST_ISSUED),
    .queries[1]  = _SM(ACTIVE_CYCLES),
    .num_queries = 2,
@@ -90,6 +122,7 @@ sm20_issued_ipc =
 static const struct nvc0_hw_metric_query_cfg
 sm20_ipc =
 {
+   .type        = NVC0_HW_METRIC_QUERY_IPC,
    .queries[0]  = _SM(INST_EXECUTED),
    .queries[1]  = _SM(ACTIVE_CYCLES),
    .num_queries = 2,
@@ -97,21 +130,20 @@ sm20_ipc =
 
 static const struct nvc0_hw_metric_query_cfg *sm20_hw_metric_queries[] =
 {
-   _M(ACHIEVED_OCCUPANCY,     &sm20_achieved_occupancy),
-   _M(BRANCH_EFFICIENCY,      &sm20_branch_efficiency),
-   _M(INST_ISSUED,            NULL),
-   _M(INST_PER_WRAP,          &sm20_inst_per_wrap),
-   _M(INST_REPLAY_OVERHEAD,   &sm20_inst_replay_overhead),
-   _M(ISSUED_IPC,             &sm20_issued_ipc),
-   _M(ISSUE_SLOTS,            NULL),
-   _M(ISSUE_SLOT_UTILIZATION, &sm20_issued_ipc),
-   _M(IPC,                    &sm20_ipc),
+   &sm20_achieved_occupancy,
+   &sm20_branch_efficiency,
+   &sm20_inst_per_wrap,
+   &sm20_inst_replay_overhead,
+   &sm20_issued_ipc,
+   &sm20_issue_slot_utilization,
+   &sm20_ipc,
 };
 
 /* ==== Compute capability 2.1 (GF108+ except GF110) ==== */
 static const struct nvc0_hw_metric_query_cfg
 sm21_inst_issued =
 {
+   .type        = NVC0_HW_METRIC_QUERY_INST_ISSUED,
    .queries[0]  = _SM(INST_ISSUED1_0),
    .queries[1]  = _SM(INST_ISSUED1_1),
    .queries[2]  = _SM(INST_ISSUED2_0),
@@ -122,6 +154,7 @@ sm21_inst_issued =
 static const struct nvc0_hw_metric_query_cfg
 sm21_inst_replay_overhead =
 {
+   .type        = NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD,
    .queries[0]  = _SM(INST_ISSUED1_0),
    .queries[1]  = _SM(INST_ISSUED1_1),
    .queries[2]  = _SM(INST_ISSUED2_0),
@@ -133,6 +166,7 @@ sm21_inst_replay_overhead =
 static const struct nvc0_hw_metric_query_cfg
 sm21_issued_ipc =
 {
+   .type        = NVC0_HW_METRIC_QUERY_ISSUED_IPC,
    .queries[0]  = _SM(INST_ISSUED1_0),
    .queries[1]  = _SM(INST_ISSUED1_1),
    .queries[2]  = _SM(INST_ISSUED2_0),
@@ -141,44 +175,36 @@ sm21_issued_ipc =
    .num_queries = 5,
 };
 
-static const struct nvc0_hw_metric_query_cfg *sm21_hw_metric_queries[] =
+static const struct nvc0_hw_metric_query_cfg
+sm21_issue_slot_utilization =
 {
-   _M(ACHIEVED_OCCUPANCY,     &sm20_achieved_occupancy),
-   _M(BRANCH_EFFICIENCY,      &sm20_branch_efficiency),
-   _M(INST_ISSUED,            &sm21_inst_issued),
-   _M(INST_PER_WRAP,          &sm20_inst_per_wrap),
-   _M(INST_REPLAY_OVERHEAD,   &sm21_inst_replay_overhead),
-   _M(ISSUED_IPC,             &sm21_issued_ipc),
-   _M(ISSUE_SLOTS,            &sm21_inst_issued),
-   _M(ISSUE_SLOT_UTILIZATION, &sm21_issued_ipc),
-   _M(IPC,                    &sm20_ipc),
+   .type        = NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION,
+   .queries[0]  = _SM(INST_ISSUED1_0),
+   .queries[1]  = _SM(INST_ISSUED1_1),
+   .queries[2]  = _SM(INST_ISSUED2_0),
+   .queries[3]  = _SM(INST_ISSUED2_1),
+   .queries[4]  = _SM(ACTIVE_CYCLES),
+   .num_queries = 5,
 };
 
-#undef _SM
-#undef _M
-
-/* === PERFORMANCE MONITORING METRICS for NVE4+ === */
-static const char *nve4_hw_metric_names[] =
-{
-   "metric-achieved_occupancy",
-   "metric-branch_efficiency",
-   "metric-inst_issued",
-   "metric-inst_per_wrap",
-   "metric-inst_replay_overhead",
-   "metric-issued_ipc",
-   "metric-issue_slots",
-   "metric-issue_slot_utilization",
-   "metric-ipc",
-   "metric-shared_replay_overhead",
+static const struct nvc0_hw_metric_query_cfg *sm21_hw_metric_queries[] =
+{
+   &sm20_achieved_occupancy,
+   &sm20_branch_efficiency,
+   &sm21_inst_issued,
+   &sm20_inst_per_wrap,
+   &sm21_inst_replay_overhead,
+   &sm21_issued_ipc,
+   &sm21_inst_issued,
+   &sm21_issue_slot_utilization,
+   &sm20_ipc,
 };
 
-#define _SM(n) NVE4_HW_SM_QUERY(NVE4_HW_SM_QUERY_ ##n)
-#define _M(n, c) [NVE4_HW_METRIC_QUERY_##n] = c
-
 /* ==== Compute capability 3.0 (GK104/GK106/GK107) ==== */
 static const struct nvc0_hw_metric_query_cfg
 sm30_achieved_occupancy =
 {
+   .type        = NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY,
    .queries[0]  = _SM(ACTIVE_WARPS),
    .queries[1]  = _SM(ACTIVE_CYCLES),
    .num_queries = 2,
@@ -187,6 +213,7 @@ sm30_achieved_occupancy =
 static const struct nvc0_hw_metric_query_cfg
 sm30_branch_efficiency =
 {
+   .type        = NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY,
    .queries[0]  = _SM(BRANCH),
    .queries[1]  = _SM(DIVERGENT_BRANCH),
    .num_queries = 2,
@@ -195,6 +222,7 @@ sm30_branch_efficiency =
 static const struct nvc0_hw_metric_query_cfg
 sm30_inst_issued =
 {
+   .type        = NVC0_HW_METRIC_QUERY_INST_ISSUED,
    .queries[0]  = _SM(INST_ISSUED1),
    .queries[1]  = _SM(INST_ISSUED2),
    .num_queries = 2,
@@ -203,6 +231,7 @@ sm30_inst_issued =
 static const struct nvc0_hw_metric_query_cfg
 sm30_inst_per_wrap =
 {
+   .type        = NVC0_HW_METRIC_QUERY_INST_PER_WRAP,
    .queries[0]  = _SM(INST_EXECUTED),
    .queries[1]  = _SM(WARPS_LAUNCHED),
    .num_queries = 2,
@@ -211,6 +240,7 @@ sm30_inst_per_wrap =
 static const struct nvc0_hw_metric_query_cfg
 sm30_inst_replay_overhead =
 {
+   .type        = NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD,
    .queries[0]  = _SM(INST_ISSUED1),
    .queries[1]  = _SM(INST_ISSUED2),
    .queries[2]  = _SM(INST_EXECUTED),
@@ -220,6 +250,17 @@ sm30_inst_replay_overhead =
 static const struct nvc0_hw_metric_query_cfg
 sm30_issued_ipc =
 {
+   .type        = NVC0_HW_METRIC_QUERY_ISSUED_IPC,
+   .queries[0]  = _SM(INST_ISSUED1),
+   .queries[1]  = _SM(INST_ISSUED2),
+   .queries[2]  = _SM(ACTIVE_CYCLES),
+   .num_queries = 3,
+};
+
+static const struct nvc0_hw_metric_query_cfg
+sm30_issue_slot_utilization =
+{
+   .type        = NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION,
    .queries[0]  = _SM(INST_ISSUED1),
    .queries[1]  = _SM(INST_ISSUED2),
    .queries[2]  = _SM(ACTIVE_CYCLES),
@@ -229,6 +270,7 @@ sm30_issued_ipc =
 static const struct nvc0_hw_metric_query_cfg
 sm30_ipc =
 {
+   .type        = NVC0_HW_METRIC_QUERY_IPC,
    .queries[0]  = _SM(INST_EXECUTED),
    .queries[1]  = _SM(ACTIVE_CYCLES),
    .num_queries = 2,
@@ -237,6 +279,7 @@ sm30_ipc =
 static const struct nvc0_hw_metric_query_cfg
 sm30_shared_replay_overhead =
 {
+   .type        = NVC0_HW_METRIC_QUERY_SHARED_REPLAY_OVERHEAD,
    .queries[0]  = _SM(SHARED_LD_REPLAY),
    .queries[1]  = _SM(SHARED_ST_REPLAY),
    .queries[2]  = _SM(INST_EXECUTED),
@@ -245,44 +288,89 @@ sm30_shared_replay_overhead =
 
 static const struct nvc0_hw_metric_query_cfg *sm30_hw_metric_queries[] =
 {
-   _M(ACHIEVED_OCCUPANCY,              &sm30_achieved_occupancy),
-   _M(BRANCH_EFFICIENCY,               &sm30_branch_efficiency),
-   _M(INST_ISSUED,                     &sm30_inst_issued),
-   _M(INST_PER_WRAP,                   &sm30_inst_per_wrap),
-   _M(INST_REPLAY_OVERHEAD,            &sm30_inst_replay_overhead),
-   _M(ISSUED_IPC,                      &sm30_issued_ipc),
-   _M(ISSUE_SLOTS,                     &sm30_inst_issued),
-   _M(ISSUE_SLOT_UTILIZATION,          &sm30_issued_ipc),
-   _M(IPC,                             &sm30_ipc),
-   _M(SHARED_REPLAY_OVERHEAD,          &sm30_shared_replay_overhead),
+   &sm30_achieved_occupancy,
+   &sm30_branch_efficiency,
+   &sm30_inst_issued,
+   &sm30_inst_per_wrap,
+   &sm30_inst_replay_overhead,
+   &sm30_issued_ipc,
+   &sm30_inst_issued,
+   &sm30_issue_slot_utilization,
+   &sm30_ipc,
+   &sm30_shared_replay_overhead,
+};
+
+/* ==== Compute capability 3.5 (GK110) ==== */
+static const struct nvc0_hw_metric_query_cfg *sm35_hw_metric_queries[] =
+{
+   &sm30_achieved_occupancy,
+   &sm30_inst_issued,
+   &sm30_inst_per_wrap,
+   &sm30_inst_replay_overhead,
+   &sm30_issued_ipc,
+   &sm30_inst_issued,
+   &sm30_issue_slot_utilization,
+   &sm30_ipc,
+   &sm30_shared_replay_overhead,
 };
 
 #undef _SM
-#undef _M
 
 static inline const struct nvc0_hw_metric_query_cfg **
 nvc0_hw_metric_get_queries(struct nvc0_screen *screen)
 {
    struct nouveau_device *dev = screen->base.device;
 
-   if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
-      return sm20_hw_metric_queries;
-   return sm21_hw_metric_queries;
+   switch (screen->base.class_3d) {
+   case NVF0_3D_CLASS:
+      return sm35_hw_metric_queries;
+   case NVE4_3D_CLASS:
+      return sm30_hw_metric_queries;
+   default:
+      if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
+         return sm20_hw_metric_queries;
+      return sm21_hw_metric_queries;
+   }
+   assert(0);
+   return NULL;
+}
+
+unsigned
+nvc0_hw_metric_get_num_queries(struct nvc0_screen *screen)
+{
+   struct nouveau_device *dev = screen->base.device;
+
+   switch (screen->base.class_3d) {
+   case NVF0_3D_CLASS:
+      return ARRAY_SIZE(sm35_hw_metric_queries);
+   case NVE4_3D_CLASS:
+      return ARRAY_SIZE(sm30_hw_metric_queries);
+   default:
+      if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
+         return ARRAY_SIZE(sm20_hw_metric_queries);
+      return ARRAY_SIZE(sm21_hw_metric_queries);
+   }
+   return 0;
 }
 
 static const struct nvc0_hw_metric_query_cfg *
-nvc0_hw_metric_query_get_cfg(struct nvc0_context *nvc0,
-                             struct nvc0_hw_query *hq)
+nvc0_hw_metric_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
 {
    const struct nvc0_hw_metric_query_cfg **queries;
    struct nvc0_screen *screen = nvc0->screen;
    struct nvc0_query *q = &hq->base;
+   unsigned num_queries;
+   unsigned i;
 
-   if (screen->base.class_3d >= NVE4_3D_CLASS)
-      return sm30_hw_metric_queries[q->type - NVE4_HW_METRIC_QUERY(0)];
-
+   num_queries = nvc0_hw_metric_get_num_queries(screen);
    queries = nvc0_hw_metric_get_queries(screen);
-   return queries[q->type - NVC0_HW_METRIC_QUERY(0)];
+
+   for (i = 0; i < num_queries; i++) {
+      if (NVC0_HW_METRIC_QUERY(queries[i]->type) == q->type)
+         return queries[i];
+   }
+   assert(0);
+   return NULL;
 }
 
 static void
@@ -419,47 +507,47 @@ sm21_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8])
 static uint64_t
 sm30_hw_metric_calc_result(struct nvc0_hw_query *hq, uint64_t res64[8])
 {
-   switch (hq->base.type - NVE4_HW_METRIC_QUERY(0)) {
-   case NVE4_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY:
+   switch (hq->base.type - NVC0_HW_METRIC_QUERY(0)) {
+   case NVC0_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY:
       /* (active_warps / active_cycles) / max. number of warps on a MP */
       if (res64[1])
          return (res64[0] / (double)res64[1]) / 64;
       break;
-   case NVE4_HW_METRIC_QUERY_BRANCH_EFFICIENCY:
+   case NVC0_HW_METRIC_QUERY_BRANCH_EFFICIENCY:
       return sm20_hw_metric_calc_result(hq, res64);
-   case NVE4_HW_METRIC_QUERY_INST_ISSUED:
+   case NVC0_HW_METRIC_QUERY_INST_ISSUED:
       /* inst_issued1 + inst_issued2 * 2 */
       return res64[0] + res64[1] * 2;
-   case NVE4_HW_METRIC_QUERY_INST_PER_WRAP:
+   case NVC0_HW_METRIC_QUERY_INST_PER_WRAP:
       return sm20_hw_metric_calc_result(hq, res64);
-   case NVE4_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD:
+   case NVC0_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD:
       /* (metric-inst_issued - inst_executed) / inst_executed */
       if (res64[2])
          return (((res64[0] + res64[1] * 2) - res64[2]) / (double)res64[2]);
       break;
-   case NVE4_HW_METRIC_QUERY_ISSUED_IPC:
+   case NVC0_HW_METRIC_QUERY_ISSUED_IPC:
       /* metric-inst_issued / active_cycles */
       if (res64[2])
          return (res64[0] + res64[1] * 2) / (double)res64[2];
       break;
-   case NVE4_HW_METRIC_QUERY_ISSUE_SLOTS:
+   case NVC0_HW_METRIC_QUERY_ISSUE_SLOTS:
       /* inst_issued1 + inst_issued2 */
       return res64[0] + res64[1];
-   case NVE4_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION:
+   case NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION:
       /* ((metric-issue_slots / 2) / active_cycles) * 100 */
       if (res64[2])
          return (((res64[0] + res64[1]) / 2) / (double)res64[2]) * 100;
       break;
-   case NVE4_HW_METRIC_QUERY_IPC:
+   case NVC0_HW_METRIC_QUERY_IPC:
       return sm20_hw_metric_calc_result(hq, res64);
-   case NVE4_HW_METRIC_QUERY_SHARED_REPLAY_OVERHEAD:
+   case NVC0_HW_METRIC_QUERY_SHARED_REPLAY_OVERHEAD:
       /* (shared_load_replay + shared_store_replay) / inst_executed */
       if (res64[2])
          return (res64[0] + res64[1]) / (double)res64[2];
       break;
    default:
       debug_printf("invalid metric type: %d\n",
-                   hq->base.type - NVE4_HW_METRIC_QUERY(0));
+                   hq->base.type - NVC0_HW_METRIC_QUERY(0));
       break;
    }
    return 0;
@@ -487,13 +575,17 @@ nvc0_hw_metric_get_query_result(struct nvc0_context *nvc0,
       res64[i] = *(uint64_t *)&results[i];
    }
 
-   if (screen->base.class_3d >= NVE4_3D_CLASS) {
+   switch (screen->base.class_3d) {
+   case NVF0_3D_CLASS:
+   case NVE4_3D_CLASS:
       value = sm30_hw_metric_calc_result(hq, res64);
-   } else {
+      break;
+   default:
       if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
          value = sm20_hw_metric_calc_result(hq, res64);
       else
          value = sm21_hw_metric_calc_result(hq, res64);
+      break;
    }
 
    *(uint64_t *)result = value;
@@ -515,8 +607,7 @@ nvc0_hw_metric_create_query(struct nvc0_context *nvc0, unsigned type)
    struct nvc0_hw_query *hq;
    unsigned i;
 
-   if ((type < NVE4_HW_METRIC_QUERY(0) || type > NVE4_HW_METRIC_QUERY_LAST) &&
-       (type < NVC0_HW_METRIC_QUERY(0) || type > NVC0_HW_METRIC_QUERY_LAST))
+   if (type < NVC0_HW_METRIC_QUERY(0) || type > NVC0_HW_METRIC_QUERY_LAST)
       return NULL;
 
    hmq = CALLOC_STRUCT(nvc0_hw_metric_query);
@@ -541,46 +632,15 @@ nvc0_hw_metric_create_query(struct nvc0_context *nvc0, unsigned type)
    return hq;
 }
 
-static int
-nvc0_hw_metric_get_next_query_id(const struct nvc0_hw_metric_query_cfg **queries,
-                                 unsigned id)
-{
-   unsigned i, next = 0;
-
-   for (i = 0; i < NVC0_HW_METRIC_QUERY_COUNT; i++) {
-      if (!queries[i]) {
-         next++;
-      } else
-      if (i >= id && queries[id + next]) {
-         break;
-      }
-   }
-   return id + next;
-}
-
 int
 nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
                                      struct pipe_driver_query_info *info)
 {
-   uint16_t class_3d = screen->base.class_3d;
    int count = 0;
 
    if (screen->base.drm->version >= 0x01000101) {
-      if (screen->compute) {
-         if (screen->base.class_3d == NVE4_3D_CLASS) {
-            count += NVE4_HW_METRIC_QUERY_COUNT;
-         } else
-         if (class_3d < NVE4_3D_CLASS) {
-            const struct nvc0_hw_metric_query_cfg **queries =
-               nvc0_hw_metric_get_queries(screen);
-            unsigned i;
-
-            for (i = 0; i < NVC0_HW_METRIC_QUERY_COUNT; i++) {
-               if (queries[i])
-                  count++;
-            }
-         }
-      }
+      if (screen->compute)
+         count = nvc0_hw_metric_get_num_queries(screen);
    }
 
    if (!info)
@@ -588,19 +648,12 @@ nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
 
    if (id < count) {
       if (screen->compute) {
-         if (screen->base.class_3d == NVE4_3D_CLASS) {
-            info->name = nve4_hw_metric_names[id];
-            info->query_type = NVE4_HW_METRIC_QUERY(id);
-            info->group_id = NVC0_HW_METRIC_QUERY_GROUP;
-            return 1;
-         } else
-         if (class_3d < NVE4_3D_CLASS) {
-             const struct nvc0_hw_metric_query_cfg **queries =
+         if (screen->base.class_3d <= NVF0_3D_CLASS) {
+            const struct nvc0_hw_metric_query_cfg **queries =
                nvc0_hw_metric_get_queries(screen);
 
-            id = nvc0_hw_metric_get_next_query_id(queries, id);
-            info->name = nvc0_hw_metric_names[id];
-            info->query_type = NVC0_HW_METRIC_QUERY(id);
+            info->name = nvc0_hw_metric_query_get_name(queries[id]->type);
+            info->query_type = NVC0_HW_METRIC_QUERY(queries[id]->type);
             info->group_id = NVC0_HW_METRIC_QUERY_GROUP;
             return 1;
          }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.h
index 06cb355db40..3203a8ca2b9 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_metric.h
@@ -18,24 +18,7 @@ nvc0_hw_metric_query(struct nvc0_hw_query *hq)
 /*
  * Driver metrics queries:
  */
-#define NVE4_HW_METRIC_QUERY(i)   (PIPE_QUERY_DRIVER_SPECIFIC + 3072 + (i))
-#define NVE4_HW_METRIC_QUERY_LAST  NVE4_HW_METRIC_QUERY(NVE4_HW_METRIC_QUERY_COUNT - 1)
-enum nve4_hw_metric_queries
-{
-    NVE4_HW_METRIC_QUERY_ACHIEVED_OCCUPANCY = 0,
-    NVE4_HW_METRIC_QUERY_BRANCH_EFFICIENCY,
-    NVE4_HW_METRIC_QUERY_INST_ISSUED,
-    NVE4_HW_METRIC_QUERY_INST_PER_WRAP,
-    NVE4_HW_METRIC_QUERY_INST_REPLAY_OVERHEAD,
-    NVE4_HW_METRIC_QUERY_ISSUED_IPC,
-    NVE4_HW_METRIC_QUERY_ISSUE_SLOTS,
-    NVE4_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION,
-    NVE4_HW_METRIC_QUERY_IPC,
-    NVE4_HW_METRIC_QUERY_SHARED_REPLAY_OVERHEAD,
-    NVE4_HW_METRIC_QUERY_COUNT
-};
-
-#define NVC0_HW_METRIC_QUERY(i)   (PIPE_QUERY_DRIVER_SPECIFIC + 3072 + (i))
+#define NVC0_HW_METRIC_QUERY(i)   (PIPE_QUERY_DRIVER_SPECIFIC + 2048 + (i))
 #define NVC0_HW_METRIC_QUERY_LAST  NVC0_HW_METRIC_QUERY(NVC0_HW_METRIC_QUERY_COUNT - 1)
 enum nvc0_hw_metric_queries
 {
@@ -48,6 +31,7 @@ enum nvc0_hw_metric_queries
     NVC0_HW_METRIC_QUERY_ISSUE_SLOTS,
     NVC0_HW_METRIC_QUERY_ISSUE_SLOT_UTILIZATION,
     NVC0_HW_METRIC_QUERY_IPC,
+    NVC0_HW_METRIC_QUERY_SHARED_REPLAY_OVERHEAD,
     NVC0_HW_METRIC_QUERY_COUNT
 };
 
@@ -56,4 +40,7 @@ nvc0_hw_metric_create_query(struct nvc0_context *, unsigned);
 int
 nvc0_hw_metric_get_driver_query_info(struct nvc0_screen *, unsigned,
                                      struct pipe_driver_query_info *);
+unsigned
+nvc0_hw_metric_get_num_queries(struct nvc0_screen *);
+
 #endif
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
index f5f9bb39fd9..db36b8a1b9f 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.c
@@ -30,59 +30,87 @@
 #include "nvc0/nve4_compute.xml.h"
 #include "nvc0/nvc0_compute.xml.h"
 
-/* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */
-
 /* NOTE: intentionally using the same names as NV */
-static const char *nve4_hw_sm_query_names[] =
-{
-   /* MP counters */
-   "active_cycles",
-   "active_warps",
-   "atom_cas_count",
-   "atom_count",
-   "branch",
-   "divergent_branch",
-   "gld_request",
-   "global_ld_mem_divergence_replays",
-   "global_store_transaction",
-   "global_st_mem_divergence_replays",
-   "gred_count",
-   "gst_request",
-   "inst_executed",
-   "inst_issued1",
-   "inst_issued2",
-   "l1_global_load_hit",
-   "l1_global_load_miss",
-   "__l1_global_load_transactions",
-   "__l1_global_store_transactions",
-   "l1_local_load_hit",
-   "l1_local_load_miss",
-   "l1_local_store_hit",
-   "l1_local_store_miss",
-   "l1_shared_load_transactions",
-   "l1_shared_store_transactions",
-   "local_load",
-   "local_load_transactions",
-   "local_store",
-   "local_store_transactions",
-   "prof_trigger_00",
-   "prof_trigger_01",
-   "prof_trigger_02",
-   "prof_trigger_03",
-   "prof_trigger_04",
-   "prof_trigger_05",
-   "prof_trigger_06",
-   "prof_trigger_07",
-   "shared_load",
-   "shared_load_replay",
-   "shared_store",
-   "shared_store_replay",
-   "sm_cta_launched",
-   "threads_launched",
-   "uncached_global_load_transaction",
-   "warps_launched",
+#define _Q(t, n) { NVC0_HW_SM_QUERY_##t, n }
+struct {
+   unsigned type;
+   const char *name;
+} nvc0_hw_sm_queries[] = {
+   _Q(ACTIVE_CYCLES,                "active_cycles"                           ),
+   _Q(ACTIVE_WARPS,                 "active_warps"                            ),
+   _Q(ATOM_CAS_COUNT,               "atom_cas_count"                          ),
+   _Q(ATOM_COUNT,                   "atom_count"                              ),
+   _Q(BRANCH,                       "branch"                                  ),
+   _Q(DIVERGENT_BRANCH,             "divergent_branch"                        ),
+   _Q(GLD_REQUEST,                  "gld_request"                             ),
+   _Q(GLD_MEM_DIV_REPLAY,           "global_ld_mem_divergence_replays"        ),
+   _Q(GST_TRANSACTIONS,             "global_store_transaction"                ),
+   _Q(GST_MEM_DIV_REPLAY,           "global_st_mem_divergence_replays"        ),
+   _Q(GRED_COUNT,                   "gred_count"                              ),
+   _Q(GST_REQUEST,                  "gst_request"                             ),
+   _Q(INST_EXECUTED,                "inst_executed"                           ),
+   _Q(INST_ISSUED,                  "inst_issued"                             ),
+   _Q(INST_ISSUED1,                 "inst_issued1"                            ),
+   _Q(INST_ISSUED2,                 "inst_issued2"                            ),
+   _Q(INST_ISSUED1_0,               "inst_issued1_0"                          ),
+   _Q(INST_ISSUED1_1,               "inst_issued1_1"                          ),
+   _Q(INST_ISSUED2_0,               "inst_issued2_0"                          ),
+   _Q(INST_ISSUED2_1,               "inst_issued2_1"                          ),
+   _Q(L1_GLD_HIT,                   "l1_global_load_hit"                      ),
+   _Q(L1_GLD_MISS,                  "l1_global_load_miss"                     ),
+   _Q(L1_GLD_TRANSACTIONS,          "__l1_global_load_transactions"           ),
+   _Q(L1_GST_TRANSACTIONS,          "__l1_global_store_transactions"          ),
+   _Q(L1_LOCAL_LD_HIT,              "l1_local_load_hit"                       ),
+   _Q(L1_LOCAL_LD_MISS,             "l1_local_load_miss"                      ),
+   _Q(L1_LOCAL_ST_HIT,              "l1_local_store_hit"                      ),
+   _Q(L1_LOCAL_ST_MISS,             "l1_local_store_miss"                     ),
+   _Q(L1_SHARED_LD_TRANSACTIONS,    "l1_shared_load_transactions"             ),
+   _Q(L1_SHARED_ST_TRANSACTIONS,    "l1_shared_store_transactions"            ),
+   _Q(LOCAL_LD,                     "local_load"                              ),
+   _Q(LOCAL_LD_TRANSACTIONS,        "local_load_transactions"                 ),
+   _Q(LOCAL_ST,                     "local_store"                             ),
+   _Q(LOCAL_ST_TRANSACTIONS,        "local_store_transactions"                ),
+   _Q(NOT_PRED_OFF_INST_EXECUTED,   "not_predicated_off_thread_inst_executed" ),
+   _Q(PROF_TRIGGER_0,               "prof_trigger_00"                         ),
+   _Q(PROF_TRIGGER_1,               "prof_trigger_01"                         ),
+   _Q(PROF_TRIGGER_2,               "prof_trigger_02"                         ),
+   _Q(PROF_TRIGGER_3,               "prof_trigger_03"                         ),
+   _Q(PROF_TRIGGER_4,               "prof_trigger_04"                         ),
+   _Q(PROF_TRIGGER_5,               "prof_trigger_05"                         ),
+   _Q(PROF_TRIGGER_6,               "prof_trigger_06"                         ),
+   _Q(PROF_TRIGGER_7,               "prof_trigger_07"                         ),
+   _Q(SHARED_LD,                    "shared_load"                             ),
+   _Q(SHARED_LD_REPLAY,             "shared_load_replay"                      ),
+   _Q(SHARED_ST,                    "shared_store"                            ),
+   _Q(SHARED_ST_REPLAY,             "shared_store_replay"                     ),
+   _Q(SM_CTA_LAUNCHED,              "sm_cta_launched"                         ),
+   _Q(THREADS_LAUNCHED,             "threads_launched"                        ),
+   _Q(TH_INST_EXECUTED,             "thread_inst_executed"                    ),
+   _Q(TH_INST_EXECUTED_0,           "thread_inst_executed_0"                  ),
+   _Q(TH_INST_EXECUTED_1,           "thread_inst_executed_1"                  ),
+   _Q(TH_INST_EXECUTED_2,           "thread_inst_executed_2"                  ),
+   _Q(TH_INST_EXECUTED_3,           "thread_inst_executed_3"                  ),
+   _Q(UNCACHED_GLD_TRANSACTIONS,    "uncached_global_load_transaction"        ),
+   _Q(WARPS_LAUNCHED,               "warps_launched"                          ),
 };
 
+#undef _Q
+
+static inline const char *
+nvc0_hw_sm_query_get_name(unsigned query_type)
+{
+   unsigned i;
+
+   for (i = 0; i < ARRAY_SIZE(nvc0_hw_sm_queries); i++) {
+      if (nvc0_hw_sm_queries[i].type == query_type)
+         return nvc0_hw_sm_queries[i].name;
+   }
+   assert(0);
+   return NULL;
+}
+
+/* === PERFORMANCE MONITORING COUNTERS for NVE4+ === */
+
 /* Code to read out MP counters: They are accessible via mmio, too, but let's
  * just avoid mapping registers in userspace. We'd have to know which MPs are
  * enabled/present, too, and that information is not presently exposed.
@@ -169,6 +197,49 @@ static const uint64_t nve4_read_hw_sm_counters_code[] =
    0x8000000000001de7ULL
 };
 
+static const uint64_t nvf0_read_hw_sm_counters_code[] =
+{
+   /* Same kernel as GK104 */
+   0x0880808080808080ULL,
+   0x86400000109c0022ULL,
+   0x86400000019c0032ULL,
+   0x86400000021c0002ULL,
+   0x86400000029c0006ULL,
+   0x86400000031c000aULL,
+   0x86400000039c000eULL,
+   0x86400000041c0012ULL,
+   0x08ac1080108c8080ULL,
+   0x86400000049c0016ULL,
+   0x86400000051c001aULL,
+   0x86400000059c001eULL,
+   0xdb201c007f9c201eULL,
+   0x64c03c00001c002aULL,
+   0xc00000020a1c3021ULL,
+   0x64c03c00009c002eULL,
+   0x0810a0808010b810ULL,
+   0xc0000001041c3025ULL,
+   0x180000000020003cULL,
+   0xdb201c007f9c243eULL,
+   0xc1c00000301c2021ULL,
+   0xc1c00000081c2431ULL,
+   0xc1c00000021c2435ULL,
+   0xe0800000069c2026ULL,
+   0x08b010b010b010a0ULL,
+   0xe0800000061c2022ULL,
+   0xe4c03c00051c0032ULL,
+   0xe0840000041c282aULL,
+   0xe4c03c00059c0036ULL,
+   0xe08040007f9c2c2eULL,
+   0xe0840000049c3032ULL,
+   0xfe800000001c2800ULL,
+   0x080000b81080b010ULL,
+   0x64c03c00011c0002ULL,
+   0xe08040007f9c3436ULL,
+   0xfe80000020043010ULL,
+   0xfc800000281c3000ULL,
+   0x18000000001c003cULL,
+};
+
 /* For simplicity, we will allocate as many group slots as we allocate counter
  * slots. This means that a single counter which wants to source from 2 groups
  * will have to be declared as using 2 counter slots. This shouldn't really be
@@ -187,69 +258,593 @@ struct nvc0_hw_sm_counter_cfg
 
 struct nvc0_hw_sm_query_cfg
 {
+   unsigned type;
    struct nvc0_hw_sm_counter_cfg ctr[8];
    uint8_t num_counters;
    uint8_t norm[2]; /* normalization num,denom */
 };
 
-#define _Q1A(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, { nu, dn } }
-#define _Q1B(n, f, m, g, s, nu, dn) [NVE4_HW_SM_QUERY_##n] = { { { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, 0, s }, {}, {}, {} }, 1, { nu, dn } }
+#define _CA(f, m, g, s) { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 0, NVE4_COMPUTE_MP_PM_A_SIGSEL_##g, 0, s }
+#define _CB(f, m, g, s) { f, NVE4_COMPUTE_MP_PM_FUNC_MODE_##m, 1, NVE4_COMPUTE_MP_PM_B_SIGSEL_##g, 0, s }
+#define _Q(n, c) [NVE4_HW_SM_QUERY_##n] = c
+
+/* ==== Compute capability 3.0 (GK104:GK110) ==== */
+static const struct nvc0_hw_sm_query_cfg
+sm30_active_cycles =
+{
+   .type         = NVC0_HW_SM_QUERY_ACTIVE_CYCLES,
+   .ctr[0]       = _CB(0x0001, B6, WARP, 0x00000000),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_active_warps =
+{
+   .type         = NVC0_HW_SM_QUERY_ACTIVE_WARPS,
+   .ctr[0]       = _CB(0x003f, B6, WARP, 0x31483104),
+   .num_counters = 1,
+   .norm         = { 2, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_atom_cas_count =
+{
+   .type         = NVC0_HW_SM_QUERY_ATOM_CAS_COUNT,
+   .ctr[0]       = _CA(0x0001, B6, BRANCH, 0x000000004),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_atom_count =
+{
+   .type         = NVC0_HW_SM_QUERY_ATOM_COUNT,
+   .ctr[0]       = _CA(0x0001, B6, BRANCH, 0x00000000),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_branch =
+{
+   .type         = NVC0_HW_SM_QUERY_BRANCH,
+   .ctr[0]       = _CA(0x0001, B6, BRANCH, 0x0000000c),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_divergent_branch =
+{
+   .type         = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,
+   .ctr[0]       = _CA(0x0001, B6, BRANCH, 0x00000010),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_gld_request =
+{
+   .type         = NVC0_HW_SM_QUERY_GLD_REQUEST,
+   .ctr[0]       = _CA(0x0001, B6, LDST, 0x00000010),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_gld_mem_div_replay =
+{
+   .type         = NVC0_HW_SM_QUERY_GLD_MEM_DIV_REPLAY,
+   .ctr[0]       = _CB(0x0001, B6, REPLAY, 0x00000010),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_gst_transactions =
+{
+   .type         = NVC0_HW_SM_QUERY_GST_TRANSACTIONS,
+   .ctr[0]       = _CB(0x0001, B6, MEM, 0x00000004),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_gst_mem_div_replay =
+{
+   .type         = NVC0_HW_SM_QUERY_GST_MEM_DIV_REPLAY,
+   .ctr[0]       = _CB(0x0001, B6, REPLAY, 0x00000014),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_gred_count =
+{
+   .type         = NVC0_HW_SM_QUERY_GRED_COUNT,
+   .ctr[0]       = _CA(0x0001, B6, BRANCH, 0x00000008),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_gst_request =
+{
+   .type         = NVC0_HW_SM_QUERY_GST_REQUEST,
+   .ctr[0]       = _CA(0x0001, B6, LDST, 0x00000014),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_inst_executed =
+{
+   .type         = NVC0_HW_SM_QUERY_INST_EXECUTED,
+   .ctr[0]       = _CA(0x0003, B6, EXEC, 0x00000398),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_inst_issued1 =
+{
+   .type         = NVC0_HW_SM_QUERY_INST_ISSUED1,
+   .ctr[0]       = _CA(0x0001, B6, ISSUE, 0x00000004),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_inst_issued2 =
+{
+   .type         = NVC0_HW_SM_QUERY_INST_ISSUED2,
+   .ctr[0]       = _CA(0x0001, B6, ISSUE, 0x00000008),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_l1_gld_hit =
+{
+   .type         = NVC0_HW_SM_QUERY_L1_GLD_HIT,
+   .ctr[0]       = _CB(0x0001, B6, L1, 0x00000010),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_l1_gld_miss =
+{
+   .type         = NVC0_HW_SM_QUERY_L1_GLD_MISS,
+   .ctr[0]       = _CB(0x0001, B6, L1, 0x00000014),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_l1_gld_transactions =
+{
+   .type         = NVC0_HW_SM_QUERY_L1_GLD_TRANSACTIONS,
+   .ctr[0]       = _CB(0x0001, B6, UNK0F, 0x00000000),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_l1_gst_transactions =
+{
+   .type         = NVC0_HW_SM_QUERY_L1_GST_TRANSACTIONS,
+   .ctr[0]       = _CB(0x0001, B6, UNK0F, 0x00000004),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_l1_local_ld_hit =
+{
+   .type         = NVC0_HW_SM_QUERY_L1_LOCAL_LD_HIT,
+   .ctr[0]       = _CB(0x0001, B6, L1, 0x00000000),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_l1_local_ld_miss =
+{
+   .type         = NVC0_HW_SM_QUERY_L1_LOCAL_LD_MISS,
+   .ctr[0]       = _CB(0x0001, B6, L1, 0x00000004),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_l1_local_st_hit =
+{
+   .type         = NVC0_HW_SM_QUERY_L1_LOCAL_ST_HIT,
+   .ctr[0]       = _CB(0x0001, B6, L1, 0x00000008),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_l1_local_st_miss =
+{
+   .type         = NVC0_HW_SM_QUERY_L1_LOCAL_ST_MISS,
+   .ctr[0]       = _CB(0x0001, B6, L1, 0x0000000c),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_l1_shared_ld_transactions =
+{
+   .type         = NVC0_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS,
+   .ctr[0]       = _CB(0x0001, B6, TRANSACTION, 0x00000008),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_l1_shared_st_transactions =
+{
+   .type         = NVC0_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS,
+   .ctr[0]       = _CB(0x0001, B6, TRANSACTION, 0x0000000c),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_local_ld =
+{
+   .type         = NVC0_HW_SM_QUERY_LOCAL_LD,
+   .ctr[0]       = _CA(0x0001, B6, LDST, 0x00000008),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_local_ld_transactions =
+{
+   .type         = NVC0_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS,
+   .ctr[0]       = _CB(0x0001, B6, TRANSACTION, 0x00000000),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_local_st =
+{
+   .type         = NVC0_HW_SM_QUERY_LOCAL_ST,
+   .ctr[0]       = _CA(0x0001, B6, LDST, 0x0000000c),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_local_st_transactions =
+{
+   .type         = NVC0_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS,
+   .ctr[0]       = _CB(0x0001, B6, TRANSACTION, 0x00000004),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_prof_trigger_0 =
+{
+   .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_0,
+   .ctr[0]       = _CA(0x0001, B6, USER, 0x00000000),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_prof_trigger_1 =
+{
+   .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_1,
+   .ctr[0]       = _CA(0x0001, B6, USER, 0x00000004),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_prof_trigger_2 =
+{
+   .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_2,
+   .ctr[0]       = _CA(0x0001, B6, USER, 0x00000008),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_prof_trigger_3 =
+{
+   .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_3,
+   .ctr[0]       = _CA(0x0001, B6, USER, 0x0000000c),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_prof_trigger_4 =
+{
+   .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_4,
+   .ctr[0]       = _CA(0x0001, B6, USER, 0x00000010),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_prof_trigger_5 =
+{
+   .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_5,
+   .ctr[0]       = _CA(0x0001, B6, USER, 0x00000014),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_prof_trigger_6 =
+{
+   .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_6,
+   .ctr[0]       = _CA(0x0001, B6, USER, 0x00000018),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_prof_trigger_7 =
+{
+   .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_7,
+   .ctr[0]       = _CA(0x0001, B6, USER, 0x0000001c),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_shared_ld =
+{
+   .type         = NVC0_HW_SM_QUERY_SHARED_LD,
+   .ctr[0]       = _CA(0x0001, B6, LDST, 0x00000000),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_shared_ld_replay =
+{
+   .type         = NVC0_HW_SM_QUERY_SHARED_LD_REPLAY,
+   .ctr[0]       = _CB(0x0001, B6, REPLAY, 0x00000008),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_shared_st =
+{
+   .type         = NVC0_HW_SM_QUERY_SHARED_ST,
+   .ctr[0]       = _CA(0x0001, B6, LDST, 0x00000004),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_shared_st_replay =
+{
+   .type         = NVC0_HW_SM_QUERY_SHARED_ST_REPLAY,
+   .ctr[0]       = _CB(0x0001, B6, REPLAY, 0x0000000c),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_sm_cta_launched =
+{
+   .type         = NVC0_HW_SM_QUERY_SM_CTA_LAUNCHED,
+   .ctr[0]       = _CB(0x0001, B6, WARP, 0x0000001c),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_threads_launched =
+{
+   .type         = NVC0_HW_SM_QUERY_THREADS_LAUNCHED,
+   .ctr[0]       = _CA(0x003f, B6, LAUNCH, 0x398a4188),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_uncached_gld_transactions =
+{
+   .type         = NVC0_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS,
+   .ctr[0]       = _CB(0x0001, B6, MEM, 0x00000000),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm30_warps_launched =
+{
+   .type         = NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
+   .ctr[0]       = _CA(0x0001, B6, LAUNCH, 0x00000004),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
 
 /* NOTES:
  * active_warps: bit 0 alternates btw 0 and 1 for odd nr of warps
  * inst_executed etc.: we only count a single warp scheduler
  */
-static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries[] =
-{
-   _Q1B(ACTIVE_CYCLES, 0x0001, B6, WARP, 0x00000000, 1, 1),
-   _Q1B(ACTIVE_WARPS,  0x003f, B6, WARP, 0x31483104, 2, 1),
-   _Q1A(ATOM_CAS_COUNT, 0x0001, B6, BRANCH, 0x000000004, 1, 1),
-   _Q1A(ATOM_COUNT, 0x0001, B6, BRANCH, 0x00000000, 1, 1),
-   _Q1A(BRANCH,           0x0001, B6, BRANCH, 0x0000000c, 1, 1),
-   _Q1A(DIVERGENT_BRANCH, 0x0001, B6, BRANCH, 0x00000010, 1, 1),
-   _Q1A(GLD_REQUEST, 0x0001, B6, LDST, 0x00000010, 1, 1),
-   _Q1B(GLD_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000010, 1, 1),
-   _Q1B(GST_TRANSACTIONS,          0x0001, B6, MEM, 0x00000004, 1, 1),
-   _Q1B(GST_MEM_DIV_REPLAY, 0x0001, B6, REPLAY, 0x00000014, 1, 1),
-   _Q1A(GRED_COUNT, 0x0001, B6, BRANCH, 0x00000008, 1, 1),
-   _Q1A(GST_REQUEST, 0x0001, B6, LDST, 0x00000014, 1, 1),
-   _Q1A(INST_EXECUTED, 0x0003, B6, EXEC,  0x00000398, 1, 1),
-   _Q1A(INST_ISSUED1,  0x0001, B6, ISSUE, 0x00000004, 1, 1),
-   _Q1A(INST_ISSUED2,  0x0001, B6, ISSUE, 0x00000008, 1, 1),
-   _Q1B(L1_GLD_HIT,  0x0001, B6, L1, 0x00000010, 1, 1),
-   _Q1B(L1_GLD_MISS, 0x0001, B6, L1, 0x00000014, 1, 1),
-   _Q1B(L1_GLD_TRANSACTIONS,  0x0001, B6, UNK0F, 0x00000000, 1, 1),
-   _Q1B(L1_GST_TRANSACTIONS,  0x0001, B6, UNK0F, 0x00000004, 1, 1),
-   _Q1B(L1_LOCAL_LD_HIT,   0x0001, B6, L1, 0x00000000, 1, 1),
-   _Q1B(L1_LOCAL_LD_MISS,  0x0001, B6, L1, 0x00000004, 1, 1),
-   _Q1B(L1_LOCAL_ST_HIT,  0x0001, B6, L1, 0x00000008, 1, 1),
-   _Q1B(L1_LOCAL_ST_MISS, 0x0001, B6, L1, 0x0000000c, 1, 1),
-   _Q1B(L1_SHARED_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000008, 1, 1),
-   _Q1B(L1_SHARED_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x0000000c, 1, 1),
-   _Q1A(LOCAL_LD,    0x0001, B6, LDST, 0x00000008, 1, 1),
-   _Q1B(LOCAL_LD_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000000, 1, 1),
-   _Q1A(LOCAL_ST,    0x0001, B6, LDST, 0x0000000c, 1, 1),
-   _Q1B(LOCAL_ST_TRANSACTIONS, 0x0001, B6, TRANSACTION, 0x00000004, 1, 1),
-   _Q1A(PROF_TRIGGER_0, 0x0001, B6, USER, 0x00000000, 1, 1),
-   _Q1A(PROF_TRIGGER_1, 0x0001, B6, USER, 0x00000004, 1, 1),
-   _Q1A(PROF_TRIGGER_2, 0x0001, B6, USER, 0x00000008, 1, 1),
-   _Q1A(PROF_TRIGGER_3, 0x0001, B6, USER, 0x0000000c, 1, 1),
-   _Q1A(PROF_TRIGGER_4, 0x0001, B6, USER, 0x00000010, 1, 1),
-   _Q1A(PROF_TRIGGER_5, 0x0001, B6, USER, 0x00000014, 1, 1),
-   _Q1A(PROF_TRIGGER_6, 0x0001, B6, USER, 0x00000018, 1, 1),
-   _Q1A(PROF_TRIGGER_7, 0x0001, B6, USER, 0x0000001c, 1, 1),
-   _Q1A(SHARED_LD,   0x0001, B6, LDST, 0x00000000, 1, 1),
-   _Q1B(SHARED_LD_REPLAY, 0x0001, B6, REPLAY, 0x00000008, 1, 1),
-   _Q1A(SHARED_ST,   0x0001, B6, LDST, 0x00000004, 1, 1),
-   _Q1B(SHARED_ST_REPLAY, 0x0001, B6, REPLAY, 0x0000000c, 1, 1),
-   _Q1B(SM_CTA_LAUNCHED,      0x0001, B6, WARP, 0x0000001c, 1, 1),
-   _Q1A(THREADS_LAUNCHED,  0x003f, B6, LAUNCH, 0x398a4188, 1, 1),
-   _Q1B(UNCACHED_GLD_TRANSACTIONS, 0x0001, B6, MEM, 0x00000000, 1, 1),
-   _Q1A(WARPS_LAUNCHED,    0x0001, B6, LAUNCH, 0x00000004, 1, 1),
-};
-
-#undef _Q1A
-#undef _Q1B
+static const struct nvc0_hw_sm_query_cfg *sm30_hw_sm_queries[] =
+{
+   &sm30_active_cycles,
+   &sm30_active_warps,
+   &sm30_atom_cas_count,
+   &sm30_atom_count,
+   &sm30_branch,
+   &sm30_divergent_branch,
+   &sm30_gld_request,
+   &sm30_gld_mem_div_replay,
+   &sm30_gst_transactions,
+   &sm30_gst_mem_div_replay,
+   &sm30_gred_count,
+   &sm30_gst_request,
+   &sm30_inst_executed,
+   &sm30_inst_issued1,
+   &sm30_inst_issued2,
+   &sm30_l1_gld_hit,
+   &sm30_l1_gld_miss,
+   &sm30_l1_gld_transactions,
+   &sm30_l1_gst_transactions,
+   &sm30_l1_local_ld_hit,
+   &sm30_l1_local_ld_miss,
+   &sm30_l1_local_st_hit,
+   &sm30_l1_local_st_miss,
+   &sm30_l1_shared_ld_transactions,
+   &sm30_l1_shared_st_transactions,
+   &sm30_local_ld,
+   &sm30_local_ld_transactions,
+   &sm30_local_st,
+   &sm30_local_st_transactions,
+   &sm30_prof_trigger_0,
+   &sm30_prof_trigger_1,
+   &sm30_prof_trigger_2,
+   &sm30_prof_trigger_3,
+   &sm30_prof_trigger_4,
+   &sm30_prof_trigger_5,
+   &sm30_prof_trigger_6,
+   &sm30_prof_trigger_7,
+   &sm30_shared_ld,
+   &sm30_shared_ld_replay,
+   &sm30_shared_st,
+   &sm30_shared_st_replay,
+   &sm30_sm_cta_launched,
+   &sm30_threads_launched,
+   &sm30_uncached_gld_transactions,
+   &sm30_warps_launched,
+};
+
+/* ==== Compute capability 3.5 (GK110/GK208) ==== */
+static const struct nvc0_hw_sm_query_cfg
+sm35_atom_cas_count =
+{
+   .type         = NVC0_HW_SM_QUERY_ATOM_CAS_COUNT,
+   .ctr[0]       = _CA(0x0001, B6, UNK1A, 0x00000014),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm35_atom_count =
+{
+   .type         = NVC0_HW_SM_QUERY_ATOM_COUNT,
+   .ctr[0]       = _CA(0x0001, B6, UNK1A, 0x00000010),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm35_gred_count =
+{
+   .type         = NVC0_HW_SM_QUERY_GRED_COUNT,
+   .ctr[0]       = _CA(0x0001, B6, UNK1A, 0x00000018),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm35_not_pred_off_inst_executed =
+{
+   .type         = NVC0_HW_SM_QUERY_NOT_PRED_OFF_INST_EXECUTED,
+   .ctr[0]       = _CA(0x003f, B6, UNK14, 0x29062080),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm35_shared_ld_replay =
+{
+   .type         = NVC0_HW_SM_QUERY_SHARED_LD_REPLAY,
+   .ctr[0]       = _CB(0xaaaa, LOGOP, UNK13, 0x00000018),
+   .ctr[1]       = _CB(0x8888, LOGOP, REPLAY, 0x00000151),
+   .num_counters = 2,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm35_shared_st_replay =
+{
+   .type         = NVC0_HW_SM_QUERY_SHARED_ST_REPLAY,
+   .ctr[0]       = _CB(0xaaaa, LOGOP, UNK13, 0x00000018),
+   .ctr[1]       = _CB(0x8888, LOGOP, REPLAY, 0x000001d1),
+   .num_counters = 2,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg
+sm35_th_inst_executed =
+{
+   .type         = NVC0_HW_SM_QUERY_TH_INST_EXECUTED,
+   .ctr[0]       = _CA(0x003f, B6, UNK11, 0x29062080),
+   .num_counters = 1,
+   .norm         = { 1, 1 },
+};
+
+static const struct nvc0_hw_sm_query_cfg *sm35_hw_sm_queries[] =
+{
+   &sm30_active_cycles,
+   &sm30_active_warps,
+   &sm35_atom_cas_count,
+   &sm35_atom_count,
+   &sm30_gld_request,
+   &sm30_gld_mem_div_replay,
+   &sm30_gst_transactions,
+   &sm30_gst_mem_div_replay,
+   &sm35_gred_count,
+   &sm30_gst_request,
+   &sm30_inst_executed,
+   &sm30_inst_issued1,
+   &sm30_inst_issued2,
+   &sm30_l1_gld_hit,
+   &sm30_l1_gld_miss,
+   &sm30_l1_gld_transactions,
+   &sm30_l1_gst_transactions,
+   &sm30_l1_local_ld_hit,
+   &sm30_l1_local_ld_miss,
+   &sm30_l1_local_st_hit,
+   &sm30_l1_local_st_miss,
+   &sm30_l1_shared_ld_transactions,
+   &sm30_l1_shared_st_transactions,
+   &sm30_local_ld,
+   &sm30_local_ld_transactions,
+   &sm30_local_st,
+   &sm30_local_st_transactions,
+   &sm35_not_pred_off_inst_executed,
+   &sm30_prof_trigger_0,
+   &sm30_prof_trigger_1,
+   &sm30_prof_trigger_2,
+   &sm30_prof_trigger_3,
+   &sm30_prof_trigger_4,
+   &sm30_prof_trigger_5,
+   &sm30_prof_trigger_6,
+   &sm30_prof_trigger_7,
+   &sm30_shared_ld,
+   &sm35_shared_ld_replay,
+   &sm30_shared_st,
+   &sm35_shared_st_replay,
+   &sm30_sm_cta_launched,
+   &sm35_th_inst_executed,
+   &sm30_threads_launched,
+   &sm30_uncached_gld_transactions,
+   &sm30_warps_launched,
+};
+
+#undef _Q
+#undef _CA
+#undef _CB
 
 /* === PERFORMANCE MONITORING COUNTERS for NVC0:NVE4 === */
 /* NOTES:
@@ -257,43 +852,6 @@ static const struct nvc0_hw_sm_query_cfg nve4_hw_sm_queries[] =
  *   because there is a context-switch problem that we need to fix.
  *   Results might be wrong sometimes, be careful!
  */
-static const char *nvc0_hw_sm_query_names[] =
-{
-   /* MP counters */
-   "active_cycles",
-   "active_warps",
-   "atom_count",
-   "branch",
-   "divergent_branch",
-   "gld_request",
-   "gred_count",
-   "gst_request",
-   "inst_executed",
-   "inst_issued",
-   "inst_issued1_0",
-   "inst_issued1_1",
-   "inst_issued2_0",
-   "inst_issued2_1",
-   "local_load",
-   "local_store",
-   "prof_trigger_00",
-   "prof_trigger_01",
-   "prof_trigger_02",
-   "prof_trigger_03",
-   "prof_trigger_04",
-   "prof_trigger_05",
-   "prof_trigger_06",
-   "prof_trigger_07",
-   "shared_load",
-   "shared_store",
-   "threads_launched",
-   "thread_inst_executed_0",
-   "thread_inst_executed_1",
-   "thread_inst_executed_2",
-   "thread_inst_executed_3",
-   "warps_launched",
-};
-
 static const uint64_t nvc0_read_hw_sm_counters_code[] =
 {
    /* mov b32 $r8 $tidx
@@ -345,12 +903,12 @@ static const uint64_t nvc0_read_hw_sm_counters_code[] =
 };
 
 #define _C(f, o, g, m, s) { f, NVC0_COMPUTE_MP_PM_OP_MODE_##o, 0, g, m, s }
-#define _Q(n, c) [NVC0_HW_SM_QUERY_##n] = c
 
 /* ==== Compute capability 2.0 (GF100/GF110) ==== */
 static const struct nvc0_hw_sm_query_cfg
 sm20_active_cycles =
 {
+   .type         = NVC0_HW_SM_QUERY_ACTIVE_CYCLES,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x11, 0x000000ff, 0x00000000),
    .num_counters = 1,
    .norm         = { 1, 1 },
@@ -359,6 +917,7 @@ sm20_active_cycles =
 static const struct nvc0_hw_sm_query_cfg
 sm20_active_warps =
 {
+   .type         = NVC0_HW_SM_QUERY_ACTIVE_WARPS,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000010),
    .ctr[1]       = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000020),
    .ctr[2]       = _C(0xaaaa, LOGOP, 0x24, 0x000000ff, 0x00000030),
@@ -372,6 +931,7 @@ sm20_active_warps =
 static const struct nvc0_hw_sm_query_cfg
 sm20_atom_count =
 {
+   .type         = NVC0_HW_SM_QUERY_ATOM_COUNT,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000030),
    .num_counters = 1,
    .norm         = { 1, 1 },
@@ -380,6 +940,7 @@ sm20_atom_count =
 static const struct nvc0_hw_sm_query_cfg
 sm20_branch =
 {
+   .type         = NVC0_HW_SM_QUERY_BRANCH,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000000),
    .ctr[1]       = _C(0xaaaa, LOGOP, 0x1a, 0x000000ff, 0x00000010),
    .num_counters = 2,
@@ -389,6 +950,7 @@ sm20_branch =
 static const struct nvc0_hw_sm_query_cfg
 sm20_divergent_branch =
 {
+   .type         = NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000020),
    .ctr[1]       = _C(0xaaaa, LOGOP, 0x19, 0x000000ff, 0x00000030),
    .num_counters = 2,
@@ -398,6 +960,7 @@ sm20_divergent_branch =
 static const struct nvc0_hw_sm_query_cfg
 sm20_gld_request =
 {
+   .type         = NVC0_HW_SM_QUERY_GLD_REQUEST,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000030),
    .num_counters = 1,
    .norm         = { 1, 1 },
@@ -406,6 +969,7 @@ sm20_gld_request =
 static const struct nvc0_hw_sm_query_cfg
 sm20_gred_count =
 {
+   .type         = NVC0_HW_SM_QUERY_GRED_COUNT,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x63, 0x000000ff, 0x00000040),
    .num_counters = 1,
    .norm         = { 1, 1 },
@@ -414,6 +978,7 @@ sm20_gred_count =
 static const struct nvc0_hw_sm_query_cfg
 sm20_gst_request =
 {
+   .type         = NVC0_HW_SM_QUERY_GST_REQUEST,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000060),
    .num_counters = 1,
    .norm         = { 1, 1 },
@@ -422,6 +987,7 @@ sm20_gst_request =
 static const struct nvc0_hw_sm_query_cfg
 sm20_inst_executed =
 {
+   .type         = NVC0_HW_SM_QUERY_INST_EXECUTED,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001000),
    .ctr[1]       = _C(0xaaaa, LOGOP, 0x2d, 0x0000ffff, 0x00001010),
    .num_counters = 2,
@@ -431,6 +997,7 @@ sm20_inst_executed =
 static const struct nvc0_hw_sm_query_cfg
 sm20_inst_issued =
 {
+   .type         = NVC0_HW_SM_QUERY_INST_ISSUED,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007060),
    .ctr[1]       = _C(0xaaaa, LOGOP, 0x27, 0x0000ffff, 0x00007070),
    .num_counters = 2,
@@ -440,6 +1007,7 @@ sm20_inst_issued =
 static const struct nvc0_hw_sm_query_cfg
 sm20_local_ld =
 {
+   .type         = NVC0_HW_SM_QUERY_LOCAL_LD,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000020),
    .num_counters = 1,
    .norm         = { 1, 1 },
@@ -448,6 +1016,7 @@ sm20_local_ld =
 static const struct nvc0_hw_sm_query_cfg
 sm20_local_st =
 {
+   .type         = NVC0_HW_SM_QUERY_LOCAL_ST,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000050),
    .num_counters = 1,
    .norm         = { 1, 1 },
@@ -456,6 +1025,7 @@ sm20_local_st =
 static const struct nvc0_hw_sm_query_cfg
 sm20_prof_trigger_0 =
 {
+   .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_0,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000000),
    .num_counters = 1,
    .norm         = { 1, 1 },
@@ -464,6 +1034,7 @@ sm20_prof_trigger_0 =
 static const struct nvc0_hw_sm_query_cfg
 sm20_prof_trigger_1 =
 {
+   .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_1,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000010),
    .num_counters = 1,
    .norm         = { 1, 1 },
@@ -472,6 +1043,7 @@ sm20_prof_trigger_1 =
 static const struct nvc0_hw_sm_query_cfg
 sm20_prof_trigger_2 =
 {
+   .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_2,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000020),
    .num_counters = 1,
    .norm         = { 1, 1 },
@@ -480,6 +1052,7 @@ sm20_prof_trigger_2 =
 static const struct nvc0_hw_sm_query_cfg
 sm20_prof_trigger_3 =
 {
+   .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_3,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000030),
    .num_counters = 1,
    .norm         = { 1, 1 },
@@ -488,6 +1061,7 @@ sm20_prof_trigger_3 =
 static const struct nvc0_hw_sm_query_cfg
 sm20_prof_trigger_4 =
 {
+   .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_4,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000040),
    .num_counters = 1,
    .norm         = { 1, 1 },
@@ -496,6 +1070,7 @@ sm20_prof_trigger_4 =
 static const struct nvc0_hw_sm_query_cfg
 sm20_prof_trigger_5 =
 {
+   .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_5,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000050),
    .num_counters = 1,
    .norm         = { 1, 1 },
@@ -504,6 +1079,7 @@ sm20_prof_trigger_5 =
 static const struct nvc0_hw_sm_query_cfg
 sm20_prof_trigger_6 =
 {
+   .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_6,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000060),
    .num_counters = 1,
    .norm         = { 1, 1 },
@@ -512,6 +1088,7 @@ sm20_prof_trigger_6 =
 static const struct nvc0_hw_sm_query_cfg
 sm20_prof_trigger_7 =
 {
+   .type         = NVC0_HW_SM_QUERY_PROF_TRIGGER_7,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x01, 0x000000ff, 0x00000070),
    .num_counters = 1,
    .norm         = { 1, 1 },
@@ -520,6 +1097,7 @@ sm20_prof_trigger_7 =
 static const struct nvc0_hw_sm_query_cfg
 sm20_shared_ld =
 {
+   .type         = NVC0_HW_SM_QUERY_SHARED_LD,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000010),
    .num_counters = 1,
    .norm         = { 1, 1 },
@@ -528,6 +1106,7 @@ sm20_shared_ld =
 static const struct nvc0_hw_sm_query_cfg
 sm20_shared_st =
 {
+   .type         = NVC0_HW_SM_QUERY_SHARED_ST,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x64, 0x000000ff, 0x00000040),
    .num_counters = 1,
    .norm         = { 1, 1 },
@@ -536,6 +1115,7 @@ sm20_shared_st =
 static const struct nvc0_hw_sm_query_cfg
 sm20_threads_launched =
 {
+   .type         = NVC0_HW_SM_QUERY_THREADS_LAUNCHED,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000010),
    .ctr[1]       = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000020),
    .ctr[2]       = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000030),
@@ -549,6 +1129,7 @@ sm20_threads_launched =
 static const struct nvc0_hw_sm_query_cfg
 sm20_th_inst_executed_0 =
 {
+   .type         = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000000),
    .ctr[1]       = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000010),
    .ctr[2]       = _C(0xaaaa, LOGOP, 0x2f, 0x000000ff, 0x00000020),
@@ -562,6 +1143,7 @@ sm20_th_inst_executed_0 =
 static const struct nvc0_hw_sm_query_cfg
 sm20_th_inst_executed_1 =
 {
+   .type         = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000000),
    .ctr[1]       = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000010),
    .ctr[2]       = _C(0xaaaa, LOGOP, 0x30, 0x000000ff, 0x00000020),
@@ -575,6 +1157,7 @@ sm20_th_inst_executed_1 =
 static const struct nvc0_hw_sm_query_cfg
 sm20_warps_launched =
 {
+   .type         = NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x26, 0x000000ff, 0x00000000),
    .num_counters = 1,
    .norm         = { 1, 1 },
@@ -582,44 +1165,39 @@ sm20_warps_launched =
 
 static const struct nvc0_hw_sm_query_cfg *sm20_hw_sm_queries[] =
 {
-   _Q(ACTIVE_CYCLES,       &sm20_active_cycles),
-   _Q(ACTIVE_WARPS,        &sm20_active_warps),
-   _Q(ATOM_COUNT,          &sm20_atom_count),
-   _Q(BRANCH,              &sm20_branch),
-   _Q(DIVERGENT_BRANCH,    &sm20_divergent_branch),
-   _Q(GLD_REQUEST,         &sm20_gld_request),
-   _Q(GRED_COUNT,          &sm20_gred_count),
-   _Q(GST_REQUEST,         &sm20_gst_request),
-   _Q(INST_EXECUTED,       &sm20_inst_executed),
-   _Q(INST_ISSUED,         &sm20_inst_issued),
-   _Q(INST_ISSUED1_0,      NULL),
-   _Q(INST_ISSUED1_1,      NULL),
-   _Q(INST_ISSUED2_0,      NULL),
-   _Q(INST_ISSUED2_1,      NULL),
-   _Q(LOCAL_LD,            &sm20_local_ld),
-   _Q(LOCAL_ST,            &sm20_local_st),
-   _Q(PROF_TRIGGER_0,      &sm20_prof_trigger_0),
-   _Q(PROF_TRIGGER_1,      &sm20_prof_trigger_1),
-   _Q(PROF_TRIGGER_2,      &sm20_prof_trigger_2),
-   _Q(PROF_TRIGGER_3,      &sm20_prof_trigger_3),
-   _Q(PROF_TRIGGER_4,      &sm20_prof_trigger_4),
-   _Q(PROF_TRIGGER_5,      &sm20_prof_trigger_5),
-   _Q(PROF_TRIGGER_6,      &sm20_prof_trigger_6),
-   _Q(PROF_TRIGGER_7,      &sm20_prof_trigger_7),
-   _Q(SHARED_LD,           &sm20_shared_ld),
-   _Q(SHARED_ST,           &sm20_shared_st),
-   _Q(THREADS_LAUNCHED,    &sm20_threads_launched),
-   _Q(TH_INST_EXECUTED_0,  &sm20_th_inst_executed_0),
-   _Q(TH_INST_EXECUTED_1,  &sm20_th_inst_executed_1),
-   _Q(TH_INST_EXECUTED_2,  NULL),
-   _Q(TH_INST_EXECUTED_3,  NULL),
-   _Q(WARPS_LAUNCHED,      &sm20_warps_launched),
+   &sm20_active_cycles,
+   &sm20_active_warps,
+   &sm20_atom_count,
+   &sm20_branch,
+   &sm20_divergent_branch,
+   &sm20_gld_request,
+   &sm20_gred_count,
+   &sm20_gst_request,
+   &sm20_inst_executed,
+   &sm20_inst_issued,
+   &sm20_local_ld,
+   &sm20_local_st,
+   &sm20_prof_trigger_0,
+   &sm20_prof_trigger_1,
+   &sm20_prof_trigger_2,
+   &sm20_prof_trigger_3,
+   &sm20_prof_trigger_4,
+   &sm20_prof_trigger_5,
+   &sm20_prof_trigger_6,
+   &sm20_prof_trigger_7,
+   &sm20_shared_ld,
+   &sm20_shared_st,
+   &sm20_threads_launched,
+   &sm20_th_inst_executed_0,
+   &sm20_th_inst_executed_1,
+   &sm20_warps_launched,
 };
 
 /* ==== Compute capability 2.1 (GF108+ except GF110) ==== */
 static const struct nvc0_hw_sm_query_cfg
 sm21_inst_executed =
 {
+   .type         = NVC0_HW_SM_QUERY_INST_EXECUTED,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000000),
    .ctr[1]       = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000010),
    .ctr[2]       = _C(0xaaaa, LOGOP, 0x2d, 0x000000ff, 0x00000020),
@@ -630,6 +1208,7 @@ sm21_inst_executed =
 static const struct nvc0_hw_sm_query_cfg
 sm21_inst_issued1_0 =
 {
+   .type         = NVC0_HW_SM_QUERY_INST_ISSUED1_0,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000010),
    .num_counters = 1,
    .norm         = { 1, 1 },
@@ -638,6 +1217,7 @@ sm21_inst_issued1_0 =
 static const struct nvc0_hw_sm_query_cfg
 sm21_inst_issued1_1 =
 {
+   .type         = NVC0_HW_SM_QUERY_INST_ISSUED1_1,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000040),
    .num_counters = 1,
    .norm         = { 1, 1 },
@@ -646,6 +1226,7 @@ sm21_inst_issued1_1 =
 static const struct nvc0_hw_sm_query_cfg
 sm21_inst_issued2_0 =
 {
+   .type         = NVC0_HW_SM_QUERY_INST_ISSUED2_0,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000020),
    .num_counters = 1,
    .norm         = { 1, 1 },
@@ -654,6 +1235,7 @@ sm21_inst_issued2_0 =
 static const struct nvc0_hw_sm_query_cfg
 sm21_inst_issued2_1 =
 {
+   .type         = NVC0_HW_SM_QUERY_INST_ISSUED2_1,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0x7e, 0x000000ff, 0x00000050),
    .num_counters = 1,
    .norm         = { 1, 1 },
@@ -662,6 +1244,7 @@ sm21_inst_issued2_1 =
 static const struct nvc0_hw_sm_query_cfg
 sm21_th_inst_executed_0 =
 {
+   .type         = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000000),
    .ctr[1]       = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000010),
    .ctr[2]       = _C(0xaaaa, LOGOP, 0xa3, 0x000000ff, 0x00000020),
@@ -675,6 +1258,7 @@ sm21_th_inst_executed_0 =
 static const struct nvc0_hw_sm_query_cfg
 sm21_th_inst_executed_1 =
 {
+   .type         = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000000),
    .ctr[1]       = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000010),
    .ctr[2]       = _C(0xaaaa, LOGOP, 0xa5, 0x000000ff, 0x00000020),
@@ -688,6 +1272,7 @@ sm21_th_inst_executed_1 =
 static const struct nvc0_hw_sm_query_cfg
 sm21_th_inst_executed_2 =
 {
+   .type         = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000000),
    .ctr[1]       = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000010),
    .ctr[2]       = _C(0xaaaa, LOGOP, 0xa4, 0x000000ff, 0x00000020),
@@ -701,6 +1286,7 @@ sm21_th_inst_executed_2 =
 static const struct nvc0_hw_sm_query_cfg
 sm21_th_inst_executed_3 =
 {
+   .type         = NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3,
    .ctr[0]       = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000000),
    .ctr[1]       = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000010),
    .ctr[2]       = _C(0xaaaa, LOGOP, 0xa6, 0x000000ff, 0x00000020),
@@ -713,41 +1299,39 @@ sm21_th_inst_executed_3 =
 
 static const struct nvc0_hw_sm_query_cfg *sm21_hw_sm_queries[] =
 {
-   _Q(ACTIVE_CYCLES,       &sm20_active_cycles),
-   _Q(ACTIVE_WARPS,        &sm20_active_warps),
-   _Q(ATOM_COUNT,          &sm20_atom_count),
-   _Q(BRANCH,              &sm20_branch),
-   _Q(DIVERGENT_BRANCH,    &sm20_divergent_branch),
-   _Q(GLD_REQUEST,         &sm20_gld_request),
-   _Q(GRED_COUNT,          &sm20_gred_count),
-   _Q(GST_REQUEST,         &sm20_gst_request),
-   _Q(INST_EXECUTED,       &sm21_inst_executed),
-   _Q(INST_ISSUED,         NULL),
-   _Q(INST_ISSUED1_0,      &sm21_inst_issued1_0),
-   _Q(INST_ISSUED1_1,      &sm21_inst_issued1_1),
-   _Q(INST_ISSUED2_0,      &sm21_inst_issued2_0),
-   _Q(INST_ISSUED2_1,      &sm21_inst_issued2_1),
-   _Q(LOCAL_LD,            &sm20_local_ld),
-   _Q(LOCAL_ST,            &sm20_local_st),
-   _Q(PROF_TRIGGER_0,      &sm20_prof_trigger_0),
-   _Q(PROF_TRIGGER_1,      &sm20_prof_trigger_1),
-   _Q(PROF_TRIGGER_2,      &sm20_prof_trigger_2),
-   _Q(PROF_TRIGGER_3,      &sm20_prof_trigger_3),
-   _Q(PROF_TRIGGER_4,      &sm20_prof_trigger_4),
-   _Q(PROF_TRIGGER_5,      &sm20_prof_trigger_5),
-   _Q(PROF_TRIGGER_6,      &sm20_prof_trigger_6),
-   _Q(PROF_TRIGGER_7,      &sm20_prof_trigger_7),
-   _Q(SHARED_LD,           &sm20_shared_ld),
-   _Q(SHARED_ST,           &sm20_shared_st),
-   _Q(THREADS_LAUNCHED,    &sm20_threads_launched),
-   _Q(TH_INST_EXECUTED_0,  &sm21_th_inst_executed_0),
-   _Q(TH_INST_EXECUTED_1,  &sm21_th_inst_executed_1),
-   _Q(TH_INST_EXECUTED_2,  &sm21_th_inst_executed_2),
-   _Q(TH_INST_EXECUTED_3,  &sm21_th_inst_executed_3),
-   _Q(WARPS_LAUNCHED,      &sm20_warps_launched),
+   &sm20_active_cycles,
+   &sm20_active_warps,
+   &sm20_atom_count,
+   &sm20_branch,
+   &sm20_divergent_branch,
+   &sm20_gld_request,
+   &sm20_gred_count,
+   &sm20_gst_request,
+   &sm21_inst_executed,
+   &sm21_inst_issued1_0,
+   &sm21_inst_issued1_1,
+   &sm21_inst_issued2_0,
+   &sm21_inst_issued2_1,
+   &sm20_local_ld,
+   &sm20_local_st,
+   &sm20_prof_trigger_0,
+   &sm20_prof_trigger_1,
+   &sm20_prof_trigger_2,
+   &sm20_prof_trigger_3,
+   &sm20_prof_trigger_4,
+   &sm20_prof_trigger_5,
+   &sm20_prof_trigger_6,
+   &sm20_prof_trigger_7,
+   &sm20_shared_ld,
+   &sm20_shared_st,
+   &sm20_threads_launched,
+   &sm21_th_inst_executed_0,
+   &sm21_th_inst_executed_1,
+   &sm21_th_inst_executed_2,
+   &sm21_th_inst_executed_3,
+   &sm20_warps_launched,
 };
 
-#undef _Q
 #undef _C
 
 static inline const struct nvc0_hw_sm_query_cfg **
@@ -755,26 +1339,55 @@ nvc0_hw_sm_get_queries(struct nvc0_screen *screen)
 {
    struct nouveau_device *dev = screen->base.device;
 
-   if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
-      return sm20_hw_sm_queries;
-   return sm21_hw_sm_queries;
+   switch (screen->base.class_3d) {
+   case NVF0_3D_CLASS:
+      return sm35_hw_sm_queries;
+   case NVE4_3D_CLASS:
+      return sm30_hw_sm_queries;
+   default:
+      if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
+         return sm20_hw_sm_queries;
+      return sm21_hw_sm_queries;
+   }
+   assert(0);
+   return NULL;
+}
+
+unsigned
+nvc0_hw_sm_get_num_queries(struct nvc0_screen *screen)
+{
+   struct nouveau_device *dev = screen->base.device;
+
+   switch (screen->base.class_3d) {
+   case NVF0_3D_CLASS:
+      return ARRAY_SIZE(sm35_hw_sm_queries);
+   case NVE4_3D_CLASS:
+      return ARRAY_SIZE(sm30_hw_sm_queries);
+   default:
+      if (dev->chipset == 0xc0 || dev->chipset == 0xc8)
+         return ARRAY_SIZE(sm20_hw_sm_queries);
+      return ARRAY_SIZE(sm21_hw_sm_queries);
+   }
+   return 0;
 }
 
 static const struct nvc0_hw_sm_query_cfg *
 nvc0_hw_sm_query_get_cfg(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
 {
+   const struct nvc0_hw_sm_query_cfg **queries;
    struct nvc0_screen *screen = nvc0->screen;
    struct nvc0_query *q = &hq->base;
+   unsigned num_queries;
+   unsigned i;
 
-   if (screen->base.class_3d >= NVE4_3D_CLASS)
-      return &nve4_hw_sm_queries[q->type - PIPE_QUERY_DRIVER_SPECIFIC];
+   num_queries = nvc0_hw_sm_get_num_queries(screen);
+   queries = nvc0_hw_sm_get_queries(screen);
 
-   if (q->type >= NVC0_HW_SM_QUERY(0) && q->type <= NVC0_HW_SM_QUERY_LAST) {
-      const struct nvc0_hw_sm_query_cfg **queries =
-         nvc0_hw_sm_get_queries(screen);
-      return queries[q->type - NVC0_HW_SM_QUERY(0)];
+   for (i = 0; i < num_queries; i++) {
+      if (NVC0_HW_SM_QUERY(queries[i]->type) == q->type)
+         return queries[i];
    }
-   debug_printf("invalid query type: %d\n", q->type);
+   assert(0);
    return NULL;
 }
 
@@ -929,6 +1542,37 @@ nvc0_hw_sm_begin_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
    return true;
 }
 
+static inline struct nvc0_program *
+nvc0_hw_sm_get_program(struct nvc0_screen *screen)
+{
+   struct nvc0_program *prog;
+
+   prog = CALLOC_STRUCT(nvc0_program);
+   if (!prog)
+      return NULL;
+
+   prog->type = PIPE_SHADER_COMPUTE;
+   prog->translated = true;
+   prog->parm_size = 12;
+
+   if (screen->base.class_3d == NVE4_3D_CLASS ||
+       screen->base.class_3d == NVF0_3D_CLASS) {
+      if (screen->base.class_3d == NVE4_3D_CLASS) {
+         prog->code = (uint32_t *)nve4_read_hw_sm_counters_code;
+         prog->code_size = sizeof(nve4_read_hw_sm_counters_code);
+      } else {
+         prog->code = (uint32_t *)nvf0_read_hw_sm_counters_code;
+         prog->code_size = sizeof(nvf0_read_hw_sm_counters_code);
+      }
+      prog->num_gprs = 14;
+   } else {
+      prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code;
+      prog->code_size = sizeof(nvc0_read_hw_sm_counters_code);
+      prog->num_gprs = 12;
+   }
+   return prog;
+}
+
 static void
 nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
 {
@@ -944,22 +1588,8 @@ nvc0_hw_sm_end_query(struct nvc0_context *nvc0, struct nvc0_hw_query *hq)
    const uint grid[3] = { screen->mp_count, screen->gpc_count, 1 };
    unsigned c, i;
 
-   if (unlikely(!screen->pm.prog)) {
-      struct nvc0_program *prog = CALLOC_STRUCT(nvc0_program);
-      prog->type = PIPE_SHADER_COMPUTE;
-      prog->translated = true;
-      prog->parm_size = 12;
-      if (is_nve4) {
-         prog->code = (uint32_t *)nve4_read_hw_sm_counters_code;
-         prog->code_size = sizeof(nve4_read_hw_sm_counters_code);
-         prog->num_gprs = 14;
-      } else {
-         prog->code = (uint32_t *)nvc0_read_hw_sm_counters_code;
-         prog->code_size = sizeof(nvc0_read_hw_sm_counters_code);
-         prog->num_gprs = 12;
-      }
-      screen->pm.prog = prog;
-   }
+   if (unlikely(!screen->pm.prog))
+      screen->pm.prog = nvc0_hw_sm_get_program(screen);
 
    /* disable all counting */
    PUSH_SPACE(push, 8);
@@ -1132,8 +1762,7 @@ nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type)
    if (nvc0->screen->base.drm->version < 0x01000101)
       return NULL;
 
-   if ((type < NVE4_HW_SM_QUERY(0) || type > NVE4_HW_SM_QUERY_LAST) &&
-       (type < NVC0_HW_SM_QUERY(0) || type > NVC0_HW_SM_QUERY_LAST))
+   if (type < NVC0_HW_SM_QUERY(0) || type > NVC0_HW_SM_QUERY_LAST)
       return NULL;
 
    hsq = CALLOC_STRUCT(nvc0_hw_sm_query);
@@ -1201,23 +1830,6 @@ nvc0_hw_sm_create_query(struct nvc0_context *nvc0, unsigned type)
    return hq;
 }
 
-static int
-nvc0_hw_sm_get_next_query_id(const struct nvc0_hw_sm_query_cfg **queries,
-                             unsigned id)
-{
-   unsigned i, next = 0;
-
-   for (i = 0; i < NVC0_HW_SM_QUERY_COUNT; i++) {
-      if (!queries[i]) {
-         next++;
-      } else
-      if (i >= id && queries[id + next]) {
-         break;
-      }
-   }
-   return id + next;
-}
-
 int
 nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
                                  struct pipe_driver_query_info *info)
@@ -1225,21 +1837,8 @@ nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
    int count = 0;
 
    if (screen->base.drm->version >= 0x01000101) {
-      if (screen->compute) {
-         if (screen->base.class_3d == NVE4_3D_CLASS) {
-            count += NVE4_HW_SM_QUERY_COUNT;
-         } else
-         if (screen->base.class_3d < NVE4_3D_CLASS) {
-            const struct nvc0_hw_sm_query_cfg **queries =
-               nvc0_hw_sm_get_queries(screen);
-            unsigned i;
-
-            for (i = 0; i < NVC0_HW_SM_QUERY_COUNT; i++) {
-               if (queries[i])
-                  count++;
-            }
-         }
-      }
+      if (screen->compute)
+         count = nvc0_hw_sm_get_num_queries(screen);
    }
 
    if (!info)
@@ -1247,19 +1846,12 @@ nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *screen, unsigned id,
 
    if (id < count) {
       if (screen->compute) {
-         if (screen->base.class_3d == NVE4_3D_CLASS) {
-            info->name = nve4_hw_sm_query_names[id];
-            info->query_type = NVE4_HW_SM_QUERY(id);
-            info->group_id = NVC0_HW_SM_QUERY_GROUP;
-            return 1;
-         } else
-         if (screen->base.class_3d < NVE4_3D_CLASS) {
+         if (screen->base.class_3d <= NVF0_3D_CLASS) {
             const struct nvc0_hw_sm_query_cfg **queries =
                nvc0_hw_sm_get_queries(screen);
 
-            id = nvc0_hw_sm_get_next_query_id(queries, id);
-            info->name = nvc0_hw_sm_query_names[id];
-            info->query_type = NVC0_HW_SM_QUERY(id);
+            info->name = nvc0_hw_sm_query_get_name(queries[id]->type);
+            info->query_type = NVC0_HW_SM_QUERY(queries[id]->type);
             info->group_id = NVC0_HW_SM_QUERY_GROUP;
             return 1;
          }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h
index 94d55a04ff8..65d6c8b3167 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_query_hw_sm.h
@@ -17,78 +17,45 @@ nvc0_hw_sm_query(struct nvc0_hw_query *hq)
 /*
  * Performance counter queries:
  */
-#define NVE4_HW_SM_QUERY(i)    (PIPE_QUERY_DRIVER_SPECIFIC + (i))
-#define NVE4_HW_SM_QUERY_LAST   NVE4_HW_SM_QUERY(NVE4_HW_SM_QUERY_COUNT - 1)
-enum nve4_hw_sm_queries
-{
-   NVE4_HW_SM_QUERY_ACTIVE_CYCLES = 0,
-   NVE4_HW_SM_QUERY_ACTIVE_WARPS,
-   NVE4_HW_SM_QUERY_ATOM_CAS_COUNT,
-   NVE4_HW_SM_QUERY_ATOM_COUNT,
-   NVE4_HW_SM_QUERY_BRANCH,
-   NVE4_HW_SM_QUERY_DIVERGENT_BRANCH,
-   NVE4_HW_SM_QUERY_GLD_REQUEST,
-   NVE4_HW_SM_QUERY_GLD_MEM_DIV_REPLAY,
-   NVE4_HW_SM_QUERY_GST_TRANSACTIONS,
-   NVE4_HW_SM_QUERY_GST_MEM_DIV_REPLAY,
-   NVE4_HW_SM_QUERY_GRED_COUNT,
-   NVE4_HW_SM_QUERY_GST_REQUEST,
-   NVE4_HW_SM_QUERY_INST_EXECUTED,
-   NVE4_HW_SM_QUERY_INST_ISSUED1,
-   NVE4_HW_SM_QUERY_INST_ISSUED2,
-   NVE4_HW_SM_QUERY_L1_GLD_HIT,
-   NVE4_HW_SM_QUERY_L1_GLD_MISS,
-   NVE4_HW_SM_QUERY_L1_GLD_TRANSACTIONS,
-   NVE4_HW_SM_QUERY_L1_GST_TRANSACTIONS,
-   NVE4_HW_SM_QUERY_L1_LOCAL_LD_HIT,
-   NVE4_HW_SM_QUERY_L1_LOCAL_LD_MISS,
-   NVE4_HW_SM_QUERY_L1_LOCAL_ST_HIT,
-   NVE4_HW_SM_QUERY_L1_LOCAL_ST_MISS,
-   NVE4_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS,
-   NVE4_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS,
-   NVE4_HW_SM_QUERY_LOCAL_LD,
-   NVE4_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS,
-   NVE4_HW_SM_QUERY_LOCAL_ST,
-   NVE4_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS,
-   NVE4_HW_SM_QUERY_PROF_TRIGGER_0,
-   NVE4_HW_SM_QUERY_PROF_TRIGGER_1,
-   NVE4_HW_SM_QUERY_PROF_TRIGGER_2,
-   NVE4_HW_SM_QUERY_PROF_TRIGGER_3,
-   NVE4_HW_SM_QUERY_PROF_TRIGGER_4,
-   NVE4_HW_SM_QUERY_PROF_TRIGGER_5,
-   NVE4_HW_SM_QUERY_PROF_TRIGGER_6,
-   NVE4_HW_SM_QUERY_PROF_TRIGGER_7,
-   NVE4_HW_SM_QUERY_SHARED_LD,
-   NVE4_HW_SM_QUERY_SHARED_LD_REPLAY,
-   NVE4_HW_SM_QUERY_SHARED_ST,
-   NVE4_HW_SM_QUERY_SHARED_ST_REPLAY,
-   NVE4_HW_SM_QUERY_SM_CTA_LAUNCHED,
-   NVE4_HW_SM_QUERY_THREADS_LAUNCHED,
-   NVE4_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS,
-   NVE4_HW_SM_QUERY_WARPS_LAUNCHED,
-   NVE4_HW_SM_QUERY_COUNT
-};
-
-#define NVC0_HW_SM_QUERY(i)    (PIPE_QUERY_DRIVER_SPECIFIC + 2048 + (i))
+#define NVC0_HW_SM_QUERY(i)    (PIPE_QUERY_DRIVER_SPECIFIC + (i))
 #define NVC0_HW_SM_QUERY_LAST   NVC0_HW_SM_QUERY(NVC0_HW_SM_QUERY_COUNT - 1)
 enum nvc0_hw_sm_queries
 {
    NVC0_HW_SM_QUERY_ACTIVE_CYCLES = 0,
    NVC0_HW_SM_QUERY_ACTIVE_WARPS,
+   NVC0_HW_SM_QUERY_ATOM_CAS_COUNT,
    NVC0_HW_SM_QUERY_ATOM_COUNT,
    NVC0_HW_SM_QUERY_BRANCH,
    NVC0_HW_SM_QUERY_DIVERGENT_BRANCH,
    NVC0_HW_SM_QUERY_GLD_REQUEST,
+   NVC0_HW_SM_QUERY_GLD_MEM_DIV_REPLAY,
+   NVC0_HW_SM_QUERY_GST_TRANSACTIONS,
+   NVC0_HW_SM_QUERY_GST_MEM_DIV_REPLAY,
    NVC0_HW_SM_QUERY_GRED_COUNT,
    NVC0_HW_SM_QUERY_GST_REQUEST,
    NVC0_HW_SM_QUERY_INST_EXECUTED,
    NVC0_HW_SM_QUERY_INST_ISSUED,
+   NVC0_HW_SM_QUERY_INST_ISSUED1,
+   NVC0_HW_SM_QUERY_INST_ISSUED2,
    NVC0_HW_SM_QUERY_INST_ISSUED1_0,
    NVC0_HW_SM_QUERY_INST_ISSUED1_1,
    NVC0_HW_SM_QUERY_INST_ISSUED2_0,
    NVC0_HW_SM_QUERY_INST_ISSUED2_1,
+   NVC0_HW_SM_QUERY_L1_GLD_HIT,
+   NVC0_HW_SM_QUERY_L1_GLD_MISS,
+   NVC0_HW_SM_QUERY_L1_GLD_TRANSACTIONS,
+   NVC0_HW_SM_QUERY_L1_GST_TRANSACTIONS,
+   NVC0_HW_SM_QUERY_L1_LOCAL_LD_HIT,
+   NVC0_HW_SM_QUERY_L1_LOCAL_LD_MISS,
+   NVC0_HW_SM_QUERY_L1_LOCAL_ST_HIT,
+   NVC0_HW_SM_QUERY_L1_LOCAL_ST_MISS,
+   NVC0_HW_SM_QUERY_L1_SHARED_LD_TRANSACTIONS,
+   NVC0_HW_SM_QUERY_L1_SHARED_ST_TRANSACTIONS,
    NVC0_HW_SM_QUERY_LOCAL_LD,
+   NVC0_HW_SM_QUERY_LOCAL_LD_TRANSACTIONS,
    NVC0_HW_SM_QUERY_LOCAL_ST,
+   NVC0_HW_SM_QUERY_LOCAL_ST_TRANSACTIONS,
+   NVC0_HW_SM_QUERY_NOT_PRED_OFF_INST_EXECUTED,
    NVC0_HW_SM_QUERY_PROF_TRIGGER_0,
    NVC0_HW_SM_QUERY_PROF_TRIGGER_1,
    NVC0_HW_SM_QUERY_PROF_TRIGGER_2,
@@ -98,12 +65,17 @@ enum nvc0_hw_sm_queries
    NVC0_HW_SM_QUERY_PROF_TRIGGER_6,
    NVC0_HW_SM_QUERY_PROF_TRIGGER_7,
    NVC0_HW_SM_QUERY_SHARED_LD,
+   NVC0_HW_SM_QUERY_SHARED_LD_REPLAY,
    NVC0_HW_SM_QUERY_SHARED_ST,
+   NVC0_HW_SM_QUERY_SHARED_ST_REPLAY,
+   NVC0_HW_SM_QUERY_SM_CTA_LAUNCHED,
    NVC0_HW_SM_QUERY_THREADS_LAUNCHED,
+   NVC0_HW_SM_QUERY_TH_INST_EXECUTED,
    NVC0_HW_SM_QUERY_TH_INST_EXECUTED_0,
    NVC0_HW_SM_QUERY_TH_INST_EXECUTED_1,
    NVC0_HW_SM_QUERY_TH_INST_EXECUTED_2,
    NVC0_HW_SM_QUERY_TH_INST_EXECUTED_3,
+   NVC0_HW_SM_QUERY_UNCACHED_GLD_TRANSACTIONS,
    NVC0_HW_SM_QUERY_WARPS_LAUNCHED,
    NVC0_HW_SM_QUERY_COUNT
 };
@@ -113,4 +85,7 @@ nvc0_hw_sm_create_query(struct nvc0_context *, unsigned);
 int
 nvc0_hw_sm_get_driver_query_info(struct nvc0_screen *, unsigned,
                                  struct pipe_driver_query_info *);
+unsigned
+nvc0_hw_sm_get_num_queries(struct nvc0_screen *);
+
 #endif
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c b/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c
index 7fbc6e1fd8e..c034d0fd011 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_resource.c
@@ -19,7 +19,8 @@ nvc0_resource_create(struct pipe_screen *screen,
 static struct pipe_resource *
 nvc0_resource_from_handle(struct pipe_screen * screen,
                           const struct pipe_resource *templ,
-                          struct winsys_handle *whandle)
+                          struct winsys_handle *whandle,
+                          unsigned usage)
 {
    if (templ->target == PIPE_BUFFER) {
       return NULL;
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
index 37620ea8ba6..3c5b1da2063 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_screen.c
@@ -225,6 +225,10 @@ nvc0_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
    case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
    case PIPE_CAP_QUERY_MEMORY_INFO:
+   case PIPE_CAP_PCI_GROUP:
+   case PIPE_CAP_PCI_BUS:
+   case PIPE_CAP_PCI_DEVICE:
+   case PIPE_CAP_PCI_FUNCTION:
       return 0;
 
    case PIPE_CAP_VENDOR_ID:
@@ -324,7 +328,7 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED:
       return 1;
    case PIPE_SHADER_CAP_TGSI_SQRT_SUPPORTED:
-      return 0;
+      return 1;
    case PIPE_SHADER_CAP_SUBROUTINES:
       return 1;
    case PIPE_SHADER_CAP_INTEGERS:
@@ -333,8 +337,9 @@ nvc0_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
       return 1;
    case PIPE_SHADER_CAP_TGSI_DROUND_SUPPORTED:
       return 1;
-   case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_FMA_SUPPORTED:
+      return 1;
+   case PIPE_SHADER_CAP_TGSI_DFRACEXP_DLDEXP_SUPPORTED:
    case PIPE_SHADER_CAP_TGSI_ANY_INOUT_DECL_RANGE:
       return 0;
    case PIPE_SHADER_CAP_MAX_SHADER_BUFFERS:
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
index 6b02ed5680a..01fe7ce9bfc 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_shader_state.c
@@ -309,7 +309,6 @@ nvc0_tfb_validate(struct nvc0_context *nvc0)
 
    if (!(nvc0->dirty_3d & NVC0_NEW_3D_TFB_TARGETS))
       return;
-   nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_TFB);
 
    for (b = 0; b < nvc0->num_tfbbufs; ++b) {
       struct nvc0_so_target *targ = nvc0_so_target(nvc0->tfbbuf[b]);
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
index 7ccce9ff6bf..090a0395432 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state.c
@@ -1184,8 +1184,10 @@ nvc0_set_transform_feedback_targets(struct pipe_context *pipe,
    }
    nvc0->num_tfbbufs = num_targets;
 
-   if (nvc0->tfbbuf_dirty)
+   if (nvc0->tfbbuf_dirty) {
+      nouveau_bufctx_reset(nvc0->bufctx_3d, NVC0_BIND_3D_TFB);
       nvc0->dirty_3d |= NVC0_NEW_3D_TFB_TARGETS;
+   }
 }
 
 static void
@@ -1340,7 +1342,7 @@ nvc0_set_global_bindings(struct pipe_context *pipe,
 
    nouveau_bufctx_reset(nvc0->bufctx_cp, NVC0_BIND_CP_GLOBAL);
 
-   nvc0->dirty_cp = NVC0_NEW_CP_GLOBALS;
+   nvc0->dirty_cp |= NVC0_NEW_CP_GLOBALS;
 }
 
 void
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
index fbf45ceca2d..c0ed5c0043d 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_state_validate.c
@@ -672,10 +672,8 @@ nvc0_switch_pipe_context(struct nvc0_context *ctx_to)
    ctx_to->screen->cur_ctx = ctx_to;
 }
 
-static struct state_validate {
-    void (*func)(struct nvc0_context *);
-    uint32_t states;
-} validate_list[] = {
+static struct nvc0_state_validate
+validate_list_3d[] = {
     { nvc0_validate_fb,            NVC0_NEW_3D_FRAMEBUFFER },
     { nvc0_validate_blend,         NVC0_NEW_3D_BLEND },
     { nvc0_validate_zsa,           NVC0_NEW_3D_ZSA },
@@ -714,7 +712,9 @@ static struct state_validate {
 };
 
 bool
-nvc0_state_validate(struct nvc0_context *nvc0, uint32_t mask)
+nvc0_state_validate(struct nvc0_context *nvc0, uint32_t mask,
+                    struct nvc0_state_validate *validate_list, int size,
+                    uint32_t *dirty, struct nouveau_bufctx *bufctx)
 {
    uint32_t state_mask;
    int ret;
@@ -723,26 +723,38 @@ nvc0_state_validate(struct nvc0_context *nvc0, uint32_t mask)
    if (nvc0->screen->cur_ctx != nvc0)
       nvc0_switch_pipe_context(nvc0);
 
-   state_mask = nvc0->dirty_3d & mask;
+   state_mask = *dirty & mask;
 
    if (state_mask) {
-      for (i = 0; i < ARRAY_SIZE(validate_list); ++i) {
-         struct state_validate *validate = &validate_list[i];
+      for (i = 0; i < size; ++i) {
+         struct nvc0_state_validate *validate = &validate_list[i];
 
          if (state_mask & validate->states)
             validate->func(nvc0);
       }
-      nvc0->dirty_3d &= ~state_mask;
+      *dirty &= ~state_mask;
 
-      nvc0_bufctx_fence(nvc0, nvc0->bufctx_3d, false);
+      nvc0_bufctx_fence(nvc0, bufctx, false);
    }
 
-   nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_3d);
+   nouveau_pushbuf_bufctx(nvc0->base.pushbuf, bufctx);
    ret = nouveau_pushbuf_validate(nvc0->base.pushbuf);
 
+   return !ret;
+}
+
+bool
+nvc0_state_validate_3d(struct nvc0_context *nvc0, uint32_t mask)
+{
+   bool ret;
+
+   ret = nvc0_state_validate(nvc0, mask, validate_list_3d,
+                             ARRAY_SIZE(validate_list_3d), &nvc0->dirty_3d,
+                             nvc0->bufctx_3d);
+
    if (unlikely(nvc0->state.flushed)) {
       nvc0->state.flushed = false;
       nvc0_bufctx_fence(nvc0, nvc0->bufctx_3d, true);
    }
-   return !ret;
+   return ret;
 }
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
index 49577969d3d..e8b3a4d549a 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_surface.c
@@ -693,7 +693,7 @@ nvc0_clear(struct pipe_context *pipe, unsigned buffers,
    uint32_t mode = 0;
 
    /* don't need NEW_BLEND, COLOR_MASK doesn't affect CLEAR_BUFFERS */
-   if (!nvc0_state_validate(nvc0, NVC0_NEW_3D_FRAMEBUFFER))
+   if (!nvc0_state_validate_3d(nvc0, NVC0_NEW_3D_FRAMEBUFFER))
       return;
 
    if (buffers & PIPE_CLEAR_COLOR && fb->nr_cbufs) {
@@ -1195,7 +1195,7 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
 
    nvc0_blitctx_prepare_state(blit);
 
-   nvc0_state_validate(nvc0, ~0);
+   nvc0_state_validate_3d(nvc0, ~0);
 
    x_range = (float)info->src.box.width / (float)info->dst.box.width;
    y_range = (float)info->src.box.height / (float)info->dst.box.height;
@@ -1203,8 +1203,8 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
    x0 = (float)info->src.box.x - x_range * (float)info->dst.box.x;
    y0 = (float)info->src.box.y - y_range * (float)info->dst.box.y;
 
-   x1 = x0 + 16384.0f * x_range;
-   y1 = y0 + 16384.0f * y_range;
+   x1 = x0 + 32768.0f * x_range;
+   y1 = y0 + 32768.0f * y_range;
 
    x0 *= (float)(1 << nv50_miptree(src)->ms_x);
    x1 *= (float)(1 << nv50_miptree(src)->ms_x);
@@ -1315,14 +1315,14 @@ nvc0_blit_3d(struct nvc0_context *nvc0, const struct pipe_blit_info *info)
       *(vbuf++) = fui(y0);
       *(vbuf++) = fui(z);
 
-      *(vbuf++) = fui(16384 << nv50_miptree(dst)->ms_x);
+      *(vbuf++) = fui(32768 << nv50_miptree(dst)->ms_x);
       *(vbuf++) = fui(0.0f);
       *(vbuf++) = fui(x1);
       *(vbuf++) = fui(y0);
       *(vbuf++) = fui(z);
 
       *(vbuf++) = fui(0.0f);
-      *(vbuf++) = fui(16384 << nv50_miptree(dst)->ms_y);
+      *(vbuf++) = fui(32768 << nv50_miptree(dst)->ms_y);
       *(vbuf++) = fui(x0);
       *(vbuf++) = fui(y1);
       *(vbuf++) = fui(z);
@@ -1644,6 +1644,7 @@ nvc0_blitter_destroy(struct nvc0_screen *screen)
       }
    }
 
+   pipe_mutex_destroy(blitter->mutex);
    FREE(blitter);
 }
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
index 647aa10ec35..e0e0ad2a0f7 100644
--- a/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
+++ b/src/gallium/drivers/nouveau/nvc0/nvc0_vbo.c
@@ -969,7 +969,7 @@ nvc0_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
       IMMED_NVC0(push, NVC0_3D(PATCH_VERTICES), nvc0->state.patch_vertices);
    }
 
-   nvc0_state_validate(nvc0, ~0);
+   nvc0_state_validate_3d(nvc0, ~0);
 
    if (nvc0->vertprog->vp.need_draw_parameters) {
       PUSH_SPACE(push, 9);
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
index 4a4e8367d28..b3d841461d6 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.c
@@ -301,34 +301,31 @@ nve4_compute_set_tex_handles(struct nvc0_context *nvc0)
    nvc0->samplers_dirty[s] = 0;
 }
 
+static struct nvc0_state_validate
+validate_list_cp[] = {
+   { nvc0_compprog_validate,              NVC0_NEW_CP_PROGRAM     },
+   { nve4_compute_validate_textures,      NVC0_NEW_CP_TEXTURES    },
+   { nve4_compute_validate_samplers,      NVC0_NEW_CP_SAMPLERS    },
+   { nve4_compute_set_tex_handles,        NVC0_NEW_CP_TEXTURES |
+                                          NVC0_NEW_CP_SAMPLERS    },
+   { nve4_compute_validate_surfaces,      NVC0_NEW_CP_SURFACES    },
+   { nvc0_compute_validate_globals,       NVC0_NEW_CP_GLOBALS     },
+};
 
 static bool
-nve4_compute_state_validate(struct nvc0_context *nvc0)
+nve4_state_validate_cp(struct nvc0_context *nvc0, uint32_t mask)
 {
-   nvc0_compprog_validate(nvc0);
-   if (nvc0->dirty_cp & NVC0_NEW_CP_TEXTURES)
-      nve4_compute_validate_textures(nvc0);
-   if (nvc0->dirty_cp & NVC0_NEW_CP_SAMPLERS)
-      nve4_compute_validate_samplers(nvc0);
-   if (nvc0->dirty_cp & (NVC0_NEW_CP_TEXTURES | NVC0_NEW_CP_SAMPLERS))
-       nve4_compute_set_tex_handles(nvc0);
-   if (nvc0->dirty_cp & NVC0_NEW_CP_SURFACES)
-      nve4_compute_validate_surfaces(nvc0);
-   if (nvc0->dirty_cp & NVC0_NEW_CP_GLOBALS)
-      nvc0_compute_validate_globals(nvc0);
-
-   nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, false);
-
-   nouveau_pushbuf_bufctx(nvc0->base.pushbuf, nvc0->bufctx_cp);
-   if (unlikely(nouveau_pushbuf_validate(nvc0->base.pushbuf)))
-      return false;
+   bool ret;
+
+   ret = nvc0_state_validate(nvc0, mask, validate_list_cp,
+                             ARRAY_SIZE(validate_list_cp), &nvc0->dirty_cp,
+                             nvc0->bufctx_cp);
+
    if (unlikely(nvc0->state.flushed))
       nvc0_bufctx_fence(nvc0, nvc0->bufctx_cp, true);
-
-   return true;
+   return ret;
 }
 
-
 static void
 nve4_compute_upload_input(struct nvc0_context *nvc0, const void *input,
                           const uint *block_layout,
@@ -447,7 +444,7 @@ nve4_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
    BCTX_REFN_bo(nvc0->bufctx_cp, CP_DESC, NOUVEAU_BO_GART | NOUVEAU_BO_RD,
                 desc_bo);
 
-   ret = !nve4_compute_state_validate(nvc0);
+   ret = !nve4_state_validate_cp(nvc0, ~0);
    if (ret)
       goto out;
 
diff --git a/src/gallium/drivers/nouveau/nvc0/nve4_compute.xml.h b/src/gallium/drivers/nouveau/nvc0/nve4_compute.xml.h
index 3fff1122b8f..32018580579 100644
--- a/src/gallium/drivers/nouveau/nvc0/nve4_compute.xml.h
+++ b/src/gallium/drivers/nouveau/nvc0/nve4_compute.xml.h
@@ -294,6 +294,9 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define NVE4_COMPUTE_MP_PM_A_SIGSEL_LAUNCH			0x00000003
 #define NVE4_COMPUTE_MP_PM_A_SIGSEL_EXEC			0x00000004
 #define NVE4_COMPUTE_MP_PM_A_SIGSEL_ISSUE			0x00000005
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL_UNK11         0x00000011
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL_UNK14         0x00000014
+#define NVE4_COMPUTE_MP_PM_A_SIGSEL_UNK1A         0x0000001a
 #define NVE4_COMPUTE_MP_PM_A_SIGSEL_LDST			0x0000001b
 #define NVE4_COMPUTE_MP_PM_A_SIGSEL_BRANCH			0x0000001c
 
@@ -307,6 +310,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define NVE4_COMPUTE_MP_PM_B_SIGSEL_UNK0F          0x0000000f
 #define NVE4_COMPUTE_MP_PM_B_SIGSEL_L1				0x00000010
 #define NVE4_COMPUTE_MP_PM_B_SIGSEL_MEM			0x00000011
+#define NVE4_COMPUTE_MP_PM_B_SIGSEL_UNK13          0x00000013
 
 #define NVE4_COMPUTE_MP_PM_SRCSEL(i0)			       (0x0000339c + 0x4*(i0))
 #define NVE4_COMPUTE_MP_PM_SRCSEL__ESIZE			0x00000004
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 7fad7ad6a43..1c3bb64f0e4 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -259,6 +259,14 @@ static int r300_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
                 return r300screen->info.vram_size >> 20;
         case PIPE_CAP_UMA:
                 return 0;
+        case PIPE_CAP_PCI_GROUP:
+            return r300screen->info.pci_domain;
+        case PIPE_CAP_PCI_BUS:
+            return r300screen->info.pci_bus;
+        case PIPE_CAP_PCI_DEVICE:
+            return r300screen->info.pci_dev;
+        case PIPE_CAP_PCI_FUNCTION:
+            return r300screen->info.pci_func;
     }
     return 0;
 }
diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c
index e90e741a353..57456c6d867 100644
--- a/src/gallium/drivers/r300/r300_texture.c
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -971,7 +971,8 @@ static void r300_texture_destroy(struct pipe_screen *screen,
 
 boolean r300_resource_get_handle(struct pipe_screen* screen,
                                  struct pipe_resource *texture,
-                                 struct winsys_handle *whandle)
+                                 struct winsys_handle *whandle,
+                                 unsigned usage)
 {
     struct radeon_winsys *rws = r300_screen(screen)->rws;
     struct r300_resource* tex = (struct r300_resource*)texture;
@@ -1005,6 +1006,7 @@ r300_texture_create_object(struct r300_screen *rscreen,
 {
     struct radeon_winsys *rws = rscreen->rws;
     struct r300_resource *tex = NULL;
+    struct radeon_bo_metadata tiling = {};
 
     tex = CALLOC_STRUCT(r300_resource);
     if (!tex) {
@@ -1059,10 +1061,10 @@ r300_texture_create_object(struct r300_screen *rscreen,
                 util_format_is_depth_or_stencil(base->format) ? "depth" : "color");
     }
 
-    rws->buffer_set_tiling(tex->buf, NULL,
-            tex->tex.microtile, tex->tex.macrotile[0],
-            0, 0, 0, 0, 0, 0, 0,
-            tex->tex.stride_in_bytes[0], false);
+    tiling.microtile = tex->tex.microtile;
+    tiling.macrotile = tex->tex.macrotile[0];
+    tiling.stride = tex->tex.stride_in_bytes[0];
+    rws->buffer_set_metadata(tex->buf, &tiling);
 
     return tex;
 
@@ -1097,13 +1099,14 @@ struct pipe_resource *r300_texture_create(struct pipe_screen *screen,
 
 struct pipe_resource *r300_texture_from_handle(struct pipe_screen *screen,
                                                const struct pipe_resource *base,
-                                               struct winsys_handle *whandle)
+                                               struct winsys_handle *whandle,
+                                               unsigned usage)
 {
     struct r300_screen *rscreen = r300_screen(screen);
     struct radeon_winsys *rws = rscreen->rws;
     struct pb_buffer *buffer;
-    enum radeon_bo_layout microtile, macrotile;
     unsigned stride;
+    struct radeon_bo_metadata tiling = {};
 
     /* Support only 2D textures without mipmaps */
     if ((base->target != PIPE_TEXTURE_2D &&
@@ -1117,25 +1120,24 @@ struct pipe_resource *r300_texture_from_handle(struct pipe_screen *screen,
     if (!buffer)
         return NULL;
 
-    rws->buffer_get_tiling(buffer, &microtile, &macrotile, NULL, NULL, NULL,
-                           NULL, NULL, NULL);
+    rws->buffer_get_metadata(buffer, &tiling);
 
     /* Enforce a microtiled zbuffer. */
     if (util_format_is_depth_or_stencil(base->format) &&
-        microtile == RADEON_LAYOUT_LINEAR) {
+        tiling.microtile == RADEON_LAYOUT_LINEAR) {
         switch (util_format_get_blocksize(base->format)) {
             case 4:
-                microtile = RADEON_LAYOUT_TILED;
+                tiling.microtile = RADEON_LAYOUT_TILED;
                 break;
 
             case 2:
-                microtile = RADEON_LAYOUT_SQUARETILED;
+                tiling.microtile = RADEON_LAYOUT_SQUARETILED;
                 break;
         }
     }
 
     return (struct pipe_resource*)
-           r300_texture_create_object(rscreen, base, microtile, macrotile,
+           r300_texture_create_object(rscreen, base, tiling.microtile, tiling.macrotile,
                                       stride, buffer);
 }
 
diff --git a/src/gallium/drivers/r300/r300_texture.h b/src/gallium/drivers/r300/r300_texture.h
index 213bdffc2ed..4c339429eca 100644
--- a/src/gallium/drivers/r300/r300_texture.h
+++ b/src/gallium/drivers/r300/r300_texture.h
@@ -25,6 +25,7 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_format.h"
+#include "pipe/p_screen.h"
 
 struct pipe_screen;
 struct pipe_context;
@@ -62,12 +63,14 @@ void r300_texture_setup_format_state(struct r300_screen *screen,
 
 boolean r300_resource_get_handle(struct pipe_screen* screen,
                                 struct pipe_resource *texture,
-                                struct winsys_handle *whandle);
+                                struct winsys_handle *whandle,
+                                 unsigned usage);
 
 struct pipe_resource*
 r300_texture_from_handle(struct pipe_screen* screen,
 			 const struct pipe_resource* base,
-			 struct winsys_handle *whandle);
+			 struct winsys_handle *whandle,
+                         unsigned usage);
 
 struct pipe_resource*
 r300_texture_create(struct pipe_screen* screen,
diff --git a/src/gallium/drivers/r600/evergreen_state.c b/src/gallium/drivers/r600/evergreen_state.c
index a12638a5bdb..83313cb28cf 100644
--- a/src/gallium/drivers/r600/evergreen_state.c
+++ b/src/gallium/drivers/r600/evergreen_state.c
@@ -989,13 +989,6 @@ void evergreen_init_color_surface_rat(struct r600_context *rctx,
 		MAX2(64, rctx->screen->b.info.pipe_interleave_bytes / block_size);
 	unsigned pitch = align(pipe_buffer->width0, pitch_alignment);
 
-	/* XXX: This is copied from evergreen_init_color_surface().  I don't
-	 * know why this is necessary.
-	 */
-	if (pipe_buffer->usage == PIPE_USAGE_STAGING) {
-		endian = ENDIAN_NONE;
-	}
-
 	surf->cb_color_base = r600_resource(pipe_buffer)->gpu_address >> 8;
 
 	surf->cb_color_pitch = (pitch / 8) - 1;
@@ -1146,11 +1139,7 @@ void evergreen_init_color_surface(struct r600_context *rctx,
 	swap = r600_translate_colorswap(surf->base.format);
 	assert(swap != ~0);
 
-	if (rtex->resource.b.b.usage == PIPE_USAGE_STAGING) {
-		endian = ENDIAN_NONE;
-	} else {
-		endian = r600_colorformat_endian_swap(format);
-	}
+	endian = r600_colorformat_endian_swap(format);
 
 	/* blend clamp should be set for all NORM/SRGB types */
 	if (ntype == V_028C70_NUMBER_UNORM || ntype == V_028C70_NUMBER_SNORM ||
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 5a6ce71414c..7018088d204 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -439,7 +439,7 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 		return PIPE_ENDIAN_LITTLE;
 
 	case PIPE_CAP_VENDOR_ID:
-		return 0x1002;
+		return ATI_VENDOR_ID;
 	case PIPE_CAP_DEVICE_ID:
 		return rscreen->b.info.pci_id;
 	case PIPE_CAP_ACCELERATED:
@@ -450,6 +450,14 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 		return 0;
 	case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
 		return rscreen->b.chip_class >= R700;
+	case PIPE_CAP_PCI_GROUP:
+		return rscreen->b.info.pci_domain;
+	case PIPE_CAP_PCI_BUS:
+		return rscreen->b.info.pci_bus;
+	case PIPE_CAP_PCI_DEVICE:
+		return rscreen->b.info.pci_dev;
+	case PIPE_CAP_PCI_FUNCTION:
+		return rscreen->b.info.pci_func;
 	}
 	return 0;
 }
diff --git a/src/gallium/drivers/r600/r600_state.c b/src/gallium/drivers/r600/r600_state.c
index f60e30486a2..f9026197b26 100644
--- a/src/gallium/drivers/r600/r600_state.c
+++ b/src/gallium/drivers/r600/r600_state.c
@@ -930,11 +930,7 @@ static void r600_init_color_surface(struct r600_context *rctx,
 	swap = r600_translate_colorswap(surf->base.format);
 	assert(swap != ~0);
 
-	if (rtex->resource.b.b.usage == PIPE_USAGE_STAGING) {
-		endian = ENDIAN_NONE;
-	} else {
-		endian = r600_colorformat_endian_swap(format);
-	}
+	endian = r600_colorformat_endian_swap(format);
 
 	/* set blend bypass according to docs if SINT/UINT or
 	   8/24 COLOR variants */
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index aa3a085c6d2..2211e07ceba 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -645,21 +645,21 @@ static void r600_set_sampler_views(struct pipe_context *pipe, unsigned shader,
 		if (rviews[i]) {
 			struct r600_texture *rtex =
 				(struct r600_texture*)rviews[i]->base.texture;
+			bool is_buffer = rviews[i]->base.texture->target == PIPE_BUFFER;
 
-			if (rviews[i]->base.texture->target != PIPE_BUFFER) {
-				if (rtex->is_depth && !rtex->is_flushing_texture) {
-					dst->views.compressed_depthtex_mask |= 1 << i;
-				} else {
-					dst->views.compressed_depthtex_mask &= ~(1 << i);
-				}
+			if (!is_buffer && rtex->is_depth && !rtex->is_flushing_texture) {
+				dst->views.compressed_depthtex_mask |= 1 << i;
+			} else {
+				dst->views.compressed_depthtex_mask &= ~(1 << i);
+			}
 
-				/* Track compressed colorbuffers. */
-				if (rtex->cmask.size) {
-					dst->views.compressed_colortex_mask |= 1 << i;
-				} else {
-					dst->views.compressed_colortex_mask &= ~(1 << i);
-				}
+			/* Track compressed colorbuffers. */
+			if (!is_buffer && rtex->cmask.size) {
+				dst->views.compressed_colortex_mask |= 1 << i;
+			} else {
+				dst->views.compressed_colortex_mask &= ~(1 << i);
 			}
+
 			/* Changing from array to non-arrays textures and vice versa requires
 			 * updating TEX_ARRAY_OVERRIDE in sampler states on R6xx-R7xx. */
 			if (rctx->b.chip_class <= R700 &&
@@ -693,6 +693,26 @@ static void r600_set_sampler_views(struct pipe_context *pipe, unsigned shader,
 	}
 }
 
+static void r600_update_compressed_colortex_mask(struct r600_samplerview_state *views)
+{
+	uint32_t mask = views->enabled_mask;
+
+	while (mask) {
+		unsigned i = u_bit_scan(&mask);
+		struct pipe_resource *res = views->views[i]->base.texture;
+
+		if (res && res->target != PIPE_BUFFER) {
+			struct r600_texture *rtex = (struct r600_texture *)res;
+
+			if (rtex->cmask.size) {
+				views->compressed_colortex_mask |= 1 << i;
+			} else {
+				views->compressed_colortex_mask &= ~(1 << i);
+			}
+		}
+	}
+}
+
 static void r600_set_viewport_states(struct pipe_context *ctx,
                                      unsigned start_slot,
                                      unsigned num_viewports,
@@ -1457,6 +1477,16 @@ static bool r600_update_derived_state(struct r600_context *rctx)
 
 	if (!rctx->blitter->running) {
 		unsigned i;
+		unsigned counter;
+
+		counter = p_atomic_read(&rctx->screen->b.compressed_colortex_counter);
+		if (counter != rctx->b.last_compressed_colortex_counter) {
+			rctx->b.last_compressed_colortex_counter = counter;
+
+			for (i = 0; i < PIPE_SHADER_TYPES; ++i) {
+				r600_update_compressed_colortex_mask(&rctx->samplers[i].views);
+			}
+		}
 
 		/* Decompress textures if needed. */
 		for (i = 0; i < PIPE_SHADER_TYPES; i++) {
@@ -1672,7 +1702,7 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 	struct radeon_winsys_cs *cs = rctx->b.gfx.cs;
 	bool render_cond_bit = rctx->b.render_cond && !rctx->b.render_cond_force_off;
 	uint64_t mask;
-	unsigned num_patches;
+	unsigned num_patches, dirty_fb_counter;
 
 	if (!info.indirect && !info.count && (info.indexed || !info.count_from_stream_output)) {
 		return;
@@ -1688,6 +1718,13 @@ static void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info
 		rctx->b.dma.flush(rctx, RADEON_FLUSH_ASYNC, NULL);
 	}
 
+	/* Re-emit the framebuffer state if needed. */
+	dirty_fb_counter = p_atomic_read(&rctx->b.screen->dirty_fb_counter);
+	if (dirty_fb_counter != rctx->b.last_dirty_fb_counter) {
+		rctx->b.last_dirty_fb_counter = dirty_fb_counter;
+		r600_mark_atom_dirty(rctx, &rctx->framebuffer.atom);
+	}
+
 	if (!r600_update_derived_state(rctx)) {
 		/* useless to render because current rendering command
 		 * can't be achieved
diff --git a/src/gallium/drivers/radeon/r600_buffer_common.c b/src/gallium/drivers/radeon/r600_buffer_common.c
index b384baa9237..33ba0fbca9b 100644
--- a/src/gallium/drivers/radeon/r600_buffer_common.c
+++ b/src/gallium/drivers/radeon/r600_buffer_common.c
@@ -213,6 +213,10 @@ static bool
 r600_invalidate_buffer(struct r600_common_context *rctx,
 		       struct r600_resource *rbuffer)
 {
+	/* Shared buffers can't be reallocated. */
+	if (rbuffer->is_shared)
+		return false;
+
 	/* In AMD_pinned_memory, the user pointer association only gets
 	 * broken when the buffer is explicitly re-allocated.
 	 */
@@ -294,6 +298,7 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 	 * in which case it can be mapped unsynchronized. */
 	if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED) &&
 	    usage & PIPE_TRANSFER_WRITE &&
+	    !rbuffer->is_shared &&
 	    !util_ranges_intersect(&rbuffer->valid_buffer_range, box->x, box->x + box->width)) {
 		usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
 	}
@@ -311,12 +316,17 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 		if (r600_invalidate_buffer(rctx, rbuffer)) {
 			/* At this point, the buffer is always idle. */
 			usage |= PIPE_TRANSFER_UNSYNCHRONIZED;
+		} else {
+			/* Fall back to a temporary buffer. */
+			usage |= PIPE_TRANSFER_DISCARD_RANGE;
 		}
 	}
-	else if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
-		 !(usage & PIPE_TRANSFER_UNSYNCHRONIZED) &&
-		 !(rscreen->debug_flags & DBG_NO_DISCARD_RANGE) &&
-		 r600_can_dma_copy_buffer(rctx, box->x, 0, box->width)) {
+
+	if ((usage & PIPE_TRANSFER_DISCARD_RANGE) &&
+	    !(usage & (PIPE_TRANSFER_UNSYNCHRONIZED |
+		       PIPE_TRANSFER_PERSISTENT)) &&
+	    !(rscreen->debug_flags & DBG_NO_DISCARD_RANGE) &&
+	    r600_can_dma_copy_buffer(rctx, box->x, 0, box->width)) {
 		assert(usage & PIPE_TRANSFER_WRITE);
 
 		/* Check if mapping this buffer would cause waiting for the GPU. */
@@ -341,7 +351,8 @@ static void *r600_buffer_transfer_map(struct pipe_context *ctx,
 	}
 	/* Using a staging buffer in GTT for larger reads is much faster. */
 	else if ((usage & PIPE_TRANSFER_READ) &&
-		 !(usage & PIPE_TRANSFER_WRITE) &&
+		 !(usage & (PIPE_TRANSFER_WRITE |
+			    PIPE_TRANSFER_PERSISTENT)) &&
 		 rbuffer->domains == RADEON_DOMAIN_VRAM &&
 		 r600_can_dma_copy_buffer(rctx, 0, box->x, box->width)) {
 		struct r600_resource *staging;
@@ -453,6 +464,7 @@ r600_alloc_buffer_struct(struct pipe_screen *screen,
 	rbuffer->b.vtbl = &r600_buffer_vtbl;
 	rbuffer->buf = NULL;
 	rbuffer->TC_L2_dirty = false;
+	rbuffer->is_shared = false;
 	util_range_init(&rbuffer->valid_buffer_range);
 	return rbuffer;
 }
diff --git a/src/gallium/drivers/radeon/r600_pipe_common.h b/src/gallium/drivers/radeon/r600_pipe_common.h
index b77b1321d73..cf8dcf7ea88 100644
--- a/src/gallium/drivers/radeon/r600_pipe_common.h
+++ b/src/gallium/drivers/radeon/r600_pipe_common.h
@@ -43,6 +43,8 @@
 #include "util/u_suballoc.h"
 #include "util/u_transfer.h"
 
+#define ATI_VENDOR_ID 0x1002
+
 #define R600_RESOURCE_FLAG_TRANSFER		(PIPE_RESOURCE_FLAG_DRV_PRIV << 0)
 #define R600_RESOURCE_FLAG_FLUSHED_DEPTH	(PIPE_RESOURCE_FLAG_DRV_PRIV << 1)
 #define R600_RESOURCE_FLAG_FORCE_TILING		(PIPE_RESOURCE_FLAG_DRV_PRIV << 2)
@@ -166,6 +168,10 @@ struct r600_resource {
 	 * use TC L2.
 	 */
 	bool				TC_L2_dirty;
+
+	/* Whether the resource has been exported via resource_get_handle. */
+	bool				is_shared;
+	unsigned			external_usage; /* PIPE_HANDLE_USAGE_* */
 };
 
 struct r600_transfer {
@@ -218,7 +224,7 @@ struct r600_texture {
 	struct r600_fmask_info		fmask;
 	struct r600_cmask_info		cmask;
 	struct r600_resource		*cmask_buffer;
-	struct r600_resource		*dcc_buffer;
+	unsigned			dcc_offset; /* 0 = disabled */
 	unsigned			cb_color_info; /* fast clear enable bit */
 	unsigned			color_clear_value[2];
 
@@ -321,6 +327,23 @@ struct r600_common_screen {
 
 	/* Performance counters. */
 	struct r600_perfcounters	*perfcounters;
+
+	/* If pipe_screen wants to re-emit the framebuffer state of all
+	 * contexts, it should atomically increment this. Each context will
+	 * compare this with its own last known value of the counter before
+	 * drawing and re-emit the framebuffer state accordingly.
+	 */
+	unsigned			dirty_fb_counter;
+
+	/* Atomically increment this counter when an existing texture's
+	 * metadata is enabled or disabled in a way that requires changing
+	 * contexts' compressed texture binding masks.
+	 */
+	unsigned			compressed_colortex_counter;
+
+	void (*query_opaque_metadata)(struct r600_common_screen *rscreen,
+				      struct r600_texture *rtex,
+				      struct radeon_bo_metadata *md);
 };
 
 /* This encapsulates a state or an operation which can emitted into the GPU
@@ -388,6 +411,8 @@ struct r600_common_context {
 	struct pipe_fence_handle	*last_sdma_fence;
 	unsigned			initial_gfx_cs_size;
 	unsigned			gpu_reset_counter;
+	unsigned			last_dirty_fb_counter;
+	unsigned			last_compressed_colortex_counter;
 
 	struct u_upload_mgr		*uploader;
 	struct u_suballocator		*allocator_so_filled_size;
@@ -464,6 +489,9 @@ struct r600_common_context {
 				      unsigned first_layer, unsigned last_layer,
 				      unsigned first_sample, unsigned last_sample);
 
+	void (*decompress_dcc)(struct pipe_context *ctx,
+			       struct r600_texture *rtex);
+
 	/* Reallocate the buffer and update all resource bindings where
 	 * the buffer is bound, including all resource descriptors. */
 	void (*invalidate_buffer)(struct pipe_context *ctx, struct pipe_resource *buf);
diff --git a/src/gallium/drivers/radeon/r600_texture.c b/src/gallium/drivers/radeon/r600_texture.c
index 0b31d0a1f01..115c7289c4c 100644
--- a/src/gallium/drivers/radeon/r600_texture.c
+++ b/src/gallium/drivers/radeon/r600_texture.c
@@ -228,31 +228,145 @@ static int r600_setup_surface(struct pipe_screen *screen,
 	return 0;
 }
 
-static boolean r600_texture_get_handle(struct pipe_screen* screen,
-				       struct pipe_resource *ptex,
-				       struct winsys_handle *whandle)
+static void r600_texture_init_metadata(struct r600_texture *rtex,
+				       struct radeon_bo_metadata *metadata)
 {
-	struct r600_texture *rtex = (struct r600_texture*)ptex;
-	struct r600_resource *resource = &rtex->resource;
 	struct radeon_surf *surface = &rtex->surface;
+
+	memset(metadata, 0, sizeof(*metadata));
+	metadata->microtile = surface->level[0].mode >= RADEON_SURF_MODE_1D ?
+				   RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR;
+	metadata->macrotile = surface->level[0].mode >= RADEON_SURF_MODE_2D ?
+				   RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR;
+	metadata->pipe_config = surface->pipe_config;
+	metadata->bankw = surface->bankw;
+	metadata->bankh = surface->bankh;
+	metadata->tile_split = surface->tile_split;
+	metadata->stencil_tile_split = surface->stencil_tile_split;
+	metadata->mtilea = surface->mtilea;
+	metadata->num_banks = surface->num_banks;
+	metadata->stride = surface->level[0].pitch_bytes;
+	metadata->scanout = (surface->flags & RADEON_SURF_SCANOUT) != 0;
+}
+
+static void r600_dirty_all_framebuffer_states(struct r600_common_screen *rscreen)
+{
+	p_atomic_inc(&rscreen->dirty_fb_counter);
+}
+
+static void r600_eliminate_fast_color_clear(struct r600_common_screen *rscreen,
+				      struct r600_texture *rtex)
+{
+	struct pipe_context *ctx = rscreen->aux_context;
+
+	pipe_mutex_lock(rscreen->aux_context_lock);
+	ctx->flush_resource(ctx, &rtex->resource.b.b);
+	ctx->flush(ctx, NULL, 0);
+	pipe_mutex_unlock(rscreen->aux_context_lock);
+}
+
+static void r600_texture_disable_cmask(struct r600_common_screen *rscreen,
+				       struct r600_texture *rtex)
+{
+	if (!rtex->cmask.size)
+		return;
+
+	assert(rtex->resource.b.b.nr_samples <= 1);
+
+	/* Disable CMASK. */
+	memset(&rtex->cmask, 0, sizeof(rtex->cmask));
+	rtex->cmask.base_address_reg = rtex->resource.gpu_address >> 8;
+
+	if (rscreen->chip_class >= SI)
+		rtex->cb_color_info &= ~SI_S_028C70_FAST_CLEAR(1);
+	else
+		rtex->cb_color_info &= ~EG_S_028C70_FAST_CLEAR(1);
+
+	if (rtex->cmask_buffer != &rtex->resource)
+	    pipe_resource_reference((struct pipe_resource**)&rtex->cmask_buffer, NULL);
+
+	/* Notify all contexts about the change. */
+	r600_dirty_all_framebuffer_states(rscreen);
+	p_atomic_inc(&rscreen->compressed_colortex_counter);
+}
+
+static void r600_texture_disable_dcc(struct r600_common_screen *rscreen,
+				     struct r600_texture *rtex)
+{
+	struct r600_common_context *rctx =
+		(struct r600_common_context *)rscreen->aux_context;
+
+	if (!rtex->dcc_offset)
+		return;
+
+	/* Decompress DCC. */
+	pipe_mutex_lock(rscreen->aux_context_lock);
+	rctx->decompress_dcc(&rctx->b, rtex);
+	rctx->b.flush(&rctx->b, NULL, 0);
+	pipe_mutex_unlock(rscreen->aux_context_lock);
+
+	/* Disable DCC. */
+	rtex->dcc_offset = 0;
+	rtex->cb_color_info &= ~VI_S_028C70_DCC_ENABLE(1);
+
+	/* Notify all contexts about the change. */
+	r600_dirty_all_framebuffer_states(rscreen);
+}
+
+static boolean r600_texture_get_handle(struct pipe_screen* screen,
+				       struct pipe_resource *resource,
+				       struct winsys_handle *whandle,
+                                       unsigned usage)
+{
 	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
+	struct r600_resource *res = (struct r600_resource*)resource;
+	struct r600_texture *rtex = (struct r600_texture*)resource;
+	struct radeon_bo_metadata metadata;
 
-	rscreen->ws->buffer_set_tiling(resource->buf,
-				       NULL,
-				       surface->level[0].mode >= RADEON_SURF_MODE_1D ?
-				       RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR,
-				       surface->level[0].mode >= RADEON_SURF_MODE_2D ?
-				       RADEON_LAYOUT_TILED : RADEON_LAYOUT_LINEAR,
-				       surface->pipe_config,
-				       surface->bankw, surface->bankh,
-				       surface->tile_split,
-				       surface->stencil_tile_split,
-				       surface->mtilea, surface->num_banks,
-				       surface->level[0].pitch_bytes,
-				       (surface->flags & RADEON_SURF_SCANOUT) != 0);
-
-	return rscreen->ws->buffer_get_handle(resource->buf,
-						surface->level[0].pitch_bytes, whandle);
+	/* This is not supported now, but it might be required for OpenCL
+	 * interop in the future.
+	 */
+	if (resource->target != PIPE_BUFFER &&
+	    (resource->nr_samples > 1 || rtex->is_depth))
+		return NULL;
+
+	if (!res->is_shared) {
+		res->is_shared = true;
+		res->external_usage = usage;
+
+		if (resource->target != PIPE_BUFFER) {
+			/* Since shader image stores don't support DCC on VI,
+			 * disable it for external clients that want write
+			 * access.
+			 */
+			if (usage & PIPE_HANDLE_USAGE_WRITE)
+				r600_texture_disable_dcc(rscreen, rtex);
+
+			if (!(usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH)) {
+				/* Eliminate fast clear (both CMASK and DCC) */
+				r600_eliminate_fast_color_clear(rscreen, rtex);
+
+				/* Disable CMASK if flush_resource isn't going
+				 * to be called.
+				 */
+				r600_texture_disable_cmask(rscreen, rtex);
+			}
+
+			/* Set metadata. */
+			r600_texture_init_metadata(rtex, &metadata);
+			if (rscreen->query_opaque_metadata)
+				rscreen->query_opaque_metadata(rscreen, rtex,
+							       &metadata);
+
+			rscreen->ws->buffer_set_metadata(res->buf, &metadata);
+		}
+	} else {
+		assert(res->external_usage == usage);
+	}
+
+	return rscreen->ws->buffer_get_handle(res->buf,
+					      rtex->surface.level[0].pitch_bytes,
+					      whandle);
 }
 
 static void r600_texture_destroy(struct pipe_screen *screen,
@@ -268,7 +382,6 @@ static void r600_texture_destroy(struct pipe_screen *screen,
 	if (rtex->cmask_buffer != &rtex->resource) {
 	    pipe_resource_reference((struct pipe_resource**)&rtex->cmask_buffer, NULL);
 	}
-	pipe_resource_reference((struct pipe_resource**)&rtex->dcc_buffer, NULL);
 	pb_reference(&resource->buf, NULL);
 	FREE(rtex);
 }
@@ -489,25 +602,8 @@ static void r600_texture_alloc_cmask_separate(struct r600_common_screen *rscreen
 		rtex->cb_color_info |= SI_S_028C70_FAST_CLEAR(1);
 	else
 		rtex->cb_color_info |= EG_S_028C70_FAST_CLEAR(1);
-}
 
-static void vi_texture_alloc_dcc_separate(struct r600_common_screen *rscreen,
-					      struct r600_texture *rtex)
-{
-	if (rscreen->debug_flags & DBG_NO_DCC)
-		return;
-
-	rtex->dcc_buffer = (struct r600_resource *)
-		r600_aligned_buffer_create(&rscreen->b, PIPE_BIND_CUSTOM,
-				   PIPE_USAGE_DEFAULT, rtex->surface.dcc_size, rtex->surface.dcc_alignment);
-	if (rtex->dcc_buffer == NULL) {
-		return;
-	}
-
-	r600_screen_clear_buffer(rscreen, &rtex->dcc_buffer->b.b, 0, rtex->surface.dcc_size,
-				 0xFFFFFFFF, true);
-
-	rtex->cb_color_info |= VI_S_028C70_DCC_ENABLE(1);
+	p_atomic_inc(&rscreen->compressed_colortex_counter);
 }
 
 static unsigned r600_texture_get_htile_size(struct r600_common_screen *rscreen,
@@ -644,10 +740,10 @@ void r600_print_texture_info(struct r600_texture *rtex, FILE *f)
 			rtex->htile_buffer->buf->alignment, rtex->htile.pitch,
 			rtex->htile.height, rtex->htile.xalign, rtex->htile.yalign);
 
-	if (rtex->dcc_buffer) {
-		fprintf(f, "  DCC: size=%u, alignment=%u\n",
-			rtex->dcc_buffer->b.b.width0,
-			rtex->dcc_buffer->buf->alignment);
+	if (rtex->dcc_offset) {
+		fprintf(f, "  DCC: offset=%u, size=%"PRIu64", alignment=%"PRIu64"\n",
+			rtex->dcc_offset, rtex->surface.dcc_size,
+			rtex->surface.dcc_alignment);
 		for (i = 0; i <= rtex->surface.last_level; i++)
 			fprintf(f, "  DCCLevel[%i]: offset=%"PRIu64"\n",
 				i, rtex->surface.level[i].dcc_offset);
@@ -745,8 +841,14 @@ r600_texture_create_object(struct pipe_screen *screen,
 				return NULL;
 			}
 		}
-		if (rtex->surface.dcc_size)
-			vi_texture_alloc_dcc_separate(rscreen, rtex);
+
+		if (!buf && rtex->surface.dcc_size &&
+		    !(rscreen->debug_flags & DBG_NO_DCC)) {
+			/* Reserve space for the DCC buffer. */
+			rtex->dcc_offset = align(rtex->size, rtex->surface.dcc_alignment);
+			rtex->size = rtex->dcc_offset + rtex->surface.dcc_size;
+			rtex->cb_color_info |= VI_S_028C70_DCC_ENABLE(1);
+		}
 	}
 
 	/* Now create the backing buffer. */
@@ -768,6 +870,12 @@ r600_texture_create_object(struct pipe_screen *screen,
 					 rtex->cmask.offset, rtex->cmask.size,
 					 0xCCCCCCCC, true);
 	}
+	if (rtex->dcc_offset) {
+		r600_screen_clear_buffer(rscreen, &rtex->resource.b.b,
+					 rtex->dcc_offset,
+					 rtex->surface.dcc_size,
+					 0xFFFFFFFF, true);
+	}
 
 	/* Initialize the CMASK base register value. */
 	rtex->cmask.base_address_reg =
@@ -877,16 +985,17 @@ struct pipe_resource *r600_texture_create(struct pipe_screen *screen,
 
 static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen,
 						      const struct pipe_resource *templ,
-						      struct winsys_handle *whandle)
+						      struct winsys_handle *whandle,
+                                                      unsigned usage)
 {
 	struct r600_common_screen *rscreen = (struct r600_common_screen*)screen;
 	struct pb_buffer *buf = NULL;
 	unsigned stride = 0;
 	unsigned array_mode;
-	enum radeon_bo_layout micro, macro;
 	struct radeon_surf surface;
-	bool scanout;
 	int r;
+	struct radeon_bo_metadata metadata = {};
+	struct r600_texture *rtex;
 
 	/* Support only 2D textures without mipmaps */
 	if ((templ->target != PIPE_TEXTURE_2D && templ->target != PIPE_TEXTURE_RECT) ||
@@ -897,15 +1006,17 @@ static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen
 	if (!buf)
 		return NULL;
 
-	rscreen->ws->buffer_get_tiling(buf, &micro, &macro,
-				       &surface.bankw, &surface.bankh,
-				       &surface.tile_split,
-				       &surface.stencil_tile_split,
-				       &surface.mtilea, &scanout);
+	rscreen->ws->buffer_get_metadata(buf, &metadata);
 
-	if (macro == RADEON_LAYOUT_TILED)
+	surface.bankw = metadata.bankw;
+	surface.bankh = metadata.bankh;
+	surface.tile_split = metadata.tile_split;
+	surface.stencil_tile_split = metadata.stencil_tile_split;
+	surface.mtilea = metadata.mtilea;
+
+	if (metadata.macrotile == RADEON_LAYOUT_TILED)
 		array_mode = RADEON_SURF_MODE_2D;
-	else if (micro == RADEON_LAYOUT_TILED)
+	else if (metadata.microtile == RADEON_LAYOUT_TILED)
 		array_mode = RADEON_SURF_MODE_1D;
 	else
 		array_mode = RADEON_SURF_MODE_LINEAR_ALIGNED;
@@ -915,11 +1026,17 @@ static struct pipe_resource *r600_texture_from_handle(struct pipe_screen *screen
 		return NULL;
 	}
 
-	if (scanout)
+	if (metadata.scanout)
 		surface.flags |= RADEON_SURF_SCANOUT;
 
-	return (struct pipe_resource *)r600_texture_create_object(screen, templ,
-								  stride, buf, &surface);
+	rtex = r600_texture_create_object(screen, templ,
+					  stride, buf, &surface);
+	if (!rtex)
+		return NULL;
+
+	rtex->resource.is_shared = true;
+	rtex->resource.external_usage = usage;
+	return &rtex->resource.b.b;
 }
 
 bool r600_init_flushed_depth_texture(struct pipe_context *ctx,
@@ -1450,6 +1567,14 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 			continue;
 		}
 
+		/* shared textures can't use fast clear without an explicit flush,
+		 * because there is no way to communicate the clear color among
+		 * all clients
+		 */
+		if (tex->resource.is_shared &&
+		    !(tex->resource.external_usage & PIPE_HANDLE_USAGE_EXPLICIT_FLUSH))
+			continue;
+
 		/* fast color clear with 1D tiling doesn't work on old kernels and CIK */
 		if (tex->surface.level[0].mode == RADEON_SURF_MODE_1D &&
 		    rctx->chip_class >= CIK &&
@@ -1458,7 +1583,7 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 			continue;
 		}
 
-		if (tex->dcc_buffer) {
+		if (tex->dcc_offset) {
 			uint32_t reset_value;
 			bool clear_words_needed;
 
@@ -1467,8 +1592,9 @@ void evergreen_do_fast_color_clear(struct r600_common_context *rctx,
 
 			vi_get_fast_clear_parameters(fb->cbufs[i]->format, color, &reset_value, &clear_words_needed);
 
-			rctx->clear_buffer(&rctx->b, &tex->dcc_buffer->b.b,
-					0, tex->surface.dcc_size, reset_value, true);
+			rctx->clear_buffer(&rctx->b, &tex->resource.b.b,
+					   tex->dcc_offset, tex->surface.dcc_size,
+					   reset_value, true);
 
 			if (clear_words_needed)
 				tex->dirty_level_mask |= 1 << fb->cbufs[i]->u.tex.level;
diff --git a/src/gallium/drivers/radeon/radeon_uvd.c b/src/gallium/drivers/radeon/radeon_uvd.c
index 367aabc7a18..b8efc58eaab 100644
--- a/src/gallium/drivers/radeon/radeon_uvd.c
+++ b/src/gallium/drivers/radeon/radeon_uvd.c
@@ -207,7 +207,7 @@ static uint32_t profile2stream_type(struct ruvd_decoder *dec, unsigned family)
 	}
 }
 
-static unsigned calc_ctx_size(struct ruvd_decoder *dec)
+static unsigned calc_ctx_size_h265_main(struct ruvd_decoder *dec)
 {
 	unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH);
 	unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT);
@@ -224,6 +224,39 @@ static unsigned calc_ctx_size(struct ruvd_decoder *dec)
 	return ((width + 255) / 16) * ((height + 255) / 16) * 16 * max_references + 52 * 1024;
 }
 
+static unsigned calc_ctx_size_h265_main10(struct ruvd_decoder *dec, struct pipe_h265_picture_desc *pic)
+{
+	unsigned block_size, log2_ctb_size, width_in_ctb, height_in_ctb, num_16x16_block_per_ctb;
+	unsigned context_buffer_size_per_ctb_row, cm_buffer_size, max_mb_address, db_left_tile_pxl_size;
+	unsigned db_left_tile_ctx_size = 4096 / 16 * (32 + 16 * 4);
+
+	unsigned width = align(dec->base.width, VL_MACROBLOCK_WIDTH);
+	unsigned height = align(dec->base.height, VL_MACROBLOCK_HEIGHT);
+	unsigned coeff_10bit = (pic->pps->sps->bit_depth_luma_minus8 || pic->pps->sps->bit_depth_chroma_minus8) ? 2 : 1;
+
+	unsigned max_references = dec->base.max_references + 1;
+
+	if (dec->base.width * dec->base.height >= 4096*2000)
+		max_references = MAX2(max_references, 8);
+	else
+		max_references = MAX2(max_references, 17);
+
+	block_size = (1 << (pic->pps->sps->log2_min_luma_coding_block_size_minus3 + 3));
+	log2_ctb_size = block_size + pic->pps->sps->log2_diff_max_min_luma_coding_block_size;
+
+	width_in_ctb = (width + ((1 << log2_ctb_size) - 1)) >> log2_ctb_size;
+	height_in_ctb = (height + ((1 << log2_ctb_size) - 1)) >> log2_ctb_size;
+
+	num_16x16_block_per_ctb = ((1 << log2_ctb_size) >> 4) * ((1 << log2_ctb_size) >> 4);
+	context_buffer_size_per_ctb_row = align(width_in_ctb * num_16x16_block_per_ctb * 16, 256);
+	max_mb_address = (unsigned) ceil(height * 8 / 2048.0);
+
+	cm_buffer_size = max_references * context_buffer_size_per_ctb_row * height_in_ctb;
+	db_left_tile_pxl_size = coeff_10bit * (max_mb_address * 2 * 2048 + 1024);
+
+	return cm_buffer_size + db_left_tile_ctx_size + db_left_tile_pxl_size;
+}
+
 /* calculate size of reference picture buffer */
 static unsigned calc_dpb_size(struct ruvd_decoder *dec)
 {
@@ -305,7 +338,10 @@ static unsigned calc_dpb_size(struct ruvd_decoder *dec)
 
 		width = align (width, 16);
 		height = align (height, 16);
-		dpb_size = align((width * height * 3) / 2, 256) * max_references;
+		if (dec->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10)
+			dpb_size = align((width * height * 9) / 4, 256) * max_references;
+		else
+			dpb_size = align((width * height * 3) / 2, 256) * max_references;
 		break;
 
 	case PIPE_VIDEO_FORMAT_VC1:
@@ -596,6 +632,15 @@ static struct ruvd_h265 get_h265_msg(struct ruvd_decoder *dec, struct pipe_video
 			result.direct_reflist[i][j] = pic->RefPicList[i][j];
 	}
 
+	if ((pic->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10) &&
+		(target->buffer_format == PIPE_FORMAT_NV12)) {
+		result.p010_mode = 0;
+		result.luma_10to8 = 5;
+		result.chroma_10to8 = 5;
+		result.sclr_luma10to8 = 4;
+		result.sclr_chroma10to8 = 4;
+	}
+
 	/* TODO
 	result.highestTid;
 	result.isNonRef;
@@ -971,6 +1016,17 @@ static void ruvd_end_frame(struct pipe_video_codec *decoder,
 
 	case PIPE_VIDEO_FORMAT_HEVC:
 		dec->msg->body.decode.codec.h265 = get_h265_msg(dec, target, (struct pipe_h265_picture_desc*)picture);
+		if (dec->ctx.res == NULL) {
+			unsigned ctx_size;
+			if (dec->base.profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10)
+				ctx_size = calc_ctx_size_h265_main10(dec, (struct pipe_h265_picture_desc*)picture);
+			else
+				ctx_size = calc_ctx_size_h265_main(dec);
+			if (!rvid_create_buffer(dec->screen, &dec->ctx, ctx_size, PIPE_USAGE_DEFAULT)) {
+				RVID_ERR("Can't allocated context buffer.\n");
+			}
+			rvid_clear_buffer(decoder->context, &dec->ctx);
+		}
 		break;
 
 	case PIPE_VIDEO_FORMAT_VC1:
@@ -1123,15 +1179,6 @@ struct pipe_video_codec *ruvd_create_decoder(struct pipe_context *context,
 
 	rvid_clear_buffer(context, &dec->dpb);
 
-	if (u_reduce_video_profile(dec->base.profile) == PIPE_VIDEO_FORMAT_HEVC) {
-		unsigned ctx_size = calc_ctx_size(dec);
-		if (!rvid_create_buffer(dec->screen, &dec->ctx, ctx_size, PIPE_USAGE_DEFAULT)) {
-			RVID_ERR("Can't allocated context buffer.\n");
-			goto error;
-		}
-		rvid_clear_buffer(context, &dec->ctx);
-	}
-
 	map_msg_fb_it_buf(dec);
 	dec->msg->size = sizeof(*dec->msg);
 	dec->msg->msg_type = RUVD_MSG_CREATE;
diff --git a/src/gallium/drivers/radeon/radeon_vce.c b/src/gallium/drivers/radeon/radeon_vce.c
index 41603b32403..087d9422c04 100644
--- a/src/gallium/drivers/radeon/radeon_vce.c
+++ b/src/gallium/drivers/radeon/radeon_vce.c
@@ -404,7 +404,8 @@ struct pipe_video_codec *rvce_create_encoder(struct pipe_context *context,
 
 	if (rscreen->info.drm_major == 3)
 		enc->use_vm = true;
-	if ((rscreen->info.drm_major > 2) || (rscreen->info.drm_minor >= 42))
+	if ((rscreen->info.drm_major == 2 && rscreen->info.drm_minor >= 42) ||
+            rscreen->info.drm_major == 3)
 		enc->use_vui = true;
 	if (rscreen->info.family >= CHIP_TONGA &&
              rscreen->info.family != CHIP_STONEY)
diff --git a/src/gallium/drivers/radeon/radeon_video.c b/src/gallium/drivers/radeon/radeon_video.c
index ec29d8cb754..24b0eed51d2 100644
--- a/src/gallium/drivers/radeon/radeon_video.c
+++ b/src/gallium/drivers/radeon/radeon_video.c
@@ -237,6 +237,7 @@ int rvid_get_video_param(struct pipe_screen *screen,
 	case PIPE_VIDEO_CAP_SUPPORTED:
 		switch (codec) {
 		case PIPE_VIDEO_FORMAT_MPEG12:
+			return profile != PIPE_VIDEO_PROFILE_MPEG1;
 		case PIPE_VIDEO_FORMAT_MPEG4:
 		case PIPE_VIDEO_FORMAT_MPEG4_AVC:
 			if (rscreen->family < CHIP_PALM)
@@ -247,8 +248,11 @@ int rvid_get_video_param(struct pipe_screen *screen,
 			return true;
 		case PIPE_VIDEO_FORMAT_HEVC:
 			/* Carrizo only supports HEVC Main */
-			return rscreen->family >= CHIP_CARRIZO &&
-				   profile == PIPE_VIDEO_PROFILE_HEVC_MAIN;
+			if (rscreen->family >= CHIP_STONEY)
+				return (profile == PIPE_VIDEO_PROFILE_HEVC_MAIN ||
+					profile == PIPE_VIDEO_PROFILE_HEVC_MAIN_10);
+			else if (rscreen->family >= CHIP_CARRIZO)
+				return profile == PIPE_VIDEO_PROFILE_HEVC_MAIN;
 		default:
 			return false;
 		}
@@ -257,7 +261,7 @@ int rvid_get_video_param(struct pipe_screen *screen,
 	case PIPE_VIDEO_CAP_MAX_WIDTH:
 		return (rscreen->family < CHIP_TONGA) ? 2048 : 4096;
 	case PIPE_VIDEO_CAP_MAX_HEIGHT:
-		return (rscreen->family < CHIP_TONGA) ? 1152 : 2304;
+		return (rscreen->family < CHIP_TONGA) ? 1152 : 4096;
 	case PIPE_VIDEO_CAP_PREFERED_FORMAT:
 		return PIPE_FORMAT_NV12;
 	case PIPE_VIDEO_CAP_PREFERS_INTERLACED:
@@ -296,6 +300,7 @@ int rvid_get_video_param(struct pipe_screen *screen,
 		case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH:
 			return 41;
 		case PIPE_VIDEO_PROFILE_HEVC_MAIN:
+		case PIPE_VIDEO_PROFILE_HEVC_MAIN_10:
 			return 186;
 		default:
 			return 0;
diff --git a/src/gallium/drivers/radeon/radeon_winsys.h b/src/gallium/drivers/radeon/radeon_winsys.h
index 1160d235062..b8a065957a7 100644
--- a/src/gallium/drivers/radeon/radeon_winsys.h
+++ b/src/gallium/drivers/radeon/radeon_winsys.h
@@ -231,6 +231,12 @@ struct radeon_winsys_cs {
 };
 
 struct radeon_info {
+    /* PCI info: domain:bus:dev:func */
+    uint32_t                    pci_domain;
+    uint32_t                    pci_bus;
+    uint32_t                    pci_dev;
+    uint32_t                    pci_func;
+
     /* Device info. */
     uint32_t                    pci_id;
     enum radeon_family          family;
@@ -276,6 +282,31 @@ struct radeon_info {
     uint32_t                    cik_macrotile_mode_array[16];
 };
 
+/* Tiling info for display code, DRI sharing, and other data. */
+struct radeon_bo_metadata {
+    /* Tiling flags describing the texture layout for display code
+     * and DRI sharing.
+     */
+    enum radeon_bo_layout   microtile;
+    enum radeon_bo_layout   macrotile;
+    unsigned                pipe_config;
+    unsigned                bankw;
+    unsigned                bankh;
+    unsigned                tile_split;
+    unsigned                stencil_tile_split;
+    unsigned                mtilea;
+    unsigned                num_banks;
+    unsigned                stride;
+    bool                    scanout;
+
+    /* Additional metadata associated with the buffer, in bytes.
+     * The maximum size is 64 * 4. This is opaque for the winsys & kernel.
+     * Supported by amdgpu only.
+     */
+    uint32_t                size_metadata;
+    uint32_t                metadata[64];
+};
+
 enum radeon_feature_id {
     RADEON_FID_R300_HYPERZ_ACCESS,     /* ZMask + HiZ */
     RADEON_FID_R300_CMASK_ACCESS,
@@ -454,45 +485,24 @@ struct radeon_winsys {
                         enum radeon_bo_usage usage);
 
     /**
-     * Return tiling flags describing a memory layout of a buffer object.
+     * Return buffer metadata.
+     * (tiling info for display code, DRI sharing, and other data)
      *
      * \param buf       A winsys buffer object to get the flags from.
-     * \param macrotile A pointer to the return value of the microtile flag.
-     * \param microtile A pointer to the return value of the macrotile flag.
-     *
-     * \note microtile and macrotile are not bitmasks!
+     * \param md        Metadata
      */
-    void (*buffer_get_tiling)(struct pb_buffer *buf,
-                              enum radeon_bo_layout *microtile,
-                              enum radeon_bo_layout *macrotile,
-                              unsigned *bankw, unsigned *bankh,
-                              unsigned *tile_split,
-                              unsigned *stencil_tile_split,
-                              unsigned *mtilea,
-                              bool *scanout);
+    void (*buffer_get_metadata)(struct pb_buffer *buf,
+                                struct radeon_bo_metadata *md);
 
     /**
-     * Set tiling flags describing a memory layout of a buffer object.
+     * Set buffer metadata.
+     * (tiling info for display code, DRI sharing, and other data)
      *
      * \param buf       A winsys buffer object to set the flags for.
-     * \param cs        A command stream to flush if the buffer is referenced by it.
-     * \param macrotile A macrotile flag.
-     * \param microtile A microtile flag.
-     * \param stride    A stride of the buffer in bytes, for texturing.
-     *
-     * \note microtile and macrotile are not bitmasks!
-     */
-    void (*buffer_set_tiling)(struct pb_buffer *buf,
-                              struct radeon_winsys_cs *rcs,
-                              enum radeon_bo_layout microtile,
-                              enum radeon_bo_layout macrotile,
-                              unsigned pipe_config,
-                              unsigned bankw, unsigned bankh,
-                              unsigned tile_split,
-                              unsigned stencil_tile_split,
-                              unsigned mtilea, unsigned num_banks,
-                              unsigned stride,
-                              bool scanout);
+     * \param md        Metadata
+     */
+    void (*buffer_set_metadata)(struct pb_buffer *buf,
+                                struct radeon_bo_metadata *md);
 
     /**
      * Get a winsys buffer from a winsys handle. The internal structure
diff --git a/src/gallium/drivers/radeonsi/cik_sdma.c b/src/gallium/drivers/radeonsi/cik_sdma.c
index 76913914b38..6eb62dcc890 100644
--- a/src/gallium/drivers/radeonsi/cik_sdma.c
+++ b/src/gallium/drivers/radeonsi/cik_sdma.c
@@ -243,7 +243,7 @@ void cik_sdma_copy(struct pipe_context *ctx,
 	if (src->format != dst->format ||
 	    rdst->surface.nsamples > 1 || rsrc->surface.nsamples > 1 ||
 	    (rdst->dirty_level_mask | rdst->stencil_dirty_level_mask) & (1 << dst_level) ||
-	    rdst->dcc_buffer || rsrc->dcc_buffer) {
+	    rdst->dcc_offset || rsrc->dcc_offset) {
 		goto fallback;
 	}
 
diff --git a/src/gallium/drivers/radeonsi/si_blit.c b/src/gallium/drivers/radeonsi/si_blit.c
index 115877060ba..f9a6de48f6b 100644
--- a/src/gallium/drivers/radeonsi/si_blit.c
+++ b/src/gallium/drivers/radeonsi/si_blit.c
@@ -241,8 +241,9 @@ static void si_blit_decompress_depth_in_place(struct si_context *sctx,
 	si_mark_atom_dirty(sctx, &sctx->db_render_state);
 }
 
-void si_flush_depth_textures(struct si_context *sctx,
-			     struct si_textures_info *textures)
+static void
+si_flush_depth_textures(struct si_context *sctx,
+			struct si_textures_info *textures)
 {
 	unsigned i;
 	unsigned mask = textures->depth_texture_mask;
@@ -271,18 +272,29 @@ void si_flush_depth_textures(struct si_context *sctx,
 static void si_blit_decompress_color(struct pipe_context *ctx,
 		struct r600_texture *rtex,
 		unsigned first_level, unsigned last_level,
-		unsigned first_layer, unsigned last_layer)
+		unsigned first_layer, unsigned last_layer,
+		bool need_dcc_decompress)
 {
 	struct si_context *sctx = (struct si_context *)ctx;
 	unsigned layer, level, checked_last_layer, max_layer;
 
-	if (!rtex->dirty_level_mask)
+	if (!rtex->dirty_level_mask && !need_dcc_decompress)
 		return;
 
 	for (level = first_level; level <= last_level; level++) {
-		if (!(rtex->dirty_level_mask & (1 << level)))
+		void* custom_blend;
+
+		if (!(rtex->dirty_level_mask & (1 << level)) && !need_dcc_decompress)
 			continue;
 
+		if (rtex->dcc_offset && need_dcc_decompress) {
+			custom_blend = sctx->custom_blend_dcc_decompress;
+		} else if (rtex->fmask.size) {
+			custom_blend = sctx->custom_blend_decompress;
+		} else {
+			custom_blend = sctx->custom_blend_fastclear;
+		}
+
 		/* The smaller the mipmap level, the less layers there are
 		 * as far as 3D textures are concerned. */
 		max_layer = util_max_layer(&rtex->resource.b.b, level);
@@ -298,9 +310,7 @@ static void si_blit_decompress_color(struct pipe_context *ctx,
 			cbsurf = ctx->create_surface(ctx, &rtex->resource.b.b, &surf_tmpl);
 
 			si_blitter_begin(ctx, SI_DECOMPRESS);
-			util_blitter_custom_color(sctx->blitter, cbsurf,
-				rtex->fmask.size ? sctx->custom_blend_decompress :
-						   sctx->custom_blend_fastclear);
+			util_blitter_custom_color(sctx->blitter, cbsurf, custom_blend);
 			si_blitter_end(ctx);
 
 			pipe_surface_reference(&cbsurf, NULL);
@@ -314,8 +324,9 @@ static void si_blit_decompress_color(struct pipe_context *ctx,
 	}
 }
 
-void si_decompress_color_textures(struct si_context *sctx,
-				  struct si_textures_info *textures)
+static void
+si_decompress_color_textures(struct si_context *sctx,
+			     struct si_textures_info *textures)
 {
 	unsigned i;
 	unsigned mask = textures->compressed_colortex_mask;
@@ -330,11 +341,37 @@ void si_decompress_color_textures(struct si_context *sctx,
 		assert(view);
 
 		tex = (struct r600_texture *)view->texture;
-		assert(tex->cmask.size || tex->fmask.size || tex->dcc_buffer);
+		assert(tex->cmask.size || tex->fmask.size || tex->dcc_offset);
 
 		si_blit_decompress_color(&sctx->b.b, tex,
 					 view->u.tex.first_level, view->u.tex.last_level,
-					 0, util_max_layer(&tex->resource.b.b, view->u.tex.first_level));
+					 0, util_max_layer(&tex->resource.b.b, view->u.tex.first_level),
+					 false);
+	}
+}
+
+void si_decompress_textures(struct si_context *sctx)
+{
+	unsigned compressed_colortex_counter;
+
+	if (sctx->blitter->running)
+		return;
+
+	/* Update the compressed_colortex_mask if necessary. */
+	compressed_colortex_counter = p_atomic_read(&sctx->screen->b.compressed_colortex_counter);
+	if (compressed_colortex_counter != sctx->b.last_compressed_colortex_counter) {
+		sctx->b.last_compressed_colortex_counter = compressed_colortex_counter;
+		si_update_compressed_colortex_masks(sctx);
+	}
+
+	/* Flush depth textures which need to be flushed. */
+	for (int i = 0; i < SI_NUM_SHADERS; i++) {
+		if (sctx->samplers[i].depth_texture_mask) {
+			si_flush_depth_textures(sctx, &sctx->samplers[i]);
+		}
+		if (sctx->samplers[i].compressed_colortex_mask) {
+			si_decompress_color_textures(sctx, &sctx->samplers[i]);
+		}
 	}
 }
 
@@ -483,9 +520,9 @@ static void si_decompress_subresource(struct pipe_context *ctx,
 			si_blit_decompress_depth_in_place(sctx, rtex, true,
 							  level, level,
 							  first_layer, last_layer);
-	} else if (rtex->fmask.size || rtex->cmask.size || rtex->dcc_buffer) {
+	} else if (rtex->fmask.size || rtex->cmask.size || rtex->dcc_offset) {
 		si_blit_decompress_color(ctx, rtex, level, level,
-					 first_layer, last_layer);
+					 first_layer, last_layer, false);
 	}
 }
 
@@ -712,7 +749,7 @@ static bool do_hardware_msaa_resolve(struct pipe_context *ctx,
 	    dst->surface.level[info->dst.level].mode >= RADEON_SURF_MODE_1D &&
 	    !(dst->surface.flags & RADEON_SURF_SCANOUT) &&
 	    (!dst->cmask.size || !dst->dirty_level_mask) && /* dst cannot be fast-cleared */
-	    !dst->dcc_buffer) {
+	    !dst->dcc_offset) {
 		si_blitter_begin(ctx, SI_COLOR_RESOLVE |
 				 (info->render_condition_enable ? 0 : SI_DISABLE_RENDER_COND));
 		util_blitter_custom_resolve_color(sctx->blitter,
@@ -761,12 +798,23 @@ static void si_flush_resource(struct pipe_context *ctx,
 
 	assert(res->target != PIPE_BUFFER);
 
-	if (!rtex->is_depth && rtex->cmask.size) {
+	if (!rtex->is_depth && (rtex->cmask.size || rtex->dcc_offset)) {
 		si_blit_decompress_color(ctx, rtex, 0, res->last_level,
-					 0, util_max_layer(res, 0));
+					 0, util_max_layer(res, 0), false);
 	}
 }
 
+static void si_decompress_dcc(struct pipe_context *ctx,
+			      struct r600_texture *rtex)
+{
+	if (!rtex->dcc_offset)
+		return;
+
+	si_blit_decompress_color(ctx, rtex, 0, rtex->resource.b.b.last_level,
+				 0, util_max_layer(&rtex->resource.b.b, 0),
+				 true);
+}
+
 static void si_pipe_clear_buffer(struct pipe_context *ctx,
 				 struct pipe_resource *dst,
 				 unsigned offset, unsigned size,
@@ -836,4 +884,5 @@ void si_init_blit_functions(struct si_context *sctx)
 	sctx->b.b.blit = si_blit;
 	sctx->b.b.flush_resource = si_flush_resource;
 	sctx->b.blit_decompress_depth = si_blit_decompress_depth;
+	sctx->b.decompress_dcc = si_decompress_dcc;
 }
diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
index 345f2bbc381..d12b3e6b28a 100644
--- a/src/gallium/drivers/radeonsi/si_descriptors.c
+++ b/src/gallium/drivers/radeonsi/si_descriptors.c
@@ -150,20 +150,17 @@ static void si_release_sampler_views(struct si_sampler_views *views)
 	si_release_descriptors(&views->desc);
 }
 
-static void si_sampler_view_add_buffers(struct si_context *sctx,
-					struct si_sampler_view *rview)
+static void si_sampler_view_add_buffer(struct si_context *sctx,
+				       struct pipe_resource *resource)
 {
-	if (rview->resource) {
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
-			rview->resource, RADEON_USAGE_READ,
-			r600_get_sampler_view_priority(rview->resource));
-	}
+	struct r600_resource *rres = (struct r600_resource*)resource;
 
-	if (rview->dcc_buffer && rview->dcc_buffer != rview->resource) {
-		radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
-			rview->dcc_buffer, RADEON_USAGE_READ,
-			RADEON_PRIO_DCC);
-	}
+	if (!resource)
+		return;
+
+	radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx, rres,
+				  RADEON_USAGE_READ,
+				  r600_get_sampler_view_priority(rres));
 }
 
 static void si_sampler_views_begin_new_cs(struct si_context *sctx,
@@ -174,10 +171,8 @@ static void si_sampler_views_begin_new_cs(struct si_context *sctx,
 	/* Add buffers to the CS. */
 	while (mask) {
 		int i = u_bit_scan64(&mask);
-		struct si_sampler_view *rview =
-			(struct si_sampler_view*)views->views[i];
 
-		si_sampler_view_add_buffers(sctx, rview);
+		si_sampler_view_add_buffer(sctx, views->views[i]->texture);
 	}
 
 	if (!views->desc.buffer)
@@ -190,15 +185,20 @@ static void si_set_sampler_view(struct si_context *sctx,
 				struct si_sampler_views *views,
 				unsigned slot, struct pipe_sampler_view *view)
 {
-	if (views->views[slot] == view)
+	struct si_sampler_view *rview = (struct si_sampler_view*)view;
+
+	if (view && view->texture && view->texture->target != PIPE_BUFFER &&
+	    G_008F28_COMPRESSION_EN(rview->state[6]) &&
+	    ((struct r600_texture*)view->texture)->dcc_offset == 0) {
+		rview->state[6] &= C_008F28_COMPRESSION_EN &
+		                   C_008F28_ALPHA_IS_ON_MSB;
+	} else if (views->views[slot] == view)
 		return;
 
 	if (view) {
-		struct si_sampler_view *rview =
-			(struct si_sampler_view*)view;
-		struct r600_texture *rtex = (struct r600_texture*)view->texture;
+		struct r600_texture *rtex = (struct r600_texture *)view->texture;
 
-		si_sampler_view_add_buffers(sctx, rview);
+		si_sampler_view_add_buffer(sctx, view->texture);
 
 		pipe_sampler_view_reference(&views->views[slot], view);
 		memcpy(views->desc.list + slot * 16, rview->state, 8*4);
@@ -229,6 +229,12 @@ static void si_set_sampler_view(struct si_context *sctx,
 	views->desc.list_dirty = true;
 }
 
+static bool is_compressed_colortex(struct r600_texture *rtex)
+{
+	return rtex->cmask.size || rtex->fmask.size ||
+	       (rtex->dcc_offset && rtex->dirty_level_mask);
+}
+
 static void si_set_sampler_views(struct pipe_context *ctx,
 				 unsigned shader, unsigned start,
                                  unsigned count,
@@ -262,8 +268,7 @@ static void si_set_sampler_views(struct pipe_context *ctx,
 			} else {
 				samplers->depth_texture_mask &= ~(1 << slot);
 			}
-			if (rtex->cmask.size || rtex->fmask.size ||
-			    (rtex->dcc_buffer && rtex->dirty_level_mask)) {
+			if (is_compressed_colortex(rtex)) {
 				samplers->compressed_colortex_mask |= 1 << slot;
 			} else {
 				samplers->compressed_colortex_mask &= ~(1 << slot);
@@ -275,6 +280,27 @@ static void si_set_sampler_views(struct pipe_context *ctx,
 	}
 }
 
+static void
+si_samplers_update_compressed_colortex_mask(struct si_textures_info *samplers)
+{
+	uint64_t mask = samplers->views.desc.enabled_mask;
+
+	while (mask) {
+		int i = u_bit_scan64(&mask);
+		struct pipe_resource *res = samplers->views.views[i]->texture;
+
+		if (res && res->target != PIPE_BUFFER) {
+			struct r600_texture *rtex = (struct r600_texture *)res;
+
+			if (is_compressed_colortex(rtex)) {
+				samplers->compressed_colortex_mask |= 1 << i;
+			} else {
+				samplers->compressed_colortex_mask &= ~(1 << i);
+			}
+		}
+	}
+}
+
 /* SAMPLER STATES */
 
 static void si_bind_sampler_states(struct pipe_context *ctx, unsigned shader,
@@ -303,6 +329,7 @@ static void si_bind_sampler_states(struct pipe_context *ctx, unsigned shader,
 		 */
 		if (samplers->views.views[i] &&
 		    samplers->views.views[i]->texture &&
+		    samplers->views.views[i]->texture->target != PIPE_BUFFER &&
 		    ((struct r600_texture*)samplers->views.views[i]->texture)->fmask.size)
 			continue;
 
@@ -767,6 +794,19 @@ static void si_desc_reset_buffer_offset(struct pipe_context *ctx,
 		  S_008F04_BASE_ADDRESS_HI(va >> 32);
 }
 
+/* TEXTURE METADATA ENABLE/DISABLE */
+
+/* CMASK can be enabled (for fast clear) and disabled (for texture export)
+ * while the texture is bound, possibly by a different context. In that case,
+ * call this function to update compressed_colortex_masks.
+ */
+void si_update_compressed_colortex_masks(struct si_context *sctx)
+{
+	for (int i = 0; i < SI_NUM_SHADERS; ++i) {
+		si_samplers_update_compressed_colortex_mask(&sctx->samplers[i]);
+	}
+}
+
 /* BUFFER DISCARD/INVALIDATION */
 
 /* Reallocate a buffer a update all resource bindings where the buffer is
diff --git a/src/gallium/drivers/radeonsi/si_dma.c b/src/gallium/drivers/radeonsi/si_dma.c
index 240d96190a9..0efca193951 100644
--- a/src/gallium/drivers/radeonsi/si_dma.c
+++ b/src/gallium/drivers/radeonsi/si_dma.c
@@ -249,7 +249,7 @@ void si_dma_copy(struct pipe_context *ctx,
 	    (rdst->dirty_level_mask | rdst->stencil_dirty_level_mask) & (1 << dst_level) ||
 	    rdst->cmask.size || rdst->fmask.size ||
 	    rsrc->cmask.size || rsrc->fmask.size ||
-	    rdst->dcc_buffer || rsrc->dcc_buffer) {
+	    rdst->dcc_offset || rsrc->dcc_offset) {
 		goto fallback;
 	}
 
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 37fd4a25d59..8b50a49cba0 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -68,6 +68,8 @@ static void si_destroy_context(struct pipe_context *context)
 		sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_decompress);
 	if (sctx->custom_blend_fastclear)
 		sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_fastclear);
+	if (sctx->custom_blend_dcc_decompress)
+		sctx->b.b.delete_blend_state(&sctx->b.b, sctx->custom_blend_dcc_decompress);
 	util_unreference_framebuffer_state(&sctx->framebuffer.state);
 
 	if (sctx->blitter)
@@ -418,7 +420,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 		return PIPE_ENDIAN_LITTLE;
 
 	case PIPE_CAP_VENDOR_ID:
-		return 0x1002;
+		return ATI_VENDOR_ID;
 	case PIPE_CAP_DEVICE_ID:
 		return sscreen->b.info.pci_id;
 	case PIPE_CAP_ACCELERATED:
@@ -427,6 +429,14 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 		return sscreen->b.info.vram_size >> 20;
 	case PIPE_CAP_UMA:
 		return 0;
+	case PIPE_CAP_PCI_GROUP:
+		return sscreen->b.info.pci_domain;
+	case PIPE_CAP_PCI_BUS:
+		return sscreen->b.info.pci_bus;
+	case PIPE_CAP_PCI_DEVICE:
+		return sscreen->b.info.pci_dev;
+	case PIPE_CAP_PCI_FUNCTION:
+		return sscreen->b.info.pci_func;
 	}
 	return 0;
 }
@@ -611,6 +621,8 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws)
 	sscreen->b.b.is_format_supported = si_is_format_supported;
 	sscreen->b.b.resource_create = r600_resource_create_common;
 
+	si_init_screen_state_functions(sscreen);
+
 	if (!r600_common_screen_init(&sscreen->b, ws) ||
 	    !si_init_gs_info(sscreen) ||
 	    !si_init_shader_cache(sscreen)) {
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index ef860a58b83..0fef5f72098 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -120,8 +120,6 @@ struct si_blend_color {
 struct si_sampler_view {
 	struct pipe_sampler_view	base;
 	struct list_head		list;
-	struct r600_resource		*resource;
-	struct r600_resource		*dcc_buffer;
         /* [0..7] = image descriptor
          * [4..7] = buffer descriptor */
 	uint32_t			state[8];
@@ -197,6 +195,7 @@ struct si_context {
 	void				*custom_blend_resolve;
 	void				*custom_blend_decompress;
 	void				*custom_blend_fastclear;
+	void				*custom_blend_dcc_decompress;
 	void				*pstipple_sampler_state;
 	struct si_screen		*screen;
 	struct pipe_fence_handle	*last_gfx_fence;
@@ -334,10 +333,7 @@ void cik_sdma_copy(struct pipe_context *ctx,
 
 /* si_blit.c */
 void si_init_blit_functions(struct si_context *sctx);
-void si_flush_depth_textures(struct si_context *sctx,
-			     struct si_textures_info *textures);
-void si_decompress_color_textures(struct si_context *sctx,
-				  struct si_textures_info *textures);
+void si_decompress_textures(struct si_context *sctx);
 void si_resource_copy_region(struct pipe_context *ctx,
 			     struct pipe_resource *dst,
 			     unsigned dst_level,
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index b23b17ad77b..f823af188c7 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -34,6 +34,7 @@
 #include "util/u_format_s3tc.h"
 #include "util/u_memory.h"
 #include "util/u_pstipple.h"
+#include "util/u_resource.h"
 
 /* Initialize an external atom (owned by ../radeon). */
 static void
@@ -2250,11 +2251,7 @@ static void si_initialize_color_surface(struct si_context *sctx,
 	}
 	assert(format != V_028C70_COLOR_INVALID);
 	swap = r600_translate_colorswap(surf->base.format);
-	if (rtex->resource.b.b.usage == PIPE_USAGE_STAGING) {
-		endian = V_028C70_ENDIAN_NONE;
-	} else {
-		endian = si_colorformat_endian_swap(format);
-	}
+	endian = si_colorformat_endian_swap(format);
 
 	/* blend clamp should be set for all NORM/SRGB types */
 	if (ntype == V_028C70_NUMBER_UNORM ||
@@ -2322,9 +2319,8 @@ static void si_initialize_color_surface(struct si_context *sctx,
 	surf->cb_color_info = color_info;
 	surf->cb_color_attrib = color_attrib;
 
-	if (sctx->b.chip_class >= VI && rtex->dcc_buffer) {
+	if (sctx->b.chip_class >= VI && rtex->dcc_offset) {
 		unsigned max_uncompressed_block_size = 2;
-		uint64_t dcc_offset = rtex->surface.level[level].dcc_offset;
 
 		if (rtex->surface.nsamples > 1) {
 			if (rtex->surface.bpe == 1)
@@ -2335,7 +2331,9 @@ static void si_initialize_color_surface(struct si_context *sctx,
 
 		surf->cb_dcc_control = S_028C78_MAX_UNCOMPRESSED_BLOCK_SIZE(max_uncompressed_block_size) |
 		                       S_028C78_INDEPENDENT_64B_BLOCKS(1);
-		surf->cb_dcc_base = (rtex->dcc_buffer->gpu_address + dcc_offset) >> 8;
+		surf->cb_dcc_base = (rtex->resource.gpu_address +
+				     rtex->dcc_offset +
+				     rtex->surface.level[level].dcc_offset) >> 8;
 	}
 
 	if (rtex->fmask.size) {
@@ -2674,12 +2672,6 @@ static void si_emit_framebuffer_state(struct si_context *sctx, struct r600_atom
 				RADEON_PRIO_CMASK);
 		}
 
-		if (tex->dcc_buffer && tex->dcc_buffer != &tex->resource) {
-			radeon_add_to_buffer_list(&sctx->b, &sctx->b.gfx,
-				tex->dcc_buffer, RADEON_USAGE_READWRITE,
-				RADEON_PRIO_DCC);
-		}
-
 		radeon_set_context_reg_seq(cs, R_028C60_CB_COLOR0_BASE + i * 0x3C,
 					   sctx->b.chip_class >= VI ? 14 : 13);
 		radeon_emit(cs, cb->cb_color_base);	/* R_028C60_CB_COLOR0_BASE */
@@ -2802,105 +2794,73 @@ static void si_set_min_samples(struct pipe_context *ctx, unsigned min_samples)
  */
 
 /**
- * Create a sampler view.
- *
- * @param ctx		context
- * @param texture	texture
- * @param state		sampler view template
- * @param width0	width0 override (for compressed textures as int)
- * @param height0	height0 override (for compressed textures as int)
- * @param force_level   set the base address to the level (for compressed textures)
+ * Build the sampler view descriptor for a buffer texture.
+ * @param state 256-bit descriptor; only the high 128 bits are filled in
  */
-struct pipe_sampler_view *
-si_create_sampler_view_custom(struct pipe_context *ctx,
-			      struct pipe_resource *texture,
-			      const struct pipe_sampler_view *state,
-			      unsigned width0, unsigned height0,
-			      unsigned force_level)
+static void
+si_make_buffer_descriptor(struct si_screen *screen, struct r600_resource *buf,
+			  enum pipe_format format,
+			  unsigned first_element, unsigned last_element,
+			  uint32_t *state)
 {
-	struct si_context *sctx = (struct si_context*)ctx;
-	struct si_sampler_view *view = CALLOC_STRUCT(si_sampler_view);
-	struct r600_texture *tmp = (struct r600_texture*)texture;
 	const struct util_format_description *desc;
-	unsigned format, num_format, base_level, first_level, last_level;
-	uint32_t pitch = 0;
-	unsigned char state_swizzle[4], swizzle[4];
-	unsigned height, depth, width;
-	enum pipe_format pipe_format = state->format;
-	struct radeon_surf_level *surflevel;
 	int first_non_void;
 	uint64_t va;
-	unsigned last_layer = state->u.tex.last_layer;
+	unsigned stride;
+	unsigned num_records;
+	unsigned num_format, data_format;
 
-	if (!view)
-		return NULL;
-
-	/* initialize base object */
-	view->base = *state;
-	view->base.texture = NULL;
-	view->base.reference.count = 1;
-	view->base.context = ctx;
-
-	/* NULL resource, obey swizzle (only ZERO and ONE make sense). */
-	if (!texture) {
-		view->state[3] = S_008F1C_DST_SEL_X(si_map_swizzle(state->swizzle_r)) |
-				 S_008F1C_DST_SEL_Y(si_map_swizzle(state->swizzle_g)) |
-				 S_008F1C_DST_SEL_Z(si_map_swizzle(state->swizzle_b)) |
-				 S_008F1C_DST_SEL_W(si_map_swizzle(state->swizzle_a)) |
-				 S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D);
-		return &view->base;
-	}
-
-	pipe_resource_reference(&view->base.texture, texture);
-	view->resource = &tmp->resource;
-
-	if (state->format == PIPE_FORMAT_X24S8_UINT ||
-	    state->format == PIPE_FORMAT_S8X24_UINT ||
-	    state->format == PIPE_FORMAT_X32_S8X24_UINT ||
-	    state->format == PIPE_FORMAT_S8_UINT)
-		view->is_stencil_sampler = true;
-
-	/* Buffer resource. */
-	if (texture->target == PIPE_BUFFER) {
-		unsigned stride, num_records;
-
-		desc = util_format_description(state->format);
-		first_non_void = util_format_get_first_non_void_channel(state->format);
-		stride = desc->block.bits / 8;
-		va = tmp->resource.gpu_address + state->u.buf.first_element*stride;
-		format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
-		num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void);
+	desc = util_format_description(format);
+	first_non_void = util_format_get_first_non_void_channel(format);
+	stride = desc->block.bits / 8;
+	va = buf->gpu_address + first_element * stride;
+	num_format = si_translate_buffer_numformat(&screen->b.b, desc, first_non_void);
+	data_format = si_translate_buffer_dataformat(&screen->b.b, desc, first_non_void);
 
-		num_records = state->u.buf.last_element + 1 - state->u.buf.first_element;
-		num_records = MIN2(num_records, texture->width0 / stride);
+	num_records = last_element + 1 - first_element;
+	num_records = MIN2(num_records, buf->b.b.width0 / stride);
 
-		if (sctx->b.chip_class >= VI)
-			num_records *= stride;
-
-		view->state[4] = va;
-		view->state[5] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
-				 S_008F04_STRIDE(stride);
-		view->state[6] = num_records;
-		view->state[7] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
-				 S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
-				 S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
-				 S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3])) |
-				 S_008F0C_NUM_FORMAT(num_format) |
-				 S_008F0C_DATA_FORMAT(format);
+	if (screen->b.chip_class >= VI)
+		num_records *= stride;
 
-		LIST_ADDTAIL(&view->list, &sctx->b.texture_buffers);
-		return &view->base;
-	}
-
-	state_swizzle[0] = state->swizzle_r;
-	state_swizzle[1] = state->swizzle_g;
-	state_swizzle[2] = state->swizzle_b;
-	state_swizzle[3] = state->swizzle_a;
+	state[4] = va;
+	state[5] = S_008F04_BASE_ADDRESS_HI(va >> 32) |
+		   S_008F04_STRIDE(stride);
+	state[6] = num_records;
+	state[7] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
+		   S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
+		   S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
+		   S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3])) |
+		   S_008F0C_NUM_FORMAT(num_format) |
+		   S_008F0C_DATA_FORMAT(data_format);
+}
 
-	surflevel = tmp->surface.level;
+/**
+ * Build the sampler view descriptor for a texture.
+ */
+static void
+si_make_texture_descriptor(struct si_screen *screen,
+			   struct r600_texture *tex,
+			   enum pipe_texture_target target,
+			   enum pipe_format pipe_format,
+			   const unsigned char state_swizzle[4],
+			   unsigned base_level, unsigned first_level, unsigned last_level,
+			   unsigned first_layer, unsigned last_layer,
+			   unsigned width, unsigned height, unsigned depth,
+			   uint32_t *state,
+			   uint32_t *fmask_state)
+{
+	struct pipe_resource *res = &tex->resource.b.b;
+	const struct radeon_surf_level *surflevel = tex->surface.level;
+	const struct util_format_description *desc;
+	unsigned char swizzle[4];
+	int first_non_void;
+	unsigned num_format, data_format;
+	uint32_t pitch;
+	uint64_t va;
 
 	/* Texturing with separate depth and stencil. */
-	if (tmp->is_depth && !tmp->is_flushing_texture) {
+	if (tex->is_depth && !tex->is_flushing_texture) {
 		switch (pipe_format) {
 		case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
 			pipe_format = PIPE_FORMAT_Z32_FLOAT;
@@ -2914,7 +2874,7 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
 		case PIPE_FORMAT_S8X24_UINT:
 		case PIPE_FORMAT_X32_S8X24_UINT:
 			pipe_format = PIPE_FORMAT_S8_UINT;
-			surflevel = tmp->surface.stencil_level;
+			surflevel = tex->surface.stencil_level;
 			break;
 		default:;
 		}
@@ -3008,89 +2968,63 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
 		}
 	}
 
-	format = si_translate_texformat(ctx->screen, pipe_format, desc, first_non_void);
-	if (format == ~0) {
-		format = 0;
-	}
-
-	base_level = 0;
-	first_level = state->u.tex.first_level;
-	last_level = state->u.tex.last_level;
-	width = width0;
-	height = height0;
-	depth = texture->depth0;
-
-	if (force_level) {
-		assert(force_level == first_level &&
-		       force_level == last_level);
-		base_level = force_level;
-		first_level = 0;
-		last_level = 0;
-		width = u_minify(width, force_level);
-		height = u_minify(height, force_level);
-		depth = u_minify(depth, force_level);
+	data_format = si_translate_texformat(&screen->b.b, pipe_format, desc, first_non_void);
+	if (data_format == ~0) {
+		data_format = 0;
 	}
 
-	pitch = surflevel[base_level].nblk_x * util_format_get_blockwidth(pipe_format);
-
-	if (texture->target == PIPE_TEXTURE_1D_ARRAY) {
+	if (res->target == PIPE_TEXTURE_1D_ARRAY) {
 	        height = 1;
-		depth = texture->array_size;
-	} else if (texture->target == PIPE_TEXTURE_2D_ARRAY) {
-		depth = texture->array_size;
-	} else if (texture->target == PIPE_TEXTURE_CUBE_ARRAY)
-		depth = texture->array_size / 6;
+		depth = res->array_size;
+	} else if (res->target == PIPE_TEXTURE_2D_ARRAY) {
+		depth = res->array_size;
+	} else if (res->target == PIPE_TEXTURE_CUBE_ARRAY)
+		depth = res->array_size / 6;
 
-	/* This is not needed if state trackers set last_layer correctly. */
-	if (state->target == PIPE_TEXTURE_1D ||
-	    state->target == PIPE_TEXTURE_2D ||
-	    state->target == PIPE_TEXTURE_RECT ||
-	    state->target == PIPE_TEXTURE_CUBE)
-		last_layer = state->u.tex.first_layer;
-
-	va = tmp->resource.gpu_address + surflevel[base_level].offset;
-
-	view->state[0] = va >> 8;
-	view->state[1] = (S_008F14_BASE_ADDRESS_HI(va >> 40) |
-			  S_008F14_DATA_FORMAT(format) |
-			  S_008F14_NUM_FORMAT(num_format));
-	view->state[2] = (S_008F18_WIDTH(width - 1) |
-			  S_008F18_HEIGHT(height - 1));
-	view->state[3] = (S_008F1C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
-			  S_008F1C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
-			  S_008F1C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
-			  S_008F1C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
-			  S_008F1C_BASE_LEVEL(texture->nr_samples > 1 ?
-						      0 : first_level) |
-			  S_008F1C_LAST_LEVEL(texture->nr_samples > 1 ?
-						      util_logbase2(texture->nr_samples) :
-						      last_level) |
-			  S_008F1C_TILING_INDEX(si_tile_mode_index(tmp, base_level, false)) |
-			  S_008F1C_POW2_PAD(texture->last_level > 0) |
-			  S_008F1C_TYPE(si_tex_dim(texture->target, state->target,
-						   texture->nr_samples)));
-	view->state[4] = (S_008F20_DEPTH(depth - 1) | S_008F20_PITCH(pitch - 1));
-	view->state[5] = (S_008F24_BASE_ARRAY(state->u.tex.first_layer) |
-			  S_008F24_LAST_ARRAY(last_layer));
-
-	if (tmp->dcc_buffer) {
-		uint64_t dcc_offset = surflevel[base_level].dcc_offset;
+	pitch = surflevel[base_level].nblk_x * util_format_get_blockwidth(pipe_format);
+	va = tex->resource.gpu_address + surflevel[base_level].offset;
+
+	state[0] = va >> 8;
+	state[1] = (S_008F14_BASE_ADDRESS_HI(va >> 40) |
+		    S_008F14_DATA_FORMAT(data_format) |
+		    S_008F14_NUM_FORMAT(num_format));
+	state[2] = (S_008F18_WIDTH(width - 1) |
+		    S_008F18_HEIGHT(height - 1));
+	state[3] = (S_008F1C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
+		    S_008F1C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
+		    S_008F1C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
+		    S_008F1C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
+		    S_008F1C_BASE_LEVEL(res->nr_samples > 1 ?
+					0 : first_level) |
+		    S_008F1C_LAST_LEVEL(res->nr_samples > 1 ?
+					util_logbase2(res->nr_samples) :
+					last_level) |
+		    S_008F1C_TILING_INDEX(si_tile_mode_index(tex, base_level, false)) |
+		    S_008F1C_POW2_PAD(res->last_level > 0) |
+		    S_008F1C_TYPE(si_tex_dim(res->target, target, res->nr_samples)));
+	state[4] = (S_008F20_DEPTH(depth - 1) | S_008F20_PITCH(pitch - 1));
+	state[5] = (S_008F24_BASE_ARRAY(first_layer) |
+		    S_008F24_LAST_ARRAY(last_layer));
+
+	if (tex->dcc_offset) {
 		unsigned swap = r600_translate_colorswap(pipe_format);
 
-		view->state[6] = S_008F28_COMPRESSION_EN(1) | S_008F28_ALPHA_IS_ON_MSB(swap <= 1);
-		view->state[7] = (tmp->dcc_buffer->gpu_address + dcc_offset) >> 8;
-		view->dcc_buffer = tmp->dcc_buffer;
+		state[6] = S_008F28_COMPRESSION_EN(1) | S_008F28_ALPHA_IS_ON_MSB(swap <= 1);
+		state[7] = (tex->resource.gpu_address +
+			    tex->dcc_offset +
+			    surflevel[base_level].dcc_offset) >> 8;
 	} else {
-		view->state[6] = 0;
-		view->state[7] = 0;
+		state[6] = 0;
+		state[7] = 0;
 	}
 
 	/* Initialize the sampler view for FMASK. */
-	if (tmp->fmask.size) {
-		uint64_t va = tmp->resource.gpu_address + tmp->fmask.offset;
+	if (tex->fmask.size) {
 		uint32_t fmask_format;
 
-		switch (texture->nr_samples) {
+		va = tex->resource.gpu_address + tex->fmask.offset;
+
+		switch (res->nr_samples) {
 		case 2:
 			fmask_format = V_008F14_IMG_DATA_FORMAT_FMASK8_S2_F2;
 			break;
@@ -3105,27 +3039,129 @@ si_create_sampler_view_custom(struct pipe_context *ctx,
 			fmask_format = V_008F14_IMG_DATA_FORMAT_INVALID;
 		}
 
-		view->fmask_state[0] = va >> 8;
-		view->fmask_state[1] = S_008F14_BASE_ADDRESS_HI(va >> 40) |
-				       S_008F14_DATA_FORMAT(fmask_format) |
-				       S_008F14_NUM_FORMAT(V_008F14_IMG_NUM_FORMAT_UINT);
-		view->fmask_state[2] = S_008F18_WIDTH(width - 1) |
-				       S_008F18_HEIGHT(height - 1);
-		view->fmask_state[3] = S_008F1C_DST_SEL_X(V_008F1C_SQ_SEL_X) |
-				       S_008F1C_DST_SEL_Y(V_008F1C_SQ_SEL_X) |
-				       S_008F1C_DST_SEL_Z(V_008F1C_SQ_SEL_X) |
-				       S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_X) |
-				       S_008F1C_TILING_INDEX(tmp->fmask.tile_mode_index) |
-				       S_008F1C_TYPE(si_tex_dim(texture->target,
-								state->target, 0));
-		view->fmask_state[4] = S_008F20_DEPTH(depth - 1) |
-				       S_008F20_PITCH(tmp->fmask.pitch_in_pixels - 1);
-		view->fmask_state[5] = S_008F24_BASE_ARRAY(state->u.tex.first_layer) |
-				       S_008F24_LAST_ARRAY(last_layer);
-		view->fmask_state[6] = 0;
-		view->fmask_state[7] = 0;
+		fmask_state[0] = va >> 8;
+		fmask_state[1] = S_008F14_BASE_ADDRESS_HI(va >> 40) |
+				 S_008F14_DATA_FORMAT(fmask_format) |
+				 S_008F14_NUM_FORMAT(V_008F14_IMG_NUM_FORMAT_UINT);
+		fmask_state[2] = S_008F18_WIDTH(width - 1) |
+				 S_008F18_HEIGHT(height - 1);
+		fmask_state[3] = S_008F1C_DST_SEL_X(V_008F1C_SQ_SEL_X) |
+				 S_008F1C_DST_SEL_Y(V_008F1C_SQ_SEL_X) |
+				 S_008F1C_DST_SEL_Z(V_008F1C_SQ_SEL_X) |
+				 S_008F1C_DST_SEL_W(V_008F1C_SQ_SEL_X) |
+				 S_008F1C_TILING_INDEX(tex->fmask.tile_mode_index) |
+				 S_008F1C_TYPE(si_tex_dim(res->target, target, 0));
+		fmask_state[4] = S_008F20_DEPTH(depth - 1) |
+				 S_008F20_PITCH(tex->fmask.pitch_in_pixels - 1);
+		fmask_state[5] = S_008F24_BASE_ARRAY(first_layer) |
+				 S_008F24_LAST_ARRAY(last_layer);
+		fmask_state[6] = 0;
+		fmask_state[7] = 0;
+	}
+}
+
+/**
+ * Create a sampler view.
+ *
+ * @param ctx		context
+ * @param texture	texture
+ * @param state		sampler view template
+ * @param width0	width0 override (for compressed textures as int)
+ * @param height0	height0 override (for compressed textures as int)
+ * @param force_level   set the base address to the level (for compressed textures)
+ */
+struct pipe_sampler_view *
+si_create_sampler_view_custom(struct pipe_context *ctx,
+			      struct pipe_resource *texture,
+			      const struct pipe_sampler_view *state,
+			      unsigned width0, unsigned height0,
+			      unsigned force_level)
+{
+	struct si_context *sctx = (struct si_context*)ctx;
+	struct si_sampler_view *view = CALLOC_STRUCT(si_sampler_view);
+	struct r600_texture *tmp = (struct r600_texture*)texture;
+	unsigned base_level, first_level, last_level;
+	unsigned char state_swizzle[4];
+	unsigned height, depth, width;
+	unsigned last_layer = state->u.tex.last_layer;
+
+	if (!view)
+		return NULL;
+
+	/* initialize base object */
+	view->base = *state;
+	view->base.texture = NULL;
+	view->base.reference.count = 1;
+	view->base.context = ctx;
+
+	/* NULL resource, obey swizzle (only ZERO and ONE make sense). */
+	if (!texture) {
+		view->state[3] = S_008F1C_DST_SEL_X(si_map_swizzle(state->swizzle_r)) |
+				 S_008F1C_DST_SEL_Y(si_map_swizzle(state->swizzle_g)) |
+				 S_008F1C_DST_SEL_Z(si_map_swizzle(state->swizzle_b)) |
+				 S_008F1C_DST_SEL_W(si_map_swizzle(state->swizzle_a)) |
+				 S_008F1C_TYPE(V_008F1C_SQ_RSRC_IMG_1D);
+		return &view->base;
 	}
 
+	pipe_resource_reference(&view->base.texture, texture);
+
+	if (state->format == PIPE_FORMAT_X24S8_UINT ||
+	    state->format == PIPE_FORMAT_S8X24_UINT ||
+	    state->format == PIPE_FORMAT_X32_S8X24_UINT ||
+	    state->format == PIPE_FORMAT_S8_UINT)
+		view->is_stencil_sampler = true;
+
+	/* Buffer resource. */
+	if (texture->target == PIPE_BUFFER) {
+		si_make_buffer_descriptor(sctx->screen,
+					  (struct r600_resource *)texture,
+					  state->format,
+					  state->u.buf.first_element,
+					  state->u.buf.last_element,
+					  view->state);
+
+		LIST_ADDTAIL(&view->list, &sctx->b.texture_buffers);
+		return &view->base;
+	}
+
+	state_swizzle[0] = state->swizzle_r;
+	state_swizzle[1] = state->swizzle_g;
+	state_swizzle[2] = state->swizzle_b;
+	state_swizzle[3] = state->swizzle_a;
+
+	base_level = 0;
+	first_level = state->u.tex.first_level;
+	last_level = state->u.tex.last_level;
+	width = width0;
+	height = height0;
+	depth = texture->depth0;
+
+	if (force_level) {
+		assert(force_level == first_level &&
+		       force_level == last_level);
+		base_level = force_level;
+		first_level = 0;
+		last_level = 0;
+		width = u_minify(width, force_level);
+		height = u_minify(height, force_level);
+		depth = u_minify(depth, force_level);
+	}
+
+	/* This is not needed if state trackers set last_layer correctly. */
+	if (state->target == PIPE_TEXTURE_1D ||
+	    state->target == PIPE_TEXTURE_2D ||
+	    state->target == PIPE_TEXTURE_RECT ||
+	    state->target == PIPE_TEXTURE_CUBE)
+		last_layer = state->u.tex.first_layer;
+
+	si_make_texture_descriptor(sctx->screen, tmp, state->target,
+				   state->format, state_swizzle,
+				   base_level, first_level, last_level,
+				   state->u.tex.first_layer, last_layer,
+				   width, height, depth,
+				   view->state, view->fmask_state);
+
 	return &view->base;
 }
 
@@ -3144,7 +3180,7 @@ static void si_sampler_view_destroy(struct pipe_context *ctx,
 {
 	struct si_sampler_view *view = (struct si_sampler_view *)state;
 
-	if (view->resource && view->resource->b.b.target == PIPE_BUFFER)
+	if (state->texture && state->texture->target == PIPE_BUFFER)
 		LIST_DELINIT(&view->list);
 
 	pipe_resource_reference(&state->texture, NULL);
@@ -3522,6 +3558,7 @@ void si_init_state_functions(struct si_context *sctx)
 	sctx->custom_blend_resolve = si_create_blend_custom(sctx, V_028808_CB_RESOLVE);
 	sctx->custom_blend_decompress = si_create_blend_custom(sctx, V_028808_CB_FMASK_DECOMPRESS);
 	sctx->custom_blend_fastclear = si_create_blend_custom(sctx, V_028808_CB_ELIMINATE_FAST_CLEAR);
+	sctx->custom_blend_dcc_decompress = si_create_blend_custom(sctx, V_028808_CB_DCC_DECOMPRESS);
 
 	sctx->b.b.set_clip_state = si_set_clip_state;
 	sctx->b.b.set_scissor_states = si_set_scissor_states;
@@ -3564,6 +3601,68 @@ void si_init_state_functions(struct si_context *sctx)
 	si_init_config(sctx);
 }
 
+static void si_query_opaque_metadata(struct r600_common_screen *rscreen,
+				     struct r600_texture *rtex,
+			             struct radeon_bo_metadata *md)
+{
+	struct si_screen *sscreen = (struct si_screen*)rscreen;
+	struct pipe_resource *res = &rtex->resource.b.b;
+	static const unsigned char swizzle[] = {
+		PIPE_SWIZZLE_RED,
+		PIPE_SWIZZLE_GREEN,
+		PIPE_SWIZZLE_BLUE,
+		PIPE_SWIZZLE_ALPHA
+	};
+	uint32_t desc[8], i;
+	bool is_array = util_resource_is_array_texture(res);
+
+	/* DRM 2.x.x doesn't support this. */
+	if (rscreen->info.drm_major != 3)
+		return;
+
+	assert(rtex->fmask.size == 0);
+
+	/* Metadata image format format version 1:
+	 * [0] = 1 (metadata format identifier)
+	 * [1] = (VENDOR_ID << 16) | PCI_ID
+	 * [2:9] = image descriptor for the whole resource
+	 *         [2] is always 0, because the base address is cleared
+	 *         [9] is the DCC offset bits [39:8] from the beginning of
+	 *             the buffer
+	 * [10:10+LAST_LEVEL] = mipmap level offset bits [39:8] for each level
+	 */
+
+	md->metadata[0] = 1; /* metadata image format version 1 */
+
+	/* TILE_MODE_INDEX is ambiguous without a PCI ID. */
+	md->metadata[1] = (ATI_VENDOR_ID << 16) | rscreen->info.pci_id;
+
+	si_make_texture_descriptor(sscreen, rtex, res->target, res->format,
+				   swizzle, 0, 0, res->last_level, 0,
+				   is_array ? res->array_size - 1 : 0,
+				   res->width0, res->height0, res->depth0,
+				   desc, NULL);
+
+	/* Clear the base address and set the relative DCC offset. */
+	desc[0] = 0;
+	desc[1] &= C_008F14_BASE_ADDRESS_HI;
+	desc[7] = rtex->dcc_offset >> 8;
+
+	/* Dwords [2:9] contain the image descriptor. */
+	memcpy(&md->metadata[2], desc, sizeof(desc));
+
+	/* Dwords [10:..] contain the mipmap level offsets. */
+	for (i = 0; i <= res->last_level; i++)
+		md->metadata[10+i] = rtex->surface.level[i].offset >> 8;
+
+	md->size_metadata = (11 + res->last_level) * 4;
+}
+
+void si_init_screen_state_functions(struct si_screen *sscreen)
+{
+	sscreen->b.query_opaque_metadata = si_query_opaque_metadata;
+}
+
 static void
 si_write_harvested_raster_configs(struct si_context *sctx,
 				  struct si_pm4_state *pm4,
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 40792cbc1d5..60c34f19e55 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -249,6 +249,7 @@ void si_all_descriptors_begin_new_cs(struct si_context *sctx);
 void si_upload_const_buffer(struct si_context *sctx, struct r600_resource **rbuffer,
 			    const uint8_t *ptr, unsigned size, uint32_t *const_offset);
 void si_shader_change_notify(struct si_context *sctx);
+void si_update_compressed_colortex_masks(struct si_context *sctx);
 void si_emit_shader_userdata(struct si_context *sctx, struct r600_atom *atom);
 
 /* si_state.c */
@@ -263,6 +264,7 @@ boolean si_is_format_supported(struct pipe_screen *screen,
                                unsigned sample_count,
                                unsigned usage);
 void si_init_state_functions(struct si_context *sctx);
+void si_init_screen_state_functions(struct si_screen *sscreen);
 unsigned cik_bank_wh(unsigned bankwh);
 unsigned cik_db_pipe_config(struct si_screen *sscreen, unsigned tile_mode);
 unsigned cik_macro_tile_aspect(unsigned macro_tile_aspect);
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 91ccd073267..84b850a2992 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -33,21 +33,6 @@
 #include "util/u_upload_mgr.h"
 #include "util/u_prim.h"
 
-static void si_decompress_textures(struct si_context *sctx)
-{
-	if (!sctx->blitter->running) {
-		/* Flush depth textures which need to be flushed. */
-		for (int i = 0; i < SI_NUM_SHADERS; i++) {
-			if (sctx->samplers[i].depth_texture_mask) {
-				si_flush_depth_textures(sctx, &sctx->samplers[i]);
-			}
-			if (sctx->samplers[i].compressed_colortex_mask) {
-				si_decompress_color_textures(sctx, &sctx->samplers[i]);
-			}
-		}
-	}
-}
-
 static unsigned si_conv_pipe_prim(unsigned mode)
 {
         static const unsigned prim_conv[] = {
@@ -763,7 +748,7 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	struct si_context *sctx = (struct si_context *)ctx;
 	struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
 	struct pipe_index_buffer ib = {};
-	unsigned mask;
+	unsigned mask, dirty_fb_counter;
 
 	if (!info->count && !info->indirect &&
 	    (info->indexed || !info->count_from_stream_output))
@@ -782,6 +767,16 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 		return;
 	}
 
+	/* Re-emit the framebuffer state if needed. */
+	dirty_fb_counter = p_atomic_read(&sctx->b.screen->dirty_fb_counter);
+	if (dirty_fb_counter != sctx->b.last_dirty_fb_counter) {
+		sctx->b.last_dirty_fb_counter = dirty_fb_counter;
+		sctx->framebuffer.dirty_cbufs |=
+			((1 << sctx->framebuffer.state.nr_cbufs) - 1);
+		sctx->framebuffer.dirty_zsbuf = true;
+		si_mark_atom_dirty(sctx, &sctx->framebuffer.atom);
+	}
+
 	si_decompress_textures(sctx);
 
 	/* Set the rasterization primitive type.
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 321b87d80a6..5fe1f7960f3 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1154,6 +1154,9 @@ static void *si_create_shader_selector(struct pipe_context *ctx,
 		break;
 	}
 
+	if (sel->info.properties[TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL])
+		sel->db_shader_control |= S_02880C_DEPTH_BEFORE_SHADER(1);
+
 	/* Compile the main shader part for use with a prolog and/or epilog. */
 	if (sel->type != PIPE_SHADER_GEOMETRY &&
 	    !sscreen->use_monolithic_shaders) {
diff --git a/src/gallium/drivers/rbug/rbug_screen.c b/src/gallium/drivers/rbug/rbug_screen.c
index ac764029a2f..c2950e4a703 100644
--- a/src/gallium/drivers/rbug/rbug_screen.c
+++ b/src/gallium/drivers/rbug/rbug_screen.c
@@ -160,13 +160,14 @@ rbug_screen_resource_create(struct pipe_screen *_screen,
 static struct pipe_resource *
 rbug_screen_resource_from_handle(struct pipe_screen *_screen,
                                  const struct pipe_resource *templ,
-                                 struct winsys_handle *handle)
+                                 struct winsys_handle *handle,
+                                 unsigned usage)
 {
    struct rbug_screen *rb_screen = rbug_screen(_screen);
    struct pipe_screen *screen = rb_screen->screen;
    struct pipe_resource *result;
 
-   result = screen->resource_from_handle(screen, templ, handle);
+   result = screen->resource_from_handle(screen, templ, handle, usage);
 
    result = rbug_resource_create(rbug_screen(_screen), result);
 
@@ -176,14 +177,15 @@ rbug_screen_resource_from_handle(struct pipe_screen *_screen,
 static boolean
 rbug_screen_resource_get_handle(struct pipe_screen *_screen,
                                 struct pipe_resource *_resource,
-                                struct winsys_handle *handle)
+                                struct winsys_handle *handle,
+                                unsigned usage)
 {
    struct rbug_screen *rb_screen = rbug_screen(_screen);
    struct rbug_resource *rb_resource = rbug_resource(_resource);
    struct pipe_screen *screen = rb_screen->screen;
    struct pipe_resource *resource = rb_resource->resource;
 
-   return screen->resource_get_handle(screen, resource, handle);
+   return screen->resource_get_handle(screen, resource, handle, usage);
 }
 
 
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index 097ffe6f920..bfd3598fc57 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -266,6 +266,10 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
    case PIPE_CAP_QUERY_BUFFER_OBJECT:
    case PIPE_CAP_QUERY_MEMORY_INFO:
+   case PIPE_CAP_PCI_GROUP:
+   case PIPE_CAP_PCI_BUS:
+   case PIPE_CAP_PCI_DEVICE:
+   case PIPE_CAP_PCI_FUNCTION:
       return 0;
    }
    /* should only get here on unhandled cases */
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
index e3e28a3ef32..5703ca2dedb 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -1047,7 +1047,7 @@ img_filter_2d_linear_repeat_POT(const struct sp_sampler_view *sp_sview,
    }
 
    /* interpolate R, G, B, A */
-   for (c = 0; c < TGSI_QUAD_SIZE; c++) {
+   for (c = 0; c < TGSI_NUM_CHANNELS; c++) {
       rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw, 
                                        tx[0][c], tx[1][c], 
                                        tx[2][c], tx[3][c]);
@@ -1063,7 +1063,7 @@ static inline void
 img_filter_2d_nearest_repeat_POT(const struct sp_sampler_view *sp_sview,
                                  const struct sp_sampler *sp_samp,
                                  const struct img_filter_args *args,
-                                 float rgba[TGSI_QUAD_SIZE])
+                                 float *rgba)
 {
    const unsigned xpot = pot_level_size(sp_sview->xpot, args->level);
    const unsigned ypot = pot_level_size(sp_sview->ypot, args->level);
@@ -1085,7 +1085,7 @@ img_filter_2d_nearest_repeat_POT(const struct sp_sampler_view *sp_sview,
    addr.bits.z = sp_sview->base.u.tex.first_layer;
 
    out = get_texel_2d_no_border(sp_sview, addr, x0, y0);
-   for (c = 0; c < TGSI_QUAD_SIZE; c++)
+   for (c = 0; c < TGSI_NUM_CHANNELS; c++)
       rgba[TGSI_NUM_CHANNELS*c] = out[c];
 
    if (DEBUG_TEX) {
@@ -1098,7 +1098,7 @@ static inline void
 img_filter_2d_nearest_clamp_POT(const struct sp_sampler_view *sp_sview,
                                 const struct sp_sampler *sp_samp,
                                 const struct img_filter_args *args,
-                                float rgba[TGSI_QUAD_SIZE])
+                                float *rgba)
 {
    const unsigned xpot = pot_level_size(sp_sview->xpot, args->level);
    const unsigned ypot = pot_level_size(sp_sview->ypot, args->level);
@@ -1128,7 +1128,7 @@ img_filter_2d_nearest_clamp_POT(const struct sp_sampler_view *sp_sview,
       y0 = ypot - 1;
    
    out = get_texel_2d_no_border(sp_sview, addr, x0, y0);
-   for (c = 0; c < TGSI_QUAD_SIZE; c++)
+   for (c = 0; c < TGSI_NUM_CHANNELS; c++)
       rgba[TGSI_NUM_CHANNELS*c] = out[c];
 
    if (DEBUG_TEX) {
@@ -1141,7 +1141,7 @@ static void
 img_filter_1d_nearest(const struct sp_sampler_view *sp_sview,
                       const struct sp_sampler *sp_samp,
                       const struct img_filter_args *args,
-                      float rgba[TGSI_QUAD_SIZE])
+                      float *rgba)
 {
    const struct pipe_resource *texture = sp_sview->base.texture;
    const int width = u_minify(texture->width0, args->level);
@@ -1159,7 +1159,7 @@ img_filter_1d_nearest(const struct sp_sampler_view *sp_sview,
 
    out = get_texel_1d_array(sp_sview, sp_samp, addr, x,
                             sp_sview->base.u.tex.first_layer);
-   for (c = 0; c < TGSI_QUAD_SIZE; c++)
+   for (c = 0; c < TGSI_NUM_CHANNELS; c++)
       rgba[TGSI_NUM_CHANNELS*c] = out[c];
 
    if (DEBUG_TEX) {
@@ -1191,7 +1191,7 @@ img_filter_1d_array_nearest(const struct sp_sampler_view *sp_sview,
    sp_samp->nearest_texcoord_s(args->s, width, args->offset[0], &x);
 
    out = get_texel_1d_array(sp_sview, sp_samp, addr, x, layer);
-   for (c = 0; c < TGSI_QUAD_SIZE; c++)
+   for (c = 0; c < TGSI_NUM_CHANNELS; c++)
       rgba[TGSI_NUM_CHANNELS*c] = out[c];
 
    if (DEBUG_TEX) {
@@ -1225,7 +1225,7 @@ img_filter_2d_nearest(const struct sp_sampler_view *sp_sview,
    sp_samp->nearest_texcoord_t(args->t, height, args->offset[1], &y);
 
    out = get_texel_2d(sp_sview, sp_samp, addr, x, y);
-   for (c = 0; c < TGSI_QUAD_SIZE; c++)
+   for (c = 0; c < TGSI_NUM_CHANNELS; c++)
       rgba[TGSI_NUM_CHANNELS*c] = out[c];
 
    if (DEBUG_TEX) {
@@ -1260,7 +1260,7 @@ img_filter_2d_array_nearest(const struct sp_sampler_view *sp_sview,
    sp_samp->nearest_texcoord_t(args->t, height, args->offset[1], &y);
 
    out = get_texel_2d_array(sp_sview, sp_samp, addr, x, y, layer);
-   for (c = 0; c < TGSI_QUAD_SIZE; c++)
+   for (c = 0; c < TGSI_NUM_CHANNELS; c++)
       rgba[TGSI_NUM_CHANNELS*c] = out[c];
 
    if (DEBUG_TEX) {
@@ -1304,7 +1304,7 @@ img_filter_cube_nearest(const struct sp_sampler_view *sp_sview,
    }
 
    out = get_texel_cube_array(sp_sview, sp_samp, addr, x, y, layerface);
-   for (c = 0; c < TGSI_QUAD_SIZE; c++)
+   for (c = 0; c < TGSI_NUM_CHANNELS; c++)
       rgba[TGSI_NUM_CHANNELS*c] = out[c];
 
    if (DEBUG_TEX) {
@@ -1340,7 +1340,7 @@ img_filter_cube_array_nearest(const struct sp_sampler_view *sp_sview,
    sp_samp->nearest_texcoord_t(args->t, height, args->offset[1], &y);
 
    out = get_texel_cube_array(sp_sview, sp_samp, addr, x, y, layerface);
-   for (c = 0; c < TGSI_QUAD_SIZE; c++)
+   for (c = 0; c < TGSI_NUM_CHANNELS; c++)
       rgba[TGSI_NUM_CHANNELS*c] = out[c];
 
    if (DEBUG_TEX) {
@@ -1375,7 +1375,7 @@ img_filter_3d_nearest(const struct sp_sampler_view *sp_sview,
    addr.bits.level = args->level;
 
    out = get_texel_3d(sp_sview, sp_samp, addr, x, y, z);
-   for (c = 0; c < TGSI_QUAD_SIZE; c++)
+   for (c = 0; c < TGSI_NUM_CHANNELS; c++)
       rgba[TGSI_NUM_CHANNELS*c] = out[c];
 }
 
@@ -1407,7 +1407,7 @@ img_filter_1d_linear(const struct sp_sampler_view *sp_sview,
                             sp_sview->base.u.tex.first_layer);
 
    /* interpolate R, G, B, A */
-   for (c = 0; c < TGSI_QUAD_SIZE; c++)
+   for (c = 0; c < TGSI_NUM_CHANNELS; c++)
       rgba[TGSI_NUM_CHANNELS*c] = lerp(xw, tx0[c], tx1[c]);
 }
 
@@ -1439,7 +1439,7 @@ img_filter_1d_array_linear(const struct sp_sampler_view *sp_sview,
    tx1 = get_texel_1d_array(sp_sview, sp_samp, addr, x1, layer);
 
    /* interpolate R, G, B, A */
-   for (c = 0; c < TGSI_QUAD_SIZE; c++)
+   for (c = 0; c < TGSI_NUM_CHANNELS; c++)
       rgba[TGSI_NUM_CHANNELS*c] = lerp(xw, tx0[c], tx1[c]);
 }
 
@@ -1541,13 +1541,13 @@ img_filter_2d_linear(const struct sp_sampler_view *sp_sview,
    tx[3] = get_texel_2d(sp_sview, sp_samp, addr, x1, y1);
 
    if (args->gather_only) {
-      for (c = 0; c < TGSI_QUAD_SIZE; c++)
+      for (c = 0; c < TGSI_NUM_CHANNELS; c++)
          rgba[TGSI_NUM_CHANNELS*c] = get_gather_value(sp_sview, c,
                                                       args->gather_comp,
                                                       tx);
    } else {
       /* interpolate R, G, B, A */
-      for (c = 0; c < TGSI_QUAD_SIZE; c++)
+      for (c = 0; c < TGSI_NUM_CHANNELS; c++)
          rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw,
                                              tx[0][c], tx[1][c],
                                              tx[2][c], tx[3][c]);
@@ -1587,13 +1587,13 @@ img_filter_2d_array_linear(const struct sp_sampler_view *sp_sview,
    tx[3] = get_texel_2d_array(sp_sview, sp_samp, addr, x1, y1, layer);
 
    if (args->gather_only) {
-      for (c = 0; c < TGSI_QUAD_SIZE; c++)
+      for (c = 0; c < TGSI_NUM_CHANNELS; c++)
          rgba[TGSI_NUM_CHANNELS*c] = get_gather_value(sp_sview, c,
                                                       args->gather_comp,
                                                       tx);
    } else {
       /* interpolate R, G, B, A */
-      for (c = 0; c < TGSI_QUAD_SIZE; c++)
+      for (c = 0; c < TGSI_NUM_CHANNELS; c++)
          rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw,
                                              tx[0][c], tx[1][c],
                                              tx[2][c], tx[3][c]);
@@ -1652,13 +1652,13 @@ img_filter_cube_linear(const struct sp_sampler_view *sp_sview,
    }
 
    if (args->gather_only) {
-      for (c = 0; c < TGSI_QUAD_SIZE; c++)
+      for (c = 0; c < TGSI_NUM_CHANNELS; c++)
          rgba[TGSI_NUM_CHANNELS*c] = get_gather_value(sp_sview, c,
                                                       args->gather_comp,
                                                       tx);
    } else {
       /* interpolate R, G, B, A */
-      for (c = 0; c < TGSI_QUAD_SIZE; c++)
+      for (c = 0; c < TGSI_NUM_CHANNELS; c++)
          rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw,
                                              tx[0][c], tx[1][c],
                                              tx[2][c], tx[3][c]);
@@ -1720,13 +1720,13 @@ img_filter_cube_array_linear(const struct sp_sampler_view *sp_sview,
    }
 
    if (args->gather_only) {
-      for (c = 0; c < TGSI_QUAD_SIZE; c++)
+      for (c = 0; c < TGSI_NUM_CHANNELS; c++)
          rgba[TGSI_NUM_CHANNELS*c] = get_gather_value(sp_sview, c,
                                                       args->gather_comp,
                                                       tx);
    } else {
       /* interpolate R, G, B, A */
-      for (c = 0; c < TGSI_QUAD_SIZE; c++)
+      for (c = 0; c < TGSI_NUM_CHANNELS; c++)
          rgba[TGSI_NUM_CHANNELS*c] = lerp_2d(xw, yw,
                                              tx[0][c], tx[1][c],
                                              tx[2][c], tx[3][c]);
@@ -1771,7 +1771,7 @@ img_filter_3d_linear(const struct sp_sampler_view *sp_sview,
    tx13 = get_texel_3d(sp_sview, sp_samp, addr, x1, y1, z1);
       
       /* interpolate R, G, B, A */
-   for (c = 0; c < TGSI_QUAD_SIZE; c++)
+   for (c = 0; c < TGSI_NUM_CHANNELS; c++)
       rgba[TGSI_NUM_CHANNELS*c] =  lerp_3d(xw, yw, zw,
                                            tx00[c], tx01[c],
                                            tx02[c], tx03[c],
@@ -2209,6 +2209,7 @@ img_filter_2d_ewa(const struct sp_sampler_view *sp_sview,
                   const float t[TGSI_QUAD_SIZE],
                   const float p[TGSI_QUAD_SIZE],
                   const uint faces[TGSI_QUAD_SIZE],
+                  const int8_t *offset,
                   unsigned level,
                   const float dudx, const float dvdx,
                   const float dudy, const float dvdy,
@@ -2268,6 +2269,8 @@ img_filter_2d_ewa(const struct sp_sampler_view *sp_sview,
    /* F *= formScale; */ /* no need to scale F as we don't use it below here */
 
    args.level = level;
+   args.offset = offset;
+
    for (j = 0; j < TGSI_QUAD_SIZE; j++) {
       /* Heckbert MS thesis, p. 59; scan over the bounding box of the ellipse
        * and incrementally update the value of Ax^2+Bxy*Cy^2; when this
@@ -2431,6 +2434,8 @@ mip_filter_linear_aniso(const struct sp_sampler_view *sp_sview,
    const float dvdy = (t[QUAD_TOP_LEFT]     - t[QUAD_BOTTOM_LEFT]) * t_to_v;
    struct img_filter_args args;
 
+   args.offset = filt_args->offset;
+
    if (filt_args->control == TGSI_SAMPLER_LOD_BIAS ||
        filt_args->control == TGSI_SAMPLER_LOD_NONE ||
        /* XXX FIXME */
@@ -2495,6 +2500,11 @@ mip_filter_linear_aniso(const struct sp_sampler_view *sp_sview,
          args.p = p[j];
          args.level = psview->u.tex.last_level;
          args.face_id = filt_args->faces[j];
+         /*
+          * XXX: we overwrote any linear filter with nearest, so this
+          * isn't right (albeit if last level is 1x1 and no border it
+          * will work just the same).
+          */
          min_filter(sp_sview, sp_samp, &args, &rgba[0][j]);
       }
    }
@@ -2503,8 +2513,8 @@ mip_filter_linear_aniso(const struct sp_sampler_view *sp_sview,
        * seem to be worth the extra running time.
        */
       img_filter_2d_ewa(sp_sview, sp_samp, min_filter, mag_filter,
-                        s, t, p, filt_args->faces, level0,
-                        dudx, dvdx, dudy, dvdy, rgba);
+                        s, t, p, filt_args->faces, filt_args->offset,
+                        level0, dudx, dvdx, dudy, dvdy, rgba);
    }
 
    if (DEBUG_TEX) {
diff --git a/src/gallium/drivers/softpipe/sp_texture.c b/src/gallium/drivers/softpipe/sp_texture.c
index 52df89504b8..52ec373f8f2 100644
--- a/src/gallium/drivers/softpipe/sp_texture.c
+++ b/src/gallium/drivers/softpipe/sp_texture.c
@@ -218,7 +218,8 @@ softpipe_resource_destroy(struct pipe_screen *pscreen,
 static struct pipe_resource *
 softpipe_resource_from_handle(struct pipe_screen *screen,
                               const struct pipe_resource *templat,
-                              struct winsys_handle *whandle)
+                              struct winsys_handle *whandle,
+                              unsigned usage)
 {
    struct sw_winsys *winsys = softpipe_screen(screen)->winsys;
    struct softpipe_resource *spr = CALLOC_STRUCT(softpipe_resource);
@@ -251,7 +252,8 @@ softpipe_resource_from_handle(struct pipe_screen *screen,
 static boolean
 softpipe_resource_get_handle(struct pipe_screen *screen,
                              struct pipe_resource *pt,
-                             struct winsys_handle *whandle)
+                             struct winsys_handle *whandle,
+                             unsigned usage)
 {
    struct sw_winsys *winsys = softpipe_screen(screen)->winsys;
    struct softpipe_resource *spr = softpipe_resource(pt);
diff --git a/src/gallium/drivers/svga/svga_context.c b/src/gallium/drivers/svga/svga_context.c
index b10eb45e548..da4281490ae 100644
--- a/src/gallium/drivers/svga/svga_context.c
+++ b/src/gallium/drivers/svga/svga_context.c
@@ -31,6 +31,7 @@
 #include "util/u_memory.h"
 #include "util/u_bitmask.h"
 #include "util/u_upload_mgr.h"
+#include "os/os_time.h"
 
 #include "svga_context.h"
 #include "svga_screen.h"
@@ -299,6 +300,7 @@ void svga_context_flush( struct svga_context *svga,
 {
    struct svga_screen *svgascreen = svga_screen(svga->pipe.screen);
    struct pipe_fence_handle *fence = NULL;
+   uint64_t t0;
 
    svga->curr.nr_fbs = 0;
 
@@ -307,9 +309,14 @@ void svga_context_flush( struct svga_context *svga,
     */
    svga_context_flush_buffers(svga);
 
+   svga->hud.command_buffer_size +=
+      svga->swc->get_command_buffer_size(svga->swc);
+
    /* Flush pending commands to hardware:
     */
+   t0 = os_time_get();
    svga->swc->flush(svga->swc, &fence);
+   svga->hud.flush_time += (os_time_get() - t0);
 
    svga->hud.num_flushes++;
 
diff --git a/src/gallium/drivers/svga/svga_context.h b/src/gallium/drivers/svga/svga_context.h
index f1a2041b6cf..1976f98e5c1 100644
--- a/src/gallium/drivers/svga/svga_context.h
+++ b/src/gallium/drivers/svga/svga_context.h
@@ -52,16 +52,19 @@
 #define SVGA_QUERY_MAP_BUFFER_TIME         (PIPE_QUERY_DRIVER_SPECIFIC + 4)
 #define SVGA_QUERY_NUM_RESOURCES_MAPPED    (PIPE_QUERY_DRIVER_SPECIFIC + 5)
 #define SVGA_QUERY_NUM_BYTES_UPLOADED      (PIPE_QUERY_DRIVER_SPECIFIC + 6)
+#define SVGA_QUERY_COMMAND_BUFFER_SIZE     (PIPE_QUERY_DRIVER_SPECIFIC + 7)
+#define SVGA_QUERY_FLUSH_TIME              (PIPE_QUERY_DRIVER_SPECIFIC + 8)
+#define SVGA_QUERY_SURFACE_WRITE_FLUSHES   (PIPE_QUERY_DRIVER_SPECIFIC + 9)
 
 /* running total counters */
-#define SVGA_QUERY_MEMORY_USED             (PIPE_QUERY_DRIVER_SPECIFIC + 7)
-#define SVGA_QUERY_NUM_SHADERS             (PIPE_QUERY_DRIVER_SPECIFIC + 8)
-#define SVGA_QUERY_NUM_RESOURCES           (PIPE_QUERY_DRIVER_SPECIFIC + 9)
-#define SVGA_QUERY_NUM_STATE_OBJECTS       (PIPE_QUERY_DRIVER_SPECIFIC + 10)
-#define SVGA_QUERY_NUM_SURFACE_VIEWS       (PIPE_QUERY_DRIVER_SPECIFIC + 11)
-#define SVGA_QUERY_NUM_GENERATE_MIPMAP     (PIPE_QUERY_DRIVER_SPECIFIC + 12)
+#define SVGA_QUERY_MEMORY_USED             (PIPE_QUERY_DRIVER_SPECIFIC + 10)
+#define SVGA_QUERY_NUM_SHADERS             (PIPE_QUERY_DRIVER_SPECIFIC + 11)
+#define SVGA_QUERY_NUM_RESOURCES           (PIPE_QUERY_DRIVER_SPECIFIC + 12)
+#define SVGA_QUERY_NUM_STATE_OBJECTS       (PIPE_QUERY_DRIVER_SPECIFIC + 13)
+#define SVGA_QUERY_NUM_SURFACE_VIEWS       (PIPE_QUERY_DRIVER_SPECIFIC + 14)
+#define SVGA_QUERY_NUM_GENERATE_MIPMAP     (PIPE_QUERY_DRIVER_SPECIFIC + 15)
 /*SVGA_QUERY_MAX has to be last because it is size of an array*/
-#define SVGA_QUERY_MAX                     (PIPE_QUERY_DRIVER_SPECIFIC + 13)
+#define SVGA_QUERY_MAX                     (PIPE_QUERY_DRIVER_SPECIFIC + 16)
 
 /**
  * Maximum supported number of constant buffers per shader
@@ -502,6 +505,9 @@ struct svga_context
       uint64_t num_validations;      /**< SVGA_QUERY_NUM_VALIDATIONS */
       uint64_t map_buffer_time;      /**< SVGA_QUERY_MAP_BUFFER_TIME */
       uint64_t num_resources_mapped; /**< SVGA_QUERY_NUM_RESOURCES_MAPPED */
+      uint64_t command_buffer_size;  /**< SVGA_QUERY_COMMAND_BUFFER_SIZE */
+      uint64_t flush_time;           /**< SVGA_QUERY_FLUSH_TIME */
+      uint64_t surface_write_flushes; /**< SVGA_QUERY_SURFACE_WRITE_FLUSHES */
       uint64_t num_shaders;          /**< SVGA_QUERY_NUM_SHADERS */
       uint64_t num_state_objects;    /**< SVGA_QUERY_NUM_STATE_OBJECTS */
       uint64_t num_surface_views;    /**< SVGA_QUERY_NUM_SURFACE_VIEWS */
diff --git a/src/gallium/drivers/svga/svga_pipe_query.c b/src/gallium/drivers/svga/svga_pipe_query.c
index 255494a5de7..845f4ef3a1c 100644
--- a/src/gallium/drivers/svga/svga_pipe_query.c
+++ b/src/gallium/drivers/svga/svga_pipe_query.c
@@ -723,15 +723,18 @@ svga_create_query(struct pipe_context *pipe,
    case SVGA_QUERY_NUM_DRAW_CALLS:
    case SVGA_QUERY_NUM_FALLBACKS:
    case SVGA_QUERY_NUM_FLUSHES:
+   case SVGA_QUERY_NUM_VALIDATIONS:
+   case SVGA_QUERY_MAP_BUFFER_TIME:
+   case SVGA_QUERY_NUM_RESOURCES_MAPPED:
+   case SVGA_QUERY_NUM_BYTES_UPLOADED:
+   case SVGA_QUERY_COMMAND_BUFFER_SIZE:
+   case SVGA_QUERY_FLUSH_TIME:
+   case SVGA_QUERY_SURFACE_WRITE_FLUSHES:
    case SVGA_QUERY_MEMORY_USED:
    case SVGA_QUERY_NUM_SHADERS:
    case SVGA_QUERY_NUM_RESOURCES:
    case SVGA_QUERY_NUM_STATE_OBJECTS:
-   case SVGA_QUERY_NUM_VALIDATIONS:
-   case SVGA_QUERY_MAP_BUFFER_TIME:
    case SVGA_QUERY_NUM_SURFACE_VIEWS:
-   case SVGA_QUERY_NUM_RESOURCES_MAPPED:
-   case SVGA_QUERY_NUM_BYTES_UPLOADED:
    case SVGA_QUERY_NUM_GENERATE_MIPMAP:
       break;
    default:
@@ -792,15 +795,18 @@ svga_destroy_query(struct pipe_context *pipe, struct pipe_query *q)
    case SVGA_QUERY_NUM_DRAW_CALLS:
    case SVGA_QUERY_NUM_FALLBACKS:
    case SVGA_QUERY_NUM_FLUSHES:
+   case SVGA_QUERY_NUM_VALIDATIONS:
+   case SVGA_QUERY_MAP_BUFFER_TIME:
+   case SVGA_QUERY_NUM_RESOURCES_MAPPED:
+   case SVGA_QUERY_NUM_BYTES_UPLOADED:
+   case SVGA_QUERY_COMMAND_BUFFER_SIZE:
+   case SVGA_QUERY_FLUSH_TIME:
+   case SVGA_QUERY_SURFACE_WRITE_FLUSHES:
    case SVGA_QUERY_MEMORY_USED:
    case SVGA_QUERY_NUM_SHADERS:
    case SVGA_QUERY_NUM_RESOURCES:
    case SVGA_QUERY_NUM_STATE_OBJECTS:
-   case SVGA_QUERY_NUM_VALIDATIONS:
-   case SVGA_QUERY_MAP_BUFFER_TIME:
    case SVGA_QUERY_NUM_SURFACE_VIEWS:
-   case SVGA_QUERY_NUM_RESOURCES_MAPPED:
-   case SVGA_QUERY_NUM_BYTES_UPLOADED:
    case SVGA_QUERY_NUM_GENERATE_MIPMAP:
       /* nothing */
       break;
@@ -884,6 +890,15 @@ svga_begin_query(struct pipe_context *pipe, struct pipe_query *q)
    case SVGA_QUERY_NUM_BYTES_UPLOADED:
       sq->begin_count = svga->hud.num_bytes_uploaded;
       break;
+   case SVGA_QUERY_COMMAND_BUFFER_SIZE:
+      sq->begin_count = svga->hud.command_buffer_size;
+      break;
+   case SVGA_QUERY_FLUSH_TIME:
+      sq->begin_count = svga->hud.flush_time;
+      break;
+   case SVGA_QUERY_SURFACE_WRITE_FLUSHES:
+      sq->begin_count = svga->hud.surface_write_flushes;
+      break;
    case SVGA_QUERY_MEMORY_USED:
    case SVGA_QUERY_NUM_SHADERS:
    case SVGA_QUERY_NUM_RESOURCES:
@@ -978,6 +993,15 @@ svga_end_query(struct pipe_context *pipe, struct pipe_query *q)
    case SVGA_QUERY_NUM_BYTES_UPLOADED:
       sq->end_count = svga->hud.num_bytes_uploaded;
       break;
+   case SVGA_QUERY_COMMAND_BUFFER_SIZE:
+      sq->end_count = svga->hud.command_buffer_size;
+      break;
+   case SVGA_QUERY_FLUSH_TIME:
+      sq->end_count = svga->hud.flush_time;
+      break;
+   case SVGA_QUERY_SURFACE_WRITE_FLUSHES:
+      sq->end_count = svga->hud.surface_write_flushes;
+      break;
    case SVGA_QUERY_MEMORY_USED:
    case SVGA_QUERY_NUM_SHADERS:
    case SVGA_QUERY_NUM_RESOURCES:
@@ -1073,9 +1097,12 @@ svga_get_query_result(struct pipe_context *pipe,
    case SVGA_QUERY_NUM_FALLBACKS:
    case SVGA_QUERY_NUM_FLUSHES:
    case SVGA_QUERY_NUM_VALIDATIONS:
+   case SVGA_QUERY_MAP_BUFFER_TIME:
    case SVGA_QUERY_NUM_RESOURCES_MAPPED:
    case SVGA_QUERY_NUM_BYTES_UPLOADED:
-   case SVGA_QUERY_MAP_BUFFER_TIME:
+   case SVGA_QUERY_COMMAND_BUFFER_SIZE:
+   case SVGA_QUERY_FLUSH_TIME:
+   case SVGA_QUERY_SURFACE_WRITE_FLUSHES:
       vresult->u64 = sq->end_count - sq->begin_count;
       break;
    /* These are running total counters */
diff --git a/src/gallium/drivers/svga/svga_resource.c b/src/gallium/drivers/svga/svga_resource.c
index 1c3bcd67afa..264ac335405 100644
--- a/src/gallium/drivers/svga/svga_resource.c
+++ b/src/gallium/drivers/svga/svga_resource.c
@@ -47,7 +47,8 @@ svga_resource_create(struct pipe_screen *screen,
 static struct pipe_resource *
 svga_resource_from_handle(struct pipe_screen * screen,
                           const struct pipe_resource *template,
-                          struct winsys_handle *whandle)
+                          struct winsys_handle *whandle,
+                          unsigned usage)
 {
    if (template->target == PIPE_BUFFER)
       return NULL;
diff --git a/src/gallium/drivers/svga/svga_resource_texture.c b/src/gallium/drivers/svga/svga_resource_texture.c
index 3f754c4d53e..1edb41dabee 100644
--- a/src/gallium/drivers/svga/svga_resource_texture.c
+++ b/src/gallium/drivers/svga/svga_resource_texture.c
@@ -463,8 +463,10 @@ svga_texture_transfer_map(struct pipe_context *pipe,
 	 assert(transfer->usage & PIPE_TRANSFER_WRITE);
 	 if ((transfer->usage & PIPE_TRANSFER_UNSYNCHRONIZED) == 0) {
             svga_surfaces_flush(svga);
-            if (!sws->surface_is_flushed(sws, surf))
+            if (!sws->surface_is_flushed(sws, surf)) {
+               svga->hud.surface_write_flushes++;
                svga_context_flush(svga, NULL);
+            }
 	 }
       }
    }
@@ -1038,7 +1040,12 @@ svga_texture_generate_mipmap(struct pipe_context *pipe,
       return FALSE;
 
    sv = svga_pipe_sampler_view(psv);
-   svga_validate_pipe_sampler_view(svga, sv);
+   ret = svga_validate_pipe_sampler_view(svga, sv);
+   if (ret != PIPE_OK) {
+      svga_context_flush(svga, NULL);
+      ret = svga_validate_pipe_sampler_view(svga, sv);
+      assert(ret == PIPE_OK);
+   }
 
    ret = SVGA3D_vgpu10_GenMips(svga->swc, sv->id, tex->handle);
    if (ret != PIPE_OK) {
diff --git a/src/gallium/drivers/svga/svga_screen.c b/src/gallium/drivers/svga/svga_screen.c
index c9abd49ec1e..bcc512041f7 100644
--- a/src/gallium/drivers/svga/svga_screen.c
+++ b/src/gallium/drivers/svga/svga_screen.c
@@ -360,6 +360,10 @@ svga_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_STRING_MARKER:
    case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
    case PIPE_CAP_QUERY_MEMORY_INFO:
+   case PIPE_CAP_PCI_GROUP:
+   case PIPE_CAP_PCI_BUS:
+   case PIPE_CAP_PCI_DEVICE:
+   case PIPE_CAP_PCI_FUNCTION:
       return 0;
    case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
       return 64;
@@ -827,6 +831,12 @@ svga_get_driver_query_info(struct pipe_screen *screen,
             PIPE_DRIVER_QUERY_TYPE_UINT64),
       QUERY("num-bytes-uploaded", SVGA_QUERY_NUM_BYTES_UPLOADED,
             PIPE_DRIVER_QUERY_TYPE_BYTES),
+      QUERY("command-buffer-size", SVGA_QUERY_COMMAND_BUFFER_SIZE,
+            PIPE_DRIVER_QUERY_TYPE_BYTES),
+      QUERY("flush-time", SVGA_QUERY_FLUSH_TIME,
+            PIPE_DRIVER_QUERY_TYPE_MICROSECONDS),
+      QUERY("surface-write-flushes", SVGA_QUERY_SURFACE_WRITE_FLUSHES,
+            PIPE_DRIVER_QUERY_TYPE_UINT64),
 
       /* running total counters */
       QUERY("memory-used", SVGA_QUERY_MEMORY_USED,
diff --git a/src/gallium/drivers/svga/svga_screen_cache.c b/src/gallium/drivers/svga/svga_screen_cache.c
index 5b441295715..321c564e7f5 100644
--- a/src/gallium/drivers/svga/svga_screen_cache.c
+++ b/src/gallium/drivers/svga/svga_screen_cache.c
@@ -563,8 +563,14 @@ svga_screen_cache_dump(const struct svga_screen *svgascreen)
          struct svga_host_surface_cache_entry *entry =
             LIST_ENTRY(struct svga_host_surface_cache_entry,
                        curr, bucket_head);
-         if (entry->key.format != 37) {
-            debug_printf("  %u x %u x %u format %u\n",
+         if (entry->key.format == SVGA3D_BUFFER) {
+            debug_printf("  %p: buffer %u bytes\n",
+                         entry->handle,
+                         entry->key.size.width);
+         }
+         else {
+            debug_printf("  %p: %u x %u x %u format %u\n",
+                         entry->handle,
                          entry->key.size.width,
                          entry->key.size.height,
                          entry->key.size.depth,
diff --git a/src/gallium/drivers/svga/svga_state_sampler.c b/src/gallium/drivers/svga/svga_state_sampler.c
index e7b540cc707..6179a797fa2 100644
--- a/src/gallium/drivers/svga/svga_state_sampler.c
+++ b/src/gallium/drivers/svga/svga_state_sampler.c
@@ -103,8 +103,17 @@ svga_validate_pipe_sampler_view(struct svga_context *svga,
       SVGA3dSurfaceFormat format;
       SVGA3dResourceType resourceDim;
       SVGA3dShaderResourceViewDesc viewDesc;
+      enum pipe_format pformat = sv->base.format;
 
-      format = svga_translate_format(ss, sv->base.format,
+      /* vgpu10 cannot create a BGRX view for a BGRA resource, so force it to
+       * create a BGRA view.
+       */
+      if (pformat == PIPE_FORMAT_B8G8R8X8_UNORM &&
+          sv->base.texture->format == PIPE_FORMAT_B8G8R8A8_UNORM) {
+         pformat = PIPE_FORMAT_B8G8R8A8_UNORM;
+      }
+
+      format = svga_translate_format(ss, pformat,
                                      PIPE_BIND_SAMPLER_VIEW);
       assert(format != SVGA3D_FORMAT_INVALID);
 
diff --git a/src/gallium/drivers/svga/svga_winsys.h b/src/gallium/drivers/svga/svga_winsys.h
index 562c6690fc1..0ad6b5e6c76 100644
--- a/src/gallium/drivers/svga/svga_winsys.h
+++ b/src/gallium/drivers/svga/svga_winsys.h
@@ -108,6 +108,12 @@ struct svga_winsys_context
 	      uint32_t nr_bytes, uint32_t nr_relocs );
    
    /**
+    * Returns current size of command buffer, in bytes.
+    */
+   unsigned
+   (*get_command_buffer_size)(struct svga_winsys_context *swc);
+
+   /**
     * Emit a relocation for a host surface.
     * 
     * @param flags bitmask of SVGA_RELOC_* flags
diff --git a/src/gallium/drivers/swr/.clang-format b/src/gallium/drivers/swr/.clang-format
new file mode 100644
index 00000000000..0ec65a5de88
--- /dev/null
+++ b/src/gallium/drivers/swr/.clang-format
@@ -0,0 +1,64 @@
+---
+Language:        Cpp
+AccessModifierOffset: -3
+AlignAfterOpenBracket: true
+AlignEscapedNewlinesLeft: false
+AlignOperands:   false
+AlignTrailingComments: false
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortBlocksOnASingleLine: false
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortIfStatementsOnASingleLine: false
+AllowShortLoopsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AlwaysBreakAfterDefinitionReturnType: true
+AlwaysBreakTemplateDeclarations: false
+AlwaysBreakBeforeMultilineStrings: false
+BreakBeforeBinaryOperators: NonAssignment
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: true
+BinPackParameters: false
+BinPackArguments: false
+ColumnLimit:     78
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+ConstructorInitializerIndentWidth: 3
+DerivePointerAlignment: false
+ExperimentalAutoDetectBinPacking: false
+IndentCaseLabels: false
+IndentWrappedFunctionNames: false
+IndentFunctionDeclarationAfterType: false
+MaxEmptyLinesToKeep: 2
+KeepEmptyLinesAtTheStartOfBlocks: true
+NamespaceIndentation: Inner
+ObjCBlockIndentWidth: 3
+ObjCSpaceAfterProperty: true
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakString: 1000
+PenaltyBreakFirstLessLess: 120
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 0
+PointerAlignment: Right
+SpacesBeforeTrailingComments: 1
+Cpp11BracedListStyle: true
+Standard:        Cpp11
+IndentWidth:     3
+TabWidth:        8
+UseTab:          Never
+BreakBeforeBraces: Linux
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+SpacesInAngles:  false
+SpaceInEmptyParentheses: false
+SpacesInCStyleCastParentheses: false
+SpaceAfterCStyleCast: false
+SpacesInContainerLiterals: true
+SpaceBeforeAssignmentOperators: true
+ContinuationIndentWidth: 3
+CommentPragmas:  '^ IWYU pragma:'
+ForEachMacros:   [ foreach, Q_FOREACH, BOOST_FOREACH ]
+SpaceBeforeParens: ControlStatements
+DisableFormat:   false
+...
+
diff --git a/src/gallium/drivers/swr/Makefile.am b/src/gallium/drivers/swr/Makefile.am
new file mode 100644
index 00000000000..f08806aaf77
--- /dev/null
+++ b/src/gallium/drivers/swr/Makefile.am
@@ -0,0 +1,31 @@
+# Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+include Makefile.sources
+include $(top_srcdir)/src/gallium/Automake.inc
+
+AM_CXXFLAGS = $(GALLIUM_DRIVER_CFLAGS)
+
+noinst_LTLIBRARIES = libmesaswr.la
+
+libmesaswr_la_SOURCES = $(LOADER_SOURCES)
+
+EXTRA_DIST = Makefile.sources-arch
diff --git a/src/gallium/drivers/swr/Makefile.sources b/src/gallium/drivers/swr/Makefile.sources
new file mode 100644
index 00000000000..72247211184
--- /dev/null
+++ b/src/gallium/drivers/swr/Makefile.sources
@@ -0,0 +1,23 @@
+# Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+LOADER_SOURCES := \
+	swr_loader.cpp
diff --git a/src/gallium/drivers/swr/Makefile.sources-arch b/src/gallium/drivers/swr/Makefile.sources-arch
new file mode 100644
index 00000000000..6c105f46199
--- /dev/null
+++ b/src/gallium/drivers/swr/Makefile.sources-arch
@@ -0,0 +1,111 @@
+# Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+CXX_SOURCES := \
+	swr_clear.cpp \
+	swr_context.cpp \
+	swr_context.h \
+	swr_context_llvm.h \
+	swr_draw.cpp \
+	swr_public.h \
+	swr_resource.h \
+	swr_screen.cpp \
+	swr_screen.h \
+	swr_state.cpp \
+	swr_state.h \
+	swr_tex_sample.cpp \
+	swr_tex_sample.h \
+	swr_scratch.h \
+	swr_scratch.cpp \
+	swr_shader.cpp \
+	swr_memory.h \
+	swr_fence.h \
+	swr_fence.cpp \
+	swr_query.h \
+	swr_query.cpp
+
+COMMON_CXX_SOURCES := \
+	rasterizer/common/containers.hpp \
+	rasterizer/common/formats.cpp \
+	rasterizer/common/formats.h \
+	rasterizer/common/isa.hpp \
+	rasterizer/common/os.h \
+	rasterizer/common/rdtsc_buckets.cpp \
+	rasterizer/common/rdtsc_buckets.h \
+	rasterizer/common/rdtsc_buckets_shared.h \
+	rasterizer/common/rdtsc_buckets_shared.h \
+	rasterizer/common/simdintrin.h \
+	rasterizer/common/swr_assert.cpp \
+	rasterizer/common/swr_assert.h
+
+CORE_CXX_SOURCES := \
+	rasterizer/core/api.cpp \
+	rasterizer/core/api.h \
+	rasterizer/core/arena.cpp \
+	rasterizer/core/arena.h \
+	rasterizer/core/backend.cpp \
+	rasterizer/core/backend.h \
+	rasterizer/core/blend.h \
+	rasterizer/core/clip.cpp \
+	rasterizer/core/clip.h \
+	rasterizer/core/context.h \
+	rasterizer/core/depthstencil.h \
+	rasterizer/core/fifo.hpp \
+	rasterizer/core/format_traits.h \
+	rasterizer/core/format_types.h \
+	rasterizer/core/frontend.cpp \
+	rasterizer/core/frontend.h \
+	rasterizer/core/knobs.h \
+	rasterizer/core/knobs_init.h \
+	rasterizer/core/multisample.cpp \
+	rasterizer/core/multisample.h \
+	rasterizer/core/pa_avx.cpp \
+	rasterizer/core/pa.h \
+	rasterizer/core/rasterizer.cpp \
+	rasterizer/core/rasterizer.h \
+	rasterizer/core/rdtsc_core.cpp \
+	rasterizer/core/rdtsc_core.h \
+	rasterizer/core/state.h \
+	rasterizer/core/threads.cpp \
+	rasterizer/core/threads.h \
+	rasterizer/core/tilemgr.cpp \
+	rasterizer/core/tilemgr.h \
+	rasterizer/core/utils.cpp \
+	rasterizer/core/utils.h
+
+JITTER_CXX_SOURCES := \
+	rasterizer/jitter/blend_jit.cpp \
+	rasterizer/jitter/blend_jit.h \
+	rasterizer/jitter/builder.cpp \
+	rasterizer/jitter/builder.h \
+	rasterizer/jitter/builder_misc.cpp \
+	rasterizer/jitter/builder_misc.h \
+	rasterizer/jitter/fetch_jit.cpp \
+	rasterizer/jitter/fetch_jit.h \
+	rasterizer/jitter/JitManager.cpp \
+	rasterizer/jitter/JitManager.h \
+	rasterizer/jitter/streamout_jit.cpp \
+	rasterizer/jitter/streamout_jit.h
+
+MEMORY_CXX_SOURCES := \
+	rasterizer/memory/ClearTile.cpp \
+	rasterizer/memory/LoadTile.cpp \
+	rasterizer/memory/StoreTile.cpp
diff --git a/src/gallium/drivers/swr/avx/Makefile.am b/src/gallium/drivers/swr/avx/Makefile.am
new file mode 100644
index 00000000000..384f1a7eecf
--- /dev/null
+++ b/src/gallium/drivers/swr/avx/Makefile.am
@@ -0,0 +1,99 @@
+# Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+include ../Makefile.sources-arch
+include $(top_srcdir)/src/gallium/Automake.inc
+
+VPATH = $(srcdir) $(srcdir)/..
+
+AM_CXXFLAGS = \
+	-march=core-avx-i \
+	-DKNOB_ARCH=KNOB_ARCH_AVX \
+	$(GALLIUM_DRIVER_CFLAGS) \
+	$(LLVM_CFLAGS) \
+	-I$(builddir)/rasterizer/scripts \
+	-I$(builddir)/rasterizer/jitter \
+	-I$(srcdir)/../rasterizer \
+	-I$(srcdir)/../rasterizer/core \
+	-I$(srcdir)/../rasterizer/jitter
+
+lib_LTLIBRARIES = libswrAVX.la
+
+BUILT_SOURCES = \
+	rasterizer/scripts/gen_knobs.cpp \
+	rasterizer/scripts/gen_knobs.h \
+	rasterizer/jitter/state_llvm.h \
+	rasterizer/jitter/builder_gen.h \
+	rasterizer/jitter/builder_gen.cpp \
+	rasterizer/jitter/builder_x86.h \
+	rasterizer/jitter/builder_x86.cpp
+
+libswrAVX_la_SOURCES = \
+	$(CXX_SOURCES) \
+	$(COMMON_CXX_SOURCES) \
+	$(CORE_CXX_SOURCES) \
+	$(JITTER_CXX_SOURCES) \
+	$(MEMORY_CXX_SOURCES) \
+	$(BUILT_SOURCES)
+
+rasterizer/scripts/gen_knobs.cpp rasterizer/scripts/gen_knobs.h: rasterizer/scripts/gen_knobs.py rasterizer/scripts/knob_defs.py rasterizer/scripts/templates/knobs.template
+	$(PYTHON2) $(PYTHON_FLAGS) \
+		$(srcdir)/../rasterizer/scripts/gen_knobs.py \
+		rasterizer/scripts
+
+rasterizer/jitter/state_llvm.h: rasterizer/jitter/scripts/gen_llvm_types.py rasterizer/core/state.h
+	$(PYTHON2) $(PYTHON_FLAGS) \
+		$(srcdir)/../rasterizer/jitter/scripts/gen_llvm_types.py \
+		--input $(srcdir)/../rasterizer/core/state.h \
+		--output rasterizer/jitter/state_llvm.h
+
+rasterizer/jitter/builder_gen.h: rasterizer/jitter/scripts/gen_llvm_ir_macros.py $(LLVM_INCLUDEDIR)/llvm/IR/IRBuilder.h
+	$(PYTHON2) $(PYTHON_FLAGS) \
+		$(srcdir)/../rasterizer/jitter/scripts/gen_llvm_ir_macros.py \
+		--input $(LLVM_INCLUDEDIR)/llvm/IR/IRBuilder.h \
+		--output rasterizer/jitter/builder_gen.h \
+		--gen_h
+
+rasterizer/jitter/builder_gen.cpp: rasterizer/jitter/scripts/gen_llvm_ir_macros.py $(LLVM_INCLUDEDIR)/llvm/IR/IRBuilder.h
+	$(PYTHON2) $(PYTHON_FLAGS) \
+		$(srcdir)/../rasterizer/jitter/scripts/gen_llvm_ir_macros.py \
+		--input $(LLVM_INCLUDEDIR)/llvm/IR/IRBuilder.h \
+		--output rasterizer/jitter/builder_gen.cpp \
+		--gen_cpp
+
+rasterizer/jitter/builder_x86.h: rasterizer/jitter/scripts/gen_llvm_ir_macros.py
+	$(PYTHON2) $(PYTHON_FLAGS) \
+		$(srcdir)/../rasterizer/jitter/scripts/gen_llvm_ir_macros.py \
+		--output rasterizer/jitter/builder_x86.h \
+		--gen_x86_h
+
+rasterizer/jitter/builder_x86.cpp: rasterizer/jitter/scripts/gen_llvm_ir_macros.py
+	$(PYTHON2) $(PYTHON_FLAGS) \
+		$(srcdir)/../rasterizer/jitter/scripts/gen_llvm_ir_macros.py \
+		--output rasterizer/jitter/builder_x86.cpp \
+		--gen_x86_cpp
+
+
+libswrAVX_la_LIBADD = \
+	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
+	$(top_builddir)/src/mesa/libmesagallium.la
+
+include $(top_srcdir)/install-gallium-links.mk
diff --git a/src/gallium/drivers/swr/avx2/Makefile.am b/src/gallium/drivers/swr/avx2/Makefile.am
new file mode 100644
index 00000000000..a3968ecd95e
--- /dev/null
+++ b/src/gallium/drivers/swr/avx2/Makefile.am
@@ -0,0 +1,99 @@
+# Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+include ../Makefile.sources-arch
+include $(top_srcdir)/src/gallium/Automake.inc
+
+VPATH = $(srcdir) $(srcdir)/..
+
+AM_CXXFLAGS = \
+	-march=core-avx2 \
+	-DKNOB_ARCH=KNOB_ARCH_AVX2 \
+	$(GALLIUM_DRIVER_CFLAGS) \
+	$(LLVM_CFLAGS) \
+	-I$(builddir)/rasterizer/scripts \
+	-I$(builddir)/rasterizer/jitter \
+	-I$(srcdir)/../rasterizer \
+	-I$(srcdir)/../rasterizer/core \
+	-I$(srcdir)/../rasterizer/jitter
+
+lib_LTLIBRARIES = libswrAVX2.la
+
+BUILT_SOURCES = \
+	rasterizer/scripts/gen_knobs.cpp \
+	rasterizer/scripts/gen_knobs.h \
+	rasterizer/jitter/state_llvm.h \
+	rasterizer/jitter/builder_gen.h \
+	rasterizer/jitter/builder_gen.cpp \
+	rasterizer/jitter/builder_x86.h \
+	rasterizer/jitter/builder_x86.cpp
+
+libswrAVX2_la_SOURCES = \
+	$(CXX_SOURCES) \
+	$(COMMON_CXX_SOURCES) \
+	$(CORE_CXX_SOURCES) \
+	$(JITTER_CXX_SOURCES) \
+	$(MEMORY_CXX_SOURCES) \
+	$(BUILT_SOURCES)
+
+rasterizer/scripts/gen_knobs.cpp rasterizer/scripts/gen_knobs.h: rasterizer/scripts/gen_knobs.py rasterizer/scripts/knob_defs.py rasterizer/scripts/templates/knobs.template
+	$(PYTHON2) $(PYTHON_FLAGS) \
+		$(srcdir)/../rasterizer/scripts/gen_knobs.py \
+		rasterizer/scripts
+
+rasterizer/jitter/state_llvm.h: rasterizer/jitter/scripts/gen_llvm_types.py rasterizer/core/state.h
+	$(PYTHON2) $(PYTHON_FLAGS) \
+		$(srcdir)/../rasterizer/jitter/scripts/gen_llvm_types.py \
+		--input $(srcdir)/../rasterizer/core/state.h \
+		--output rasterizer/jitter/state_llvm.h
+
+rasterizer/jitter/builder_gen.h: rasterizer/jitter/scripts/gen_llvm_ir_macros.py $(LLVM_INCLUDEDIR)/llvm/IR/IRBuilder.h
+	$(PYTHON2) $(PYTHON_FLAGS) \
+		$(srcdir)/../rasterizer/jitter/scripts/gen_llvm_ir_macros.py \
+		--input $(LLVM_INCLUDEDIR)/llvm/IR/IRBuilder.h \
+		--output rasterizer/jitter/builder_gen.h \
+		--gen_h
+
+rasterizer/jitter/builder_gen.cpp: rasterizer/jitter/scripts/gen_llvm_ir_macros.py $(LLVM_INCLUDEDIR)/llvm/IR/IRBuilder.h
+	$(PYTHON2) $(PYTHON_FLAGS) \
+		$(srcdir)/../rasterizer/jitter/scripts/gen_llvm_ir_macros.py \
+		--input $(LLVM_INCLUDEDIR)/llvm/IR/IRBuilder.h \
+		--output rasterizer/jitter/builder_gen.cpp \
+		--gen_cpp
+
+rasterizer/jitter/builder_x86.h: rasterizer/jitter/scripts/gen_llvm_ir_macros.py
+	$(PYTHON2) $(PYTHON_FLAGS) \
+		$(srcdir)/../rasterizer/jitter/scripts/gen_llvm_ir_macros.py \
+		--output rasterizer/jitter/builder_x86.h \
+		--gen_x86_h
+
+rasterizer/jitter/builder_x86.cpp: rasterizer/jitter/scripts/gen_llvm_ir_macros.py
+	$(PYTHON2) $(PYTHON_FLAGS) \
+		$(srcdir)/../rasterizer/jitter/scripts/gen_llvm_ir_macros.py \
+		--output rasterizer/jitter/builder_x86.cpp \
+		--gen_x86_cpp
+
+
+libswrAVX2_la_LIBADD = \
+	$(top_builddir)/src/gallium/auxiliary/libgallium.la \
+	$(top_builddir)/src/mesa/libmesagallium.la
+
+include $(top_srcdir)/install-gallium-links.mk
diff --git a/src/gallium/drivers/swr/rasterizer/common/containers.hpp b/src/gallium/drivers/swr/rasterizer/common/containers.hpp
new file mode 100644
index 00000000000..bc96c5f62fd
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/containers.hpp
@@ -0,0 +1,208 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+
+#ifndef SWRLIB_CONTAINERS_HPP__
+#define SWRLIB_CONTAINERS_HPP__
+
+#include <functional>
+#include "common/os.h"
+
+namespace SWRL
+{
+
+template <typename T, int NUM_ELEMENTS>
+struct UncheckedFixedVector
+{
+	UncheckedFixedVector() : mSize(0)
+	{
+	}
+
+	UncheckedFixedVector(std::size_t size, T const& exemplar)
+	{
+		this->mSize = 0;
+		for (std::size_t i = 0; i < size; ++i)
+			this->push_back(exemplar);
+	}
+
+	template <typename Iter>
+	UncheckedFixedVector(Iter fst, Iter lst)
+	{
+		this->mSize = 0;
+		for ( ; fst != lst; ++fst)
+			this->push_back(*fst);
+	}
+
+	UncheckedFixedVector(UncheckedFixedVector const& UFV)
+	{
+		this->mSize = 0;
+		for (std::size_t i = 0, N = UFV.size(); i < N; ++i)
+			(*this)[i] = UFV[i];
+		this->mSize = UFV.size();
+	}
+
+	UncheckedFixedVector& operator=(UncheckedFixedVector const& UFV)
+	{
+		for (std::size_t i = 0, N = UFV.size(); i < N; ++i)
+			(*this)[i] = UFV[i];
+		this->mSize = UFV.size();
+		return *this;
+	}
+
+	T* begin()	{ return &this->mElements[0]; }
+	T* end()	{ return &this->mElements[0] + this->mSize; }
+	T const* begin() const	{ return &this->mElements[0]; }
+	T const* end() const	{ return &this->mElements[0] + this->mSize; }
+
+	friend bool operator==(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
+	{
+		if (L.size() != R.size()) return false;
+		for (std::size_t i = 0, N = L.size(); i < N; ++i)
+		{
+			if (L[i] != R[i]) return false;
+		}
+		return true;
+	}
+
+	friend bool operator!=(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
+	{
+		if (L.size() != R.size()) return true;
+		for (std::size_t i = 0, N = L.size(); i < N; ++i)
+		{
+			if (L[i] != R[i]) return true;
+		}
+		return false;
+	}
+
+	T& operator[](std::size_t idx)
+	{
+		return this->mElements[idx];
+	}
+	T const& operator[](std::size_t idx) const
+	{
+		return this->mElements[idx];
+	}
+	void push_back(T const& t)
+	{
+		this->mElements[this->mSize]	= t;
+		++this->mSize;
+	}
+	void pop_back()
+	{
+		SWR_ASSERT(this->mSize > 0);
+		--this->mSize;
+	}
+	T& back()
+	{
+		return this->mElements[this->mSize-1];
+	}
+	T const& back() const
+	{
+		return this->mElements[this->mSize-1];
+	}
+	bool empty() const
+	{
+		return this->mSize == 0;
+	}
+	std::size_t size() const
+	{
+		return this->mSize;
+	}
+	void resize(std::size_t sz)
+	{
+		this->mSize = sz;
+	}
+	void clear()
+	{
+		this->resize(0);
+	}
+private:
+	std::size_t	mSize;
+	T			mElements[NUM_ELEMENTS];
+};
+
+template <typename T, int NUM_ELEMENTS>
+struct FixedStack : UncheckedFixedVector<T, NUM_ELEMENTS>
+{
+	FixedStack() {}
+
+	void push(T const& t)
+	{
+		this->push_back(t);
+	}
+
+	void pop()
+	{
+		this->pop_back();
+	}
+
+	T& top()
+	{
+		return this->back();
+	}
+
+	T const& top() const
+	{
+		return this->back();
+	}
+};
+
+template <typename T>
+struct CRCHash
+{
+    static_assert((sizeof(T) % sizeof(UINT)) == 0, "CRCHash expects templated type size is even multiple of 4B");
+    UINT operator()(const T& k) const
+    {
+        UINT *pData = (UINT*)&k;
+        UINT crc = 0;
+        for (UINT i = 0; i < sizeof(T) / sizeof(UINT); ++i)
+        {
+            crc = _mm_crc32_u32(crc, pData[i]);
+        }
+        return crc;
+    }
+};
+
+}// end SWRL
+
+namespace std
+{
+
+template <typename T, int N>
+struct hash<SWRL::UncheckedFixedVector<T, N>>
+{
+	size_t operator() (SWRL::UncheckedFixedVector<T, N> const& v) const
+	{
+		if (v.size() == 0) return 0;
+		std::hash<T> H;
+		size_t x = H(v[0]);
+		if (v.size() == 1) return x;
+		for (size_t i = 1; i < v.size(); ++i)
+			x ^= H(v[i]) + 0x9e3779b9 + (x<<6) + (x>>2);
+		return x;
+	}
+};
+
+
+}// end std.
+
+#endif//SWRLIB_CONTAINERS_HPP__
diff --git a/src/gallium/drivers/swr/rasterizer/common/formats.cpp b/src/gallium/drivers/swr/rasterizer/common/formats.cpp
new file mode 100644
index 00000000000..ed8ce7e5b0f
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/formats.cpp
@@ -0,0 +1,5469 @@
+
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+* 
+* @file formats.cpp
+* 
+* @brief auto-generated file
+* 
+* DO NOT EDIT
+* 
+******************************************************************************/
+
+#include "formats.h"
+
+// lookup table for unorm8 srgb -> float conversion
+const uint32_t srgb8Table[256] = {
+    0x00000000, 0x399f22b4, 0x3a1f22b4, 0x3a6eb40f, 0x3a9f22b4, 0x3ac6eb61, 0x3aeeb40f, 0x3b0b3e5e, 0x3b1f22b4, 0x3b33070b, 0x3b46eb61, 0x3b5b518d, 0x3b70f18d, 0x3b83e1c6, 0x3b8fe616, 0x3b9c87fd,
+    0x3ba9c9b5, 0x3bb7ad6f, 0x3bc63549, 0x3bd5635f, 0x3be539c1, 0x3bf5ba70, 0x3c0373b5, 0x3c0c6152, 0x3c15a703, 0x3c1f45be, 0x3c293e6b, 0x3c3391f7, 0x3c3e4149, 0x3c494d43, 0x3c54b6c7, 0x3c607eb1,
+    0x3c6ca5dc, 0x3c792d22, 0x3c830aa8, 0x3c89af9f, 0x3c9085db, 0x3c978dc5, 0x3c9ec7c0, 0x3ca63431, 0x3cadd37d, 0x3cb5a601, 0x3cbdac20, 0x3cc5e639, 0x3cce54ab, 0x3cd6f7d3, 0x3cdfd00e, 0x3ce8ddb9,
+    0x3cf22131, 0x3cfb9ac6, 0x3d02a56c, 0x3d0798df, 0x3d0ca7e7, 0x3d11d2b0, 0x3d171965, 0x3d1c7c31, 0x3d21fb3c, 0x3d2796b2, 0x3d2d4ebe, 0x3d332384, 0x3d39152e, 0x3d3f23e6, 0x3d454fd4, 0x3d4b991f,
+    0x3d51ffef, 0x3d58846a, 0x3d5f26b7, 0x3d65e6fe, 0x3d6cc564, 0x3d73c20f, 0x3d7add25, 0x3d810b66, 0x3d84b795, 0x3d887330, 0x3d8c3e4a, 0x3d9018f6, 0x3d940345, 0x3d97fd4a, 0x3d9c0716, 0x3da020bb,
+    0x3da44a4b, 0x3da883d7, 0x3daccd70, 0x3db12728, 0x3db59110, 0x3dba0b38, 0x3dbe95b5, 0x3dc33092, 0x3dc7dbe2, 0x3dcc97b6, 0x3dd1641f, 0x3dd6412c, 0x3ddb2eef, 0x3de02d77, 0x3de53cd5, 0x3dea5d19,
+    0x3def8e55, 0x3df4d093, 0x3dfa23e8, 0x3dff8861, 0x3e027f07, 0x3e054282, 0x3e080ea5, 0x3e0ae379, 0x3e0dc107, 0x3e10a755, 0x3e13966c, 0x3e168e53, 0x3e198f11, 0x3e1c98ae, 0x3e1fab32, 0x3e22c6a3,
+    0x3e25eb09, 0x3e29186c, 0x3e2c4ed2, 0x3e2f8e45, 0x3e32d6c8, 0x3e362865, 0x3e398322, 0x3e3ce706, 0x3e405419, 0x3e43ca62, 0x3e4749e8, 0x3e4ad2b1, 0x3e4e64c6, 0x3e52002b, 0x3e55a4e9, 0x3e595307,
+    0x3e5d0a8b, 0x3e60cb7c, 0x3e6495e0, 0x3e6869bf, 0x3e6c4720, 0x3e702e08, 0x3e741e7f, 0x3e78188c, 0x3e7c1c38, 0x3e8014c2, 0x3e82203c, 0x3e84308d, 0x3e8645ba, 0x3e885fc5, 0x3e8a7eb2, 0x3e8ca283,
+    0x3e8ecb3d, 0x3e90f8e1, 0x3e932b74, 0x3e9562f8, 0x3e979f71, 0x3e99e0e2, 0x3e9c274e, 0x3e9e72b7, 0x3ea0c322, 0x3ea31892, 0x3ea57308, 0x3ea7d289, 0x3eaa3718, 0x3eaca0b7, 0x3eaf0f69, 0x3eb18333,
+    0x3eb3fc16, 0x3eb67a15, 0x3eb8fd34, 0x3ebb8576, 0x3ebe12e1, 0x3ec0a571, 0x3ec33d2d, 0x3ec5da17, 0x3ec87c33, 0x3ecb2383, 0x3ecdd00b, 0x3ed081cd, 0x3ed338cc, 0x3ed5f50b, 0x3ed8b68d, 0x3edb7d54,
+    0x3ede4965, 0x3ee11ac1, 0x3ee3f16b, 0x3ee6cd67, 0x3ee9aeb6, 0x3eec955d, 0x3eef815d, 0x3ef272ba, 0x3ef56976, 0x3ef86594, 0x3efb6717, 0x3efe6e02, 0x3f00bd2b, 0x3f02460c, 0x3f03d1a5, 0x3f055ff8,
+    0x3f06f106, 0x3f0884cf, 0x3f0a1b57, 0x3f0bb49d, 0x3f0d50a2, 0x3f0eef69, 0x3f1090f2, 0x3f123540, 0x3f13dc53, 0x3f15862d, 0x3f1732cf, 0x3f18e23b, 0x3f1a9471, 0x3f1c4973, 0x3f1e0143, 0x3f1fbbe1,
+    0x3f217950, 0x3f23398f, 0x3f24fca2, 0x3f26c288, 0x3f288b43, 0x3f2a56d5, 0x3f2c253f, 0x3f2df681, 0x3f2fca9e, 0x3f31a197, 0x3f337b6c, 0x3f355820, 0x3f3737b3, 0x3f391a26, 0x3f3aff7e, 0x3f3ce7b7,
+    0x3f3ed2d4, 0x3f40c0d6, 0x3f42b1c0, 0x3f44a592, 0x3f469c4d, 0x3f4895f3, 0x3f4a9284, 0x3f4c9203, 0x3f4e9470, 0x3f5099cd, 0x3f52a21a, 0x3f54ad59, 0x3f56bb8c, 0x3f58ccb3, 0x3f5ae0cf, 0x3f5cf7e2,
+    0x3f5f11ee, 0x3f612ef2, 0x3f634eef, 0x3f6571ec, 0x3f6797e1, 0x3f69c0d8, 0x3f6beccb, 0x3f6e1bc2, 0x3f704db6, 0x3f7282b1, 0x3f74baae, 0x3f76f5b3, 0x3f7933b9, 0x3f7b74cb, 0x3f7db8e0, 0x3f800000,
+};
+
+// order must match SWR_FORMAT
+const SWR_FORMAT_INFO gFormatInfo[] = {
+    // R32G32B32A32_FLOAT (0x0)
+    {
+        "R32G32B32A32_FLOAT",
+        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 32, 32, 32, 32 }, // Bits per component
+        128, // Bits per element
+        16, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R32G32B32A32_SINT (0x1)
+    {
+        "R32G32B32A32_SINT",
+        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 32, 32, 32, 32 }, // Bits per component
+        128, // Bits per element
+        16, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R32G32B32A32_UINT (0x2)
+    {
+        "R32G32B32A32_UINT",
+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 32, 32, 32, 32 }, // Bits per component
+        128, // Bits per element
+        16, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // 0x3 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x4 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x5 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // R32G32B32X32_FLOAT (0x6)
+    {
+        "R32G32B32X32_FLOAT",
+        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNUSED },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 32, 32, 32, 32 }, // Bits per component
+        128, // Bits per element
+        16, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R32G32B32A32_SSCALED (0x7)
+    {
+        "R32G32B32A32_SSCALED",
+        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 32, 32, 32, 32 }, // Bits per component
+        128, // Bits per element
+        16, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R32G32B32A32_USCALED (0x8)
+    {
+        "R32G32B32A32_USCALED",
+        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 32, 32, 32, 32 }, // Bits per component
+        128, // Bits per element
+        16, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // 0x9 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xa (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xb (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xc (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xd (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xe (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xf (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x10 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x11 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x12 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x13 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x14 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x15 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x16 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x17 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x18 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x19 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x1a (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x1b (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x1c (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x1d (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x1e (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x1f (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x20 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x21 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x22 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x23 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x24 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x25 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x26 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x27 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x28 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x29 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x2a (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x2b (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x2c (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x2d (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x2e (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x2f (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x30 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x31 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x32 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x33 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x34 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x35 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x36 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x37 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x38 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x39 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x3a (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x3b (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x3c (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x3d (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x3e (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x3f (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // R32G32B32_FLOAT (0x40)
+    {
+        "R32G32B32_FLOAT",
+        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 0 }, // Swizzle
+        { 32, 32, 32, 0 }, // Bits per component
+        96, // Bits per element
+        12, // Bytes per element
+        3, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R32G32B32_SINT (0x41)
+    {
+        "R32G32B32_SINT",
+        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 1, 2, 0 }, // Swizzle
+        { 32, 32, 32, 0 }, // Bits per component
+        96, // Bits per element
+        12, // Bytes per element
+        3, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R32G32B32_UINT (0x42)
+    {
+        "R32G32B32_UINT",
+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 1, 2, 0 }, // Swizzle
+        { 32, 32, 32, 0 }, // Bits per component
+        96, // Bits per element
+        12, // Bytes per element
+        3, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // 0x43 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x44 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // R32G32B32_SSCALED (0x45)
+    {
+        "R32G32B32_SSCALED",
+        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 0 }, // Swizzle
+        { 32, 32, 32, 0 }, // Bits per component
+        96, // Bits per element
+        12, // Bytes per element
+        3, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R32G32B32_USCALED (0x46)
+    {
+        "R32G32B32_USCALED",
+        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 0 }, // Swizzle
+        { 32, 32, 32, 0 }, // Bits per component
+        96, // Bits per element
+        12, // Bytes per element
+        3, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // 0x47 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x48 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x49 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x4a (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x4b (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x4c (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x4d (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x4e (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x4f (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x50 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x51 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x52 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x53 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x54 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x55 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x56 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x57 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x58 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x59 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x5a (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x5b (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x5c (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x5d (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x5e (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x5f (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x60 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x61 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x62 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x63 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x64 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x65 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x66 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x67 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x68 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x69 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x6a (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x6b (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x6c (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x6d (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x6e (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x6f (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x70 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x71 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x72 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x73 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x74 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x75 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x76 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x77 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x78 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x79 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x7a (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x7b (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x7c (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x7d (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x7e (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x7f (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // R16G16B16A16_UNORM (0x80)
+    {
+        "R16G16B16A16_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 16, 16, 16, 16 }, // Bits per component
+        64, // Bits per element
+        8, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R16G16B16A16_SNORM (0x81)
+    {
+        "R16G16B16A16_SNORM",
+        { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 16, 16, 16, 16 }, // Bits per component
+        64, // Bits per element
+        8, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R16G16B16A16_SINT (0x82)
+    {
+        "R16G16B16A16_SINT",
+        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 16, 16, 16, 16 }, // Bits per component
+        64, // Bits per element
+        8, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R16G16B16A16_UINT (0x83)
+    {
+        "R16G16B16A16_UINT",
+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 16, 16, 16, 16 }, // Bits per component
+        64, // Bits per element
+        8, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R16G16B16A16_FLOAT (0x84)
+    {
+        "R16G16B16A16_FLOAT",
+        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 16, 16, 16, 16 }, // Bits per component
+        64, // Bits per element
+        8, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R32G32_FLOAT (0x85)
+    {
+        "R32G32_FLOAT",
+        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 0, 0 }, // Swizzle
+        { 32, 32, 0, 0 }, // Bits per component
+        64, // Bits per element
+        8, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R32G32_SINT (0x86)
+    {
+        "R32G32_SINT",
+        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 1, 0, 0 }, // Swizzle
+        { 32, 32, 0, 0 }, // Bits per component
+        64, // Bits per element
+        8, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R32G32_UINT (0x87)
+    {
+        "R32G32_UINT",
+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 1, 0, 0 }, // Swizzle
+        { 32, 32, 0, 0 }, // Bits per component
+        64, // Bits per element
+        8, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R32_FLOAT_X8X24_TYPELESS (0x88)
+    {
+        "R32_FLOAT_X8X24_TYPELESS",
+        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 0, 0 }, // Swizzle
+        { 32, 32, 0, 0 }, // Bits per component
+        64, // Bits per element
+        8, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // X32_TYPELESS_G8X24_UINT (0x89)
+    {
+        "X32_TYPELESS_G8X24_UINT",
+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 1, 0, 0 }, // Swizzle
+        { 32, 32, 0, 0 }, // Bits per component
+        64, // Bits per element
+        8, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // L32A32_FLOAT (0x8a)
+    {
+        "L32A32_FLOAT",
+        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 3, 0, 0 }, // Swizzle
+        { 32, 32, 0, 0 }, // Bits per component
+        64, // Bits per element
+        8, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        true, // isLuminance
+    },
+    // 0x8b (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x8c (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x8d (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // R16G16B16X16_UNORM (0x8e)
+    {
+        "R16G16B16X16_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 16, 16, 16, 16 }, // Bits per component
+        64, // Bits per element
+        8, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R16G16B16X16_FLOAT (0x8f)
+    {
+        "R16G16B16X16_FLOAT",
+        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNUSED },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 16, 16, 16, 16 }, // Bits per component
+        64, // Bits per element
+        8, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // 0x90 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // L32X32_FLOAT (0x91)
+    {
+        "L32X32_FLOAT",
+        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 3, 0, 0 }, // Swizzle
+        { 32, 32, 0, 0 }, // Bits per component
+        64, // Bits per element
+        8, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        true, // isLuminance
+    },
+    // I32X32_FLOAT (0x92)
+    {
+        "I32X32_FLOAT",
+        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 3, 0, 0 }, // Swizzle
+        { 32, 32, 0, 0 }, // Bits per component
+        64, // Bits per element
+        8, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        true, // isLuminance
+    },
+    // R16G16B16A16_SSCALED (0x93)
+    {
+        "R16G16B16A16_SSCALED",
+        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 16, 16, 16, 16 }, // Bits per component
+        64, // Bits per element
+        8, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R16G16B16A16_USCALED (0x94)
+    {
+        "R16G16B16A16_USCALED",
+        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 16, 16, 16, 16 }, // Bits per component
+        64, // Bits per element
+        8, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R32G32_SSCALED (0x95)
+    {
+        "R32G32_SSCALED",
+        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 0, 0 }, // Swizzle
+        { 32, 32, 0, 0 }, // Bits per component
+        64, // Bits per element
+        8, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R32G32_USCALED (0x96)
+    {
+        "R32G32_USCALED",
+        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 0, 0 }, // Swizzle
+        { 32, 32, 0, 0 }, // Bits per component
+        64, // Bits per element
+        8, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // 0x97 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // R32_FLOAT_X8X24_TYPELESS_LD (0x98)
+    {
+        "R32_FLOAT_X8X24_TYPELESS_LD",
+        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 0, 0 }, // Swizzle
+        { 32, 32, 0, 0 }, // Bits per component
+        64, // Bits per element
+        8, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // 0x99 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x9a (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x9b (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x9c (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x9d (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x9e (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x9f (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xa0 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xa1 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xa2 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xa3 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xa4 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xa5 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xa6 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xa7 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xa8 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xa9 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xaa (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xab (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xac (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xad (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xae (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xaf (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xb0 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xb1 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xb2 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xb3 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xb4 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xb5 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xb6 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xb7 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xb8 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xb9 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xba (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xbb (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xbc (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xbd (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xbe (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xbf (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // B8G8R8A8_UNORM (0xc0)
+    {
+        "B8G8R8A8_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 2, 1, 0, 3 }, // Swizzle
+        { 8, 8, 8, 8 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // B8G8R8A8_UNORM_SRGB (0xc1)
+    {
+        "B8G8R8A8_UNORM_SRGB",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 2, 1, 0, 3 }, // Swizzle
+        { 8, 8, 8, 8 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        true, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R10G10B10A2_UNORM (0xc2)
+    {
+        "R10G10B10A2_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 10, 10, 10, 2 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R10G10B10A2_UNORM_SRGB (0xc3)
+    {
+        "R10G10B10A2_UNORM_SRGB",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 10, 10, 10, 2 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        true, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R10G10B10A2_UINT (0xc4)
+    {
+        "R10G10B10A2_UINT",
+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 10, 10, 10, 2 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // 0xc5 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xc6 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // R8G8B8A8_UNORM (0xc7)
+    {
+        "R8G8B8A8_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 8, 8, 8, 8 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R8G8B8A8_UNORM_SRGB (0xc8)
+    {
+        "R8G8B8A8_UNORM_SRGB",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 8, 8, 8, 8 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        true, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R8G8B8A8_SNORM (0xc9)
+    {
+        "R8G8B8A8_SNORM",
+        { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 8, 8, 8, 8 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R8G8B8A8_SINT (0xca)
+    {
+        "R8G8B8A8_SINT",
+        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 8, 8, 8, 8 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R8G8B8A8_UINT (0xcb)
+    {
+        "R8G8B8A8_UINT",
+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 8, 8, 8, 8 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R16G16_UNORM (0xcc)
+    {
+        "R16G16_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 0, 0 }, // Swizzle
+        { 16, 16, 0, 0 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 65535.0f, 1.0f / 65535.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R16G16_SNORM (0xcd)
+    {
+        "R16G16_SNORM",
+        { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 0, 0 }, // Swizzle
+        { 16, 16, 0, 0 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 32767.0f, 1.0f / 32767.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R16G16_SINT (0xce)
+    {
+        "R16G16_SINT",
+        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 1, 0, 0 }, // Swizzle
+        { 16, 16, 0, 0 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R16G16_UINT (0xcf)
+    {
+        "R16G16_UINT",
+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 1, 0, 0 }, // Swizzle
+        { 16, 16, 0, 0 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R16G16_FLOAT (0xd0)
+    {
+        "R16G16_FLOAT",
+        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 0, 0 }, // Swizzle
+        { 16, 16, 0, 0 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // B10G10R10A2_UNORM (0xd1)
+    {
+        "B10G10R10A2_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 2, 1, 0, 3 }, // Swizzle
+        { 10, 10, 10, 2 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // B10G10R10A2_UNORM_SRGB (0xd2)
+    {
+        "B10G10R10A2_UNORM_SRGB",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 2, 1, 0, 3 }, // Swizzle
+        { 10, 10, 10, 2 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        true, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R11G11B10_FLOAT (0xd3)
+    {
+        "R11G11B10_FLOAT",
+        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 0 }, // Swizzle
+        { 11, 11, 10, 0 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        3, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // 0xd4 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xd5 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // R32_SINT (0xd6)
+    {
+        "R32_SINT",
+        { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 32, 0, 0, 0 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R32_UINT (0xd7)
+    {
+        "R32_UINT",
+        { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 32, 0, 0, 0 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R32_FLOAT (0xd8)
+    {
+        "R32_FLOAT",
+        { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 32, 0, 0, 0 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R24_UNORM_X8_TYPELESS (0xd9)
+    {
+        "R24_UNORM_X8_TYPELESS",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 24, 0, 0, 0 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 16777215.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // 0xda (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xdb (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // R24_UNORM_X8_TYPELESS_LD (0xdc)
+    {
+        "R24_UNORM_X8_TYPELESS_LD",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 24, 0, 0, 0 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 16777215.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // L32_UNORM (0xdd)
+    {
+        "L32_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 32, 0, 0, 0 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 4294967295.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        true, // isLuminance
+    },
+    // 0xde (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // L16A16_UNORM (0xdf)
+    {
+        "L16A16_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 3, 0, 0 }, // Swizzle
+        { 16, 16, 0, 0 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 65535.0f, 1.0f / 65535.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        true, // isLuminance
+    },
+    // I24X8_UNORM (0xe0)
+    {
+        "I24X8_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 3, 0, 0 }, // Swizzle
+        { 24, 8, 0, 0 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 16777215.0f, 1.0f / 255.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        true, // isLuminance
+    },
+    // L24X8_UNORM (0xe1)
+    {
+        "L24X8_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 3, 0, 0 }, // Swizzle
+        { 24, 8, 0, 0 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 16777215.0f, 1.0f / 255.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        true, // isLuminance
+    },
+    // 0xe2 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // I32_FLOAT (0xe3)
+    {
+        "I32_FLOAT",
+        { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 32, 0, 0, 0 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        true, // isLuminance
+    },
+    // L32_FLOAT (0xe4)
+    {
+        "L32_FLOAT",
+        { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 32, 0, 0, 0 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        true, // isLuminance
+    },
+    // A32_FLOAT (0xe5)
+    {
+        "A32_FLOAT",
+        { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 3, 0, 0, 0 }, // Swizzle
+        { 32, 0, 0, 0 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // 0xe6 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xe7 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xe8 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // B8G8R8X8_UNORM (0xe9)
+    {
+        "B8G8R8X8_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 2, 1, 0, 3 }, // Swizzle
+        { 8, 8, 8, 8 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // B8G8R8X8_UNORM_SRGB (0xea)
+    {
+        "B8G8R8X8_UNORM_SRGB",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 2, 1, 0, 3 }, // Swizzle
+        { 8, 8, 8, 8 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        true, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R8G8B8X8_UNORM (0xeb)
+    {
+        "R8G8B8X8_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 8, 8, 8, 8 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R8G8B8X8_UNORM_SRGB (0xec)
+    {
+        "R8G8B8X8_UNORM_SRGB",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 8, 8, 8, 8 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        true, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R9G9B9E5_SHAREDEXP (0xed)
+    {
+        "R9G9B9E5_SHAREDEXP",
+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 9, 9, 9, 5 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // B10G10R10X2_UNORM (0xee)
+    {
+        "B10G10R10X2_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 2, 1, 0, 3 }, // Swizzle
+        { 10, 10, 10, 2 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 1023.0f, 1.0f / 3.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // 0xef (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // L16A16_FLOAT (0xf0)
+    {
+        "L16A16_FLOAT",
+        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 3, 0, 0 }, // Swizzle
+        { 16, 16, 0, 0 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        true, // isLuminance
+    },
+    // 0xf1 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xf2 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // R10G10B10X2_USCALED (0xf3)
+    {
+        "R10G10B10X2_USCALED",
+        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNUSED },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 10, 10, 10, 2 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R8G8B8A8_SSCALED (0xf4)
+    {
+        "R8G8B8A8_SSCALED",
+        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 8, 8, 8, 8 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R8G8B8A8_USCALED (0xf5)
+    {
+        "R8G8B8A8_USCALED",
+        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 8, 8, 8, 8 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R16G16_SSCALED (0xf6)
+    {
+        "R16G16_SSCALED",
+        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 0, 0 }, // Swizzle
+        { 16, 16, 0, 0 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R16G16_USCALED (0xf7)
+    {
+        "R16G16_USCALED",
+        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 0, 0 }, // Swizzle
+        { 16, 16, 0, 0 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R32_SSCALED (0xf8)
+    {
+        "R32_SSCALED",
+        { SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 32, 0, 0, 0 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R32_USCALED (0xf9)
+    {
+        "R32_USCALED",
+        { SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 32, 0, 0, 0 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // 0xfa (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xfb (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xfc (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xfd (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xfe (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0xff (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // B5G6R5_UNORM (0x100)
+    {
+        "B5G6R5_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 2, 1, 0, 0 }, // Swizzle
+        { 5, 6, 5, 0 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        3, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // B5G6R5_UNORM_SRGB (0x101)
+    {
+        "B5G6R5_UNORM_SRGB",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 2, 1, 0, 0 }, // Swizzle
+        { 5, 6, 5, 0 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        3, // Num components
+        true, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 31.0f, 1.0f / 63.0f, 1.0f / 31.0f, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // B5G5R5A1_UNORM (0x102)
+    {
+        "B5G5R5A1_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 2, 1, 0, 3 }, // Swizzle
+        { 5, 5, 5, 1 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // B5G5R5A1_UNORM_SRGB (0x103)
+    {
+        "B5G5R5A1_UNORM_SRGB",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 2, 1, 0, 3 }, // Swizzle
+        { 5, 5, 5, 1 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        4, // Num components
+        true, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // B4G4R4A4_UNORM (0x104)
+    {
+        "B4G4R4A4_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 2, 1, 0, 3 }, // Swizzle
+        { 4, 4, 4, 4 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // B4G4R4A4_UNORM_SRGB (0x105)
+    {
+        "B4G4R4A4_UNORM_SRGB",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 2, 1, 0, 3 }, // Swizzle
+        { 4, 4, 4, 4 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        4, // Num components
+        true, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f, 1.0f / 15.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R8G8_UNORM (0x106)
+    {
+        "R8G8_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 0, 0 }, // Swizzle
+        { 8, 8, 0, 0 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 1.0f / 255.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R8G8_SNORM (0x107)
+    {
+        "R8G8_SNORM",
+        { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 0, 0 }, // Swizzle
+        { 8, 8, 0, 0 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 127.0f, 1.0f / 127.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R8G8_SINT (0x108)
+    {
+        "R8G8_SINT",
+        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 1, 0, 0 }, // Swizzle
+        { 8, 8, 0, 0 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R8G8_UINT (0x109)
+    {
+        "R8G8_UINT",
+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 1, 0, 0 }, // Swizzle
+        { 8, 8, 0, 0 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R16_UNORM (0x10a)
+    {
+        "R16_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 16, 0, 0, 0 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 65535.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R16_SNORM (0x10b)
+    {
+        "R16_SNORM",
+        { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 16, 0, 0, 0 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 32767.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R16_SINT (0x10c)
+    {
+        "R16_SINT",
+        { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 16, 0, 0, 0 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R16_UINT (0x10d)
+    {
+        "R16_UINT",
+        { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 16, 0, 0, 0 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R16_FLOAT (0x10e)
+    {
+        "R16_FLOAT",
+        { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 16, 0, 0, 0 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // 0x10f (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x110 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // I16_UNORM (0x111)
+    {
+        "I16_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 16, 0, 0, 0 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 65535.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        true, // isLuminance
+    },
+    // L16_UNORM (0x112)
+    {
+        "L16_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 16, 0, 0, 0 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 65535.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        true, // isLuminance
+    },
+    // A16_UNORM (0x113)
+    {
+        "A16_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 3, 0, 0, 0 }, // Swizzle
+        { 16, 0, 0, 0 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 65535.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // L8A8_UNORM (0x114)
+    {
+        "L8A8_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 3, 0, 0 }, // Swizzle
+        { 8, 8, 0, 0 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 1.0f / 255.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        true, // isLuminance
+    },
+    // I16_FLOAT (0x115)
+    {
+        "I16_FLOAT",
+        { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 16, 0, 0, 0 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        true, // isLuminance
+    },
+    // L16_FLOAT (0x116)
+    {
+        "L16_FLOAT",
+        { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 16, 0, 0, 0 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        true, // isLuminance
+    },
+    // A16_FLOAT (0x117)
+    {
+        "A16_FLOAT",
+        { SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 3, 0, 0, 0 }, // Swizzle
+        { 16, 0, 0, 0 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // L8A8_UNORM_SRGB (0x118)
+    {
+        "L8A8_UNORM_SRGB",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 3, 0, 0 }, // Swizzle
+        { 8, 8, 0, 0 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        2, // Num components
+        true, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 1.0f / 255.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        true, // isLuminance
+    },
+    // 0x119 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // B5G5R5X1_UNORM (0x11a)
+    {
+        "B5G5R5X1_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 2, 1, 0, 3 }, // Swizzle
+        { 5, 5, 5, 1 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // B5G5R5X1_UNORM_SRGB (0x11b)
+    {
+        "B5G5R5X1_UNORM_SRGB",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNUSED },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 2, 1, 0, 3 }, // Swizzle
+        { 5, 5, 5, 1 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        4, // Num components
+        true, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 31.0f, 1.0f / 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R8G8_SSCALED (0x11c)
+    {
+        "R8G8_SSCALED",
+        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 0, 0 }, // Swizzle
+        { 8, 8, 0, 0 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R8G8_USCALED (0x11d)
+    {
+        "R8G8_USCALED",
+        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 0, 0 }, // Swizzle
+        { 8, 8, 0, 0 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R16_SSCALED (0x11e)
+    {
+        "R16_SSCALED",
+        { SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 16, 0, 0, 0 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R16_USCALED (0x11f)
+    {
+        "R16_USCALED",
+        { SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 16, 0, 0, 0 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // 0x120 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x121 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x122 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x123 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x124 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x125 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // L8A8_UINT (0x126)
+    {
+        "L8A8_UINT",
+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 3, 0, 0 }, // Swizzle
+        { 8, 8, 0, 0 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        true, // isLuminance
+    },
+    // L8A8_SINT (0x127)
+    {
+        "L8A8_SINT",
+        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 3, 0, 0 }, // Swizzle
+        { 8, 8, 0, 0 }, // Bits per component
+        16, // Bits per element
+        2, // Bytes per element
+        2, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        true, // isLuminance
+    },
+    // 0x128 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x129 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x12a (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x12b (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x12c (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x12d (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x12e (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x12f (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x130 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x131 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x132 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x133 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x134 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x135 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x136 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x137 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x138 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x139 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x13a (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x13b (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x13c (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x13d (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x13e (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x13f (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // R8_UNORM (0x140)
+    {
+        "R8_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 8, 0, 0, 0 }, // Bits per component
+        8, // Bits per element
+        1, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R8_SNORM (0x141)
+    {
+        "R8_SNORM",
+        { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 8, 0, 0, 0 }, // Bits per component
+        8, // Bits per element
+        1, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R8_SINT (0x142)
+    {
+        "R8_SINT",
+        { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 8, 0, 0, 0 }, // Bits per component
+        8, // Bits per element
+        1, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R8_UINT (0x143)
+    {
+        "R8_UINT",
+        { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 8, 0, 0, 0 }, // Bits per component
+        8, // Bits per element
+        1, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // A8_UNORM (0x144)
+    {
+        "A8_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 3, 0, 0, 0 }, // Swizzle
+        { 8, 0, 0, 0 }, // Bits per component
+        8, // Bits per element
+        1, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // I8_UNORM (0x145)
+    {
+        "I8_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 8, 0, 0, 0 }, // Bits per component
+        8, // Bits per element
+        1, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        true, // isLuminance
+    },
+    // L8_UNORM (0x146)
+    {
+        "L8_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 8, 0, 0, 0 }, // Bits per component
+        8, // Bits per element
+        1, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        true, // isLuminance
+    },
+    // 0x147 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x148 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // R8_SSCALED (0x149)
+    {
+        "R8_SSCALED",
+        { SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 8, 0, 0, 0 }, // Bits per component
+        8, // Bits per element
+        1, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R8_USCALED (0x14a)
+    {
+        "R8_USCALED",
+        { SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 8, 0, 0, 0 }, // Bits per component
+        8, // Bits per element
+        1, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // 0x14b (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // L8_UNORM_SRGB (0x14c)
+    {
+        "L8_UNORM_SRGB",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 8, 0, 0, 0 }, // Bits per component
+        8, // Bits per element
+        1, // Bytes per element
+        1, // Num components
+        true, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        true, // isLuminance
+    },
+    // 0x14d (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x14e (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x14f (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x150 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x151 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // L8_UINT (0x152)
+    {
+        "L8_UINT",
+        { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 8, 0, 0, 0 }, // Bits per component
+        8, // Bits per element
+        1, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        true, // isLuminance
+    },
+    // L8_SINT (0x153)
+    {
+        "L8_SINT",
+        { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 8, 0, 0, 0 }, // Bits per component
+        8, // Bits per element
+        1, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        true, // isLuminance
+    },
+    // I8_UINT (0x154)
+    {
+        "I8_UINT",
+        { SWR_TYPE_UINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 8, 0, 0, 0 }, // Bits per component
+        8, // Bits per element
+        1, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        true, // isLuminance
+    },
+    // I8_SINT (0x155)
+    {
+        "I8_SINT",
+        { SWR_TYPE_SINT, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 8, 0, 0, 0 }, // Bits per component
+        8, // Bits per element
+        1, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 0, 0, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        true, // isLuminance
+    },
+    // 0x156 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x157 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x158 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x159 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x15a (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x15b (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x15c (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x15d (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x15e (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x15f (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x160 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x161 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x162 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x163 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x164 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x165 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x166 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x167 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x168 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x169 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x16a (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x16b (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x16c (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x16d (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x16e (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x16f (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x170 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x171 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x172 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x173 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x174 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x175 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x176 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x177 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x178 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x179 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x17a (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x17b (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x17c (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x17d (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x17e (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x17f (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x180 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x181 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x182 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // YCRCB_SWAPUVY (0x183)
+    {
+        "YCRCB_SWAPUVY",
+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 8, 8, 8, 8 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        true, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+        2, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // 0x184 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x185 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // BC1_UNORM (0x186)
+    {
+        "BC1_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 8, 0, 0, 0 }, // Bits per component
+        64, // Bits per element
+        8, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        true, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+        4, // bcWidth
+        4, // bcHeight
+        false, // isLuminance
+    },
+    // BC2_UNORM (0x187)
+    {
+        "BC2_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 8, 0, 0, 0 }, // Bits per component
+        128, // Bits per element
+        16, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        true, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+        4, // bcWidth
+        4, // bcHeight
+        false, // isLuminance
+    },
+    // BC3_UNORM (0x188)
+    {
+        "BC3_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 8, 0, 0, 0 }, // Bits per component
+        128, // Bits per element
+        16, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        true, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+        4, // bcWidth
+        4, // bcHeight
+        false, // isLuminance
+    },
+    // BC4_UNORM (0x189)
+    {
+        "BC4_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 8, 0, 0, 0 }, // Bits per component
+        64, // Bits per element
+        8, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        true, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+        4, // bcWidth
+        4, // bcHeight
+        false, // isLuminance
+    },
+    // BC5_UNORM (0x18a)
+    {
+        "BC5_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 8, 0, 0, 0 }, // Bits per component
+        128, // Bits per element
+        16, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        true, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+        4, // bcWidth
+        4, // bcHeight
+        false, // isLuminance
+    },
+    // BC1_UNORM_SRGB (0x18b)
+    {
+        "BC1_UNORM_SRGB",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 8, 0, 0, 0 }, // Bits per component
+        64, // Bits per element
+        8, // Bytes per element
+        1, // Num components
+        true, // isSRGB
+        true, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+        4, // bcWidth
+        4, // bcHeight
+        false, // isLuminance
+    },
+    // BC2_UNORM_SRGB (0x18c)
+    {
+        "BC2_UNORM_SRGB",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 8, 0, 0, 0 }, // Bits per component
+        128, // Bits per element
+        16, // Bytes per element
+        1, // Num components
+        true, // isSRGB
+        true, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+        4, // bcWidth
+        4, // bcHeight
+        false, // isLuminance
+    },
+    // BC3_UNORM_SRGB (0x18d)
+    {
+        "BC3_UNORM_SRGB",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 8, 0, 0, 0 }, // Bits per component
+        128, // Bits per element
+        16, // Bytes per element
+        1, // Num components
+        true, // isSRGB
+        true, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+        4, // bcWidth
+        4, // bcHeight
+        false, // isLuminance
+    },
+    // 0x18e (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // YCRCB_SWAPUV (0x18f)
+    {
+        "YCRCB_SWAPUV",
+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 8, 8, 8, 8 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        true, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+        2, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // 0x190 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x191 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x192 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // R8G8B8_UNORM (0x193)
+    {
+        "R8G8B8_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 0 }, // Swizzle
+        { 8, 8, 8, 0 }, // Bits per component
+        24, // Bits per element
+        3, // Bytes per element
+        3, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R8G8B8_SNORM (0x194)
+    {
+        "R8G8B8_SNORM",
+        { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 0 }, // Swizzle
+        { 8, 8, 8, 0 }, // Bits per component
+        24, // Bits per element
+        3, // Bytes per element
+        3, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 127.0f, 1.0f / 127.0f, 1.0f / 127.0f, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R8G8B8_SSCALED (0x195)
+    {
+        "R8G8B8_SSCALED",
+        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 0 }, // Swizzle
+        { 8, 8, 8, 0 }, // Bits per component
+        24, // Bits per element
+        3, // Bytes per element
+        3, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R8G8B8_USCALED (0x196)
+    {
+        "R8G8B8_USCALED",
+        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 0 }, // Swizzle
+        { 8, 8, 8, 0 }, // Bits per component
+        24, // Bits per element
+        3, // Bytes per element
+        3, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // 0x197 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x198 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // BC4_SNORM (0x199)
+    {
+        "BC4_SNORM",
+        { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 8, 0, 0, 0 }, // Bits per component
+        64, // Bits per element
+        8, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        true, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor
+        4, // bcWidth
+        4, // bcHeight
+        false, // isLuminance
+    },
+    // BC5_SNORM (0x19a)
+    {
+        "BC5_SNORM",
+        { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 8, 0, 0, 0 }, // Bits per component
+        128, // Bits per element
+        16, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        true, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor
+        4, // bcWidth
+        4, // bcHeight
+        false, // isLuminance
+    },
+    // R16G16B16_FLOAT (0x19b)
+    {
+        "R16G16B16_FLOAT",
+        { SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_FLOAT, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 0 }, // Swizzle
+        { 16, 16, 16, 0 }, // Bits per component
+        48, // Bits per element
+        6, // Bytes per element
+        3, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R16G16B16_UNORM (0x19c)
+    {
+        "R16G16B16_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 0 }, // Swizzle
+        { 16, 16, 16, 0 }, // Bits per component
+        48, // Bits per element
+        6, // Bytes per element
+        3, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 65535.0f, 1.0f / 65535.0f, 1.0f / 65535.0f, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R16G16B16_SNORM (0x19d)
+    {
+        "R16G16B16_SNORM",
+        { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 0 }, // Swizzle
+        { 16, 16, 16, 0 }, // Bits per component
+        48, // Bits per element
+        6, // Bytes per element
+        3, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 32767.0f, 1.0f / 32767.0f, 1.0f / 32767.0f, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R16G16B16_SSCALED (0x19e)
+    {
+        "R16G16B16_SSCALED",
+        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 0 }, // Swizzle
+        { 16, 16, 16, 0 }, // Bits per component
+        48, // Bits per element
+        6, // Bytes per element
+        3, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R16G16B16_USCALED (0x19f)
+    {
+        "R16G16B16_USCALED",
+        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 0 }, // Swizzle
+        { 16, 16, 16, 0 }, // Bits per component
+        48, // Bits per element
+        6, // Bytes per element
+        3, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // 0x1a0 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // BC6H_SF16 (0x1a1)
+    {
+        "BC6H_SF16",
+        { SWR_TYPE_SNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 8, 0, 0, 0 }, // Bits per component
+        128, // Bits per element
+        16, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        true, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 127.0f, 0, 0, 0 }, // To float scale factor
+        4, // bcWidth
+        4, // bcHeight
+        false, // isLuminance
+    },
+    // BC7_UNORM (0x1a2)
+    {
+        "BC7_UNORM",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 8, 0, 0, 0 }, // Bits per component
+        128, // Bits per element
+        16, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        true, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+        4, // bcWidth
+        4, // bcHeight
+        false, // isLuminance
+    },
+    // BC7_UNORM_SRGB (0x1a3)
+    {
+        "BC7_UNORM_SRGB",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 8, 0, 0, 0 }, // Bits per component
+        128, // Bits per element
+        16, // Bytes per element
+        1, // Num components
+        true, // isSRGB
+        true, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+        4, // bcWidth
+        4, // bcHeight
+        false, // isLuminance
+    },
+    // BC6H_UF16 (0x1a4)
+    {
+        "BC6H_UF16",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 0, 0, 0 }, // Swizzle
+        { 8, 0, 0, 0 }, // Bits per component
+        128, // Bits per element
+        16, // Bytes per element
+        1, // Num components
+        false, // isSRGB
+        true, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 0, 0, 0 }, // To float scale factor
+        4, // bcWidth
+        4, // bcHeight
+        false, // isLuminance
+    },
+    // 0x1a5 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x1a6 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x1a7 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // R8G8B8_UNORM_SRGB (0x1a8)
+    {
+        "R8G8B8_UNORM_SRGB",
+        { SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNORM, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 0 }, // Swizzle
+        { 8, 8, 8, 0 }, // Bits per component
+        24, // Bits per element
+        3, // Bytes per element
+        3, // Num components
+        true, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 255.0f, 1.0f / 255.0f, 1.0f / 255.0f, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // 0x1a9 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x1aa (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x1ab (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x1ac (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x1ad (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x1ae (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x1af (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // R16G16B16_UINT (0x1b0)
+    {
+        "R16G16B16_UINT",
+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 1, 2, 0 }, // Swizzle
+        { 16, 16, 16, 0 }, // Bits per component
+        48, // Bits per element
+        6, // Bytes per element
+        3, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R16G16B16_SINT (0x1b1)
+    {
+        "R16G16B16_SINT",
+        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 1, 2, 0 }, // Swizzle
+        { 16, 16, 16, 0 }, // Bits per component
+        48, // Bits per element
+        6, // Bytes per element
+        3, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // 0x1b2 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // R10G10B10A2_SNORM (0x1b3)
+    {
+        "R10G10B10A2_SNORM",
+        { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 10, 10, 10, 2 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R10G10B10A2_USCALED (0x1b4)
+    {
+        "R10G10B10A2_USCALED",
+        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 10, 10, 10, 2 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R10G10B10A2_SSCALED (0x1b5)
+    {
+        "R10G10B10A2_SSCALED",
+        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 10, 10, 10, 2 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R10G10B10A2_SINT (0x1b6)
+    {
+        "R10G10B10A2_SINT",
+        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 1, 2, 3 }, // Swizzle
+        { 10, 10, 10, 2 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // B10G10R10A2_SNORM (0x1b7)
+    {
+        "B10G10R10A2_SNORM",
+        { SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM, SWR_TYPE_SNORM },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 2, 1, 0, 3 }, // Swizzle
+        { 10, 10, 10, 2 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { true, true, true, true }, // Is normalized?
+        { 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 511.0f, 1.0f / 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // B10G10R10A2_USCALED (0x1b8)
+    {
+        "B10G10R10A2_USCALED",
+        { SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED, SWR_TYPE_USCALED },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 2, 1, 0, 3 }, // Swizzle
+        { 10, 10, 10, 2 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // B10G10R10A2_SSCALED (0x1b9)
+    {
+        "B10G10R10A2_SSCALED",
+        { SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED, SWR_TYPE_SSCALED },
+        { 0, 0, 0, 0x3f800000 }, // Defaults for missing components
+        { 2, 1, 0, 3 }, // Swizzle
+        { 10, 10, 10, 2 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // B10G10R10A2_UINT (0x1ba)
+    {
+        "B10G10R10A2_UINT",
+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 2, 1, 0, 3 }, // Swizzle
+        { 10, 10, 10, 2 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // B10G10R10A2_SINT (0x1bb)
+    {
+        "B10G10R10A2_SINT",
+        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 2, 1, 0, 3 }, // Swizzle
+        { 10, 10, 10, 2 }, // Bits per component
+        32, // Bits per element
+        4, // Bytes per element
+        4, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 1.0f }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // 0x1bc (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x1bd (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x1be (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x1bf (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x1c0 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x1c1 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x1c2 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x1c3 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x1c4 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x1c5 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x1c6 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // 0x1c7 (Padding)
+    {
+        "UNKNOWN",
+        { SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, { 0, 0, 0, 0 }, 0, 0, 0, false, false, false,
+        { false, false, false, false },
+        { 0.0f, 0.0f, 0.0f, 0.0f },
+        1, 1, false    },
+    // R8G8B8_UINT (0x1c8)
+    {
+        "R8G8B8_UINT",
+        { SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UINT, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 1, 2, 0 }, // Swizzle
+        { 8, 8, 8, 0 }, // Bits per component
+        24, // Bits per element
+        3, // Bytes per element
+        3, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+    // R8G8B8_SINT (0x1c9)
+    {
+        "R8G8B8_SINT",
+        { SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_SINT, SWR_TYPE_UNKNOWN },
+        { 0, 0, 0, 0x1 }, // Defaults for missing components
+        { 0, 1, 2, 0 }, // Swizzle
+        { 8, 8, 8, 0 }, // Bits per component
+        24, // Bits per element
+        3, // Bytes per element
+        3, // Num components
+        false, // isSRGB
+        false, // isBC
+        false, // isSubsampled
+        { false, false, false, false }, // Is normalized?
+        { 1.0f, 1.0f, 1.0f, 0 }, // To float scale factor
+        1, // bcWidth
+        1, // bcHeight
+        false, // isLuminance
+    },
+};
diff --git a/src/gallium/drivers/swr/rasterizer/common/formats.h b/src/gallium/drivers/swr/rasterizer/common/formats.h
new file mode 100644
index 00000000000..b9dd53ebaa4
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/formats.h
@@ -0,0 +1,251 @@
+
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+* 
+* @file formats.h
+* 
+* @brief auto-generated file
+* 
+* DO NOT EDIT
+* 
+******************************************************************************/
+
+#pragma once
+
+#include "common/os.h"
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_TYPE - Format component type
+//////////////////////////////////////////////////////////////////////////
+enum SWR_TYPE
+{
+    SWR_TYPE_UNKNOWN,
+    SWR_TYPE_UNUSED,
+    SWR_TYPE_UNORM,
+    SWR_TYPE_SNORM,
+    SWR_TYPE_UINT,
+    SWR_TYPE_SINT,
+    SWR_TYPE_FLOAT,
+    SWR_TYPE_SSCALED,
+    SWR_TYPE_USCALED,
+};
+//////////////////////////////////////////////////////////////////////////
+/// SWR_FORMAT
+//////////////////////////////////////////////////////////////////////////
+enum SWR_FORMAT
+{
+    R32G32B32A32_FLOAT = 0x0,
+    R32G32B32A32_SINT = 0x1,
+    R32G32B32A32_UINT = 0x2,
+    R32G32B32X32_FLOAT = 0x6,
+    R32G32B32A32_SSCALED = 0x7,
+    R32G32B32A32_USCALED = 0x8,
+    R32G32B32_FLOAT = 0x40,
+    R32G32B32_SINT = 0x41,
+    R32G32B32_UINT = 0x42,
+    R32G32B32_SSCALED = 0x45,
+    R32G32B32_USCALED = 0x46,
+    R16G16B16A16_UNORM = 0x80,
+    R16G16B16A16_SNORM = 0x81,
+    R16G16B16A16_SINT = 0x82,
+    R16G16B16A16_UINT = 0x83,
+    R16G16B16A16_FLOAT = 0x84,
+    R32G32_FLOAT = 0x85,
+    R32G32_SINT = 0x86,
+    R32G32_UINT = 0x87,
+    R32_FLOAT_X8X24_TYPELESS = 0x88,
+    X32_TYPELESS_G8X24_UINT = 0x89,
+    L32A32_FLOAT = 0x8A,
+    R16G16B16X16_UNORM = 0x8E,
+    R16G16B16X16_FLOAT = 0x8F,
+    L32X32_FLOAT = 0x91,
+    I32X32_FLOAT = 0x92,
+    R16G16B16A16_SSCALED = 0x93,
+    R16G16B16A16_USCALED = 0x94,
+    R32G32_SSCALED = 0x95,
+    R32G32_USCALED = 0x96,
+    R32_FLOAT_X8X24_TYPELESS_LD = 0x98,
+    B8G8R8A8_UNORM = 0xC0,
+    B8G8R8A8_UNORM_SRGB = 0xC1,
+    R10G10B10A2_UNORM = 0xC2,
+    R10G10B10A2_UNORM_SRGB = 0xC3,
+    R10G10B10A2_UINT = 0xC4,
+    R8G8B8A8_UNORM = 0xC7,
+    R8G8B8A8_UNORM_SRGB = 0xC8,
+    R8G8B8A8_SNORM = 0xC9,
+    R8G8B8A8_SINT = 0xCA,
+    R8G8B8A8_UINT = 0xCB,
+    R16G16_UNORM = 0xCC,
+    R16G16_SNORM = 0xCD,
+    R16G16_SINT = 0xCE,
+    R16G16_UINT = 0xCF,
+    R16G16_FLOAT = 0xD0,
+    B10G10R10A2_UNORM = 0xD1,
+    B10G10R10A2_UNORM_SRGB = 0xD2,
+    R11G11B10_FLOAT = 0xD3,
+    R32_SINT = 0xD6,
+    R32_UINT = 0xD7,
+    R32_FLOAT = 0xD8,
+    R24_UNORM_X8_TYPELESS = 0xD9,
+    R24_UNORM_X8_TYPELESS_LD = 0xDC,
+    L32_UNORM = 0xDD,
+    L16A16_UNORM = 0xDF,
+    I24X8_UNORM = 0xE0,
+    L24X8_UNORM = 0xE1,
+    I32_FLOAT = 0xE3,
+    L32_FLOAT = 0xE4,
+    A32_FLOAT = 0xE5,
+    B8G8R8X8_UNORM = 0xE9,
+    B8G8R8X8_UNORM_SRGB = 0xEA,
+    R8G8B8X8_UNORM = 0xEB,
+    R8G8B8X8_UNORM_SRGB = 0xEC,
+    R9G9B9E5_SHAREDEXP = 0xED,
+    B10G10R10X2_UNORM = 0xEE,
+    L16A16_FLOAT = 0xF0,
+    R10G10B10X2_USCALED = 0xF3,
+    R8G8B8A8_SSCALED = 0xF4,
+    R8G8B8A8_USCALED = 0xF5,
+    R16G16_SSCALED = 0xF6,
+    R16G16_USCALED = 0xF7,
+    R32_SSCALED = 0xF8,
+    R32_USCALED = 0xF9,
+    B5G6R5_UNORM = 0x100,
+    B5G6R5_UNORM_SRGB = 0x101,
+    B5G5R5A1_UNORM = 0x102,
+    B5G5R5A1_UNORM_SRGB = 0x103,
+    B4G4R4A4_UNORM = 0x104,
+    B4G4R4A4_UNORM_SRGB = 0x105,
+    R8G8_UNORM = 0x106,
+    R8G8_SNORM = 0x107,
+    R8G8_SINT = 0x108,
+    R8G8_UINT = 0x109,
+    R16_UNORM = 0x10A,
+    R16_SNORM = 0x10B,
+    R16_SINT = 0x10C,
+    R16_UINT = 0x10D,
+    R16_FLOAT = 0x10E,
+    I16_UNORM = 0x111,
+    L16_UNORM = 0x112,
+    A16_UNORM = 0x113,
+    L8A8_UNORM = 0x114,
+    I16_FLOAT = 0x115,
+    L16_FLOAT = 0x116,
+    A16_FLOAT = 0x117,
+    L8A8_UNORM_SRGB = 0x118,
+    B5G5R5X1_UNORM = 0x11A,
+    B5G5R5X1_UNORM_SRGB = 0x11B,
+    R8G8_SSCALED = 0x11C,
+    R8G8_USCALED = 0x11D,
+    R16_SSCALED = 0x11E,
+    R16_USCALED = 0x11F,
+    L8A8_UINT = 0x126,
+    L8A8_SINT = 0x127,
+    R8_UNORM = 0x140,
+    R8_SNORM = 0x141,
+    R8_SINT = 0x142,
+    R8_UINT = 0x143,
+    A8_UNORM = 0x144,
+    I8_UNORM = 0x145,
+    L8_UNORM = 0x146,
+    R8_SSCALED = 0x149,
+    R8_USCALED = 0x14A,
+    L8_UNORM_SRGB = 0x14C,
+    L8_UINT = 0x152,
+    L8_SINT = 0x153,
+    I8_UINT = 0x154,
+    I8_SINT = 0x155,
+    YCRCB_SWAPUVY = 0x183,
+    BC1_UNORM = 0x186,
+    BC2_UNORM = 0x187,
+    BC3_UNORM = 0x188,
+    BC4_UNORM = 0x189,
+    BC5_UNORM = 0x18A,
+    BC1_UNORM_SRGB = 0x18B,
+    BC2_UNORM_SRGB = 0x18C,
+    BC3_UNORM_SRGB = 0x18D,
+    YCRCB_SWAPUV = 0x18F,
+    R8G8B8_UNORM = 0x193,
+    R8G8B8_SNORM = 0x194,
+    R8G8B8_SSCALED = 0x195,
+    R8G8B8_USCALED = 0x196,
+    BC4_SNORM = 0x199,
+    BC5_SNORM = 0x19A,
+    R16G16B16_FLOAT = 0x19B,
+    R16G16B16_UNORM = 0x19C,
+    R16G16B16_SNORM = 0x19D,
+    R16G16B16_SSCALED = 0x19E,
+    R16G16B16_USCALED = 0x19F,
+    BC6H_SF16 = 0x1A1,
+    BC7_UNORM = 0x1A2,
+    BC7_UNORM_SRGB = 0x1A3,
+    BC6H_UF16 = 0x1A4,
+    R8G8B8_UNORM_SRGB = 0x1A8,
+    R16G16B16_UINT = 0x1B0,
+    R16G16B16_SINT = 0x1B1,
+    R10G10B10A2_SNORM = 0x1B3,
+    R10G10B10A2_USCALED = 0x1B4,
+    R10G10B10A2_SSCALED = 0x1B5,
+    R10G10B10A2_SINT = 0x1B6,
+    B10G10R10A2_SNORM = 0x1B7,
+    B10G10R10A2_USCALED = 0x1B8,
+    B10G10R10A2_SSCALED = 0x1B9,
+    B10G10R10A2_UINT = 0x1BA,
+    B10G10R10A2_SINT = 0x1BB,
+    R8G8B8_UINT = 0x1C8,
+    R8G8B8_SINT = 0x1C9,
+    NUM_SWR_FORMATS = 0x1CA,
+};
+//////////////////////////////////////////////////////////////////////////
+/// SWR_FORMAT_INFO - Format information
+//////////////////////////////////////////////////////////////////////////
+struct SWR_FORMAT_INFO
+{
+    const char* name;
+    SWR_TYPE type[4];
+    uint32_t defaults[4];
+    uint32_t swizzle[4]; ///< swizzle per component
+    uint32_t bpc[4];     ///< bits per component
+    uint32_t bpp;        ///< bits per pixel
+    uint32_t Bpp;        ///< bytes per pixel
+    uint32_t numComps;   ///< number of components
+    bool isSRGB;
+    bool isBC;
+    bool isSubsampled;
+    bool isNormalized[4];
+    float toFloat[4];
+    uint32_t bcWidth;
+    uint32_t bcHeight;
+    bool isLuminance;
+};
+
+extern const SWR_FORMAT_INFO gFormatInfo[];
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Retrieves format info struct for given format.
+/// @param format - SWR format
+INLINE const SWR_FORMAT_INFO& GetFormatInfo(SWR_FORMAT format)
+{
+    return gFormatInfo[format];
+}
+
+// lookup table for unorm8 srgb -> float conversion
+extern const uint32_t srgb8Table[256];
diff --git a/src/gallium/drivers/swr/rasterizer/common/isa.hpp b/src/gallium/drivers/swr/rasterizer/common/isa.hpp
new file mode 100644
index 00000000000..ef381799bc3
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/isa.hpp
@@ -0,0 +1,235 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+#include <bitset>
+#include <array>
+#include <string>
+#include <algorithm>
+
+#if defined(_WIN32)
+#include <intrin.h>
+#else
+#include <string.h>
+#include <cpuid.h>
+#endif
+
+class InstructionSet
+{
+public:
+    InstructionSet() : CPU_Rep() {};
+
+    // getters
+    std::string Vendor(void) { return CPU_Rep.vendor_; }
+    std::string Brand(void) { return CPU_Rep.brand_; }
+
+    bool SSE3(void) { return CPU_Rep.f_1_ECX_[0]; }
+    bool PCLMULQDQ(void) { return CPU_Rep.f_1_ECX_[1]; }
+    bool MONITOR(void) { return CPU_Rep.f_1_ECX_[3]; }
+    bool SSSE3(void) { return CPU_Rep.f_1_ECX_[9]; }
+    bool FMA(void) { return CPU_Rep.f_1_ECX_[12]; }
+    bool CMPXCHG16B(void) { return CPU_Rep.f_1_ECX_[13]; }
+    bool SSE41(void) { return CPU_Rep.f_1_ECX_[19]; }
+    bool SSE42(void) { return CPU_Rep.f_1_ECX_[20]; }
+    bool MOVBE(void) { return CPU_Rep.f_1_ECX_[22]; }
+    bool POPCNT(void) { return CPU_Rep.f_1_ECX_[23]; }
+    bool AES(void) { return CPU_Rep.f_1_ECX_[25]; }
+    bool XSAVE(void) { return CPU_Rep.f_1_ECX_[26]; }
+    bool OSXSAVE(void) { return CPU_Rep.f_1_ECX_[27]; }
+    bool RDRAND(void) { return CPU_Rep.f_1_ECX_[30]; }
+
+    bool MSR(void) { return CPU_Rep.f_1_EDX_[5]; }
+    bool CX8(void) { return CPU_Rep.f_1_EDX_[8]; }
+    bool SEP(void) { return CPU_Rep.f_1_EDX_[11]; }
+    bool CMOV(void) { return CPU_Rep.f_1_EDX_[15]; }
+    bool CLFSH(void) { return CPU_Rep.f_1_EDX_[19]; }
+    bool MMX(void) { return CPU_Rep.f_1_EDX_[23]; }
+    bool FXSR(void) { return CPU_Rep.f_1_EDX_[24]; }
+    bool SSE(void) { return CPU_Rep.f_1_EDX_[25]; }
+    bool SSE2(void) { return CPU_Rep.f_1_EDX_[26]; }
+
+    bool FSGSBASE(void) { return CPU_Rep.f_7_EBX_[0]; }
+    bool BMI1(void) { return CPU_Rep.f_7_EBX_[3]; }
+    bool HLE(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_7_EBX_[4]; }
+    bool BMI2(void) { return CPU_Rep.f_7_EBX_[8]; }
+    bool ERMS(void) { return CPU_Rep.f_7_EBX_[9]; }
+    bool INVPCID(void) { return CPU_Rep.f_7_EBX_[10]; }
+    bool RTM(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_7_EBX_[11]; }
+    bool RDSEED(void) { return CPU_Rep.f_7_EBX_[18]; }
+    bool ADX(void) { return CPU_Rep.f_7_EBX_[19]; }
+    bool SHA(void) { return CPU_Rep.f_7_EBX_[29]; }
+
+    bool PREFETCHWT1(void) { return CPU_Rep.f_7_ECX_[0]; }
+
+    bool LAHF(void) { return CPU_Rep.f_81_ECX_[0]; }
+    bool LZCNT(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_ECX_[5]; }
+    bool ABM(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[5]; }
+    bool SSE4a(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[6]; }
+    bool XOP(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[11]; }
+    bool TBM(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_ECX_[21]; }
+
+    bool SYSCALL(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_EDX_[11]; }
+    bool MMXEXT(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[22]; }
+    bool RDTSCP(void) { return CPU_Rep.isIntel_ && CPU_Rep.f_81_EDX_[27]; }
+    bool _3DNOWEXT(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[30]; }
+    bool _3DNOW(void) { return CPU_Rep.isAMD_ && CPU_Rep.f_81_EDX_[31]; }
+
+    bool AVX(void) { return CPU_Rep.f_1_ECX_[28]; }
+    bool F16C(void) { return CPU_Rep.f_1_ECX_[29]; }
+    bool AVX2(void) { return CPU_Rep.f_7_EBX_[5]; }
+    bool AVX512F(void) { return CPU_Rep.f_7_EBX_[16]; }
+    bool AVX512PF(void) { return CPU_Rep.f_7_EBX_[26]; }
+    bool AVX512ER(void) { return CPU_Rep.f_7_EBX_[27]; }
+    bool AVX512CD(void) { return CPU_Rep.f_7_EBX_[28]; }
+
+private:
+    class InstructionSet_Internal
+    {
+    public:
+        InstructionSet_Internal()
+            : nIds_{ 0 },
+            nExIds_{ 0 },
+            isIntel_{ false },
+            isAMD_{ false },
+            f_1_ECX_{ 0 },
+            f_1_EDX_{ 0 },
+            f_7_EBX_{ 0 },
+            f_7_ECX_{ 0 },
+            f_81_ECX_{ 0 },
+            f_81_EDX_{ 0 },
+            data_{},
+            extdata_{}
+        {
+            //int cpuInfo[4] = {-1};
+            std::array<int, 4> cpui;
+
+            // Calling __cpuid with 0x0 as the function_id argument
+            // gets the number of the highest valid function ID.
+#if defined(_WIN32)
+            __cpuid(cpui.data(), 0);
+            nIds_ = cpui[0];
+#else
+            nIds_ = __get_cpuid_max(0, NULL);
+#endif
+
+            for (int i = 0; i <= nIds_; ++i)
+            {
+#if defined(_WIN32)
+                __cpuidex(cpui.data(), i, 0);
+#else
+                int *data = cpui.data();
+                __cpuid_count(i, 0, data[0], data[1], data[2], data[3]);
+#endif
+                data_.push_back(cpui);
+            }
+
+            // Capture vendor string
+            char vendor[0x20];
+            memset(vendor, 0, sizeof(vendor));
+            *reinterpret_cast<int*>(vendor) = data_[0][1];
+            *reinterpret_cast<int*>(vendor + 4) = data_[0][3];
+            *reinterpret_cast<int*>(vendor + 8) = data_[0][2];
+            vendor_ = vendor;
+            if (vendor_ == "GenuineIntel")
+            {
+                isIntel_ = true;
+            }
+            else if (vendor_ == "AuthenticAMD")
+            {
+                isAMD_ = true;
+            }
+
+            // load bitset with flags for function 0x00000001
+            if (nIds_ >= 1)
+            {
+                f_1_ECX_ = data_[1][2];
+                f_1_EDX_ = data_[1][3];
+            }
+
+            // load bitset with flags for function 0x00000007
+            if (nIds_ >= 7)
+            {
+                f_7_EBX_ = data_[7][1];
+                f_7_ECX_ = data_[7][2];
+            }
+
+            // Calling __cpuid with 0x80000000 as the function_id argument
+            // gets the number of the highest valid extended ID.
+#if defined(_WIN32)
+            __cpuid(cpui.data(), 0x80000000);
+            nExIds_ = cpui[0];
+#else
+            nExIds_ = __get_cpuid_max(0x80000000, NULL);
+#endif
+
+            char brand[0x40];
+            memset(brand, 0, sizeof(brand));
+
+            for (unsigned i = 0x80000000; i <= nExIds_; ++i)
+            {
+#if defined(_WIN32)
+                __cpuidex(cpui.data(), i, 0);
+#else
+                int *data = cpui.data();
+                __cpuid_count(i, 0, data[0], data[1], data[2], data[3]);
+#endif
+                extdata_.push_back(cpui);
+            }
+
+            // load bitset with flags for function 0x80000001
+            if (nExIds_ >= 0x80000001)
+            {
+                f_81_ECX_ = extdata_[1][2];
+                f_81_EDX_ = extdata_[1][3];
+            }
+
+            // Interpret CPU brand string if reported
+            if (nExIds_ >= 0x80000004)
+            {
+                memcpy(brand, extdata_[2].data(), sizeof(cpui));
+                memcpy(brand + 16, extdata_[3].data(), sizeof(cpui));
+                memcpy(brand + 32, extdata_[4].data(), sizeof(cpui));
+                brand_ = brand;
+            }
+        };
+
+        int nIds_;
+        unsigned nExIds_;
+        std::string vendor_;
+        std::string brand_;
+        bool isIntel_;
+        bool isAMD_;
+        std::bitset<32> f_1_ECX_;
+        std::bitset<32> f_1_EDX_;
+        std::bitset<32> f_7_EBX_;
+        std::bitset<32> f_7_ECX_;
+        std::bitset<32> f_81_ECX_;
+        std::bitset<32> f_81_EDX_;
+        std::vector<std::array<int, 4>> data_;
+        std::vector<std::array<int, 4>> extdata_;
+    };
+    const InstructionSet_Internal CPU_Rep;
+};
diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h
new file mode 100644
index 00000000000..522ae0dd65f
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/os.h
@@ -0,0 +1,220 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+
+#ifndef __SWR_OS_H__
+#define __SWR_OS_H__
+
+#include "core/knobs.h"
+
+#if (defined(FORCE_WINDOWS) || defined(_WIN32)) && !defined(FORCE_LINUX)
+
+#define SWR_API __cdecl
+
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+
+#ifndef NOMINMAX
+#define NOMINMAX
+#endif
+#include "Windows.h"
+#include <intrin.h>
+#include <cstdint>
+
+#define OSALIGN(RWORD, WIDTH) __declspec(align(WIDTH)) RWORD
+#define THREAD __declspec(thread)
+#define INLINE __forceinline
+#define DEBUGBREAK __debugbreak()
+
+#define PRAGMA_WARNING_PUSH_DISABLE(...) \
+	__pragma(warning(push));\
+	__pragma(warning(disable:__VA_ARGS__));
+
+#define PRAGMA_WARNING_POP() __pragma(warning(pop))
+
+#if defined(_WIN32)
+#if defined(_WIN64)
+#define BitScanForwardSizeT BitScanForward64
+#define _mm_popcount_sizeT _mm_popcnt_u64
+#else
+#define BitScanForwardSizeT BitScanForward
+#define _mm_popcount_sizeT _mm_popcnt_u32
+#endif
+#endif
+
+#elif defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
+
+#define SWR_API
+
+#include <stdlib.h>
+#include <string.h>
+#include <X11/Xmd.h>
+#include <x86intrin.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <unistd.h>
+#include <sys/stat.h>
+
+typedef void			VOID;
+typedef void*           LPVOID;
+typedef CARD8			BOOL;
+typedef wchar_t			WCHAR;
+typedef uint16_t		UINT16;
+typedef int				INT;
+typedef unsigned int	UINT;
+typedef uint32_t		UINT32;
+typedef uint64_t		UINT64;
+typedef int64_t		    INT64;
+typedef void*			HANDLE;
+typedef float			FLOAT;
+typedef int			    LONG;
+typedef CARD8		    BYTE;
+typedef unsigned char   UCHAR;
+typedef unsigned int	DWORD;
+
+#undef FALSE
+#define FALSE 0
+
+#undef TRUE
+#define TRUE 1
+
+#define OSALIGN(RWORD, WIDTH) RWORD __attribute__((aligned(WIDTH)))
+#define THREAD __thread
+#ifndef INLINE
+#define INLINE __inline
+#endif
+#define DEBUGBREAK asm ("int $3")
+#define __cdecl
+#define __declspec(X)
+
+#define GCC_VERSION (__GNUC__ * 10000 \
+                     + __GNUC_MINOR__ * 100 \
+                     + __GNUC_PATCHLEVEL__)
+
+#if !defined(__clang__) && (__GNUC__) && (GCC_VERSION < 40500)
+inline
+uint64_t __rdtsc()
+{
+    long low, high;
+    asm volatile("rdtsc" : "=a"(low), "=d"(high));
+    return (low | ((uint64_t)high << 32));
+}
+#endif
+
+#ifndef __clang__
+// Intrinsic not defined in gcc
+static INLINE
+void _mm256_storeu2_m128i(__m128i *hi, __m128i *lo, __m256i a)
+{
+    _mm_storeu_si128((__m128i*)lo, _mm256_castsi256_si128(a));
+    _mm_storeu_si128((__m128i*)hi, _mm256_extractf128_si256(a, 0x1));
+}
+#endif
+
+inline
+unsigned char _BitScanForward(unsigned long *Index, unsigned long Mask)
+{
+    *Index = __builtin_ctz(Mask);
+    return (Mask != 0);
+}
+
+inline
+unsigned char _BitScanForward(unsigned int *Index, unsigned int Mask)
+{
+    *Index = __builtin_ctz(Mask);
+    return (Mask != 0);
+}
+
+inline
+unsigned char _BitScanReverse(unsigned long *Index, unsigned long Mask)
+{
+    *Index = __builtin_clz(Mask);
+    return (Mask != 0);
+}
+
+inline
+unsigned char _BitScanReverse(unsigned int *Index, unsigned int Mask)
+{
+    *Index = __builtin_clz(Mask);
+    return (Mask != 0);
+}
+
+inline
+void *_aligned_malloc(unsigned int size, unsigned int alignment)
+{
+    void *ret;
+    if (posix_memalign(&ret, alignment, size))
+    {
+        return NULL;
+    }
+    return ret;
+}
+
+inline
+unsigned char _bittest(const LONG *a, LONG b)
+{
+    return ((*(unsigned *)(a) & (1 << b)) != 0);
+}
+
+#define GetCurrentProcessId getpid
+
+#define CreateDirectory(name, pSecurity) mkdir(name, 0777)
+
+#if defined(_WIN32)
+static inline
+unsigned int _mm_popcnt_u32(unsigned int v)
+{
+    return __builtin_popcount(v);
+}
+#endif
+
+#define _aligned_free free
+#define InterlockedCompareExchange(Dest, Exchange, Comparand) __sync_val_compare_and_swap(Dest, Comparand, Exchange)
+#define InterlockedExchangeAdd(Addend, Value) __sync_fetch_and_add(Addend, Value)
+#define InterlockedDecrement(Append) __sync_sub_and_fetch(Append, 1)
+#define InterlockedIncrement(Append) __sync_add_and_fetch(Append, 1)
+#define _ReadWriteBarrier() asm volatile("" ::: "memory")
+#define __stdcall
+
+#define PRAGMA_WARNING_PUSH_DISABLE(...)
+#define PRAGMA_WARNING_POP()
+
+#else
+
+#error Unsupported OS/system.
+
+#endif
+
+// Universal types
+typedef BYTE        KILOBYTE[1024];
+typedef KILOBYTE    MEGABYTE[1024];
+typedef MEGABYTE    GIGABYTE[1024];
+
+#define OSALIGNLINE(RWORD) OSALIGN(RWORD, 64)
+#if KNOB_SIMD_WIDTH == 8
+#define OSALIGNSIMD(RWORD) OSALIGN(RWORD, 32)
+#endif
+
+#include "common/swr_assert.h"
+
+#endif//__SWR_OS_H__
diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
new file mode 100644
index 00000000000..454641b2751
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
@@ -0,0 +1,188 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+* 
+* @file rdtsc_buckets.cpp
+* 
+* @brief implementation of rdtsc buckets.
+* 
+* Notes:
+* 
+******************************************************************************/
+#include "rdtsc_buckets.h"
+#include <inttypes.h>
+
+THREAD UINT tlsThreadId = 0;
+
+void BucketManager::RegisterThread(const std::string& name)
+{
+    BUCKET_THREAD newThread;
+    newThread.name = name;
+    newThread.root.children.reserve(mBuckets.size());
+    newThread.root.id = 0;
+    newThread.root.pParent = nullptr;
+    newThread.pCurrent = &newThread.root;
+
+    mThreadMutex.lock();
+
+    // assign unique thread id for this thread
+    size_t id = mThreads.size();
+    newThread.id = (UINT)id;
+    tlsThreadId = (UINT)id;
+
+    // open threadviz file if enabled
+    if (mThreadViz)
+    {
+        std::stringstream ss;
+        ss << mThreadVizDir << "\\threadviz_thread." << newThread.id << ".dat";
+        newThread.vizFile = fopen(ss.str().c_str(), "wb");
+    }
+
+    // store new thread
+    mThreads.push_back(newThread);
+
+    mThreadMutex.unlock();
+}
+
+UINT BucketManager::RegisterBucket(const BUCKET_DESC& desc)
+{
+    size_t id = mBuckets.size();
+    mBuckets.push_back(desc);
+    return (UINT)id;
+}
+
+void BucketManager::PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 parentCycles, const BUCKET& bucket)
+{
+    const char *arrows[] = {
+        "",
+        "|-> ",
+        "    |-> ",
+        "        |-> ",
+        "            |-> ",
+        "                |-> ",
+        "                    |-> "
+    };
+
+    // compute percent of total cycles used by this bucket
+    float percentTotal = (float)((double)bucket.elapsed / (double)threadCycles * 100.0);
+
+    // compute percent of parent cycles used by this bucket
+    float percentParent = (float)((double)bucket.elapsed / (double)parentCycles * 100.0);
+
+    // compute average cycle count per invocation
+    UINT64 CPE = bucket.elapsed / bucket.count;
+
+    BUCKET_DESC &desc = mBuckets[bucket.id];
+
+    // construct hierarchy visualization
+    char hier[80];
+    strcpy(hier, arrows[level]);
+    strcat(hier, desc.name.c_str());
+
+    // print out
+    fprintf(f, "%6.2f %6.2f %-10" PRIu64 " %-10" PRIu64 " %-10u %-10lu %-10u %s\n", 
+        percentTotal, 
+        percentParent, 
+        bucket.elapsed, 
+        CPE, 
+        bucket.count, 
+        (unsigned long)0, 
+        (uint32_t)0, 
+        hier
+    );
+
+    // dump all children of this bucket
+    for (const BUCKET& child : bucket.children)
+    {
+        if (child.count)
+        {
+            PrintBucket(f, level + 1, threadCycles, bucket.elapsed, child);
+        }
+    }
+}
+
+void BucketManager::PrintThread(FILE* f, const BUCKET_THREAD& thread)
+{
+    // print header
+    fprintf(f, "\nThread %u (%s)\n", thread.id, thread.name.c_str());
+    fprintf(f, " %%Tot   %%Par  Cycles     CPE        NumEvent   CPE2       NumEvent2  Bucket\n");
+
+    // compute thread level total cycle counts across all buckets from root
+    const BUCKET& root = thread.root;
+    UINT64 totalCycles = 0;
+    for (const BUCKET& child : root.children)
+    {
+        totalCycles += child.elapsed;
+    }
+
+    for (const BUCKET& child : root.children)
+    {
+        if (child.count)
+        {
+            PrintBucket(f, 0, totalCycles, totalCycles, child);
+        }
+    }
+}
+
+void BucketManager::DumpThreadViz()
+{
+    // ensure all thread data is flushed
+    mThreadMutex.lock();
+    for (auto& thread : mThreads)
+    {
+        fflush(thread.vizFile);
+        fclose(thread.vizFile);
+    }
+    mThreadMutex.unlock();
+
+    // dump bucket descriptions
+    std::stringstream ss;
+    ss << mThreadVizDir << "\\threadviz_buckets.dat";
+
+    FILE* f = fopen(ss.str().c_str(), "wb");
+    for (auto& bucket : mBuckets)
+    {
+        Serialize(f, bucket);
+    }
+    fclose(f);
+}
+
+void BucketManager::PrintReport(const std::string& filename)
+{
+    if (mThreadViz)
+    {
+        DumpThreadViz();
+    }
+    else
+    {
+        FILE* f = fopen(filename.c_str(), "w");
+
+        mThreadMutex.lock();
+        for (const BUCKET_THREAD& thread : mThreads)
+        {
+            PrintThread(f, thread);
+            fprintf(f, "\n");
+        }
+        mThreadMutex.unlock();
+
+        fclose(f);
+    }
+}
diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
new file mode 100644
index 00000000000..99cb10ec6e8
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
@@ -0,0 +1,229 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+* 
+* @file rdtsc_buckets.h
+* 
+* @brief declaration for rdtsc buckets.
+* 
+* Notes:
+* 
+******************************************************************************/
+#pragma once
+
+#include "os.h"
+#include <vector>
+#include <mutex>
+#include <sstream>
+
+#include "rdtsc_buckets_shared.h"
+
+// unique thread id stored in thread local storage
+extern THREAD UINT tlsThreadId;
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief BucketManager encapsulates a single instance of the buckets
+///        functionality. There can be one or many bucket managers active
+///        at any time.  The manager owns all the threads and
+///        bucket information that have been registered to it.
+class BucketManager
+{
+public:
+    BucketManager(bool enableThreadViz) : mThreadViz(enableThreadViz)
+    {
+        if (mThreadViz)
+        {
+            uint32_t pid = GetCurrentProcessId();
+            std::stringstream str;
+            str << "threadviz." << pid;
+            mThreadVizDir = str.str();
+            CreateDirectory(mThreadVizDir.c_str(), NULL);
+        }
+    }
+
+    // removes all registered thread data
+    void ClearThreads()
+    {
+        mThreadMutex.lock();
+        mThreads.clear();
+        mThreadMutex.unlock();
+    }
+
+    // removes all registered buckets
+    void ClearBuckets()
+    {
+        mBuckets.clear();
+    }
+
+    /// Registers a new thread with the manager.
+    /// @param name - name of thread, used for labels in reports and threadviz
+    void RegisterThread(const std::string& name);
+
+    /// Registers a new bucket type with the manager.  Returns a unique
+    /// id which should be used in subsequent calls to start/stop the bucket
+    /// @param desc - description of the bucket
+    /// @return unique id
+    UINT RegisterBucket(const BUCKET_DESC& desc);
+
+    // dump threadviz data
+    void DumpThreadViz();
+
+    // print report
+    void PrintReport(const std::string& filename);
+
+    // start capturing
+    INLINE void StartCapture()
+    {
+        mCapturing = true;
+    }
+
+    // stop capturing
+    INLINE void StopCapture()
+    {
+        mCapturing = false;
+
+        // wait for all threads to pop back to root bucket
+        bool stillCapturing = true;
+        while (stillCapturing)
+        {
+            stillCapturing = false;
+            for (const BUCKET_THREAD& t : mThreads)
+            {
+                if (t.pCurrent != &t.root)
+                {
+                    stillCapturing = true;
+                    continue;
+                }
+            }
+        }
+    }
+
+    // start a bucket
+    // @param id generated by RegisterBucket
+    INLINE void StartBucket(UINT id)
+    {
+        if (!mCapturing) return;
+
+        SWR_ASSERT(tlsThreadId < mThreads.size());
+
+        BUCKET_THREAD& bt = mThreads[tlsThreadId];
+
+        // if threadviz is enabled, only need to dump start info to threads viz file
+        if (mThreadViz)
+        {
+            SWR_ASSERT(bt.vizFile != nullptr);
+            if (mBuckets[id].enableThreadViz)
+            {
+                VIZ_START_DATA data{ VIZ_START, id, __rdtsc() };
+                Serialize(bt.vizFile, data);
+            }
+        }
+        else
+        {
+            if (bt.pCurrent->children.size() < mBuckets.size())
+            {
+                bt.pCurrent->children.resize(mBuckets.size());
+            }
+            BUCKET &child = bt.pCurrent->children[id];
+            child.pParent = bt.pCurrent;
+            child.id = id;
+            child.start = __rdtsc();
+
+            // update thread's currently executing bucket
+            bt.pCurrent = &child;
+        }
+
+        bt.level++;
+    }
+
+    // stop the currently executing bucket
+    INLINE void StopBucket(UINT id)
+    {
+        SWR_ASSERT(tlsThreadId < mThreads.size());
+        BUCKET_THREAD &bt = mThreads[tlsThreadId];
+
+        if (bt.level == 0) return;
+
+        if (mThreadViz)
+        {
+            SWR_ASSERT(bt.vizFile != nullptr);
+            if (mBuckets[id].enableThreadViz)
+            {
+                VIZ_STOP_DATA data{ VIZ_STOP, __rdtsc() };
+                Serialize(bt.vizFile, data);
+            }
+        }
+        else
+        {
+            if (bt.pCurrent->start == 0) return;
+            SWR_ASSERT(bt.pCurrent->id == id, "Mismatched buckets detected");
+
+            bt.pCurrent->elapsed += (__rdtsc() - bt.pCurrent->start);
+            bt.pCurrent->count++;
+
+            // pop to parent
+            bt.pCurrent = bt.pCurrent->pParent;
+        }
+
+        bt.level--;
+    }
+
+    INLINE void AddEvent(uint32_t id, uint32_t count)
+    {
+        if (!mCapturing) return;
+
+        SWR_ASSERT(tlsThreadId < mThreads.size());
+
+        BUCKET_THREAD& bt = mThreads[tlsThreadId];
+
+        // don't record events for threadviz
+        if (!mThreadViz)
+        {
+            if (bt.pCurrent->children.size() < mBuckets.size())
+            {
+                bt.pCurrent->children.resize(mBuckets.size());
+            }
+            BUCKET &child = bt.pCurrent->children[id];
+            child.pParent = bt.pCurrent;
+            child.id = id;
+            child.count += count;
+        }
+    }
+
+private:
+    void PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 parentCycles, const BUCKET& bucket);
+    void PrintThread(FILE* f, const BUCKET_THREAD& thread);
+
+    // list of active threads that have registered with this manager
+    std::vector<BUCKET_THREAD> mThreads;
+
+    // list of buckets registered with this manager
+    std::vector<BUCKET_DESC> mBuckets;
+
+    // is capturing currently enabled
+    volatile bool mCapturing{ false };
+
+    std::mutex mThreadMutex;
+
+    // enable threadviz
+    bool mThreadViz{ false };
+    std::string mThreadVizDir;
+};
diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
new file mode 100644
index 00000000000..41c6d5dec79
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
@@ -0,0 +1,167 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+* 
+* @file rdtsc_buckets.h
+* 
+* @brief declaration for rdtsc buckets.
+* 
+* Notes:
+* 
+******************************************************************************/
+#pragma once
+
+#include <vector>
+#include <cassert>
+
+struct BUCKET
+{
+    uint32_t id{ 0 };
+    uint64_t start{ 0 };
+    uint64_t elapsed{ 0 };
+    uint32_t count{ 0 };
+
+    BUCKET* pParent{ nullptr };
+    std::vector<BUCKET> children;
+};
+
+struct BUCKET_DESC
+{
+    // name of bucket, used in reports
+    std::string name;
+
+    // description of bucket, used in threadviz
+    std::string description;
+
+    // enable for threadviz dumping
+    bool enableThreadViz;
+
+    // threadviz color of bucket, in RGBA8_UNORM format
+    uint32_t color;
+};
+
+struct BUCKET_THREAD
+{
+    // name of thread, used in reports
+    std::string name;
+
+    // id for this thread, assigned by the thread manager
+    uint32_t id;
+
+    // root of the bucket hierarchy for this thread
+    BUCKET root;
+
+    // currently executing bucket somewhere in the hierarchy
+    BUCKET* pCurrent;
+
+    // currently executing hierarchy level
+    uint32_t level{ 0 };
+
+    // threadviz file object
+    FILE* vizFile{ nullptr };
+
+    BUCKET_THREAD() {}
+    BUCKET_THREAD(const BUCKET_THREAD& that)
+    {
+        name = that.name;
+        id = that.id;
+        root = that.root;
+        pCurrent = &root;
+        vizFile = that.vizFile;
+    }
+};
+
+enum VIZ_TYPE
+{
+    VIZ_START = 0,
+    VIZ_STOP  = 1,
+    VIZ_DATA  = 2
+};
+
+struct VIZ_START_DATA
+{
+    uint8_t type;
+    uint32_t bucketId;
+    uint64_t timestamp;
+};
+
+struct VIZ_STOP_DATA
+{
+    uint8_t type;
+    uint64_t timestamp;
+};
+
+inline void Serialize(FILE* f, const VIZ_START_DATA& data)
+{
+    fwrite(&data, sizeof(VIZ_START_DATA), 1, f);
+}
+
+inline void Deserialize(FILE* f, VIZ_START_DATA& data)
+{
+    fread(&data, sizeof(VIZ_START_DATA), 1, f);
+    assert(data.type == VIZ_START);
+}
+
+inline void Serialize(FILE* f, const VIZ_STOP_DATA& data)
+{
+    fwrite(&data, sizeof(VIZ_STOP_DATA), 1, f);
+}
+
+inline void Deserialize(FILE* f, VIZ_STOP_DATA& data)
+{
+    fread(&data, sizeof(VIZ_STOP_DATA), 1, f);
+    assert(data.type == VIZ_STOP);
+}
+
+inline void Serialize(FILE* f, const std::string& string)
+{
+    assert(string.size() <= 256);
+
+    uint8_t length = (uint8_t)string.size();
+    fwrite(&length, sizeof(length), 1, f);
+    fwrite(string.c_str(), string.size(), 1, f);
+}
+
+inline void Deserialize(FILE* f, std::string& string)
+{
+    char cstr[256];
+    uint8_t length;
+    fread(&length, sizeof(length), 1, f);
+    fread(cstr, length, 1, f);
+    cstr[length] = 0;
+    string.assign(cstr);
+}
+
+inline void Serialize(FILE* f, const BUCKET_DESC& desc)
+{
+    Serialize(f, desc.name);
+    Serialize(f, desc.description);
+    fwrite(&desc.enableThreadViz, sizeof(desc.enableThreadViz), 1, f);
+    fwrite(&desc.color, sizeof(desc.color), 1, f);
+}
+
+inline void Deserialize(FILE* f, BUCKET_DESC& desc)
+{
+    Deserialize(f, desc.name);
+    Deserialize(f, desc.description);
+    fread(&desc.enableThreadViz, sizeof(desc.enableThreadViz), 1, f);
+    fread(&desc.color, sizeof(desc.color), 1, f);
+}
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
new file mode 100644
index 00000000000..8fa6d9ef408
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
@@ -0,0 +1,787 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+
+#ifndef __SWR_SIMDINTRIN_H__
+#define __SWR_SIMDINTRIN_H__
+
+#include "os.h"
+
+#include <cassert>
+
+#include <emmintrin.h>
+#include <immintrin.h>
+#include <xmmintrin.h>
+
+#if KNOB_SIMD_WIDTH == 8 
+typedef __m256 simdscalar;
+typedef __m256i simdscalari;
+typedef uint8_t simdmask;
+#else
+#error Unsupported vector width
+#endif
+
+// simd vector
+OSALIGNSIMD(union) simdvector
+{
+	simdscalar	v[4];
+	struct
+	{
+		simdscalar x, y, z, w;
+	};
+
+	simdscalar& operator[] (const int i) { return v[i]; }
+	const simdscalar& operator[] (const int i) const { return v[i]; }
+};
+
+#if KNOB_SIMD_WIDTH == 8
+#define _simd128_maskstore_ps _mm_maskstore_ps
+#define _simd_load_ps _mm256_load_ps
+#define _simd_load1_ps _mm256_broadcast_ss
+#define _simd_loadu_ps _mm256_loadu_ps
+#define _simd_setzero_ps _mm256_setzero_ps
+#define _simd_set1_ps	_mm256_set1_ps
+#define _simd_blend_ps	_mm256_blend_ps
+#define _simd_blendv_ps _mm256_blendv_ps
+#define _simd_store_ps _mm256_store_ps
+#define _simd_mul_ps _mm256_mul_ps
+#define _simd_add_ps _mm256_add_ps
+#define _simd_sub_ps _mm256_sub_ps
+#define _simd_rsqrt_ps _mm256_rsqrt_ps
+#define _simd_min_ps _mm256_min_ps
+#define _simd_max_ps _mm256_max_ps
+#define _simd_movemask_ps _mm256_movemask_ps
+#define _simd_cvtps_epi32 _mm256_cvtps_epi32
+#define _simd_cvttps_epi32 _mm256_cvttps_epi32
+#define _simd_cvtepi32_ps _mm256_cvtepi32_ps
+#define _simd_cmplt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ)
+#define _simd_cmpgt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OQ)
+#define _simd_cmpneq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_NEQ_OQ)
+#define _simd_cmpeq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_OQ)
+#define _simd_cmpge_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ)
+#define _simd_cmple_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
+#define _simd_cmp_ps(a, b, imm) _mm256_cmp_ps(a, b, imm)
+#define _simd_and_ps _mm256_and_ps
+#define _simd_or_ps _mm256_or_ps
+
+#define _simd_rcp_ps _mm256_rcp_ps
+#define _simd_div_ps _mm256_div_ps
+#define _simd_castsi_ps _mm256_castsi256_ps
+#define _simd_andnot_ps _mm256_andnot_ps
+#define _simd_round_ps _mm256_round_ps
+#define _simd_castpd_ps _mm256_castpd_ps
+#define _simd_broadcast_ps(a) _mm256_broadcast_ps((const __m128*)(a))
+
+#define _simd_load_sd _mm256_load_sd
+#define _simd_movemask_pd _mm256_movemask_pd
+#define _simd_castsi_pd _mm256_castsi256_pd
+
+// emulated integer simd
+#define SIMD_EMU_EPI(func, intrin) \
+INLINE \
+__m256i func(__m256i a, __m256i b)\
+{\
+	__m128i aHi = _mm256_extractf128_si256(a, 1);\
+	__m128i bHi = _mm256_extractf128_si256(b, 1);\
+	__m128i aLo = _mm256_castsi256_si128(a);\
+	__m128i bLo = _mm256_castsi256_si128(b);\
+\
+	__m128i subLo = intrin(aLo, bLo);\
+	__m128i subHi = intrin(aHi, bHi);\
+\
+	__m256i result = _mm256_castsi128_si256(subLo);\
+	        result = _mm256_insertf128_si256(result, subHi, 1);\
+\
+	return result;\
+}
+
+#if (KNOB_ARCH == KNOB_ARCH_AVX)
+#define _simd_mul_epi32 _simdemu_mul_epi32
+#define _simd_mullo_epi32 _simdemu_mullo_epi32
+#define _simd_sub_epi32 _simdemu_sub_epi32
+#define _simd_sub_epi64 _simdemu_sub_epi64
+#define _simd_min_epi32 _simdemu_min_epi32
+#define _simd_min_epu32 _simdemu_min_epu32
+#define _simd_max_epi32 _simdemu_max_epi32
+#define _simd_max_epu32 _simdemu_max_epu32
+#define _simd_add_epi32 _simdemu_add_epi32
+#define _simd_and_si _simdemu_and_si
+#define _simd_andnot_si _simdemu_andnot_si
+#define _simd_cmpeq_epi32 _simdemu_cmpeq_epi32
+#define _simd_cmplt_epi32 _simdemu_cmplt_epi32
+#define _simd_cmpgt_epi32 _simdemu_cmpgt_epi32
+#define _simd_or_si _simdemu_or_si
+#define _simd_castps_si _mm256_castps_si256
+#define _simd_adds_epu8 _simdemu_adds_epu8
+#define _simd_subs_epu8 _simdemu_subs_epu8
+#define _simd_add_epi8 _simdemu_add_epi8
+#define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64
+#define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64
+#define _simd_movemask_epi8 _simdemu_movemask_epi8
+
+SIMD_EMU_EPI(_simdemu_mul_epi32, _mm_mul_epi32)
+SIMD_EMU_EPI(_simdemu_mullo_epi32, _mm_mullo_epi32)
+SIMD_EMU_EPI(_simdemu_sub_epi32, _mm_sub_epi32)
+SIMD_EMU_EPI(_simdemu_sub_epi64, _mm_sub_epi64)
+SIMD_EMU_EPI(_simdemu_min_epi32, _mm_min_epi32)
+SIMD_EMU_EPI(_simdemu_min_epu32, _mm_min_epu32)
+SIMD_EMU_EPI(_simdemu_max_epi32, _mm_max_epi32)
+SIMD_EMU_EPI(_simdemu_max_epu32, _mm_max_epu32)
+SIMD_EMU_EPI(_simdemu_add_epi32, _mm_add_epi32)
+SIMD_EMU_EPI(_simdemu_and_si, _mm_and_si128)
+SIMD_EMU_EPI(_simdemu_andnot_si, _mm_andnot_si128)
+SIMD_EMU_EPI(_simdemu_cmpeq_epi32, _mm_cmpeq_epi32)
+SIMD_EMU_EPI(_simdemu_cmplt_epi32, _mm_cmplt_epi32)
+SIMD_EMU_EPI(_simdemu_cmpgt_epi32, _mm_cmpgt_epi32)
+SIMD_EMU_EPI(_simdemu_or_si, _mm_or_si128)
+SIMD_EMU_EPI(_simdemu_adds_epu8, _mm_adds_epu8)
+SIMD_EMU_EPI(_simdemu_subs_epu8, _mm_subs_epu8)
+SIMD_EMU_EPI(_simdemu_add_epi8, _mm_add_epi8)
+SIMD_EMU_EPI(_simdemu_cmpeq_epi64, _mm_cmpeq_epi64)
+SIMD_EMU_EPI(_simdemu_cmpgt_epi64, _mm_cmpgt_epi64)
+
+#define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
+#define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
+
+#define _simd_slli_epi32(a,i) _simdemu_slli_epi32(a,i)
+#define _simd_srai_epi32(a,i) _simdemu_srai_epi32(a,i)
+#define _simd_srli_epi32(a,i) _simdemu_srli_epi32(a,i)
+#define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
+
+#define _simd128_fmadd_ps _mm_fmaddemu_ps
+#define _simd_fmadd_ps _mm_fmaddemu256_ps
+#define _simd_fmsub_ps _mm_fmsubemu256_ps
+#define _simd_shuffle_epi8 _simdemu_shuffle_epi8 
+SIMD_EMU_EPI(_simdemu_shuffle_epi8, _mm_shuffle_epi8)
+
+INLINE
+__m128 _mm_fmaddemu_ps(__m128 a, __m128 b, __m128 c)
+{
+	__m128 res = _mm_mul_ps(a, b);
+	res = _mm_add_ps(res, c);
+	return res;
+}
+
+INLINE
+__m256 _mm_fmaddemu256_ps(__m256 a, __m256 b, __m256 c)
+{
+	__m256 res = _mm256_mul_ps(a, b);
+	res = _mm256_add_ps(res, c);
+	return res;
+}
+
+INLINE
+__m256 _mm_fmsubemu256_ps(__m256 a, __m256 b, __m256 c)
+{
+	__m256 res = _mm256_mul_ps(a, b);
+	res = _mm256_sub_ps(res, c);
+	return res;
+}
+
+INLINE
+__m256 _simd_i32gather_ps(const float* pBase, __m256i vOffsets, const int scale)
+{
+    uint32_t *pOffsets = (uint32_t*)&vOffsets;
+    simdscalar vResult;
+    float* pResult = (float*)&vResult;
+    for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
+    {
+        uint32_t offset = pOffsets[i];
+        offset = offset * scale;
+        pResult[i] = *(float*)(((const uint8_t*)pBase + offset));
+    }
+
+    return vResult;
+}
+
+INLINE
+__m256 _simd_mask_i32gather_ps(__m256 vSrc, const float* pBase, __m256i vOffsets, __m256 vMask, const int scale)
+{
+    uint32_t *pOffsets = (uint32_t*)&vOffsets;
+    simdscalar vResult = vSrc;
+    float* pResult = (float*)&vResult;
+    DWORD index;
+    uint32_t mask = _simd_movemask_ps(vMask);
+    while (_BitScanForward(&index, mask))
+    {
+        mask &= ~(1 << index);
+        uint32_t offset = pOffsets[index];
+        offset = offset * scale;
+        pResult[index] = *(float*)(((const uint8_t*)pBase + offset));
+    }
+
+    return vResult;
+}
+
+INLINE
+__m256i _simd_abs_epi32(__m256i a)
+{
+        __m128i aHi = _mm256_extractf128_si256(a, 1);
+        __m128i aLo = _mm256_castsi256_si128(a);
+        __m128i absLo = _mm_abs_epi32(aLo);
+        __m128i absHi = _mm_abs_epi32(aHi);
+        __m256i result = _mm256_castsi128_si256(absLo);
+        result = _mm256_insertf128_si256(result, absHi, 1);
+        return result;
+}
+
+INLINE 
+int _simdemu_movemask_epi8(__m256i a)
+{
+    __m128i aHi = _mm256_extractf128_si256(a, 1);
+    __m128i aLo = _mm256_castsi256_si128(a);
+
+    int resHi = _mm_movemask_epi8(aHi);
+    int resLo = _mm_movemask_epi8(aLo);
+
+    return (resHi << 16) | resLo;
+}
+#else
+
+#define _simd_mul_epi32 _mm256_mul_epi32
+#define _simd_mullo_epi32 _mm256_mullo_epi32
+#define _simd_sub_epi32 _mm256_sub_epi32
+#define _simd_sub_epi64 _mm256_sub_epi64
+#define _simd_min_epi32 _mm256_min_epi32
+#define _simd_max_epi32 _mm256_max_epi32
+#define _simd_min_epu32 _mm256_min_epu32
+#define _simd_max_epu32 _mm256_max_epu32
+#define _simd_add_epi32 _mm256_add_epi32
+#define _simd_and_si _mm256_and_si256
+#define _simd_andnot_si _mm256_andnot_si256
+#define _simd_cmpeq_epi32 _mm256_cmpeq_epi32
+#define _simd_cmplt_epi32(a,b) _mm256_cmpgt_epi32(b,a)
+#define _simd_cmpgt_epi32(a,b) _mm256_cmpgt_epi32(a,b)
+#define _simd_or_si _mm256_or_si256
+#define _simd_castps_si _mm256_castps_si256
+
+#define _simd_unpacklo_epi32 _mm256_unpacklo_epi32
+#define _simd_unpackhi_epi32 _mm256_unpackhi_epi32
+
+#define _simd_srli_si(a,i) _simdemu_srli_si128<i>(a)
+#define _simd_slli_epi32 _mm256_slli_epi32
+#define _simd_srai_epi32 _mm256_srai_epi32
+#define _simd_srli_epi32 _mm256_srli_epi32
+#define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
+#define _simd128_fmadd_ps _mm_fmadd_ps
+#define _simd_fmadd_ps _mm256_fmadd_ps
+#define _simd_fmsub_ps _mm256_fmsub_ps
+#define _simd_shuffle_epi8 _mm256_shuffle_epi8 
+#define _simd_adds_epu8 _mm256_adds_epu8
+#define _simd_subs_epu8 _mm256_subs_epu8
+#define _simd_add_epi8 _mm256_add_epi8
+#define _simd_i32gather_ps _mm256_i32gather_ps
+#define _simd_mask_i32gather_ps _mm256_mask_i32gather_ps
+#define _simd_abs_epi32 _mm256_abs_epi32
+
+#define _simd_cmpeq_epi64 _mm256_cmpeq_epi64
+#define _simd_cmpgt_epi64 _mm256_cmpgt_epi64
+#define _simd_movemask_epi8 _mm256_movemask_epi8
+#endif
+
+#define _simd_shuffleps_epi32(vA, vB, imm) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(vA), _mm256_castsi256_ps(vB), imm))
+#define _simd_shuffle_ps _mm256_shuffle_ps
+#define _simd_set1_epi32 _mm256_set1_epi32
+#define _simd_set1_epi8 _mm256_set1_epi8
+#define _simd_setzero_si _mm256_setzero_si256
+#define _simd_cvttps_epi32 _mm256_cvttps_epi32
+#define _simd_store_si _mm256_store_si256
+#define _simd_broadcast_ss _mm256_broadcast_ss
+#define _simd_maskstore_ps _mm256_maskstore_ps
+#define _simd_load_si _mm256_load_si256
+#define _simd_loadu_si _mm256_loadu_si256
+#define _simd_sub_ps _mm256_sub_ps
+#define _simd_testz_ps _mm256_testz_ps
+#define _simd_xor_ps _mm256_xor_ps
+
+
+INLINE
+simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalar mask)
+{
+    return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), mask));
+}
+
+// convert bitmask to vector mask
+INLINE
+simdscalar vMask(int32_t mask)
+{
+    __m256i vec = _mm256_set1_epi32(mask);
+    const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+    vec = _simd_and_si(vec, bit);
+    vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
+    return _simd_castsi_ps(vec);
+}
+
+INLINE
+void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int slane)
+{
+    OSALIGNSIMD(float) rArray[KNOB_SIMD_WIDTH], sArray[KNOB_SIMD_WIDTH];
+    _mm256_store_ps(rArray, r);
+    _mm256_store_ps(sArray, s);
+    rArray[rlane] = sArray[slane];
+    r = _mm256_load_ps(rArray);
+}
+
+INLINE __m256i _simdemu_slli_epi32(__m256i a, uint32_t i)
+{
+	__m128i aHi = _mm256_extractf128_si256(a, 1);
+	__m128i aLo = _mm256_castsi256_si128(a);
+
+	__m128i resHi = _mm_slli_epi32(aHi, i);
+	__m128i resLo = _mm_slli_epi32(aLo, i);
+
+	__m256i result = _mm256_castsi128_si256(resLo);
+		    result = _mm256_insertf128_si256(result, resHi, 1);
+
+	return result;
+}
+
+INLINE __m256i _simdemu_srai_epi32(__m256i a, uint32_t i)
+{
+	__m128i aHi = _mm256_extractf128_si256(a, 1);
+	__m128i aLo = _mm256_castsi256_si128(a);
+
+	__m128i resHi = _mm_srai_epi32(aHi, i);
+	__m128i resLo = _mm_srai_epi32(aLo, i);
+
+	__m256i result = _mm256_castsi128_si256(resLo);
+		    result = _mm256_insertf128_si256(result, resHi, 1);
+
+	return result;
+}
+
+INLINE __m256i _simdemu_srli_epi32(__m256i a, uint32_t i)
+{
+    __m128i aHi = _mm256_extractf128_si256(a, 1);
+    __m128i aLo = _mm256_castsi256_si128(a);
+
+    __m128i resHi = _mm_srli_epi32(aHi, i);
+    __m128i resLo = _mm_srli_epi32(aLo, i);
+
+    __m256i result = _mm256_castsi128_si256(resLo);
+    result = _mm256_insertf128_si256(result, resHi, 1);
+
+    return result;
+}
+
+INLINE
+void _simdvec_transpose(simdvector &v)
+{
+	SWR_ASSERT(false, "Need to implement 8 wide version");
+}
+
+#else
+#error Unsupported vector width
+#endif
+
+// Populates a simdvector from a vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
+INLINE
+void _simdvec_load_ps(simdvector& r, const float *p)
+{
+	r[0] = _simd_set1_ps(p[0]);
+	r[1] = _simd_set1_ps(p[1]);
+	r[2] = _simd_set1_ps(p[2]);
+	r[3] = _simd_set1_ps(p[3]);
+}
+
+INLINE
+void _simdvec_mov(simdvector& r, const simdscalar& s)
+{
+	r[0] = s;
+	r[1] = s;
+	r[2] = s;
+	r[3] = s;
+}
+
+INLINE
+void _simdvec_mov(simdvector& r, const simdvector& v)
+{
+	r[0] = v[0];
+	r[1] = v[1];
+	r[2] = v[2];
+	r[3] = v[3];
+}
+
+// just move a lane from the source simdvector to dest simdvector
+INLINE
+void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane)
+{
+	_simd_mov(r[0], rlane, s[0], slane);
+	_simd_mov(r[1], rlane, s[1], slane);
+	_simd_mov(r[2], rlane, s[2], slane);
+	_simd_mov(r[3], rlane, s[3], slane);
+}
+
+INLINE
+void _simdvec_dp3_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
+{
+	simdscalar tmp;
+	r	= _simd_mul_ps(v0[0], v1[0]);	// (v0.x*v1.x)
+
+	tmp	= _simd_mul_ps(v0[1], v1[1]);		// (v0.y*v1.y)
+	r	= _simd_add_ps(r, tmp);			// (v0.x*v1.x) + (v0.y*v1.y)
+
+	tmp	= _simd_mul_ps(v0[2], v1[2]);	// (v0.z*v1.z)
+	r	= _simd_add_ps(r, tmp);			// (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+}
+
+INLINE
+void _simdvec_dp4_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
+{
+	simdscalar tmp;
+	r	= _simd_mul_ps(v0[0], v1[0]);	// (v0.x*v1.x)
+
+	tmp	= _simd_mul_ps(v0[1], v1[1]);		// (v0.y*v1.y)
+	r	= _simd_add_ps(r, tmp);			// (v0.x*v1.x) + (v0.y*v1.y)
+
+	tmp	= _simd_mul_ps(v0[2], v1[2]);	// (v0.z*v1.z)
+	r	= _simd_add_ps(r, tmp);			// (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+
+	tmp	= _simd_mul_ps(v0[3], v1[3]);	// (v0.w*v1.w)
+	r	= _simd_add_ps(r, tmp);			// (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+}
+
+INLINE
+simdscalar _simdvec_rcp_length_ps(const simdvector& v)
+{
+	simdscalar length;
+	_simdvec_dp4_ps(length, v, v);
+	return _simd_rsqrt_ps(length);
+}
+
+INLINE
+void _simdvec_normalize_ps(simdvector& r, const simdvector& v)
+{
+	simdscalar vecLength;
+	vecLength = _simdvec_rcp_length_ps(v);
+
+	r[0] = _simd_mul_ps(v[0], vecLength);
+	r[1] = _simd_mul_ps(v[1], vecLength);
+	r[2] = _simd_mul_ps(v[2], vecLength);
+	r[3] = _simd_mul_ps(v[3], vecLength);
+}
+
+INLINE
+void _simdvec_mul_ps(simdvector& r, const simdvector& v, const simdscalar& s)
+{
+	r[0] = _simd_mul_ps(v[0], s);
+	r[1] = _simd_mul_ps(v[1], s);
+	r[2] = _simd_mul_ps(v[2], s);
+	r[3] = _simd_mul_ps(v[3], s);
+}
+
+INLINE
+void _simdvec_mul_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
+{
+	r[0] = _simd_mul_ps(v0[0], v1[0]);
+	r[1] = _simd_mul_ps(v0[1], v1[1]);
+	r[2] = _simd_mul_ps(v0[2], v1[2]);
+	r[3] = _simd_mul_ps(v0[3], v1[3]);
+}
+
+INLINE
+void _simdvec_add_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
+{
+	r[0] = _simd_add_ps(v0[0], v1[0]);
+	r[1] = _simd_add_ps(v0[1], v1[1]);
+	r[2] = _simd_add_ps(v0[2], v1[2]);
+	r[3] = _simd_add_ps(v0[3], v1[3]);
+}
+
+INLINE
+void _simdvec_min_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
+{
+	r[0] = _simd_min_ps(v0[0], s);
+	r[1] = _simd_min_ps(v0[1], s);
+	r[2] = _simd_min_ps(v0[2], s);
+	r[3] = _simd_min_ps(v0[3], s);
+}
+
+INLINE
+void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
+{
+	r[0] = _simd_max_ps(v0[0], s);
+	r[1] = _simd_max_ps(v0[1], s);
+	r[2] = _simd_max_ps(v0[2], s);
+	r[3] = _simd_max_ps(v0[3], s);
+}
+
+// Matrix4x4 * Vector4
+//   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
+//   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
+//   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
+//   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
+INLINE
+void _simd_mat4x4_vec4_multiply(
+	simdvector& result,
+	const float *pMatrix,
+	const simdvector& v)
+{
+	simdscalar m;
+	simdscalar r0;
+	simdscalar r1;
+
+	m	= _simd_load1_ps(pMatrix + 0*4 + 0);	// m[row][0]
+	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
+	m	= _simd_load1_ps(pMatrix + 0*4 + 1);	// m[row][1]
+	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
+	m	= _simd_load1_ps(pMatrix + 0*4 + 2);	// m[row][2]
+	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+	m	= _simd_load1_ps(pMatrix + 0*4 + 3);	// m[row][3]
+	r1	= _simd_mul_ps(m, v[3]);				// (m3 * v.z)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+	result[0] = r0;
+
+	m	= _simd_load1_ps(pMatrix + 1*4 + 0);	// m[row][0]
+	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
+	m	= _simd_load1_ps(pMatrix + 1*4 + 1);	// m[row][1]
+	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
+	m	= _simd_load1_ps(pMatrix + 1*4 + 2);	// m[row][2]
+	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+	m	= _simd_load1_ps(pMatrix + 1*4 + 3);	// m[row][3]
+	r1	= _simd_mul_ps(m, v[3]);				// (m3 * v.z)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+	result[1] = r0;
+
+	m	= _simd_load1_ps(pMatrix + 2*4 + 0);	// m[row][0]
+	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
+	m	= _simd_load1_ps(pMatrix + 2*4 + 1);	// m[row][1]
+	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
+	m	= _simd_load1_ps(pMatrix + 2*4 + 2);	// m[row][2]
+	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+	m	= _simd_load1_ps(pMatrix + 2*4 + 3);	// m[row][3]
+	r1	= _simd_mul_ps(m, v[3]);				// (m3 * v.z)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+	result[2] = r0;
+
+	m	= _simd_load1_ps(pMatrix + 3*4 + 0);	// m[row][0]
+	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
+	m	= _simd_load1_ps(pMatrix + 3*4 + 1);	// m[row][1]
+	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
+	m	= _simd_load1_ps(pMatrix + 3*4 + 2);	// m[row][2]
+	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+	m	= _simd_load1_ps(pMatrix + 3*4 + 3);	// m[row][3]
+	r1	= _simd_mul_ps(m, v[3]);				// (m3 * v.z)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+	result[3] = r0;
+}
+
+// Matrix4x4 * Vector3 - Direction Vector where w = 0.
+//   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
+//   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
+//   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
+//   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
+INLINE
+void _simd_mat3x3_vec3_w0_multiply(
+	simdvector& result,
+	const float *pMatrix,
+	const simdvector& v)
+{
+	simdscalar m;
+	simdscalar r0;
+	simdscalar r1;
+
+	m	= _simd_load1_ps(pMatrix + 0*4 + 0);	// m[row][0]
+	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
+	m	= _simd_load1_ps(pMatrix + 0*4 + 1);	// m[row][1]
+	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
+	m	= _simd_load1_ps(pMatrix + 0*4 + 2);	// m[row][2]
+	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+	result[0] = r0;
+
+	m	= _simd_load1_ps(pMatrix + 1*4 + 0);	// m[row][0]
+	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
+	m	= _simd_load1_ps(pMatrix + 1*4 + 1);	// m[row][1]
+	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
+	m	= _simd_load1_ps(pMatrix + 1*4 + 2);	// m[row][2]
+	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+	result[1] = r0;
+
+	m	= _simd_load1_ps(pMatrix + 2*4 + 0);	// m[row][0]
+	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
+	m	= _simd_load1_ps(pMatrix + 2*4 + 1);	// m[row][1]
+	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
+	m	= _simd_load1_ps(pMatrix + 2*4 + 2);	// m[row][2]
+	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+	result[2] = r0;
+
+	result[3] = _simd_setzero_ps();
+}
+
+// Matrix4x4 * Vector3 - Position vector where w = 1.
+//   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
+//   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
+//   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
+//   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
+INLINE
+void _simd_mat4x4_vec3_w1_multiply(
+	simdvector& result,
+	const float *pMatrix,
+	const simdvector& v)
+{
+	simdscalar m;
+	simdscalar r0;
+	simdscalar r1;
+
+	m	= _simd_load1_ps(pMatrix + 0*4 + 0);	// m[row][0]
+	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
+	m	= _simd_load1_ps(pMatrix + 0*4 + 1);	// m[row][1]
+	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
+	m	= _simd_load1_ps(pMatrix + 0*4 + 2);	// m[row][2]
+	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+	m	= _simd_load1_ps(pMatrix + 0*4 + 3);	// m[row][3]
+	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+	result[0] = r0;
+
+	m	= _simd_load1_ps(pMatrix + 1*4 + 0);	// m[row][0]
+	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
+	m	= _simd_load1_ps(pMatrix + 1*4 + 1);	// m[row][1]
+	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
+	m	= _simd_load1_ps(pMatrix + 1*4 + 2);	// m[row][2]
+	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+	m	= _simd_load1_ps(pMatrix + 1*4 + 3);	// m[row][3]
+	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+	result[1] = r0;
+
+	m	= _simd_load1_ps(pMatrix + 2*4 + 0);	// m[row][0]
+	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
+	m	= _simd_load1_ps(pMatrix + 2*4 + 1);	// m[row][1]
+	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
+	m	= _simd_load1_ps(pMatrix + 2*4 + 2);	// m[row][2]
+	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+	m	= _simd_load1_ps(pMatrix + 2*4 + 3);	// m[row][3]
+	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+	result[2] = r0;
+
+	m	= _simd_load1_ps(pMatrix + 3*4 + 0);	// m[row][0]
+	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
+	m	= _simd_load1_ps(pMatrix + 3*4 + 1);	// m[row][1]
+	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
+	m	= _simd_load1_ps(pMatrix + 3*4 + 2);	// m[row][2]
+	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+	m	= _simd_load1_ps(pMatrix + 3*4 + 3);	// m[row][3]
+	result[3]	= _simd_add_ps(r0, m);			// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+}
+
+INLINE
+void _simd_mat4x3_vec3_w1_multiply(
+	simdvector& result,
+	const float *pMatrix,
+	const simdvector& v)
+{
+	simdscalar m;
+	simdscalar r0;
+	simdscalar r1;
+
+	m	= _simd_load1_ps(pMatrix + 0*4 + 0);	// m[row][0]
+	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
+	m	= _simd_load1_ps(pMatrix + 0*4 + 1);	// m[row][1]
+	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
+	m	= _simd_load1_ps(pMatrix + 0*4 + 2);	// m[row][2]
+	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+	m	= _simd_load1_ps(pMatrix + 0*4 + 3);	// m[row][3]
+	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+	result[0] = r0;
+
+	m	= _simd_load1_ps(pMatrix + 1*4 + 0);	// m[row][0]
+	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
+	m	= _simd_load1_ps(pMatrix + 1*4 + 1);	// m[row][1]
+	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
+	m	= _simd_load1_ps(pMatrix + 1*4 + 2);	// m[row][2]
+	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+	m	= _simd_load1_ps(pMatrix + 1*4 + 3);	// m[row][3]
+	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+	result[1] = r0;
+
+	m	= _simd_load1_ps(pMatrix + 2*4 + 0);	// m[row][0]
+	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
+	m	= _simd_load1_ps(pMatrix + 2*4 + 1);	// m[row][1]
+	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
+	m	= _simd_load1_ps(pMatrix + 2*4 + 2);	// m[row][2]
+	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
+	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+	m	= _simd_load1_ps(pMatrix + 2*4 + 3);	// m[row][3]
+	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+	result[2] = r0;
+	result[3] = _simd_set1_ps(1.0f);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Compute plane equation vA * vX + vB * vY + vC
+INLINE simdscalar vplaneps(simdscalar vA, simdscalar vB, simdscalar vC, simdscalar &vX, simdscalar &vY)
+{
+    simdscalar vOut = _simd_fmadd_ps(vA, vX, vC);
+    vOut = _simd_fmadd_ps(vB, vY, vOut);
+    return vOut;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Interpolates a single component.
+/// @param vI - barycentric I
+/// @param vJ - barycentric J
+/// @param pInterpBuffer - pointer to attribute barycentric coeffs
+template<UINT Attrib, UINT Comp>
+static INLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, const float *pInterpBuffer)
+{
+    const float *pInterpA = &pInterpBuffer[Attrib * 12 + 0 + Comp];
+    const float *pInterpB = &pInterpBuffer[Attrib * 12 + 4 + Comp];
+    const float *pInterpC = &pInterpBuffer[Attrib * 12 + 8 + Comp];
+
+    simdscalar vA = _simd_broadcast_ss(pInterpA);
+    simdscalar vB = _simd_broadcast_ss(pInterpB);
+    simdscalar vC = _simd_broadcast_ss(pInterpC);
+
+    simdscalar vk = _simd_sub_ps(_simd_sub_ps(_simd_set1_ps(1.0f), vI), vJ);
+    vC = _simd_mul_ps(vk, vC);
+    
+    return vplaneps(vA, vB, vC, vI, vJ);
+}
+
+
+#endif//__SWR_SIMDINTRIN_H__
diff --git a/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp b/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp
new file mode 100644
index 00000000000..0bffd2c8000
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/swr_assert.cpp
@@ -0,0 +1,238 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+
+#include "common/os.h"
+#include <stdarg.h>
+#include <stdio.h>
+#include <assert.h>
+
+#if SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS
+
+#if defined(_WIN32)
+#pragma comment(lib, "user32.lib")
+#endif // _WIN32
+
+enum TextColor
+{
+    TEXT_BLACK      = 0,
+    TEXT_RED        = 1,
+    TEXT_GREEN      = 2,
+    TEXT_BLUE       = 4,
+    TEXT_PURPLE     = TEXT_RED | TEXT_BLUE,
+    TEXT_CYAN       = TEXT_GREEN | TEXT_BLUE,
+    TEXT_YELLOW     = TEXT_RED | TEXT_GREEN,
+    TEXT_WHITE      = TEXT_RED | TEXT_GREEN | TEXT_BLUE,
+};
+
+enum TextStyle
+{
+    TEXT_NORMAL     = 0,
+    TEXT_INTENSITY  = 1,
+};
+
+void SetTextColor(FILE* stream, TextColor color = TEXT_WHITE, TextStyle style = TEXT_NORMAL)
+{
+#if defined(_WIN32)
+
+    HANDLE hConsoleHandle = nullptr;
+    if (stream == stderr)
+    {
+        hConsoleHandle = GetStdHandle(STD_ERROR_HANDLE);
+    }
+    else if (stream == stdout)
+    {
+        hConsoleHandle = GetStdHandle(STD_OUTPUT_HANDLE);
+    }
+    else
+    {
+        // Not a console stream, do nothing
+        return;
+    }
+
+    WORD textAttributes = 0;
+    if (color & TEXT_RED)
+    {
+        textAttributes |= FOREGROUND_RED;
+    }
+    if (color & TEXT_GREEN)
+    {
+        textAttributes |= FOREGROUND_GREEN;
+    }
+    if (color & TEXT_BLUE)
+    {
+        textAttributes |= FOREGROUND_BLUE;
+    }
+    if (style & TEXT_INTENSITY)
+    {
+        textAttributes |= FOREGROUND_INTENSITY;
+    }
+    SetConsoleTextAttribute(hConsoleHandle, textAttributes);
+
+#else // !_WIN32
+
+    // Print ANSI codes
+    uint32_t cc = 30 + (style ? 60 : 0) + color;
+    fprintf(stream, "\033[0m\033[%d;%dm", style, cc);
+
+#endif
+}
+
+void ResetTextColor(FILE* stream)
+{
+#if defined(_WIN32)
+
+    SetTextColor(stream);
+
+#else // !_WIN32
+
+    // Print ANSI codes
+    fprintf(stream, "\033[0m");
+
+#endif
+}
+
+bool SwrAssert(
+    bool        chkDebugger,
+    bool&       enabled,
+    const char* pExpression,
+    const char* pFileName,
+    uint32_t    lineNum,
+    const char* pFunction,
+    const char* pFmtString /* = nullptr */,
+    ...)
+{
+    if (!enabled) return false;
+
+    SetTextColor(stderr, TEXT_CYAN, TEXT_NORMAL);
+
+    fprintf(stderr, "%s(%d): ", pFileName, lineNum);
+
+    SetTextColor(stderr, TEXT_RED, TEXT_INTENSITY);
+
+    fprintf(stderr, "ASSERT: %s\n", pExpression);
+
+    SetTextColor(stderr, TEXT_CYAN, TEXT_INTENSITY);
+    fprintf(stderr, "\t%s\n", pFunction);
+
+    if (pFmtString)
+    {
+        SetTextColor(stderr, TEXT_YELLOW, TEXT_INTENSITY);
+        fprintf(stderr, "\t");
+        va_list args;
+        va_start(args, pFmtString);
+        vfprintf(stderr, pFmtString, args);
+        va_end(args);
+        fprintf(stderr, "\n");
+    }
+    ResetTextColor(stderr);
+    fflush(stderr);
+
+#if defined(_WIN32)
+    static const int MAX_MESSAGE_LEN = 2048;
+    char msgBuf[MAX_MESSAGE_LEN];
+
+    sprintf_s(msgBuf, "%s(%d): ASSERT: %s\n", pFileName, lineNum, pExpression);
+    msgBuf[MAX_MESSAGE_LEN - 2] = '\n';
+    msgBuf[MAX_MESSAGE_LEN - 1] = 0;
+    OutputDebugStringA(msgBuf);
+
+    sprintf_s(msgBuf, "\t%s\n", pFunction);
+    msgBuf[MAX_MESSAGE_LEN - 2] = '\n';
+    msgBuf[MAX_MESSAGE_LEN - 1] = 0;
+    OutputDebugStringA(msgBuf);
+
+    int offset = 0;
+
+    if (pFmtString)
+    {
+        va_list args;
+        va_start(args, pFmtString);
+        offset = _vsnprintf_s(
+            msgBuf,
+            sizeof(msgBuf),
+            sizeof(msgBuf),
+            pFmtString,
+            args);
+        va_end(args);
+
+        if (offset < 0) { return true; }
+
+        OutputDebugStringA("\t");
+        OutputDebugStringA(msgBuf);
+        OutputDebugStringA("\n");
+    }
+
+    if (KNOB_ENABLE_ASSERT_DIALOGS)
+    {
+        int retval = sprintf_s(
+            &msgBuf[offset],
+            MAX_MESSAGE_LEN - offset,
+            "\n\n"
+            "File: %s\n"
+            "Line: %d\n"
+            "\n"
+            "Expression: %s\n\n"
+            "Cancel: Disable this assert for the remainder of the process\n"
+            "Try Again: Break into the debugger\n"
+            "Continue: Continue execution (but leave assert enabled)",
+            pFileName,
+            lineNum,
+            pExpression);
+
+        if (retval < 0) { return true; }
+
+        offset += retval;
+
+        if (!IsDebuggerPresent())
+        {
+            sprintf_s(
+                &msgBuf[offset],
+                MAX_MESSAGE_LEN - offset,
+                "\n\n*** NO DEBUGGER DETECTED ***\n\nPressing \"Try Again\" will cause a program crash!");
+        }
+
+        retval = MessageBoxA(nullptr, msgBuf, "Assert Failed", MB_CANCELTRYCONTINUE | MB_ICONEXCLAMATION);
+
+        switch (retval)
+        {
+        case IDCANCEL:
+            enabled = false;
+            return false;
+
+        case IDTRYAGAIN:
+            return true;
+
+        case IDCONTINUE:
+            return false;
+        }
+    }
+    else
+    {
+        return IsDebuggerPresent() || !chkDebugger;
+    }
+#endif // _WIN32
+
+    return true;
+}
+
+#endif // SWR_ENABLE_ASSERTS
diff --git a/src/gallium/drivers/swr/rasterizer/common/swr_assert.h b/src/gallium/drivers/swr/rasterizer/common/swr_assert.h
new file mode 100644
index 00000000000..fecadb3d499
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/swr_assert.h
@@ -0,0 +1,109 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+
+#ifndef __SWR_ASSERT_H__
+#define __SWR_ASSERT_H__
+
+#if !defined(__SWR_OS_H__)
+#error swr_assert.h should not be included directly, please include "common/os.h" instead.
+#endif
+
+#if !defined(SWR_ENABLE_ASSERTS)
+
+#if !defined(NDEBUG)
+#define SWR_ENABLE_ASSERTS 1
+#else
+#define SWR_ENABLE_ASSERTS 0
+#endif // _DEBUG
+
+#endif // SWR_ENABLE_ASSERTS
+
+#if !defined(SWR_ENABLE_REL_ASSERTS)
+#define SWR_ENABLE_REL_ASSERTS 1
+#endif
+
+#if SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS
+#include "assert.h"
+
+#if !defined(__cplusplus)
+
+#pragma message("C++ is required for SWR Asserts, falling back to assert.h")
+
+#if SWR_ENABLE_ASSERTS
+#define SWR_ASSERT(e, ...) assert(e)
+#endif
+
+#if SWR_ENABLE_REL_ASSERTS
+#define SWR_REL_ASSERT(e, ...) assert(e)
+#endif
+
+#else
+
+#if SWR_ENABLE_ASSERTS
+#if defined(assert)
+#undef assert
+#endif
+#define assert(exp) SWR_ASSERT(exp)
+#endif
+
+bool SwrAssert(
+    bool        chkDebugger,
+    bool&       enabled,
+    const char* pExpression,
+    const char* pFileName,
+    uint32_t    lineNum,
+    const char* function,
+    const char* pFmtString = nullptr,
+    ...);
+
+#define _SWR_ASSERT(chkDebugger, e, ...) {\
+    bool expFailed = !(e);\
+    if (expFailed) {\
+        static bool swrAssertEnabled = true;\
+        expFailed = SwrAssert(chkDebugger, swrAssertEnabled, #e, __FILE__, __LINE__, __FUNCTION__, ##__VA_ARGS__);\
+        if (expFailed) { DEBUGBREAK; }\
+    }\
+}
+
+#if SWR_ENABLE_ASSERTS
+#define SWR_ASSERT(e, ...) _SWR_ASSERT(true, e, ##__VA_ARGS__)
+#endif
+
+#if SWR_ENABLE_REL_ASSERTS
+#define SWR_REL_ASSERT(e, ...) _SWR_ASSERT(false, e, ##__VA_ARGS__)
+#endif
+#endif // C++
+
+#endif // SWR_ENABLE_ASSERTS || SWR_ENABLE_REL_ASSERTS
+
+#if !SWR_ENABLE_ASSERTS
+#define SWR_ASSERT(e, ...)
+#endif
+
+#if !SWR_ENABLE_REL_ASSERTS
+#define SWR_REL_ASSERT(e, ...)
+#endif
+
+#define SWR_NOT_IMPL SWR_ASSERT(0, "%s not implemented", __FUNCTION__)
+
+#endif//__SWR_ASSERT_H__
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
new file mode 100644
index 00000000000..fccccab503c
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -0,0 +1,1511 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file api.cpp
+*
+* @brief API implementation
+*
+******************************************************************************/
+
+#include <cfloat>
+#include <cmath>
+#include <cstdio>
+
+#include "core/api.h"
+#include "core/backend.h"
+#include "core/context.h"
+#include "core/frontend.h"
+#include "core/rasterizer.h"
+#include "core/rdtsc_core.h"
+#include "core/threads.h"
+#include "core/tilemgr.h"
+#include "core/clip.h"
+
+#include "common/simdintrin.h"
+#include "common/os.h"
+
+void SetupDefaultState(SWR_CONTEXT *pContext);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Create SWR Context.
+/// @param pCreateInfo - pointer to creation info.
+HANDLE SwrCreateContext(
+    const SWR_CREATECONTEXT_INFO* pCreateInfo)
+{
+    RDTSC_RESET();
+    RDTSC_INIT(0);
+
+    void* pContextMem = _aligned_malloc(sizeof(SWR_CONTEXT), KNOB_SIMD_WIDTH * 4);
+    memset(pContextMem, 0, sizeof(SWR_CONTEXT));
+    SWR_CONTEXT *pContext = new (pContextMem) SWR_CONTEXT();
+
+    pContext->driverType = pCreateInfo->driver;
+    pContext->privateStateSize = pCreateInfo->privateStateSize;
+
+    pContext->dcRing = (DRAW_CONTEXT*)_aligned_malloc(sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT, 64);
+    memset(pContext->dcRing, 0, sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT);
+
+    pContext->dsRing = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT, 64);
+    memset(pContext->dsRing, 0, sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT);
+
+    pContext->numSubContexts = pCreateInfo->maxSubContexts;
+    if (pContext->numSubContexts > 1)
+    {
+        pContext->subCtxSave = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE) * pContext->numSubContexts, 64);
+        memset(pContext->subCtxSave, 0, sizeof(DRAW_STATE) * pContext->numSubContexts);
+    }
+
+    for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
+    {
+        pContext->dcRing[dc].pArena = new Arena();
+        pContext->dcRing[dc].inUse = false;
+        pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena));
+        pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.
+
+        pContext->dsRing[dc].pArena = new Arena();
+    }
+
+    if (!KNOB_SINGLE_THREADED)
+    {
+        memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
+        memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
+        new (&pContext->WaitLock) std::mutex();
+        new (&pContext->FifosNotEmpty) std::condition_variable();
+
+        CreateThreadPool(pContext, &pContext->threadPool);
+    }
+
+    // Calling createThreadPool() above can set SINGLE_THREADED
+    if (KNOB_SINGLE_THREADED)
+    {
+        pContext->NumWorkerThreads = 1;
+    }
+
+    // Allocate scratch space for workers.
+    ///@note We could lazily allocate this but its rather small amount of memory.
+    for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
+    {
+        ///@todo Use numa API for allocations using numa information from thread data (if exists).
+        pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4);
+    }
+
+    pContext->nextDrawId = 1;
+    pContext->DrawEnqueued = 1;
+
+    // State setup AFTER context is fully initialized
+    SetupDefaultState(pContext);
+
+    // initialize hot tile manager
+    pContext->pHotTileMgr = new HotTileMgr();
+
+    // initialize function pointer tables
+    InitClearTilesTable();
+
+    // initialize store tiles function
+    pContext->pfnLoadTile = pCreateInfo->pfnLoadTile;
+    pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
+    pContext->pfnClearTile = pCreateInfo->pfnClearTile;
+
+    return (HANDLE)pContext;
+}
+
+void SwrDestroyContext(HANDLE hContext)
+{
+    SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
+    DestroyThreadPool(pContext, &pContext->threadPool);
+
+    // free the fifos
+    for (uint32_t i = 0; i < KNOB_MAX_DRAWS_IN_FLIGHT; ++i)
+    {
+        delete pContext->dcRing[i].pArena;
+        delete pContext->dsRing[i].pArena;
+        delete(pContext->dcRing[i].pTileMgr);
+        delete(pContext->dcRing[i].pDispatch);
+    }
+
+    // Free scratch space.
+    for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
+    {
+        _aligned_free(pContext->pScratch[i]);
+    }
+
+    _aligned_free(pContext->dcRing);
+    _aligned_free(pContext->dsRing);
+    _aligned_free(pContext->subCtxSave);
+
+    delete(pContext->pHotTileMgr);
+
+    pContext->~SWR_CONTEXT();
+    _aligned_free((SWR_CONTEXT*)hContext);
+}
+
+void CopyState(DRAW_STATE& dst, const DRAW_STATE& src)
+{
+    memcpy(&dst.state, &src.state, sizeof(API_STATE));
+}
+
+void WakeAllThreads(SWR_CONTEXT *pContext)
+{
+    pContext->FifosNotEmpty.notify_all();
+}
+
+bool StillDrawing(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC)
+{
+    // For single thread nothing should still be drawing.
+    if (KNOB_SINGLE_THREADED) { return false; }
+
+    if (pDC->isCompute)
+    {
+        if (pDC->doneCompute)
+        {
+            pDC->inUse = false;
+            return false;
+        }
+    }
+
+    // Check if backend work is done. First make sure all triangles have been binned.
+    if (pDC->doneFE == true)
+    {
+        // ensure workers have all moved passed this draw
+        if (pDC->threadsDoneFE != pContext->NumWorkerThreads)
+        {
+            return true;
+        }
+
+        if (pDC->threadsDoneBE != pContext->NumWorkerThreads)
+        {
+            return true;
+        }
+
+        pDC->inUse = false;    // all work is done.
+    }
+
+    return pDC->inUse;
+}
+
+void QueueDraw(SWR_CONTEXT *pContext)
+{
+    SWR_ASSERT(pContext->pCurDrawContext->inUse == false);
+    pContext->pCurDrawContext->inUse = true;
+
+    _ReadWriteBarrier();
+    {
+        std::unique_lock<std::mutex> lock(pContext->WaitLock);
+        pContext->DrawEnqueued++;
+    }
+
+    if (KNOB_SINGLE_THREADED)
+    {
+        // flush denormals to 0
+        uint32_t mxcsr = _mm_getcsr();
+        _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
+
+        std::unordered_set<uint32_t> lockedTiles;
+        uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
+        WorkOnFifoFE(pContext, 0, curDraw[0], 0);
+        WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles);
+
+        // restore csr
+        _mm_setcsr(mxcsr);
+    }
+    else
+    {
+        RDTSC_START(APIDrawWakeAllThreads);
+        WakeAllThreads(pContext);
+        RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
+    }
+
+    // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
+    pContext->pPrevDrawContext = pContext->pCurDrawContext;
+    pContext->pCurDrawContext = nullptr;
+}
+
+///@todo Combine this with QueueDraw
+void QueueDispatch(SWR_CONTEXT *pContext)
+{
+    SWR_ASSERT(pContext->pCurDrawContext->inUse == false);
+    pContext->pCurDrawContext->inUse = true;
+
+    _ReadWriteBarrier();
+    {
+        std::unique_lock<std::mutex> lock(pContext->WaitLock);
+        pContext->DrawEnqueued++;
+    }
+
+    if (KNOB_SINGLE_THREADED)
+    {
+        // flush denormals to 0
+        uint32_t mxcsr = _mm_getcsr();
+        _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
+
+        uint64_t curDispatch = pContext->pCurDrawContext->drawId;
+        WorkOnCompute(pContext, 0, curDispatch);
+
+        // restore csr
+        _mm_setcsr(mxcsr);
+    }
+    else
+    {
+        RDTSC_START(APIDrawWakeAllThreads);
+        WakeAllThreads(pContext);
+        RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
+    }
+
+    // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
+    pContext->pPrevDrawContext = pContext->pCurDrawContext;
+    pContext->pCurDrawContext = nullptr;
+}
+
+DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
+{
+    RDTSC_START(APIGetDrawContext);
+    // If current draw context is null then need to obtain a new draw context to use from ring.
+    if (pContext->pCurDrawContext == nullptr)
+    {
+        uint32_t dcIndex = pContext->nextDrawId % KNOB_MAX_DRAWS_IN_FLIGHT;
+
+        DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
+        pContext->pCurDrawContext = pCurDrawContext;
+
+        // Need to wait until this draw context is available to use.
+        while (StillDrawing(pContext, pCurDrawContext))
+        {
+            _mm_pause();
+        }
+
+        // Assign next available entry in DS ring to this DC.
+        uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
+        pCurDrawContext->pState = &pContext->dsRing[dsIndex];
+
+        Arena& stateArena = *(pCurDrawContext->pState->pArena);
+
+        // Copy previous state to current state.
+        if (pContext->pPrevDrawContext)
+        {
+            DRAW_CONTEXT* pPrevDrawContext = pContext->pPrevDrawContext;
+
+            // If we're splitting our draw then we can just use the same state from the previous
+            // draw. In this case, we won't increment the DS ring index so the next non-split
+            // draw can receive the state.
+            if (isSplitDraw == false)
+            {
+                CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState);
+
+                stateArena.Reset(true);    // Reset memory.
+                pCurDrawContext->pState->pPrivateState = nullptr;
+
+                pContext->curStateId++;  // Progress state ring index forward.
+            }
+            else
+            {
+                // If its a split draw then just copy the state pointer over
+                // since its the same draw.
+                pCurDrawContext->pState = pPrevDrawContext->pState;
+            }
+        }
+        else
+        {
+            stateArena.Reset();    // Reset memory.
+            pContext->curStateId++;  // Progress state ring index forward.
+        }
+
+        pCurDrawContext->dependency = 0;
+        pCurDrawContext->pArena->Reset();
+        pCurDrawContext->pContext = pContext;
+        pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
+        pCurDrawContext->inUse = false;
+
+        pCurDrawContext->doneCompute = false;
+        pCurDrawContext->doneFE = false;
+        pCurDrawContext->FeLock = 0;
+        pCurDrawContext->threadsDoneFE = 0;
+        pCurDrawContext->threadsDoneBE = 0;
+
+        pCurDrawContext->pTileMgr->initialize();
+
+        // Assign unique drawId for this DC
+        pCurDrawContext->drawId = pContext->nextDrawId++;
+    }
+    else
+    {
+        SWR_ASSERT(isSplitDraw == false, "Split draw should only be used when obtaining a new DC");
+    }
+
+    RDTSC_STOP(APIGetDrawContext, 0, 0);
+    return pContext->pCurDrawContext;
+}
+
+void SWR_API SwrSetActiveSubContext(
+    HANDLE hContext,
+    uint32_t subContextIndex)
+{
+    SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
+    if (subContextIndex >= pContext->numSubContexts)
+    {
+        return;
+    }
+
+    if (subContextIndex != pContext->curSubCtxId)
+    {
+        // Save and restore draw state
+        DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+        CopyState(
+            pContext->subCtxSave[pContext->curSubCtxId],
+            *(pDC->pState));
+
+        CopyState(
+            *(pDC->pState),
+            pContext->subCtxSave[subContextIndex]);
+
+        pContext->curSubCtxId = subContextIndex;
+    }
+}
+
+API_STATE* GetDrawState(SWR_CONTEXT *pContext)
+{
+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+    SWR_ASSERT(pDC->pState != nullptr);
+
+    return &pDC->pState->state;
+}
+
+void SetupDefaultState(SWR_CONTEXT *pContext)
+{
+    API_STATE* pState = GetDrawState(pContext);
+
+    pState->rastState.cullMode = SWR_CULLMODE_NONE;
+    pState->rastState.frontWinding = SWR_FRONTWINDING_CCW;
+}
+
+static INLINE SWR_CONTEXT* GetContext(HANDLE hContext)
+{
+    return (SWR_CONTEXT*)hContext;
+}
+
+void SwrSync(HANDLE hContext, PFN_CALLBACK_FUNC pfnFunc, uint64_t userData, uint64_t userData2, uint64_t userData3)
+{
+    RDTSC_START(APISync);
+
+    SWR_ASSERT(pfnFunc != nullptr);
+
+    SWR_CONTEXT *pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
+    pDC->FeWork.type = SYNC;
+    pDC->FeWork.pfnWork = ProcessSync;
+    pDC->FeWork.desc.sync.pfnCallbackFunc = pfnFunc;
+    pDC->FeWork.desc.sync.userData = userData;
+    pDC->FeWork.desc.sync.userData2 = userData2;
+    pDC->FeWork.desc.sync.userData3 = userData3;
+
+    // cannot execute until all previous draws have completed
+    pDC->dependency = pDC->drawId - 1;
+
+    //enqueue
+    QueueDraw(pContext);
+
+    RDTSC_STOP(APISync, 1, 0);
+}
+
+void SwrWaitForIdle(HANDLE hContext)
+{
+    SWR_CONTEXT *pContext = GetContext(hContext);
+
+    RDTSC_START(APIWaitForIdle);
+    // Wait for all work to complete.
+    for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
+    {
+        DRAW_CONTEXT *pDC = &pContext->dcRing[dc];
+
+        while (StillDrawing(pContext, pDC))
+        {
+            _mm_pause();
+        }
+    }
+    RDTSC_STOP(APIWaitForIdle, 1, 0);
+}
+
+void SwrSetVertexBuffers(
+    HANDLE hContext,
+    uint32_t numBuffers,
+    const SWR_VERTEX_BUFFER_STATE* pVertexBuffers)
+{
+    API_STATE* pState = GetDrawState(GetContext(hContext));
+
+    for (uint32_t i = 0; i < numBuffers; ++i)
+    {
+        const SWR_VERTEX_BUFFER_STATE *pVB = &pVertexBuffers[i];
+        pState->vertexBuffers[pVB->index] = *pVB;
+    }
+}
+
+void SwrSetIndexBuffer(
+    HANDLE hContext,
+    const SWR_INDEX_BUFFER_STATE* pIndexBuffer)
+{
+    API_STATE* pState = GetDrawState(GetContext(hContext));
+
+    pState->indexBuffer = *pIndexBuffer;
+}
+
+void SwrSetFetchFunc(
+    HANDLE hContext,
+    PFN_FETCH_FUNC    pfnFetchFunc)
+{
+    API_STATE* pState = GetDrawState(GetContext(hContext));
+
+    pState->pfnFetchFunc = pfnFetchFunc;
+}
+
+void SwrSetSoFunc(
+    HANDLE hContext,
+    PFN_SO_FUNC    pfnSoFunc,
+    uint32_t streamIndex)
+{
+    API_STATE* pState = GetDrawState(GetContext(hContext));
+
+    SWR_ASSERT(streamIndex < MAX_SO_STREAMS);
+
+    pState->pfnSoFunc[streamIndex] = pfnSoFunc;
+}
+
+void SwrSetSoState(
+    HANDLE hContext,
+    SWR_STREAMOUT_STATE* pSoState)
+{
+    API_STATE* pState = GetDrawState(GetContext(hContext));
+
+    pState->soState = *pSoState;
+}
+
+void SwrSetSoBuffers(
+    HANDLE hContext,
+    SWR_STREAMOUT_BUFFER* pSoBuffer,
+    uint32_t slot)
+{
+    API_STATE* pState = GetDrawState(GetContext(hContext));
+
+    SWR_ASSERT((slot < 4), "There are only 4 SO buffer slots [0, 3]\nSlot requested: %d", slot);
+
+    pState->soBuffer[slot] = *pSoBuffer;
+}
+
+void SwrSetVertexFunc(
+    HANDLE hContext,
+    PFN_VERTEX_FUNC pfnVertexFunc)
+{
+    API_STATE* pState = GetDrawState(GetContext(hContext));
+
+    pState->pfnVertexFunc = pfnVertexFunc;
+}
+
+void SwrSetFrontendState(
+    HANDLE hContext,
+    SWR_FRONTEND_STATE *pFEState)
+{
+    API_STATE* pState = GetDrawState(GetContext(hContext));
+    pState->frontendState = *pFEState;
+}
+
+void SwrSetGsState(
+    HANDLE hContext,
+    SWR_GS_STATE *pGSState)
+{
+    API_STATE* pState = GetDrawState(GetContext(hContext));
+    pState->gsState = *pGSState;
+}
+
+void SwrSetGsFunc(
+    HANDLE hContext,
+    PFN_GS_FUNC pfnGsFunc)
+{
+    API_STATE* pState = GetDrawState(GetContext(hContext));
+    pState->pfnGsFunc = pfnGsFunc;
+}
+
+void SwrSetCsFunc(
+    HANDLE hContext,
+    PFN_CS_FUNC pfnCsFunc,
+    uint32_t totalThreadsInGroup)
+{
+    API_STATE* pState = GetDrawState(GetContext(hContext));
+    pState->pfnCsFunc = pfnCsFunc;
+    pState->totalThreadsInGroup = totalThreadsInGroup;
+}
+
+void SwrSetTsState(
+    HANDLE hContext,
+    SWR_TS_STATE *pState)
+{
+    API_STATE* pApiState = GetDrawState(GetContext(hContext));
+    pApiState->tsState = *pState;
+}
+
+void SwrSetHsFunc(
+    HANDLE hContext,
+    PFN_HS_FUNC pfnFunc)
+{
+    API_STATE* pApiState = GetDrawState(GetContext(hContext));
+    pApiState->pfnHsFunc = pfnFunc;
+}
+
+void SwrSetDsFunc(
+    HANDLE hContext,
+    PFN_DS_FUNC pfnFunc)
+{
+    API_STATE* pApiState = GetDrawState(GetContext(hContext));
+    pApiState->pfnDsFunc = pfnFunc;
+}
+
+void SwrSetDepthStencilState(
+    HANDLE hContext,
+    SWR_DEPTH_STENCIL_STATE *pDSState)
+{
+    API_STATE* pState = GetDrawState(GetContext(hContext));
+
+    pState->depthStencilState = *pDSState;
+}
+
+void SwrSetBackendState(
+    HANDLE hContext,
+    SWR_BACKEND_STATE *pBEState)
+{
+    API_STATE* pState = GetDrawState(GetContext(hContext));
+
+    pState->backendState = *pBEState;
+}
+
+void SwrSetPixelShaderState(
+    HANDLE hContext,
+    SWR_PS_STATE *pPSState)
+{
+    API_STATE *pState = GetDrawState(GetContext(hContext));
+    pState->psState = *pPSState;
+}
+
+void SwrSetBlendState(
+    HANDLE hContext,
+    SWR_BLEND_STATE *pBlendState)
+{
+    API_STATE *pState = GetDrawState(GetContext(hContext));
+    memcpy(&pState->blendState, pBlendState, sizeof(SWR_BLEND_STATE));
+}
+
+void SwrSetBlendFunc(
+    HANDLE hContext,
+    uint32_t renderTarget,
+    PFN_BLEND_JIT_FUNC pfnBlendFunc)
+{
+    SWR_ASSERT(renderTarget < SWR_NUM_RENDERTARGETS);
+    API_STATE *pState = GetDrawState(GetContext(hContext));
+    pState->pfnBlendFunc[renderTarget] = pfnBlendFunc;
+}
+
+void SwrSetLinkage(
+    HANDLE hContext,
+    uint32_t mask,
+    const uint8_t* pMap)
+{
+    API_STATE* pState = GetDrawState(GetContext(hContext));
+
+    static const uint8_t IDENTITY_MAP[] =
+    {
+         0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
+        16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+    };
+    static_assert(sizeof(IDENTITY_MAP) == sizeof(pState->linkageMap),
+        "Update for new value of MAX_ATTRIBUTES");
+
+    pState->linkageMask = mask;
+    pState->linkageCount = _mm_popcnt_u32(mask);
+
+    if (!pMap)
+    {
+        pMap = IDENTITY_MAP;
+    }
+    memcpy(pState->linkageMap, pMap, pState->linkageCount);
+}
+
+// update guardband multipliers for the viewport
+void updateGuardband(API_STATE *pState)
+{
+    // guardband center is viewport center
+    pState->gbState.left    = KNOB_GUARDBAND_WIDTH  / pState->vp[0].width;
+    pState->gbState.right   = KNOB_GUARDBAND_WIDTH  / pState->vp[0].width;
+    pState->gbState.top     = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
+    pState->gbState.bottom  = KNOB_GUARDBAND_HEIGHT / pState->vp[0].height;
+}
+
+void SwrSetRastState(
+    HANDLE hContext,
+    const SWR_RASTSTATE *pRastState)
+{
+    SWR_CONTEXT *pContext = GetContext(hContext);
+    API_STATE* pState = GetDrawState(pContext);
+
+    memcpy(&pState->rastState, pRastState, sizeof(SWR_RASTSTATE));
+}
+
+void SwrSetViewports(
+    HANDLE hContext,
+    uint32_t numViewports,
+    const SWR_VIEWPORT* pViewports,
+    const SWR_VIEWPORT_MATRIX* pMatrices)
+{
+    SWR_ASSERT(numViewports <= KNOB_NUM_VIEWPORTS_SCISSORS,
+        "Invalid number of viewports.");
+
+    SWR_CONTEXT *pContext = GetContext(hContext);
+    API_STATE* pState = GetDrawState(pContext);
+
+    memcpy(&pState->vp[0], pViewports, sizeof(SWR_VIEWPORT) * numViewports);
+
+    if (pMatrices != nullptr)
+    {
+        memcpy(&pState->vpMatrix[0], pMatrices, sizeof(SWR_VIEWPORT_MATRIX) * numViewports);
+    }
+    else
+    {
+        // Compute default viewport transform.
+        for (uint32_t i = 0; i < numViewports; ++i)
+        {
+            if (pContext->driverType == DX)
+            {
+                pState->vpMatrix[i].m00 = pState->vp[i].width / 2.0f;
+                pState->vpMatrix[i].m11 = -pState->vp[i].height / 2.0f;
+                pState->vpMatrix[i].m22 = pState->vp[i].maxZ - pState->vp[i].minZ;
+                pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
+                pState->vpMatrix[i].m31 = pState->vp[i].y - pState->vpMatrix[i].m11;
+                pState->vpMatrix[i].m32 = pState->vp[i].minZ;
+            }
+            else
+            {
+                // Standard, with the exception that Y is inverted.
+                pState->vpMatrix[i].m00 = (pState->vp[i].width - pState->vp[i].x) / 2.0f;
+                pState->vpMatrix[i].m11 = (pState->vp[i].y - pState->vp[i].height) / 2.0f;
+                pState->vpMatrix[i].m22 = (pState->vp[i].maxZ - pState->vp[i].minZ) / 2.0f;
+                pState->vpMatrix[i].m30 = pState->vp[i].x + pState->vpMatrix[i].m00;
+                pState->vpMatrix[i].m31 = pState->vp[i].height + pState->vpMatrix[i].m11;
+                pState->vpMatrix[i].m32 = pState->vp[i].minZ + pState->vpMatrix[i].m22;
+
+                // Now that the matrix is calculated, clip the view coords to screen size.
+                // OpenGL allows for -ve x,y in the viewport.
+                pState->vp[i].x = std::max(pState->vp[i].x, 0.0f);
+                pState->vp[i].y = std::max(pState->vp[i].y, 0.0f);
+            }
+        }
+    }
+
+    updateGuardband(pState);
+}
+
+void SwrSetScissorRects(
+    HANDLE hContext,
+    uint32_t numScissors,
+    const BBOX* pScissors)
+{
+    SWR_ASSERT(numScissors <= KNOB_NUM_VIEWPORTS_SCISSORS,
+        "Invalid number of scissor rects.");
+
+    API_STATE* pState = GetDrawState(GetContext(hContext));
+    memcpy(&pState->scissorRects[0], pScissors, numScissors * sizeof(BBOX));
+};
+
+void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
+{
+    API_STATE *pState = &pDC->pState->state;
+    uint32_t left, right, top, bottom;
+
+    // Set up scissor dimensions based on scissor or viewport
+    if (pState->rastState.scissorEnable)
+    {
+        // scissor rect right/bottom edge are exclusive, core expects scissor dimensions to be inclusive, so subtract one pixel from right/bottom edges
+        left = pState->scissorRects[0].left;
+        right = pState->scissorRects[0].right;
+        top = pState->scissorRects[0].top;
+        bottom = pState->scissorRects[0].bottom;
+    }
+    else
+    {
+        left = (int32_t)pState->vp[0].x;
+        right = (int32_t)pState->vp[0].x + (int32_t)pState->vp[0].width;
+        top = (int32_t)pState->vp[0].y;
+        bottom = (int32_t)pState->vp[0].y + (int32_t)pState->vp[0].height;
+    }
+
+    right = std::min<uint32_t>(right, KNOB_MAX_SCISSOR_X);
+    bottom = std::min<uint32_t>(bottom, KNOB_MAX_SCISSOR_Y);
+
+    if (left > KNOB_MAX_SCISSOR_X || top > KNOB_MAX_SCISSOR_Y)
+    {
+        pState->scissorInFixedPoint.left = 0;
+        pState->scissorInFixedPoint.right = 0;
+        pState->scissorInFixedPoint.top = 0;
+        pState->scissorInFixedPoint.bottom = 0;
+    }
+    else
+    {
+        pState->scissorInFixedPoint.left = left * FIXED_POINT_SCALE;
+        pState->scissorInFixedPoint.right = right * FIXED_POINT_SCALE - 1;
+        pState->scissorInFixedPoint.top = top * FIXED_POINT_SCALE;
+        pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1;
+    }
+}
+
+void SetupPipeline(DRAW_CONTEXT *pDC)
+{
+    DRAW_STATE* pState = pDC->pState;
+    const SWR_RASTSTATE &rastState = pState->state.rastState;
+    BACKEND_FUNCS& backendFuncs = pState->backendFuncs;
+    const uint32_t forcedSampleCount = (rastState.bForcedSampleCount) ? 1 : 0;
+
+    // setup backend
+    if (pState->state.psState.pfnPixelShader == nullptr)
+    {
+        backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount];
+        // always need to generate I & J per sample for Z interpolation
+        backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[1];
+    }
+    else
+    {
+        const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.bForcedSampleCount) ? 1 : 0;
+        const uint32_t centroid = ((pState->state.psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
+
+        // currently only support 'normal' input coverage
+        SWR_ASSERT(pState->state.psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL ||
+                   pState->state.psState.inputCoverage == SWR_INPUT_COVERAGE_NONE);
+     
+        SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)pState->state.psState.barycentricsMask;
+        
+        // select backend function
+        switch(pState->state.psState.shadingRate)
+        {
+        case SWR_SHADING_RATE_PIXEL:
+            if(bMultisampleEnable)
+            {
+                // always need to generate I & J per sample for Z interpolation
+                barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
+                backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][pState->state.psState.inputCoverage][centroid][forcedSampleCount];
+                backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][pState->state.blendState.sampleCount];
+            }
+            else
+            {
+                // always need to generate I & J per pixel for Z interpolation
+                barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
+                backendFuncs.pfnBackend = gBackendSingleSample[pState->state.psState.inputCoverage][centroid];
+                backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][SWR_MULTISAMPLE_1X];
+            }
+            break;
+        case SWR_SHADING_RATE_SAMPLE:
+            SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN);
+            // always need to generate I & J per sample for Z interpolation
+            barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
+            backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][pState->state.psState.inputCoverage][centroid];
+            backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][pState->state.blendState.sampleCount];
+            break;
+        case SWR_SHADING_RATE_COARSE:
+        default:
+            SWR_ASSERT(0 && "Invalid shading rate");
+            break;
+        }
+
+        // setup pointer to function that generates necessary barycentrics required by the PS
+        bool bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_PIXEL_MASK) > 0 ? 1 : 0;
+        backendFuncs.pfnCalcPixelBarycentrics = gPixelBarycentricTable[bBarycentrics];
+
+        bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_PER_SAMPLE_MASK) > 0 ? 1 : 0;
+        backendFuncs.pfnCalcSampleBarycentrics = gSampleBarycentricTable[bBarycentrics];
+
+        bBarycentrics = (barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0 ? 1 : 0;
+        backendFuncs.pfnCalcCentroidBarycentrics = gCentroidBarycentricTable[rastState.sampleCount][bBarycentrics][rastState.samplePattern][forcedSampleCount];
+    }
+    
+    PFN_PROCESS_PRIMS pfnBinner;
+    switch (pState->state.topology)
+    {
+    case TOP_POINT_LIST:
+        pState->pfnProcessPrims = ClipPoints;
+        pfnBinner = BinPoints;
+        break;
+    case TOP_LINE_LIST:
+    case TOP_LINE_STRIP:
+    case TOP_LINE_LOOP:
+    case TOP_LINE_LIST_ADJ:
+    case TOP_LISTSTRIP_ADJ:
+        pState->pfnProcessPrims = ClipLines;
+        pfnBinner = BinLines;
+        break;
+    default:
+        pState->pfnProcessPrims = ClipTriangles;
+        pfnBinner = BinTriangles;
+        break;
+    };
+
+    // disable clipper if viewport transform is disabled
+    if (pState->state.frontendState.vpTransformDisable)
+    {
+        pState->pfnProcessPrims = pfnBinner;
+    }
+
+    if ((pState->state.psState.pfnPixelShader == nullptr) &&
+        (pState->state.depthStencilState.depthTestEnable == FALSE) &&
+        (pState->state.depthStencilState.depthWriteEnable == FALSE) &&
+        (pState->state.depthStencilState.stencilTestEnable == FALSE) &&
+        (pState->state.depthStencilState.stencilWriteEnable == FALSE) &&
+        (pState->state.linkageCount == 0))
+    {
+        pState->pfnProcessPrims = nullptr;
+        pState->state.linkageMask = 0;
+    }
+
+    if (pState->state.soState.rasterizerDisable == true)
+    {
+        pState->pfnProcessPrims = nullptr;
+        pState->state.linkageMask = 0;
+    }
+
+    // set up the frontend attrib mask
+    pState->state.feAttribMask = pState->state.linkageMask;
+    if (pState->state.soState.soEnable)
+    {
+        for (uint32_t i = 0; i < 4; ++i)
+        {
+            pState->state.feAttribMask |= pState->state.soState.streamMasks[i];
+        }
+    }
+
+    // complicated logic to test for cases where we don't need backing hottile memory for a draw
+    // have to check for the special case where depth/stencil test is enabled but depthwrite is disabled.
+    pState->state.depthHottileEnable = ((!(pState->state.depthStencilState.depthTestEnable &&
+                                           !pState->state.depthStencilState.depthWriteEnable &&
+                                           pState->state.depthStencilState.depthTestFunc == ZFUNC_ALWAYS)) && 
+                                        (pState->state.depthStencilState.depthTestEnable || 
+                                         pState->state.depthStencilState.depthWriteEnable)) ? true : false;
+
+    pState->state.stencilHottileEnable = (((!(pState->state.depthStencilState.stencilTestEnable &&
+                                             !pState->state.depthStencilState.stencilWriteEnable &&
+                                              pState->state.depthStencilState.stencilTestFunc == ZFUNC_ALWAYS)) ||
+                                          // for stencil we have to check the double sided state as well
+                                          (!(pState->state.depthStencilState.doubleSidedStencilTestEnable &&
+                                             !pState->state.depthStencilState.stencilWriteEnable &&
+                                              pState->state.depthStencilState.backfaceStencilTestFunc == ZFUNC_ALWAYS))) && 
+                                          (pState->state.depthStencilState.stencilTestEnable  ||
+                                           pState->state.depthStencilState.stencilWriteEnable)) ? true : false;
+
+    uint32_t numRTs = pState->state.psState.numRenderTargets;
+    pState->state.colorHottileEnable = 0;
+    if(pState->state.psState.pfnPixelShader != nullptr)
+    {
+        for (uint32_t rt = 0; rt < numRTs; ++rt)
+        {
+            pState->state.colorHottileEnable |=  
+                (!pState->state.blendState.renderTarget[rt].writeDisableAlpha ||
+                 !pState->state.blendState.renderTarget[rt].writeDisableRed ||
+                 !pState->state.blendState.renderTarget[rt].writeDisableGreen ||
+                 !pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0;
+        }
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief InitDraw
+/// @param pDC - Draw context to initialize for this draw.
+void InitDraw(
+    DRAW_CONTEXT *pDC,
+    bool isSplitDraw)
+{
+    // We don't need to re-setup the scissors/pipeline state again for split draw.
+    if (isSplitDraw == false)
+    {
+        SetupMacroTileScissors(pDC);
+        SetupPipeline(pDC);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief We can split the draw for certain topologies for better performance.
+/// @param totalVerts - Total vertices for draw
+/// @param topology - Topology used for draw
+uint32_t MaxVertsPerDraw(
+    DRAW_CONTEXT* pDC,
+    uint32_t totalVerts,
+    PRIMITIVE_TOPOLOGY topology)
+{
+    API_STATE& state = pDC->pState->state;
+
+    uint32_t vertsPerDraw = totalVerts;
+
+    if (state.soState.soEnable)
+    {
+        return totalVerts;
+    }
+
+    switch (topology)
+    {
+    case TOP_POINT_LIST:
+    case TOP_TRIANGLE_LIST:
+        vertsPerDraw = KNOB_MAX_PRIMS_PER_DRAW;
+        break;
+
+    case TOP_PATCHLIST_1:
+    case TOP_PATCHLIST_2:
+    case TOP_PATCHLIST_3:
+    case TOP_PATCHLIST_4:
+    case TOP_PATCHLIST_5:
+    case TOP_PATCHLIST_6:
+    case TOP_PATCHLIST_7:
+    case TOP_PATCHLIST_8:
+    case TOP_PATCHLIST_9:
+    case TOP_PATCHLIST_10:
+    case TOP_PATCHLIST_11:
+    case TOP_PATCHLIST_12:
+    case TOP_PATCHLIST_13:
+    case TOP_PATCHLIST_14:
+    case TOP_PATCHLIST_15:
+    case TOP_PATCHLIST_16:
+    case TOP_PATCHLIST_17:
+    case TOP_PATCHLIST_18:
+    case TOP_PATCHLIST_19:
+    case TOP_PATCHLIST_20:
+    case TOP_PATCHLIST_21:
+    case TOP_PATCHLIST_22:
+    case TOP_PATCHLIST_23:
+    case TOP_PATCHLIST_24:
+    case TOP_PATCHLIST_25:
+    case TOP_PATCHLIST_26:
+    case TOP_PATCHLIST_27:
+    case TOP_PATCHLIST_28:
+    case TOP_PATCHLIST_29:
+    case TOP_PATCHLIST_30:
+    case TOP_PATCHLIST_31:
+    case TOP_PATCHLIST_32:
+        if (pDC->pState->state.tsState.tsEnable)
+        {
+            uint32_t vertsPerPrim = topology - TOP_PATCHLIST_BASE;
+            vertsPerDraw = vertsPerPrim * KNOB_MAX_TESS_PRIMS_PER_DRAW;
+        }
+        break;
+
+    default:
+        // We are not splitting up draws for other topologies.
+        break;
+    }
+
+    return vertsPerDraw;
+}
+
+// Recursive template used to auto-nest conditionals.  Converts dynamic boolean function
+// arguments to static template arguments.
+template <bool... ArgsB>
+struct FEDrawChooser
+{
+    // Last Arg Terminator
+    static PFN_FE_WORK_FUNC GetFunc(bool bArg)
+    {
+        if (bArg)
+        {
+            return ProcessDraw<ArgsB..., true>;
+        }
+
+        return ProcessDraw<ArgsB..., false>;
+    }
+
+    // Recursively parse args
+    template <typename... TArgsT>
+    static PFN_FE_WORK_FUNC GetFunc(bool bArg, TArgsT... remainingArgs)
+    {
+        if (bArg)
+        {
+            return FEDrawChooser<ArgsB..., true>::GetFunc(remainingArgs...);
+        }
+
+        return FEDrawChooser<ArgsB..., false>::GetFunc(remainingArgs...);
+    }
+};
+
+// Selector for correct templated Draw front-end function
+INLINE
+static PFN_FE_WORK_FUNC GetFEDrawFunc(bool IsIndexed, bool HasTessellation, bool HasGeometryShader, bool HasStreamOut, bool RasterizerEnabled)
+{
+    return FEDrawChooser<>::GetFunc(IsIndexed, HasTessellation, HasGeometryShader, HasStreamOut, RasterizerEnabled);
+}
+
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief DrawInstanced
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param topology - Specifies topology for draw.
+/// @param numVerts - How many vertices to read sequentially from vertex data (per instance).
+/// @param startVertex - Specifies start vertex for draw. (vertex data)
+/// @param numInstances - How many instances to render.
+/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
+void DrawInstanced(
+    HANDLE hContext,
+    PRIMITIVE_TOPOLOGY topology,
+    uint32_t numVertices,
+    uint32_t startVertex,
+    uint32_t numInstances = 1,
+    uint32_t startInstance = 0)
+{
+    if (KNOB_TOSS_DRAW)
+    {
+        return;
+    }
+
+    RDTSC_START(APIDraw);
+
+    SWR_CONTEXT *pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
+    int32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
+    uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw);
+    int32_t remainingVerts = numVertices;
+
+    API_STATE    *pState = &pDC->pState->state;
+    pState->topology = topology;
+    pState->forceFront = false;
+
+    // disable culling for points/lines
+    uint32_t oldCullMode = pState->rastState.cullMode;
+    if (topology == TOP_POINT_LIST)
+    {
+        pState->rastState.cullMode = SWR_CULLMODE_NONE;
+        pState->forceFront = true;
+    }
+
+    int draw = 0;
+    while (remainingVerts)
+    {
+        uint32_t numVertsForDraw = (remainingVerts < maxVertsPerDraw) ?
+        remainingVerts : maxVertsPerDraw;
+
+        bool isSplitDraw = (draw > 0) ? true : false;
+        DRAW_CONTEXT* pDC = GetDrawContext(pContext, isSplitDraw);
+        InitDraw(pDC, isSplitDraw);
+
+        pDC->FeWork.type = DRAW;
+        pDC->FeWork.pfnWork = GetFEDrawFunc(
+            false,  // IsIndexed
+            pState->tsState.tsEnable,
+            pState->gsState.gsEnable,
+            pState->soState.soEnable,
+            pDC->pState->pfnProcessPrims != nullptr);
+        pDC->FeWork.desc.draw.numVerts = numVertsForDraw;
+        pDC->FeWork.desc.draw.startVertex = startVertex;
+        pDC->FeWork.desc.draw.numInstances = numInstances;
+        pDC->FeWork.desc.draw.startInstance = startInstance;
+        pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
+        pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw;
+
+        //enqueue DC
+        QueueDraw(pContext);
+
+        remainingVerts -= numVertsForDraw;
+        draw++;
+    }
+
+    // restore culling state
+    pDC = GetDrawContext(pContext);
+    pDC->pState->state.rastState.cullMode = oldCullMode;
+
+    RDTSC_STOP(APIDraw, numVertices * numInstances, 0);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrDraw
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param topology - Specifies topology for draw.
+/// @param startVertex - Specifies start vertex in vertex buffer for draw.
+/// @param primCount - Number of vertices.
+void SwrDraw(
+    HANDLE hContext,
+    PRIMITIVE_TOPOLOGY topology,
+    uint32_t startVertex,
+    uint32_t numVertices)
+{
+    DrawInstanced(hContext, topology, numVertices, startVertex);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrDrawInstanced
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param topology - Specifies topology for draw.
+/// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
+/// @param numInstances - How many instances to render.
+/// @param startVertex - Specifies start vertex for draw. (vertex data)
+/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
+void SwrDrawInstanced(
+    HANDLE hContext,
+    PRIMITIVE_TOPOLOGY topology,
+    uint32_t numVertsPerInstance,
+    uint32_t numInstances,
+    uint32_t startVertex,
+    uint32_t startInstance
+    )
+{
+    DrawInstanced(hContext, topology, numVertsPerInstance, startVertex, numInstances, startInstance);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief DrawIndexedInstanced
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param topology - Specifies topology for draw.
+/// @param numIndices - Number of indices to read sequentially from index buffer.
+/// @param indexOffset - Starting index into index buffer.
+/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
+/// @param numInstances - Number of instances to render.
+/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
+void DrawIndexedInstance(
+    HANDLE hContext,
+    PRIMITIVE_TOPOLOGY topology,
+    uint32_t numIndices,
+    uint32_t indexOffset,
+    int32_t baseVertex,
+    uint32_t numInstances = 1,
+    uint32_t startInstance = 0)
+{
+    if (KNOB_TOSS_DRAW)
+    {
+        return;
+    }
+
+    RDTSC_START(APIDrawIndexed);
+
+    SWR_CONTEXT *pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+    API_STATE* pState = &pDC->pState->state;
+
+    int32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
+    uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw);
+    int32_t remainingIndices = numIndices;
+
+    uint32_t indexSize = 0;
+    switch (pState->indexBuffer.format)
+    {
+    case R32_UINT: indexSize = sizeof(uint32_t); break;
+    case R16_UINT: indexSize = sizeof(uint16_t); break;
+    case R8_UINT: indexSize = sizeof(uint8_t); break;
+    default:
+        SWR_ASSERT(0);
+    }
+
+    int draw = 0;
+    uint8_t *pIB = (uint8_t*)pState->indexBuffer.pIndices;
+    pIB += (uint64_t)indexOffset * (uint64_t)indexSize;
+
+    pState->topology = topology;
+    pState->forceFront = false;
+
+    // disable culling for points/lines
+    uint32_t oldCullMode = pState->rastState.cullMode;
+    if (topology == TOP_POINT_LIST)
+    {
+        pState->rastState.cullMode = SWR_CULLMODE_NONE;
+        pState->forceFront = true;
+    }
+
+    while (remainingIndices)
+    {
+        uint32_t numIndicesForDraw = (remainingIndices < maxIndicesPerDraw) ?
+        remainingIndices : maxIndicesPerDraw;
+
+        // When breaking up draw, we need to obtain new draw context for each iteration.
+        bool isSplitDraw = (draw > 0) ? true : false;
+        pDC = GetDrawContext(pContext, isSplitDraw);
+        InitDraw(pDC, isSplitDraw);
+
+        pDC->FeWork.type = DRAW;
+        pDC->FeWork.pfnWork = GetFEDrawFunc(
+            true,   // IsIndexed
+            pState->tsState.tsEnable,
+            pState->gsState.gsEnable,
+            pState->soState.soEnable,
+            pDC->pState->pfnProcessPrims != nullptr);
+        pDC->FeWork.desc.draw.pDC = pDC;
+        pDC->FeWork.desc.draw.numIndices = numIndicesForDraw;
+        pDC->FeWork.desc.draw.pIB = (int*)pIB;
+        pDC->FeWork.desc.draw.type = pDC->pState->state.indexBuffer.format;
+
+        pDC->FeWork.desc.draw.numInstances = numInstances;
+        pDC->FeWork.desc.draw.startInstance = startInstance;
+        pDC->FeWork.desc.draw.baseVertex = baseVertex;
+        pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
+
+        //enqueue DC
+        QueueDraw(pContext);
+
+        pIB += maxIndicesPerDraw * indexSize;
+        remainingIndices -= numIndicesForDraw;
+        draw++;
+    }
+
+    // restore culling state
+    pDC = GetDrawContext(pContext);
+    pDC->pState->state.rastState.cullMode = oldCullMode;
+
+    RDTSC_STOP(APIDrawIndexed, numIndices * numInstances, 0);
+}
+
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief DrawIndexed
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param topology - Specifies topology for draw.
+/// @param numIndices - Number of indices to read sequentially from index buffer.
+/// @param indexOffset - Starting index into index buffer.
+/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
+void SwrDrawIndexed(
+    HANDLE hContext,
+    PRIMITIVE_TOPOLOGY topology,
+    uint32_t numIndices,
+    uint32_t indexOffset,
+    int32_t baseVertex
+    )
+{
+    DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrDrawIndexedInstanced
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param topology - Specifies topology for draw.
+/// @param numIndices - Number of indices to read sequentially from index buffer.
+/// @param numInstances - Number of instances to render.
+/// @param indexOffset - Starting index into index buffer.
+/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
+/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
+void SwrDrawIndexedInstanced(
+    HANDLE hContext,
+    PRIMITIVE_TOPOLOGY topology,
+    uint32_t numIndices,
+    uint32_t numInstances,
+    uint32_t indexOffset,
+    int32_t baseVertex,
+    uint32_t startInstance)
+{
+    DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance);
+}
+
+// Attach surfaces to pipeline
+void SwrInvalidateTiles(
+    HANDLE hContext,
+    uint32_t attachmentMask)
+{
+    SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
+    // Queue a load to the hottile
+    pDC->FeWork.type = INVALIDATETILES;
+    pDC->FeWork.pfnWork = ProcessInvalidateTiles;
+    pDC->FeWork.desc.invalidateTiles.attachmentMask = attachmentMask;
+
+    //enqueue
+    QueueDraw(pContext);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrDispatch
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param threadGroupCountX - Number of thread groups dispatched in X direction
+/// @param threadGroupCountY - Number of thread groups dispatched in Y direction
+/// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
+void SwrDispatch(
+    HANDLE hContext,
+    uint32_t threadGroupCountX,
+    uint32_t threadGroupCountY,
+    uint32_t threadGroupCountZ)
+{
+    if (KNOB_TOSS_DRAW)
+    {
+        return;
+    }
+
+    RDTSC_START(APIDispatch);
+    SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
+    pDC->isCompute = true;      // This is a compute context.
+
+    // Ensure spill fill pointers are initialized to nullptr.
+    memset(pDC->pSpillFill, 0, sizeof(pDC->pSpillFill));
+
+    COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
+
+    pTaskData->threadGroupCountX = threadGroupCountX;
+    pTaskData->threadGroupCountY = threadGroupCountY;
+    pTaskData->threadGroupCountZ = threadGroupCountZ;
+
+    uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
+    pDC->pDispatch->initialize(totalThreadGroups, pTaskData);
+
+    QueueDispatch(pContext);
+    RDTSC_STOP(APIDispatch, threadGroupCountX * threadGroupCountY * threadGroupCountZ, 0);
+}
+
+// Deswizzles, converts and stores current contents of the hot tiles to surface
+// described by pState
+void SwrStoreTiles(
+    HANDLE hContext,
+    SWR_RENDERTARGET_ATTACHMENT attachment,
+    SWR_TILE_STATE postStoreTileState)
+{
+    RDTSC_START(APIStoreTiles);
+
+    SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
+    SetupMacroTileScissors(pDC);
+
+    pDC->FeWork.type = STORETILES;
+    pDC->FeWork.pfnWork = ProcessStoreTiles;
+    pDC->FeWork.desc.storeTiles.attachment = attachment;
+    pDC->FeWork.desc.storeTiles.postStoreTileState = postStoreTileState;
+
+    //enqueue
+    QueueDraw(pContext);
+
+    RDTSC_STOP(APIStoreTiles, 0, 0);
+}
+
+void SwrClearRenderTarget(
+    HANDLE hContext,
+    uint32_t clearMask,
+    const float clearColor[4],
+    float z,
+    BYTE stencil)
+{
+    RDTSC_START(APIClearRenderTarget);
+
+    SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
+
+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
+    SetupMacroTileScissors(pDC);
+
+    CLEAR_FLAGS flags;
+    flags.mask = clearMask;
+
+    pDC->FeWork.type = CLEAR;
+    pDC->FeWork.pfnWork = ProcessClear;
+    pDC->FeWork.desc.clear.flags = flags;
+    pDC->FeWork.desc.clear.clearDepth = z;
+    pDC->FeWork.desc.clear.clearRTColor[0] = clearColor[0];
+    pDC->FeWork.desc.clear.clearRTColor[1] = clearColor[1];
+    pDC->FeWork.desc.clear.clearRTColor[2] = clearColor[2];
+    pDC->FeWork.desc.clear.clearRTColor[3] = clearColor[3];
+    pDC->FeWork.desc.clear.clearStencil = stencil;
+
+    // enqueue draw
+    QueueDraw(pContext);
+
+    RDTSC_STOP(APIClearRenderTarget, 0, pDC->drawId);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Returns a pointer to the private context state for the current
+///        draw operation. This is used for external componets such as the
+///        sampler.
+///        SWR is responsible for the allocation of the private context state.
+/// @param hContext - Handle passed back from SwrCreateContext
+VOID* SwrGetPrivateContextState(
+    HANDLE hContext)
+{
+    SWR_CONTEXT* pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+    DRAW_STATE* pState = pDC->pState;
+
+    if (pState->pPrivateState == nullptr)
+    {
+        pState->pPrivateState = pState->pArena->AllocAligned(pContext->privateStateSize, KNOB_SIMD_WIDTH*sizeof(float));
+    }
+
+    return pState->pPrivateState;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Clients can use this to allocate memory for draw/dispatch
+///        operations. The memory will automatically be freed once operation
+///        has completed. Client can use this to allocate binding tables,
+///        etc. needed for shader execution.
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param size - Size of allocation
+/// @param align - Alignment needed for allocation.
+VOID* SwrAllocDrawContextMemory(
+    HANDLE hContext,
+    uint32_t size,
+    uint32_t align)
+{
+    SWR_CONTEXT* pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
+    return pDC->pState->pArena->AllocAligned(size, align);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Returns pointer to SWR stats.
+/// @note The counters are atomically incremented by multiple threads.
+///       When calling this, you need to ensure all previous operations
+///       have completed.
+/// @todo If necessary, add a callback to avoid stalling the pipe to
+///       sample the counters.
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pStats - SWR will fill this out for caller.
+void SwrGetStats(
+    HANDLE hContext,
+    SWR_STATS* pStats)
+{
+    SWR_CONTEXT *pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
+    pDC->FeWork.type = QUERYSTATS;
+    pDC->FeWork.pfnWork = ProcessQueryStats;
+    pDC->FeWork.desc.queryStats.pStats = pStats;
+
+    // cannot execute until all previous draws have completed
+    pDC->dependency = pDC->drawId - 1;
+
+    //enqueue
+    QueueDraw(pContext);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Enables stats counting
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param enable - If true then counts are incremented.
+void SwrEnableStats(
+    HANDLE hContext,
+    bool enable)
+{
+    SWR_CONTEXT *pContext = GetContext(hContext);
+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
+    pDC->pState->state.enableStats = enable;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Mark end of frame - used for performance profiling
+/// @param hContext - Handle passed back from SwrCreateContext
+void SWR_API SwrEndFrame(
+    HANDLE hContext)
+{
+    RDTSC_ENDFRAME();
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h
new file mode 100644
index 00000000000..72fae8b2c21
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@@ -0,0 +1,500 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file api.h
+*
+* @brief API definitions
+*
+******************************************************************************/
+
+#ifndef __SWR_API_H__
+#define __SWR_API_H__
+
+#include "common/os.h"
+
+#include <assert.h>
+#include <vector>
+
+#include "common/simdintrin.h"
+#include "common/formats.h"
+#include "core/utils.h"
+#include "core/state.h"
+
+///@todo place all the API functions into the 'swr' namespace.
+
+typedef void(SWR_API *PFN_CALLBACK_FUNC)(uint64_t data, uint64_t data2, uint64_t data3);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Function signature for load hot tiles
+/// @param hPrivateContext - handle to private data
+/// @param dstFormat - format of the hot tile
+/// @param renderTargetIndex - render target to store, can be color, depth or stencil
+/// @param x - destination x coordinate
+/// @param y - destination y coordinate
+/// @param pDstHotTile - pointer to the hot tile surface
+typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, SWR_FORMAT dstFormat,
+    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
+    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, BYTE *pDstHotTile);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Function signature for store hot tiles
+/// @param hPrivateContext - handle to private data
+/// @param srcFormat - format of the hot tile
+/// @param renderTargetIndex - render target to store, can be color, depth or stencil
+/// @param x - destination x coordinate
+/// @param y - destination y coordinate
+/// @param pSrcHotTile - pointer to the hot tile surface
+typedef void(SWR_API *PFN_STORE_TILE)(HANDLE hPrivateContext, SWR_FORMAT srcFormat,
+    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
+    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, BYTE *pSrcHotTile);
+
+/// @brief Function signature for clearing from the hot tiles clear value
+/// @param hPrivateContext - handle to private data
+/// @param renderTargetIndex - render target to store, can be color, depth or stencil
+/// @param x - destination x coordinate
+/// @param y - destination y coordinate
+/// @param pClearColor - pointer to the hot tile's clear value
+typedef void(SWR_API *PFN_CLEAR_TILE)(HANDLE hPrivateContext,
+    SWR_RENDERTARGET_ATTACHMENT rtIndex,
+    uint32_t x, uint32_t y, const float* pClearColor);
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_CREATECONTEXT_INFO
+/////////////////////////////////////////////////////////////////////////
+struct SWR_CREATECONTEXT_INFO
+{
+    DRIVER_TYPE driver;
+
+    // External functions (e.g. sampler) need per draw context state.
+    // Use SwrGetPrivateContextState() to access private state.
+    uint32_t privateStateSize;
+
+    // Each SWR context can have multiple sets of active state
+    uint32_t maxSubContexts;
+
+    // tile manipulation functions
+    PFN_LOAD_TILE pfnLoadTile;
+    PFN_STORE_TILE pfnStoreTile;
+    PFN_CLEAR_TILE pfnClearTile;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_RECT
+/////////////////////////////////////////////////////////////////////////
+struct SWR_RECT
+{
+    uint32_t left;
+    uint32_t right;
+    uint32_t top;
+    uint32_t bottom;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Create SWR Context.
+/// @param pCreateInfo - pointer to creation info.
+HANDLE SWR_API SwrCreateContext(
+    const SWR_CREATECONTEXT_INFO* pCreateInfo);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Destroys SWR Context.
+/// @param hContext - Handle passed back from SwrCreateContext
+void SWR_API SwrDestroyContext(
+    HANDLE hContext);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set currently active state context
+/// @param subContextIndex - value from 0 to
+///     SWR_CREATECONTEXT_INFO.maxSubContexts.  Defaults to 0.
+void SWR_API SwrSetActiveSubContext(
+    HANDLE hContext,
+    uint32_t subContextIndex);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Sync cmd. Executes the callback func when all rendering up to this sync
+///        has been completed
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pfnFunc - pointer to callback function,
+/// @param userData - user data to pass back 
+void SWR_API SwrSync(
+    HANDLE hContext,
+    PFN_CALLBACK_FUNC pfnFunc,
+    uint64_t userData,
+    uint64_t userData2,
+    uint64_t userData3 = 0);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Blocks until all rendering has been completed.
+/// @param hContext - Handle passed back from SwrCreateContext
+void SWR_API SwrWaitForIdle(
+    HANDLE hContext);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set vertex buffer state.
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param numBuffers - Number of vertex buffer state descriptors.
+/// @param pVertexBuffers - Array of vertex buffer state descriptors.
+void SWR_API SwrSetVertexBuffers(
+    HANDLE hContext,
+    uint32_t numBuffers,
+    const SWR_VERTEX_BUFFER_STATE* pVertexBuffers);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set index buffer
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pIndexBuffer - Index buffer.
+void SWR_API SwrSetIndexBuffer(
+    HANDLE hContext,
+    const SWR_INDEX_BUFFER_STATE* pIndexBuffer);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set fetch shader pointer.
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pfnFetchFunc - Pointer to shader.
+void SWR_API SwrSetFetchFunc(
+    HANDLE hContext,
+    PFN_FETCH_FUNC    pfnFetchFunc);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set streamout shader pointer.
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pfnSoFunc - Pointer to shader.
+/// @param streamIndex - specifies stream
+void SWR_API SwrSetSoFunc(
+    HANDLE hContext,
+    PFN_SO_FUNC    pfnSoFunc,
+    uint32_t streamIndex);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set streamout state
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pSoState - Pointer to streamout state.
+void SWR_API SwrSetSoState(
+    HANDLE hContext,
+    SWR_STREAMOUT_STATE* pSoState);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set streamout buffer state
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pSoBuffer - Pointer to streamout buffer.
+/// @param slot - Slot to bind SO buffer to.
+void SWR_API SwrSetSoBuffers(
+    HANDLE hContext,
+    SWR_STREAMOUT_BUFFER* pSoBuffer,
+    uint32_t slot);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set vertex shader pointer.
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pfnVertexFunc - Pointer to shader.
+void SWR_API SwrSetVertexFunc(
+    HANDLE hContext,
+    PFN_VERTEX_FUNC pfnVertexFunc);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set frontend state.
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pState - Pointer to state
+void SWR_API SwrSetFrontendState(
+    HANDLE hContext,
+    SWR_FRONTEND_STATE *pState);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set geometry shader state.
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pState - Pointer to state
+void SWR_API SwrSetGsState(
+    HANDLE hContext,
+    SWR_GS_STATE *pState);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set geometry shader
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pState - Pointer to geometry shader function
+void SWR_API SwrSetGsFunc(
+    HANDLE hContext,
+    PFN_GS_FUNC pfnGsFunc);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set compute shader
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pState - Pointer to compute shader function
+/// @param totalThreadsInGroup - product of thread group dimensions.
+void SWR_API SwrSetCsFunc(
+    HANDLE hContext,
+    PFN_CS_FUNC pfnCsFunc,
+    uint32_t totalThreadsInGroup);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set tessellation state.
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pState - Pointer to state
+void SWR_API SwrSetTsState(
+    HANDLE hContext,
+    SWR_TS_STATE *pState);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set hull shader
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pfnFunc - Pointer to shader function
+void SWR_API SwrSetHsFunc(
+    HANDLE hContext,
+    PFN_HS_FUNC pfnFunc);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set domain shader
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pfnFunc - Pointer to shader function
+void SWR_API SwrSetDsFunc(
+    HANDLE hContext,
+    PFN_DS_FUNC pfnFunc);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set depth stencil state
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pState - Pointer to state.
+void SWR_API SwrSetDepthStencilState(
+    HANDLE hContext,
+    SWR_DEPTH_STENCIL_STATE *pState);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set backend state
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pState - Pointer to state.
+void SWR_API SwrSetBackendState(
+    HANDLE hContext,
+    SWR_BACKEND_STATE *pState);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set pixel shader state
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pState - Pointer to state.
+void SWR_API SwrSetPixelShaderState(
+    HANDLE hContext,
+    SWR_PS_STATE *pState);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set blend state
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pState - Pointer to state.
+void SWR_API SwrSetBlendState(
+    HANDLE hContext,
+    SWR_BLEND_STATE *pState);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set blend function
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param renderTarget - render target index
+/// @param pfnBlendFunc - function pointer
+void SWR_API SwrSetBlendFunc(
+    HANDLE hContext,
+    uint32_t renderTarget,
+    PFN_BLEND_JIT_FUNC pfnBlendFunc);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Set linkage mask
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param mask - Specifies which vertex outputs are are needed by PS.
+/// @param pMap - (Optional)Linkage map to specify where FE attributes are
+///               gathered from to supply PS attribute values.  The length
+///               of the map buffer needs to match the number of set bits
+///               in "mask".
+void SWR_API SwrSetLinkage(
+    HANDLE hContext,
+    uint32_t mask,
+    const uint8_t* pMap);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrDraw
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param topology - Specifies topology for draw.
+/// @param startVertex - Specifies start vertex in vertex buffer for draw.
+/// @param primCount - Number of vertices.
+void SWR_API SwrDraw(
+    HANDLE hContext,
+    PRIMITIVE_TOPOLOGY topology,
+    uint32_t startVertex,
+    uint32_t primCount);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrDrawInstanced
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param topology - Specifies topology for draw.
+/// @param numVertsPerInstance - How many vertices to read sequentially from vertex data.
+/// @param numInstances - How many instances to render.
+/// @param startVertex - Specifies start vertex for draw. (vertex data)
+/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
+void SWR_API SwrDrawInstanced(
+    HANDLE hContext,
+    PRIMITIVE_TOPOLOGY topology,
+    uint32_t numVertsPerInstance,
+    uint32_t numInstances,
+    uint32_t startVertex,
+    uint32_t startInstance);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief DrawIndexed
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param topology - Specifies topology for draw.
+/// @param numIndices - Number of indices to read sequentially from index buffer.
+/// @param indexOffset - Starting index into index buffer.
+/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
+void SWR_API SwrDrawIndexed(
+    HANDLE hContext,
+    PRIMITIVE_TOPOLOGY topology,
+    uint32_t numIndices,
+    uint32_t indexOffset,
+    int32_t baseVertex);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrDrawIndexedInstanced
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param topology - Specifies topology for draw.
+/// @param numIndices - Number of indices to read sequentially from index buffer.
+/// @param numInstances - Number of instances to render.
+/// @param indexOffset - Starting index into index buffer.
+/// @param baseVertex - Vertex in vertex buffer to consider as index "0". Note value is signed.
+/// @param startInstance - Which instance to start sequentially fetching from in each buffer (instanced data)
+void SWR_API SwrDrawIndexedInstanced(
+    HANDLE hContext,
+    PRIMITIVE_TOPOLOGY topology,
+    uint32_t numIndices,
+    uint32_t numInstances,
+    uint32_t indexOffset,
+    int32_t baseVertex,
+    uint32_t startInstance);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrInvalidateTiles
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
+void SWR_API SwrInvalidateTiles(
+    HANDLE hContext,
+    uint32_t attachmentMask);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrDispatch
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param threadGroupCountX - Number of thread groups dispatched in X direction
+/// @param threadGroupCountY - Number of thread groups dispatched in Y direction
+/// @param threadGroupCountZ - Number of thread groups dispatched in Z direction
+void SWR_API SwrDispatch(
+    HANDLE hContext,
+    uint32_t threadGroupCountX,
+    uint32_t threadGroupCountY,
+    uint32_t threadGroupCountZ);
+
+
+enum SWR_TILE_STATE
+{
+    SWR_TILE_INVALID    = 0,    // tile is in unitialized state and should be loaded with surface contents before rendering
+    SWR_TILE_DIRTY      = 2,    // tile contains newer data than surface it represents
+    SWR_TILE_RESOLVED   = 3,    // is in sync with surface it represents
+};
+
+/// @todo Add a good description for what attachments are and when and why you would use the different SWR_TILE_STATEs.
+void SWR_API SwrStoreTiles(
+    HANDLE hContext,
+    SWR_RENDERTARGET_ATTACHMENT attachment,
+    SWR_TILE_STATE postStoreTileState);
+
+void SWR_API SwrClearRenderTarget(
+    HANDLE hContext,
+    uint32_t clearMask,
+    const FLOAT clearColor[4],
+    float z,
+    BYTE stencil);
+
+void SWR_API SwrSetRastState(
+    HANDLE hContext,
+    const SWR_RASTSTATE *pRastState);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrSetViewports
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param numViewports - number of viewports passed in
+/// @param pViewports - Specifies extents of viewport.
+/// @param pMatrices - If not specified then SWR computes a default one.
+void SWR_API SwrSetViewports(
+    HANDLE hContext,
+    uint32_t numViewports,
+    const SWR_VIEWPORT* pViewports,
+    const SWR_VIEWPORT_MATRIX* pMatrices);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrSetScissorRects
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param numScissors - number of scissors passed in
+/// @param pScissors - array of scissors
+void SWR_API SwrSetScissorRects(
+    HANDLE hContext,
+    uint32_t numScissors,
+    const BBOX* pScissors);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Returns a pointer to the private context state for the current
+///        draw operation. This is used for external componets such as the
+///        sampler.
+///
+/// @note  Client needs to resend private state prior to each draw call.
+///        Also, SWR is responsible for the private state memory.
+/// @param hContext - Handle passed back from SwrCreateContext
+VOID* SWR_API SwrGetPrivateContextState(
+    HANDLE hContext);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Clients can use this to allocate memory for draw/dispatch
+///        operations. The memory will automatically be freed once operation
+///        has completed. Client can use this to allocate binding tables,
+///        etc. needed for shader execution.
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param size - Size of allocation
+/// @param align - Alignment needed for allocation.
+VOID* SWR_API SwrAllocDrawContextMemory(
+    HANDLE hContext,
+    uint32_t size,
+    uint32_t align);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Returns pointer to SWR stats.
+/// @note The counters are incremented by multiple threads.
+///       When calling this, you need to ensure all previous operations
+///       have completed.
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pStats - SWR will fill this out for caller.
+void SWR_API SwrGetStats(
+    HANDLE hContext,
+    SWR_STATS* pStats);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Enables stats counting
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param enable - If true then counts are incremented.
+void SWR_API SwrEnableStats(
+    HANDLE hContext,
+    bool enable);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Mark end of frame - used for performance profiling
+/// @param hContext - Handle passed back from SwrCreateContext
+void SWR_API SwrEndFrame(
+    HANDLE hContext);
+#endif//__SWR_API_H__
diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.cpp b/src/gallium/drivers/swr/rasterizer/core/arena.cpp
new file mode 100644
index 00000000000..8184c8d3f4c
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/arena.cpp
@@ -0,0 +1,166 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file arena.cpp
+*
+* @brief Arena memory manager
+*        The arena is convenient and fast for managing allocations for any of
+*        our allocations that are associated with operations and can all be freed
+*        once when their operation has completed. Allocations are cheap since
+*        most of the time its simply an increment of an offset. Also, no need to
+*        free individual allocations. All of the arena memory can be freed at once.
+*
+******************************************************************************/
+
+#include "context.h"
+#include "arena.h"
+
+#include <cmath>
+
+Arena::Arena()
+    : m_pCurBlock(nullptr), m_size(0)
+{
+    m_pMutex = new std::mutex();
+}
+
+Arena::~Arena()
+{
+    Reset();        // Reset just in case to avoid leaking memory.
+
+    if (m_pCurBlock)
+    {
+        _aligned_free(m_pCurBlock->pMem);
+        delete m_pCurBlock;
+    }
+
+    delete m_pMutex;
+}
+
+///@todo Remove this when all users have stopped using this.
+void Arena::Init()
+{
+    m_size = 0;
+    m_pCurBlock = nullptr;
+
+    m_pMutex = new std::mutex();
+}
+
+void* Arena::AllocAligned(size_t size, size_t align)
+{
+    if (m_pCurBlock)
+    {
+        ArenaBlock* pCurBlock = m_pCurBlock;
+        pCurBlock->offset = AlignUp(pCurBlock->offset, align);
+
+        if ((pCurBlock->offset + size) <= pCurBlock->blockSize)
+        {
+            void* pMem = PtrAdd(pCurBlock->pMem, pCurBlock->offset);
+            pCurBlock->offset += size;
+            m_size += size;
+            return pMem;
+        }
+
+        // Not enough memory in this block, fall through to allocate
+        // a new block
+    }
+
+    static const size_t ArenaBlockSize = 1024*1024;
+    size_t blockSize = std::max(m_size + ArenaBlockSize, std::max(size, ArenaBlockSize));
+    blockSize = AlignUp(blockSize, KNOB_SIMD_WIDTH*4);
+
+    void *pMem = _aligned_malloc(blockSize, KNOB_SIMD_WIDTH*4);    // Arena blocks are always simd byte aligned.
+    SWR_ASSERT(pMem != nullptr);
+
+    ArenaBlock* pNewBlock = new (std::nothrow) ArenaBlock();
+    SWR_ASSERT(pNewBlock != nullptr);
+
+    if (pNewBlock != nullptr)
+    {
+        pNewBlock->pNext        = m_pCurBlock;
+
+        m_pCurBlock             = pNewBlock;
+        m_pCurBlock->pMem       = pMem;
+        m_pCurBlock->blockSize  = blockSize;
+
+    }
+
+    return AllocAligned(size, align);
+}
+
+void* Arena::Alloc(size_t size)
+{
+    return AllocAligned(size, 1);
+}
+
+void* Arena::AllocAlignedSync(size_t size, size_t align)
+{
+    void* pAlloc = nullptr;
+
+    SWR_ASSERT(m_pMutex != nullptr);
+
+    m_pMutex->lock();
+    pAlloc = AllocAligned(size, align);
+    m_pMutex->unlock();
+
+    return pAlloc;
+}
+
+void* Arena::AllocSync(size_t size)
+{
+    void* pAlloc = nullptr;
+
+    SWR_ASSERT(m_pMutex != nullptr);
+
+    m_pMutex->lock();
+    pAlloc = Alloc(size);
+    m_pMutex->unlock();
+
+    return pAlloc;
+}
+
+void Arena::Reset(bool removeAll)
+{
+    if (m_pCurBlock)
+    {
+        m_pCurBlock->offset = 0;
+
+        ArenaBlock *pUsedBlocks = m_pCurBlock->pNext;
+        m_pCurBlock->pNext = nullptr;
+        while(pUsedBlocks)
+        {
+            ArenaBlock* pBlock = pUsedBlocks;
+            pUsedBlocks = pBlock->pNext;
+
+            _aligned_free(pBlock->pMem);
+            delete pBlock;
+        }
+
+        if (removeAll)
+        {
+            _aligned_free(m_pCurBlock->pMem);
+            delete m_pCurBlock;
+            m_pCurBlock = nullptr;
+        }
+    }
+
+    m_size = 0;
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h
new file mode 100644
index 00000000000..76eee11fb08
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/arena.h
@@ -0,0 +1,69 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file arena.h
+*
+* @brief Arena memory manager
+*        The arena is convenient and fast for managing allocations for any of
+*        our allocations that are associated with operations and can all be freed
+*        once when their operation has completed. Allocations are cheap since
+*        most of the time its simply an increment of an offset. Also, no need to
+*        free individual allocations. All of the arena memory can be freed at once.
+*
+******************************************************************************/
+#pragma once
+
+#include <mutex>
+
+class Arena
+{
+public:
+    Arena();
+   ~Arena();
+
+    void        Init();
+
+    void*       AllocAligned(size_t size, size_t  align);
+    void*       Alloc(size_t  size);
+
+    void*       AllocAlignedSync(size_t size, size_t align);
+    void*       AllocSync(size_t size);
+
+    void        Reset(bool removeAll = false);
+    size_t      Size() { return m_size; }
+
+private:
+
+    struct ArenaBlock
+    {
+        void*       pMem        = nullptr;
+        size_t      blockSize   = 0;
+        size_t      offset      = 0;
+        ArenaBlock* pNext       = nullptr;
+    };
+
+    ArenaBlock*     m_pCurBlock = nullptr;
+    size_t          m_size      = 0;
+
+    /// @note Mutex is only used by sync allocation functions.
+    std::mutex*     m_pMutex;
+};
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
new file mode 100644
index 00000000000..4a472bc9e5c
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -0,0 +1,1899 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file backend.cpp
+*
+* @brief Backend handles rasterization, pixel shading and output merger
+*        operations.
+*
+******************************************************************************/
+
+#include <smmintrin.h>
+
+#include "rdtsc_core.h"
+#include "backend.h"
+#include "depthstencil.h"
+#include "tilemgr.h"
+#include "memory/tilingtraits.h"
+#include "core/multisample.h"
+
+#include <algorithm>
+
+const __m128 vTileOffsetsX = {0.5, KNOB_TILE_X_DIM - 0.5, 0.5, KNOB_TILE_X_DIM - 0.5};
+const __m128 vTileOffsetsY = {0.5, 0.5, KNOB_TILE_Y_DIM - 0.5, KNOB_TILE_Y_DIM - 0.5};
+
+/// @todo move to common lib
+#define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
+static const __m128 gMaskToVec[] = {
+    MASKTOVEC(0,0,0,0),
+    MASKTOVEC(0,0,0,1),
+    MASKTOVEC(0,0,1,0),
+    MASKTOVEC(0,0,1,1),
+    MASKTOVEC(0,1,0,0),
+    MASKTOVEC(0,1,0,1),
+    MASKTOVEC(0,1,1,0),
+    MASKTOVEC(0,1,1,1),
+    MASKTOVEC(1,0,0,0),
+    MASKTOVEC(1,0,0,1),
+    MASKTOVEC(1,0,1,0),
+    MASKTOVEC(1,0,1,1),
+    MASKTOVEC(1,1,0,0),
+    MASKTOVEC(1,1,0,1),
+    MASKTOVEC(1,1,1,0),
+    MASKTOVEC(1,1,1,1),
+};
+
+typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, DWORD[4]);
+static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Process compute work.
+/// @param pDC - pointer to draw context (dispatch).
+/// @param workerId - The unique worker ID that is assigned to this thread.
+/// @param threadGroupId - the linear index for the thread group within the dispatch.
+void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId)
+{
+    RDTSC_START(BEDispatch);
+
+    SWR_CONTEXT *pContext = pDC->pContext;
+
+    const COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pDispatch->GetTasksData();
+    SWR_ASSERT(pTaskData != nullptr);
+
+    // Ensure spill fill memory has been allocated.
+    if (pDC->pSpillFill[workerId] == nullptr)
+    {
+        ///@todo Add state which indicates the spill fill size.
+        pDC->pSpillFill[workerId] = (uint8_t*)pDC->pArena->AllocAlignedSync(4096 * 1024, sizeof(float) * 8);
+    }
+
+    const API_STATE& state = GetApiState(pDC);
+
+    SWR_CS_CONTEXT csContext{ 0 };
+    csContext.tileCounter = threadGroupId;
+    csContext.dispatchDims[0] = pTaskData->threadGroupCountX;
+    csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
+    csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
+    csContext.pTGSM = pContext->pScratch[workerId];
+    csContext.pSpillFillBuffer = pDC->pSpillFill[workerId];
+
+    state.pfnCsFunc(GetPrivateState(pDC), &csContext);
+
+    UPDATE_STAT(CsInvocations, state.totalThreadsInGroup);
+
+    RDTSC_STOP(BEDispatch, 1, 0);
+}
+
+void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
+{
+    SYNC_DESC *pSync = (SYNC_DESC*)pUserData;
+
+    uint32_t x, y;
+    MacroTileMgr::getTileIndices(macroTile, x, y);
+    SWR_ASSERT(x == 0 && y == 0);
+
+    if (pSync->pfnCallbackFunc != nullptr)
+    {
+        pSync->pfnCallbackFunc(pSync->userData, pSync->userData2, pSync->userData3);
+    }
+}
+
+void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
+{
+    QUERY_DESC* pQueryDesc = (QUERY_DESC*)pUserData;
+    SWR_STATS* pStats = pQueryDesc->pStats;
+    SWR_CONTEXT *pContext = pDC->pContext;
+
+    SWR_ASSERT(pStats != nullptr);
+
+    for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
+    {
+        pStats->DepthPassCount += pContext->stats[i].DepthPassCount;
+
+        pStats->IaVertices    += pContext->stats[i].IaVertices;
+        pStats->IaPrimitives  += pContext->stats[i].IaPrimitives;
+        pStats->VsInvocations += pContext->stats[i].VsInvocations;
+        pStats->HsInvocations += pContext->stats[i].HsInvocations;
+        pStats->DsInvocations += pContext->stats[i].DsInvocations;
+        pStats->GsInvocations += pContext->stats[i].GsInvocations;
+        pStats->PsInvocations += pContext->stats[i].PsInvocations;
+        pStats->CInvocations  += pContext->stats[i].CInvocations;
+        pStats->CsInvocations += pContext->stats[i].CsInvocations;
+        pStats->CPrimitives   += pContext->stats[i].CPrimitives;
+        pStats->GsPrimitives  += pContext->stats[i].GsPrimitives;
+
+        for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
+        {
+            pStats->SoWriteOffset[stream] += pContext->stats[i].SoWriteOffset[stream];
+
+            /// @note client is required to provide valid write offset before every draw, so we clear
+            /// out the contents of the write offset when storing stats
+            pContext->stats[i].SoWriteOffset[stream] = 0;
+
+            pStats->SoPrimStorageNeeded[stream] += pContext->stats[i].SoPrimStorageNeeded[stream];
+            pStats->SoNumPrimsWritten[stream] += pContext->stats[i].SoNumPrimsWritten[stream];
+        }
+    }
+}
+
+template<SWR_FORMAT format>
+void ClearRasterTile(BYTE *pTileBuffer, simdvector &value)
+{
+    auto lambda = [&](int comp)
+    {
+        FormatTraits<format>::storeSOA(comp, pTileBuffer, value.v[comp]);
+        pTileBuffer += (KNOB_SIMD_WIDTH * FormatTraits<format>::GetBPC(comp) / 8);
+    };
+
+    const uint32_t numIter = (KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM);
+    for (uint32_t i = 0; i < numIter; ++i)
+    {
+        UnrollerL<0, FormatTraits<format>::numComps, 1>::step(lambda);
+    }
+}
+
+template<SWR_FORMAT format>
+INLINE void ClearMacroTile(DRAW_CONTEXT *pDC, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t macroTile, DWORD clear[4])
+{
+    // convert clear color to hottile format
+    // clear color is in RGBA float/uint32
+    simdvector vClear;
+    for (uint32_t comp = 0; comp < FormatTraits<format>::numComps; ++comp)
+    {
+        simdscalar vComp;
+        vComp = _simd_load1_ps((const float*)&clear[comp]);
+        if (FormatTraits<format>::isNormalized(comp))
+        {
+            vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<format>::fromFloat(comp)));
+            vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
+        }
+        vComp = FormatTraits<format>::pack(comp, vComp);
+        vClear.v[FormatTraits<format>::swizzle(comp)] = vComp;
+    }
+
+    uint32_t tileX, tileY;
+    MacroTileMgr::getTileIndices(macroTile, tileX, tileY);
+    const API_STATE& state = GetApiState(pDC);
+    
+    int top = KNOB_MACROTILE_Y_DIM_FIXED * tileY;
+    int bottom = top + KNOB_MACROTILE_Y_DIM_FIXED - 1;
+    int left = KNOB_MACROTILE_X_DIM_FIXED * tileX;
+    int right = left + KNOB_MACROTILE_X_DIM_FIXED - 1;
+
+    // intersect with scissor
+    top = std::max(top, state.scissorInFixedPoint.top);
+    left = std::max(left, state.scissorInFixedPoint.left);
+    bottom = std::min(bottom, state.scissorInFixedPoint.bottom);
+    right = std::min(right, state.scissorInFixedPoint.right);
+
+    // translate to local hottile origin
+    top -= KNOB_MACROTILE_Y_DIM_FIXED * tileY;
+    bottom -= KNOB_MACROTILE_Y_DIM_FIXED * tileY;
+    left -= KNOB_MACROTILE_X_DIM_FIXED * tileX;
+    right -= KNOB_MACROTILE_X_DIM_FIXED * tileX;
+
+    // convert to raster tiles
+    top >>= (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
+    bottom >>= (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
+    left >>= (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
+    right >>= (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
+
+    const int numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
+    // compute steps between raster tile samples / raster tiles / macro tile rows
+    const uint32_t rasterTileSampleStep = KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<format>::bpp / 8;
+    const uint32_t rasterTileStep = (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<format>::bpp / 8)) * numSamples;
+    const uint32_t macroTileRowStep = (KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * rasterTileStep;
+    const uint32_t pitch = (FormatTraits<format>::bpp * KNOB_MACROTILE_X_DIM / 8);
+
+    HOTTILE *pHotTile = pDC->pContext->pHotTileMgr->GetHotTile(pDC->pContext, pDC, macroTile, rt, true, numSamples);
+    uint32_t rasterTileStartOffset = (ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, left, top)) * numSamples;
+    uint8_t* pRasterTileRow = pHotTile->pBuffer + rasterTileStartOffset; //(ComputeTileOffset2D< TilingTraits<SWR_TILE_SWRZ, FormatTraits<format>::bpp > >(pitch, x, y)) * numSamples;
+
+    // loop over all raster tiles in the current hot tile
+    for (int y = top; y <= bottom; ++y)
+    {
+        uint8_t* pRasterTile = pRasterTileRow;
+        for (int x = left; x <= right; ++x)
+        {
+            for( int sampleNum = 0; sampleNum < numSamples; sampleNum++)
+            {
+                ClearRasterTile<format>(pRasterTile, vClear);
+                pRasterTile += rasterTileSampleStep;
+            }
+        }
+        pRasterTileRow += macroTileRowStep;
+    }
+
+    pHotTile->state = HOTTILE_DIRTY;
+}
+
+
+void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData)
+{
+    if (KNOB_FAST_CLEAR)
+    {
+        CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
+        SWR_CONTEXT *pContext = pDC->pContext;
+        SWR_MULTISAMPLE_COUNT sampleCount = pDC->pState->state.rastState.sampleCount;
+        uint32_t numSamples = GetNumSamples(sampleCount);
+
+        SWR_ASSERT(pClear->flags.bits != 0); // shouldn't be here without a reason.
+
+        RDTSC_START(BEClear);
+
+        if (pClear->flags.mask & SWR_CLEAR_COLOR)
+        {
+            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_COLOR0, true, numSamples);
+            // All we want to do here is to mark the hot tile as being in a "needs clear" state.
+            pHotTile->clearData[0] = *(DWORD*)&(pClear->clearRTColor[0]);
+            pHotTile->clearData[1] = *(DWORD*)&(pClear->clearRTColor[1]);
+            pHotTile->clearData[2] = *(DWORD*)&(pClear->clearRTColor[2]);
+            pHotTile->clearData[3] = *(DWORD*)&(pClear->clearRTColor[3]);
+            pHotTile->state = HOTTILE_CLEAR;
+        }
+
+        if (pClear->flags.mask & SWR_CLEAR_DEPTH)
+        {
+            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_DEPTH, true, numSamples);
+            pHotTile->clearData[0] = *(DWORD*)&pClear->clearDepth;
+            pHotTile->state = HOTTILE_CLEAR;
+        }
+
+        if (pClear->flags.mask & SWR_CLEAR_STENCIL)
+        {
+            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, SWR_ATTACHMENT_STENCIL, true, numSamples);
+
+            pHotTile->clearData[0] = *(DWORD*)&pClear->clearStencil;
+            pHotTile->state = HOTTILE_CLEAR;
+        }
+
+        RDTSC_STOP(BEClear, 0, 0);
+    }
+    else
+    {
+        // Legacy clear
+        CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
+        RDTSC_START(BEClear);
+
+        if (pClear->flags.mask & SWR_CLEAR_COLOR)
+        {
+            /// @todo clear data should come in as RGBA32_FLOAT
+            DWORD clearData[4];
+            float clearFloat[4];
+            clearFloat[0] = ((BYTE*)(&pClear->clearRTColor))[0] / 255.0f;
+            clearFloat[1] = ((BYTE*)(&pClear->clearRTColor))[1] / 255.0f;
+            clearFloat[2] = ((BYTE*)(&pClear->clearRTColor))[2] / 255.0f;
+            clearFloat[3] = ((BYTE*)(&pClear->clearRTColor))[3] / 255.0f;
+            clearData[0] = *(DWORD*)&clearFloat[0];
+            clearData[1] = *(DWORD*)&clearFloat[1];
+            clearData[2] = *(DWORD*)&clearFloat[2];
+            clearData[3] = *(DWORD*)&clearFloat[3];
+
+            PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_COLOR_HOT_TILE_FORMAT];
+            SWR_ASSERT(pfnClearTiles != nullptr);
+
+            pfnClearTiles(pDC, SWR_ATTACHMENT_COLOR0, macroTile, clearData);
+        }
+
+        if (pClear->flags.mask & SWR_CLEAR_DEPTH)
+        {
+            DWORD clearData[4];
+            clearData[0] = *(DWORD*)&pClear->clearDepth;
+            PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_DEPTH_HOT_TILE_FORMAT];
+            SWR_ASSERT(pfnClearTiles != nullptr);
+
+            pfnClearTiles(pDC, SWR_ATTACHMENT_DEPTH, macroTile, clearData);
+        }
+
+        if (pClear->flags.mask & SWR_CLEAR_STENCIL)
+        {
+            uint32_t value = pClear->clearStencil;
+            DWORD clearData[4];
+            clearData[0] = *(DWORD*)&value;
+            PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[KNOB_STENCIL_HOT_TILE_FORMAT];
+
+            pfnClearTiles(pDC, SWR_ATTACHMENT_STENCIL, macroTile, clearData);
+        }
+
+        RDTSC_STOP(BEClear, 0, 0);
+    }
+}
+
+
+void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
+{
+    RDTSC_START(BEStoreTiles);
+    STORE_TILES_DESC *pDesc = (STORE_TILES_DESC*)pData;
+    SWR_CONTEXT *pContext = pDC->pContext;
+
+#ifdef KNOB_ENABLE_RDTSC
+    uint32_t numTiles = 0;
+#endif
+    SWR_FORMAT srcFormat;
+    switch (pDesc->attachment)
+    {
+    case SWR_ATTACHMENT_COLOR0:
+    case SWR_ATTACHMENT_COLOR1:
+    case SWR_ATTACHMENT_COLOR2:
+    case SWR_ATTACHMENT_COLOR3:
+    case SWR_ATTACHMENT_COLOR4:
+    case SWR_ATTACHMENT_COLOR5:
+    case SWR_ATTACHMENT_COLOR6:
+    case SWR_ATTACHMENT_COLOR7: srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
+    case SWR_ATTACHMENT_DEPTH: srcFormat = KNOB_DEPTH_HOT_TILE_FORMAT; break;
+    case SWR_ATTACHMENT_STENCIL: srcFormat = KNOB_STENCIL_HOT_TILE_FORMAT; break;
+    default: SWR_ASSERT(false, "Unknown attachment: %d", pDesc->attachment); srcFormat = KNOB_COLOR_HOT_TILE_FORMAT; break;
+    }
+
+    uint32_t x, y;
+    MacroTileMgr::getTileIndices(macroTile, x, y);
+
+    // Only need to store the hottile if it's been rendered to...
+    HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, pDesc->attachment, false);
+    if (pHotTile)
+    {
+        // clear if clear is pending (i.e., not rendered to), then mark as dirty for store.
+        if (pHotTile->state == HOTTILE_CLEAR)
+        {
+            PFN_CLEAR_TILES pfnClearTiles = sClearTilesTable[srcFormat];
+            SWR_ASSERT(pfnClearTiles != nullptr);
+
+            pfnClearTiles(pDC, pDesc->attachment, macroTile, pHotTile->clearData);
+        }
+
+        if (pHotTile->state == HOTTILE_DIRTY || pDesc->postStoreTileState == (SWR_TILE_STATE)HOTTILE_DIRTY)
+        {
+            int destX = KNOB_MACROTILE_X_DIM * x;
+            int destY = KNOB_MACROTILE_Y_DIM * y;
+
+            pContext->pfnStoreTile(GetPrivateState(pDC), srcFormat,
+                pDesc->attachment, destX, destY, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+        }
+        
+
+        if (pHotTile->state == HOTTILE_DIRTY || pHotTile->state == HOTTILE_RESOLVED)
+        {
+            pHotTile->state = (HOTTILE_STATE)pDesc->postStoreTileState;
+        }
+    }
+    RDTSC_STOP(BEStoreTiles, numTiles, pDC->drawId);
+}
+
+
+void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
+{
+    INVALIDATE_TILES_DESC *pDesc = (INVALIDATE_TILES_DESC*)pData;
+    SWR_CONTEXT *pContext = pDC->pContext;
+
+    for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
+    {
+        if (pDesc->attachmentMask & (1 << i))
+        {
+            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, false);
+            if (pHotTile)
+            {
+                pHotTile->state = HOTTILE_INVALID;
+            }
+        }
+    }
+}
+
+#if KNOB_SIMD_WIDTH == 8
+const __m256 vQuadCenterOffsetsX = { 0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5 };
+const __m256 vQuadCenterOffsetsY = { 0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5 };
+const __m256 vQuadULOffsetsX ={0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
+const __m256 vQuadULOffsetsY ={0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
+#define MASK 0xff
+#else
+#error Unsupported vector width
+#endif
+
+INLINE
+bool CanEarlyZ(const SWR_PS_STATE *pPSState)
+{
+    return (pPSState->forceEarlyZ || (!pPSState->writesODepth && !pPSState->usesSourceDepth && !pPSState->usesUAV));
+}
+
+simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ)
+{
+    simdscalar vClipMask = _simd_setzero_ps();
+    uint32_t numClipDistance = _mm_popcnt_u32(clipMask);
+
+    for (uint32_t i = 0; i < numClipDistance; ++i)
+    {
+        // pull triangle clip distance values from clip buffer
+        simdscalar vA = _simd_broadcast_ss(pUserClipBuffer++);
+        simdscalar vB = _simd_broadcast_ss(pUserClipBuffer++);
+        simdscalar vC = _simd_broadcast_ss(pUserClipBuffer++);
+
+        // interpolate
+        simdscalar vInterp = vplaneps(vA, vB, vC, vI, vJ);
+        
+        // clip if interpolated clip distance is < 0 || NAN
+        simdscalar vCull = _simd_cmp_ps(_simd_setzero_ps(), vInterp, _CMP_NLE_UQ);
+
+        vClipMask = _simd_or_ps(vClipMask, vCull);
+    }
+
+    return _simd_movemask_ps(vClipMask);
+}
+
+template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
+INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
+{
+
+    // will need to update for avx512
+    assert(KNOB_SIMD_WIDTH == 8);
+
+    __m256i mask[2];
+    __m256i sampleCoverage[2];
+    if(bIsStandardPattern)
+    {
+        __m256i src = _mm256_set1_epi32(0);
+        __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
+
+        if(MultisampleTraits<sampleCountT>::numSamples == 1)
+        {
+            mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 2)
+        {
+            mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 4)
+        {
+            mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 8)
+        {
+            mask[0] = _mm256_set1_epi32(-1);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 16)
+        {
+            mask[0] = _mm256_set1_epi32(-1);
+            mask[1] = _mm256_set1_epi32(-1);
+            index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
+        }
+
+        // gather coverage for samples 0-7
+        sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
+        if(MultisampleTraits<sampleCountT>::numSamples > 8)
+        {
+            // gather coverage for samples 8-15
+            sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
+        }
+    }
+    else
+    {
+        // center coverage is the same for all samples; just broadcast to the sample slots
+        uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
+        if(MultisampleTraits<sampleCountT>::numSamples == 1)
+        {
+            sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 2)
+        {
+            sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 4)
+        {
+            sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 8)
+        {
+            sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 16)
+        {
+            sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
+            sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
+        }
+    }
+
+    mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
+                              -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
+    // pull out the the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
+    __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
+
+    __m256i packedCoverage1;
+    if(MultisampleTraits<sampleCountT>::numSamples > 8)
+    {
+        // pull out the the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
+        packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
+    }
+
+#if (KNOB_ARCH == KNOB_ARCH_AVX)
+    // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
+    __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
+    __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
+    packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
+
+    __m256i packedSampleCoverage;
+    if(MultisampleTraits<sampleCountT>::numSamples > 8)
+    {
+        // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
+        hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
+        shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
+        shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
+        packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
+        packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
+    }
+    else
+    {
+        packedSampleCoverage = packedCoverage0;
+    }
+#else
+    __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
+    // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
+    packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
+
+    __m256i packedSampleCoverage;
+    if(MultisampleTraits<sampleCountT>::numSamples > 8)
+    {
+        permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
+        // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
+        packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
+
+        // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
+        packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
+    }
+    else
+    {
+        packedSampleCoverage = packedCoverage0;
+    }
+#endif
+
+    for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
+    {
+        // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
+        inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
+
+        if(!bForcedSampleCount)
+        {
+            // input coverage has to be anded with sample mask if MSAA isn't forced on
+            inputMask[i] &= sampleMask;
+        }
+
+        // shift to the next pixel in the 4x2
+        packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
+    }
+}
+
+template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
+INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
+{
+    uint32_t inputMask[KNOB_SIMD_WIDTH]; 
+    generateInputCoverage<sampleCountT, bIsStandardPattern, bForcedSampleCount>(coverageMask, inputMask, sampleMask);
+    inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
+}
+
+template<bool perspMask>
+INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
+{
+    if(perspMask)
+    {
+        // evaluate I,J
+        psContext.vI.center = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.center, psContext.vY.center);
+        psContext.vJ.center = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.center, psContext.vY.center);
+        psContext.vI.center = _simd_mul_ps(psContext.vI.center, coeffs.vRecipDet);
+        psContext.vJ.center = _simd_mul_ps(psContext.vJ.center, coeffs.vRecipDet);
+
+        // interpolate 1/w
+        psContext.vOneOverW.center = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.center, psContext.vJ.center);
+    }
+}
+
+template<bool perspMask>
+INLINE void CalcSampleBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
+{
+    if(perspMask)
+    {
+        // evaluate I,J
+        psContext.vI.sample = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.sample, psContext.vY.sample);
+        psContext.vJ.sample = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.sample, psContext.vY.sample);
+        psContext.vI.sample = _simd_mul_ps(psContext.vI.sample, coeffs.vRecipDet);
+        psContext.vJ.sample = _simd_mul_ps(psContext.vJ.sample, coeffs.vRecipDet);
+
+        // interpolate 1/w
+        psContext.vOneOverW.sample = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.sample, psContext.vJ.sample);
+    }
+}
+
+
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+// Centroid behaves exactly as follows :
+// (1) If all samples in the primitive are covered, the attribute is evaluated at the pixel center (even if the sample pattern does not happen to 
+//     have a sample location there).
+// (2) Else the attribute is evaluated at the first covered sample, in increasing order of sample index, where sample coverage is after ANDing the 
+//     coverage with the SampleMask Rasterizer State.
+// (3) If no samples are covered, such as on helper pixels executed off the bounds of a primitive to fill out 2x2 pixel stamps, the attribute is 
+//     evaluated as follows : If the SampleMask Rasterizer state is a subset of the samples in the pixel, then the first sample covered by the 
+//     SampleMask Rasterizer State is the evaluation point.Otherwise (full SampleMask), the pixel center is the evaluation point.
+////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////
+template<SWR_MULTISAMPLE_COUNT sampleCount, bool bForcedSampleCount>
+INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const uint64_t *const coverageMask, const uint32_t sampleMask,
+                            const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
+{
+    uint32_t inputMask[KNOB_SIMD_WIDTH];
+
+    generateInputCoverage<sampleCount, 1, bForcedSampleCount>(coverageMask, inputMask, sampleMask);
+
+    // Case (2) - partially covered pixel
+
+    // scan for first covered sample per pixel in the 4x2 span
+    unsigned long sampleNum[KNOB_SIMD_WIDTH];
+    (inputMask[0] > 0) ? (_BitScanForward(&sampleNum[0], inputMask[0])) : (sampleNum[0] = 0);
+    (inputMask[1] > 0) ? (_BitScanForward(&sampleNum[1], inputMask[1])) : (sampleNum[1] = 0);
+    (inputMask[2] > 0) ? (_BitScanForward(&sampleNum[2], inputMask[2])) : (sampleNum[2] = 0);
+    (inputMask[3] > 0) ? (_BitScanForward(&sampleNum[3], inputMask[3])) : (sampleNum[3] = 0);
+    (inputMask[4] > 0) ? (_BitScanForward(&sampleNum[4], inputMask[4])) : (sampleNum[4] = 0);
+    (inputMask[5] > 0) ? (_BitScanForward(&sampleNum[5], inputMask[5])) : (sampleNum[5] = 0);
+    (inputMask[6] > 0) ? (_BitScanForward(&sampleNum[6], inputMask[6])) : (sampleNum[6] = 0);
+    (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
+
+    // look up and set the sample offsets from UL pixel corner for first covered sample 
+    __m256 vXSample = _mm256_set_ps(MultisampleTraits<sampleCount>::X(sampleNum[7]),
+                                    MultisampleTraits<sampleCount>::X(sampleNum[6]),
+                                    MultisampleTraits<sampleCount>::X(sampleNum[5]),
+                                    MultisampleTraits<sampleCount>::X(sampleNum[4]),
+                                    MultisampleTraits<sampleCount>::X(sampleNum[3]),
+                                    MultisampleTraits<sampleCount>::X(sampleNum[2]),
+                                    MultisampleTraits<sampleCount>::X(sampleNum[1]),
+                                    MultisampleTraits<sampleCount>::X(sampleNum[0]));
+
+    __m256 vYSample = _mm256_set_ps(MultisampleTraits<sampleCount>::Y(sampleNum[7]),
+                                    MultisampleTraits<sampleCount>::Y(sampleNum[6]),
+                                    MultisampleTraits<sampleCount>::Y(sampleNum[5]),
+                                    MultisampleTraits<sampleCount>::Y(sampleNum[4]),
+                                    MultisampleTraits<sampleCount>::Y(sampleNum[3]),
+                                    MultisampleTraits<sampleCount>::Y(sampleNum[2]),
+                                    MultisampleTraits<sampleCount>::Y(sampleNum[1]),
+                                    MultisampleTraits<sampleCount>::Y(sampleNum[0]));
+    // add sample offset to UL pixel corner
+    vXSample = _simd_add_ps(vXSamplePosUL, vXSample);
+    vYSample = _simd_add_ps(vYSamplePosUL, vYSample);
+
+    // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
+    static const __m256i vFullyCoveredMask = MultisampleTraits<sampleCount>::FullSampleMask();
+    __m256i vInputCoveragei =  _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
+    __m256i vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
+
+    static const __m256i vZero = _simd_setzero_si();
+    const __m256i vSampleMask = _simd_and_si(_simd_set1_epi32(sampleMask), vFullyCoveredMask);
+    __m256i vNoSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vZero);
+    __m256i vIsFullSampleMask = _simd_cmpeq_epi32(vSampleMask, vFullyCoveredMask);
+    __m256i vCase3b = _simd_and_si(vNoSamplesCovered, vIsFullSampleMask);
+
+    __m256i vEvalAtCenter = _simd_or_si(vAllSamplesCovered, vCase3b);
+
+    // set the centroid position based on results from above
+    psContext.vX.centroid = _simd_blendv_ps(vXSample, psContext.vX.center, _simd_castsi_ps(vEvalAtCenter));
+    psContext.vY.centroid = _simd_blendv_ps(vYSample, psContext.vY.center, _simd_castsi_ps(vEvalAtCenter));
+
+    // Case (3a) No samples covered and partial sample mask
+    __m256i vSomeSampleMaskSamples = _simd_cmplt_epi32(vSampleMask, vFullyCoveredMask);
+    // sample mask should never be all 0's for this case, but handle it anyways
+    unsigned long firstCoveredSampleMaskSample = 0;
+    (sampleMask > 0) ? (_BitScanForward(&firstCoveredSampleMaskSample, sampleMask)) : (firstCoveredSampleMaskSample = 0);
+
+    __m256i vCase3a = _simd_and_si(vNoSamplesCovered, vSomeSampleMaskSamples);
+
+    vXSample = _simd_set1_ps(MultisampleTraits<sampleCount>::X(firstCoveredSampleMaskSample));
+    vYSample = _simd_set1_ps(MultisampleTraits<sampleCount>::Y(firstCoveredSampleMaskSample));
+
+    // blend in case 3a pixel locations
+    psContext.vX.centroid = _simd_blendv_ps(psContext.vX.centroid, vXSample, _simd_castsi_ps(vCase3a));
+    psContext.vY.centroid = _simd_blendv_ps(psContext.vY.centroid, vYSample, _simd_castsi_ps(vCase3a));
+}
+
+template<uint32_t sampleCount, uint32_t persp, uint32_t standardPattern, uint32_t forcedMultisampleCount>
+INLINE void CalcCentroidBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext,
+                                     const uint64_t *const coverageMask, const uint32_t sampleMask,
+                                     const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL)
+{
+    static const bool bPersp = (bool)persp;
+    static const bool bIsStandardPattern = (bool)standardPattern;
+    static const bool bForcedMultisampleCount = (bool)forcedMultisampleCount;
+
+    // calculate centroid positions
+    if(bPersp)
+    {
+        if(bIsStandardPattern)
+        {
+            ///@ todo: don't need to generate input coverage 2x if input coverage and centroid
+            CalcCentroidPos<(SWR_MULTISAMPLE_COUNT)sampleCount, bForcedMultisampleCount>(psContext, coverageMask, sampleMask, vXSamplePosUL, vYSamplePosUL);
+        }
+        else
+        {
+            static const __m256 pixelCenter = _simd_set1_ps(0.5f);
+            psContext.vX.centroid = _simd_add_ps(vXSamplePosUL, pixelCenter);
+            psContext.vY.centroid = _simd_add_ps(vYSamplePosUL, pixelCenter);
+        }
+        // evaluate I,J
+        psContext.vI.centroid = vplaneps(coeffs.vIa, coeffs.vIb, coeffs.vIc, psContext.vX.centroid, psContext.vY.centroid);
+        psContext.vJ.centroid = vplaneps(coeffs.vJa, coeffs.vJb, coeffs.vJc, psContext.vX.centroid, psContext.vY.centroid);
+        psContext.vI.centroid = _simd_mul_ps(psContext.vI.centroid, coeffs.vRecipDet);
+        psContext.vJ.centroid = _simd_mul_ps(psContext.vJ.centroid, coeffs.vRecipDet);
+
+        // interpolate 1/w
+        psContext.vOneOverW.centroid = vplaneps(coeffs.vAOneOverW, coeffs.vBOneOverW, coeffs.vCOneOverW, psContext.vI.centroid, psContext.vJ.centroid);
+    }
+}
+
+template<uint32_t NumRT, uint32_t sampleCountT>
+void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState,
+                  const PFN_BLEND_JIT_FUNC (&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask)
+{
+    // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
+    static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
+    uint32_t rasterTileColorOffset = MultisampleTraits<sampleCount>::RasterTileColorOffset(sample);
+    for(uint32_t rt = 0; rt < NumRT; ++rt)
+    {
+        uint8_t *pColorSample;
+        if(sampleCount == SWR_MULTISAMPLE_1X)
+        {
+            pColorSample = pColorBase[rt];
+        }
+        else
+        {
+            pColorSample = pColorBase[rt] + rasterTileColorOffset;
+        }
+
+        const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
+
+        // Blend outputs and update coverage mask for alpha test
+        if(pfnBlendFunc[rt] != nullptr)
+        {
+            pfnBlendFunc[rt](
+                pBlendState,
+                psContext.shaded[rt],
+                psContext.shaded[1],
+                sample,
+                pColorSample,
+                psContext.shaded[rt],
+                &psContext.oMask,
+                (simdscalari*)&coverageMask);
+        }
+
+        // final write mask 
+        simdscalari outputMask = _simd_castps_si(_simd_and_ps(coverageMask, depthPassMask));
+
+        ///@todo can only use maskstore fast path if bpc is 32. Assuming hot tile is RGBA32_FLOAT.
+        static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
+
+        const uint32_t simd = KNOB_SIMD_WIDTH * sizeof(float);
+
+        // store with color mask
+        if(!pRTBlend->writeDisableRed)
+        {
+            _simd_maskstore_ps((float*)pColorSample, outputMask, psContext.shaded[rt].x);
+        }
+        if(!pRTBlend->writeDisableGreen)
+        {
+            _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, psContext.shaded[rt].y);
+        }
+        if(!pRTBlend->writeDisableBlue)
+        {
+            _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, psContext.shaded[rt].z);
+        }
+        if(!pRTBlend->writeDisableAlpha)
+        {
+            _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, psContext.shaded[rt].w);
+        }
+    }
+}
+
+template<uint32_t sampleCountT, uint32_t samplePattern, uint32_t inputCoverage, uint32_t centroidPos, uint32_t forcedSampleCount>
+void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
+{
+    RDTSC_START(BESetup);
+    // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
+    static const bool bInputCoverage = (bool)inputCoverage;
+    static const bool bCentroidPos = (bool)centroidPos;
+
+    SWR_CONTEXT *pContext = pDC->pContext;
+    const API_STATE& state = GetApiState(pDC);
+    const SWR_RASTSTATE& rastState = state.rastState;
+    const SWR_PS_STATE *pPSState = &state.psState;
+    const SWR_BLEND_STATE *pBlendState = &state.blendState;
+    const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
+    uint64_t coverageMask = work.coverageMask[0];
+
+    // broadcast scalars
+    BarycentricCoeffs coeffs;
+    coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
+    coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
+    coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
+
+    coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
+    coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
+    coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
+
+    coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
+    coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
+    coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
+
+    coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
+
+    coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
+    coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
+    coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
+
+    uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
+    uint32_t NumRT = state.psState.numRenderTargets;
+    for(uint32_t rt = 0; rt < NumRT; ++rt)
+    {
+        pColorBase[rt] = renderBuffers.pColor[rt];
+    }
+    uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
+    RDTSC_STOP(BESetup, 0, 0);
+
+    SWR_PS_CONTEXT psContext;
+    psContext.pAttribs = work.pAttribs;
+    psContext.pPerspAttribs = work.pPerspAttribs;
+    psContext.frontFace = work.triFlags.frontFacing;
+    psContext.primID = work.triFlags.primID;
+
+    // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
+    psContext.I = work.I;
+    psContext.J = work.J;
+    psContext.recipDet = work.recipDet;
+    psContext.pRecipW = work.pRecipW;
+    psContext.pSamplePosX = (const float*)&MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosX;
+    psContext.pSamplePosY = (const float*)&MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosY;
+
+    for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
+    {
+        // UL pixel corner
+        psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+        // pixel center
+        psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
+
+        for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
+        {
+            if(bInputCoverage)
+            {
+                generateInputCoverage<SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, false>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
+            }
+
+            if(coverageMask & MASK)
+            {
+                RDTSC_START(BEBarycentric);
+                psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+                // pixel center
+                psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
+
+                backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
+
+                if(bCentroidPos)
+                {
+                    // for 1x case, centroid is pixel center
+                    psContext.vX.centroid = psContext.vX.center;
+                    psContext.vY.centroid = psContext.vY.center;
+                    psContext.vI.centroid = psContext.vI.center;
+                    psContext.vJ.centroid = psContext.vJ.center;
+                    psContext.vOneOverW.centroid = psContext.vOneOverW.center;
+                }
+
+                // interpolate z
+                psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
+                RDTSC_STOP(BEBarycentric, 0, 0);
+
+                simdmask clipCoverageMask = coverageMask & MASK;
+
+                // interpolate user clip distance if available
+                if(rastState.clipDistanceMask)
+                {
+                    clipCoverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
+                                                             psContext.vI.center, psContext.vJ.center);
+                }
+
+                simdscalar vCoverageMask = vMask(clipCoverageMask);
+                simdscalar depthPassMask = vCoverageMask;
+                simdscalar stencilPassMask = vCoverageMask;
+
+                // Early-Z?
+                if(CanEarlyZ(pPSState))
+                {
+                    RDTSC_START(BEEarlyDepthTest);
+                    depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+                                                        psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
+                    RDTSC_STOP(BEEarlyDepthTest, 0, 0);
+
+                    // early-exit if no pixels passed depth or earlyZ is forced on
+                    if(pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
+                    {
+                        DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+                                            pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
+
+                        if (!_simd_movemask_ps(depthPassMask))
+                        {
+                            goto Endtile;
+                        }
+                    }
+                }
+
+                psContext.sampleIndex = 0;
+                psContext.activeMask = _simd_castps_si(vCoverageMask);
+
+                // execute pixel shader
+                RDTSC_START(BEPixelShader);
+                UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
+                state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
+                RDTSC_STOP(BEPixelShader, 0, 0);
+
+                vCoverageMask = _simd_castsi_ps(psContext.activeMask);
+
+                // late-Z
+                if(!CanEarlyZ(pPSState))
+                {
+                    RDTSC_START(BELateDepthTest);
+                    depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+                                                        psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
+                    RDTSC_STOP(BELateDepthTest, 0, 0);
+
+                    if(!_simd_movemask_ps(depthPassMask))
+                    {
+                        // need to call depth/stencil write for stencil write
+                        DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+                                            pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
+                        goto Endtile;
+                    }
+                }
+
+                uint32_t statMask = _simd_movemask_ps(depthPassMask);
+                uint32_t statCount = _mm_popcnt_u32(statMask);
+                UPDATE_STAT(DepthPassCount, statCount);
+
+                // output merger
+                RDTSC_START(BEOutputMerger);
+                backendFuncs.pfnOutputMerger(psContext, pColorBase, 0, pBlendState, state.pfnBlendFunc,
+                                             vCoverageMask, depthPassMask);
+
+                // do final depth write after all pixel kills
+                if (!pPSState->forceEarlyZ)
+                {
+                    DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+                        pDepthBase, depthPassMask, vCoverageMask, pStencilBase, stencilPassMask);
+                }
+                RDTSC_STOP(BEOutputMerger, 0, 0);
+            }
+
+Endtile:
+            RDTSC_START(BEEndTile);
+            coverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+            pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
+            pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
+
+            for(uint32_t rt = 0; rt < NumRT; ++rt)
+            {
+                pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+            }
+            RDTSC_STOP(BEEndTile, 0, 0);
+        }
+    }
+}
+
+template<uint32_t sampleCountT, uint32_t samplePattern, uint32_t inputCoverage, uint32_t centroidPos, uint32_t forcedSampleCount>
+void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
+{
+    // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
+    static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
+    static const bool bInputCoverage = (bool)inputCoverage;
+    static const bool bCentroidPos = (bool)centroidPos;
+
+    RDTSC_START(BESetup);
+
+    SWR_CONTEXT *pContext = pDC->pContext;
+    const API_STATE& state = GetApiState(pDC);
+    const SWR_RASTSTATE& rastState = state.rastState;
+    const SWR_PS_STATE *pPSState = &state.psState;
+    const SWR_BLEND_STATE *pBlendState = &state.blendState;
+    const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
+
+    // broadcast scalars
+    BarycentricCoeffs coeffs;
+    coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
+    coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
+    coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
+
+    coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
+    coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
+    coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
+
+    coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
+    coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
+    coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
+
+    coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
+
+    coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
+    coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
+    coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
+
+    uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
+    uint32_t NumRT = state.psState.numRenderTargets;
+    for(uint32_t rt = 0; rt < NumRT; ++rt)
+    {
+        pColorBase[rt] = renderBuffers.pColor[rt];
+    }
+    uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
+    RDTSC_STOP(BESetup, 0, 0);
+
+    SWR_PS_CONTEXT psContext;
+    psContext.pAttribs = work.pAttribs;
+    psContext.pPerspAttribs = work.pPerspAttribs;
+    psContext.pRecipW = work.pRecipW;
+    psContext.frontFace = work.triFlags.frontFacing;
+    psContext.primID = work.triFlags.primID;
+
+    // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
+    psContext.I = work.I;
+    psContext.J = work.J;
+    psContext.recipDet = work.recipDet;
+    psContext.pSamplePosX = (const float*)&MultisampleTraits<sampleCount>::samplePosX;
+    psContext.pSamplePosY = (const float*)&MultisampleTraits<sampleCount>::samplePosY;
+    const uint32_t numSamples = MultisampleTraits<sampleCount>::numSamples;
+
+    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
+    {
+        // UL pixel corner
+        psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+        // pixel center
+        psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
+        
+        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
+        {
+            psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+            // pixel center
+            psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
+
+            RDTSC_START(BEBarycentric);
+            backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
+            RDTSC_STOP(BEBarycentric, 0, 0);
+
+            if(bInputCoverage)
+            {
+                generateInputCoverage<sampleCount, SWR_MSAA_STANDARD_PATTERN, false>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
+            }
+
+            if(bCentroidPos)
+            {
+                ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
+                RDTSC_START(BEBarycentric);
+                backendFuncs.pfnCalcCentroidBarycentrics(coeffs, psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
+                RDTSC_STOP(BEBarycentric, 0, 0);
+            }
+
+            for(uint32_t sample = 0; sample < numSamples; sample++)
+            {
+                if (work.coverageMask[sample] & MASK)
+                {
+                    RDTSC_START(BEBarycentric);
+
+                    // calculate per sample positions
+                    psContext.vX.sample = _simd_add_ps(psContext.vX.UL, MultisampleTraits<sampleCount>::vX(sample));
+                    psContext.vY.sample = _simd_add_ps(psContext.vY.UL, MultisampleTraits<sampleCount>::vY(sample));
+                    
+                    simdmask coverageMask = work.coverageMask[sample] & MASK;
+                    simdscalar vCoverageMask = vMask(coverageMask);
+
+                    backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
+
+                    // interpolate z
+                    psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
+
+                    RDTSC_STOP(BEBarycentric, 0, 0);
+
+                    // interpolate user clip distance if available
+                    if (rastState.clipDistanceMask)
+                    {
+                        coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
+                            psContext.vI.sample, psContext.vJ.sample);
+                    }
+                    
+                    simdscalar depthPassMask = vCoverageMask;
+                    simdscalar stencilPassMask = vCoverageMask;
+
+                    // offset depth/stencil buffers current sample
+                    uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
+                    uint8_t *pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
+
+                    // Early-Z?
+                    if (CanEarlyZ(pPSState))
+                    {
+                        RDTSC_START(BEEarlyDepthTest);
+                        depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+                                              psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
+                        RDTSC_STOP(BEEarlyDepthTest, 0, 0);
+
+                        // early-exit if no samples passed depth or earlyZ is forced on.
+                        if (pPSState->forceEarlyZ || !_simd_movemask_ps(depthPassMask))
+                        {
+                            DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+                                pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
+
+                            if (!_simd_movemask_ps(depthPassMask))
+                            {
+                                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+                                continue;
+                            }
+                        }
+                    }
+
+                    psContext.sampleIndex = sample;
+                    psContext.activeMask = _simd_castps_si(vCoverageMask);
+
+                    // execute pixel shader
+                    RDTSC_START(BEPixelShader);
+                    UPDATE_STAT(PsInvocations, _mm_popcnt_u32(_simd_movemask_ps(vCoverageMask)));
+                    state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
+                    RDTSC_STOP(BEPixelShader, 0, 0);
+
+                    vCoverageMask = _simd_castsi_ps(psContext.activeMask);
+
+                    //// late-Z
+                    if (!CanEarlyZ(pPSState))
+                    {
+                        RDTSC_START(BELateDepthTest);
+                        depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+                                              psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
+                        RDTSC_STOP(BELateDepthTest, 0, 0);
+
+                        if (!_simd_movemask_ps(depthPassMask))
+                        {
+                            // need to call depth/stencil write for stencil write
+                            DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+                                pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
+
+                            work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+                            continue;
+                        }
+                    }
+
+                    uint32_t statMask = _simd_movemask_ps(depthPassMask);
+                    uint32_t statCount = _mm_popcnt_u32(statMask);
+                    UPDATE_STAT(DepthPassCount, statCount);
+
+                    // output merger
+                    RDTSC_START(BEOutputMerger);
+                    backendFuncs.pfnOutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc, 
+                                                 vCoverageMask, depthPassMask);
+
+                    // do final depth write after all pixel kills
+                    if (!pPSState->forceEarlyZ)
+                    {
+                        DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+                            pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
+                    }
+                    RDTSC_STOP(BEOutputMerger, 0, 0);
+                }
+                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+            }
+            RDTSC_START(BEEndTile);
+            pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
+            pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
+
+            for (uint32_t rt = 0; rt < NumRT; ++rt)
+            {
+                pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+            }
+            RDTSC_STOP(BEEndTile, 0, 0);
+        }
+    }
+}
+
+template<uint32_t sampleCountT, uint32_t samplePattern, uint32_t inputCoverage, uint32_t centroidPos, uint32_t forcedSampleCount>
+void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
+{
+    // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
+    static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
+    static const bool bIsStandardPattern = (bool)samplePattern;
+    static const bool bInputCoverage = (bool)inputCoverage;
+    static const bool bCentroidPos = (bool)centroidPos;
+    static const bool bForcedSampleCount = (bool)forcedSampleCount;
+
+    RDTSC_START(BESetup);
+
+    SWR_CONTEXT *pContext = pDC->pContext;
+    const API_STATE& state = GetApiState(pDC);
+    const SWR_RASTSTATE& rastState = state.rastState;
+    const SWR_PS_STATE *pPSState = &state.psState;
+    const SWR_BLEND_STATE *pBlendState = &state.blendState;
+    const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
+
+    // broadcast scalars
+    BarycentricCoeffs coeffs;
+    coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
+    coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
+    coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
+
+    coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
+    coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
+    coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
+
+    coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
+    coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
+    coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
+
+    coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
+
+    coeffs.vAOneOverW = _simd_broadcast_ss(&work.OneOverW[0]);
+    coeffs.vBOneOverW = _simd_broadcast_ss(&work.OneOverW[1]);
+    coeffs.vCOneOverW = _simd_broadcast_ss(&work.OneOverW[2]);
+
+    uint8_t *pColorBase[SWR_NUM_RENDERTARGETS];
+    uint32_t NumRT = state.psState.numRenderTargets;
+    for(uint32_t rt = 0; rt < NumRT; ++rt)
+    {
+        pColorBase[rt] = renderBuffers.pColor[rt];
+    }
+    uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
+    RDTSC_STOP(BESetup, 0, 0);
+
+    SWR_PS_CONTEXT psContext;
+    psContext.pAttribs = work.pAttribs;
+    psContext.pPerspAttribs = work.pPerspAttribs;
+    psContext.frontFace = work.triFlags.frontFacing;
+    psContext.primID = work.triFlags.primID;
+    psContext.pRecipW = work.pRecipW;
+    // save Ia/Ib/Ic and Ja/Jb/Jc if we need to reevaluate i/j/k in the shader because of pull attribs
+    psContext.I = work.I;
+    psContext.J = work.J;
+    psContext.recipDet = work.recipDet;
+    psContext.pSamplePosX = (const float*)&MultisampleTraits<sampleCount>::samplePosX;
+    psContext.pSamplePosY = (const float*)&MultisampleTraits<sampleCount>::samplePosY;
+    psContext.sampleIndex = 0;
+
+    uint32_t numCoverageSamples;
+    if(bIsStandardPattern)
+    {
+        numCoverageSamples = MultisampleTraits<sampleCount>::numSamples;
+    }
+    else
+    {
+        numCoverageSamples = 1;
+    }
+
+    uint32_t numOMSamples;
+    // RT has to be single sample if we're in forcedMSAA mode
+    if(bForcedSampleCount && (sampleCount > SWR_MULTISAMPLE_1X))
+    {
+        numOMSamples = 1;
+    }
+    // unless we're forced to single sample, in which case we run the OM at the sample count of the RT
+    else if(bForcedSampleCount && (sampleCount == SWR_MULTISAMPLE_1X))
+    {
+        numOMSamples = GetNumSamples(pBlendState->sampleCount);
+    }
+    // else we're in normal MSAA mode and rasterizer and OM are running at the same sample count
+    else
+    {
+        numOMSamples = MultisampleTraits<sampleCount>::numSamples;
+    }
+    
+    for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
+    {
+        psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+        psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
+        for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
+        {
+            simdscalar vZ[MultisampleTraits<sampleCount>::numSamples];
+            psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+            // set pixel center positions
+            psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
+
+            if (bInputCoverage)
+            {
+                generateInputCoverage<sampleCount, bIsStandardPattern, bForcedSampleCount>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask);
+            }
+
+            if(bCentroidPos)
+            {
+                ///@ todo: don't need to genererate input coverage 2x if input coverage and centroid
+                RDTSC_START(BEBarycentric);
+                backendFuncs.pfnCalcCentroidBarycentrics(coeffs, psContext, &work.coverageMask[0], pBlendState->sampleMask, psContext.vX.UL, psContext.vY.UL);
+                RDTSC_STOP(BEBarycentric, 0, 0);
+            }
+
+            // if oDepth written to, or there is a potential to discard any samples, we need to 
+            // run the PS early, then interp or broadcast Z and test
+            if(pPSState->writesODepth || pPSState->killsPixel)
+            {
+                RDTSC_START(BEBarycentric);
+                backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
+
+                // interpolate z
+                psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
+                RDTSC_STOP(BEBarycentric, 0, 0);
+
+                // execute pixel shader
+                RDTSC_START(BEPixelShader);
+                state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
+                RDTSC_STOP(BEPixelShader, 0, 0);
+            }
+            else
+            {
+				psContext.activeMask = _simd_set1_epi32(-1);
+            }
+
+            // need to declare enough space for all samples
+            simdscalar vCoverageMask[MultisampleTraits<sampleCount>::numSamples];
+            simdscalar depthPassMask[MultisampleTraits<sampleCount>::numSamples]; 
+            simdscalar stencilPassMask[MultisampleTraits<sampleCount>::numSamples];
+            simdscalar anyDepthSamplePassed = _simd_setzero_ps();
+            simdscalar anyStencilSamplePassed = _simd_setzero_ps();
+            for(uint32_t sample = 0; sample < numCoverageSamples; sample++)
+            {
+                vCoverageMask[sample] = vMask(work.coverageMask[sample] & MASK);
+
+                // pull mask back out for any discards and and with coverage
+                vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], _simd_castsi_ps(psContext.activeMask));
+
+                if (!_simd_movemask_ps(vCoverageMask[sample]))
+                {
+                    vCoverageMask[sample] = depthPassMask[sample] = stencilPassMask[sample] =  _simd_setzero_ps();
+                    continue;
+                }
+
+                if(bForcedSampleCount)
+                {
+                    // candidate pixels (that passed coverage) will cause shader invocation if any bits in the samplemask are set
+                    const simdscalar vSampleMask = _simd_castsi_ps(_simd_cmpgt_epi32(_simd_set1_epi32(pBlendState->sampleMask), _simd_setzero_si()));
+                    anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, _simd_and_ps(vCoverageMask[sample], vSampleMask));
+                    continue;
+                }
+
+                depthPassMask[sample] = vCoverageMask[sample];
+
+                // if oDepth isn't written to, we need to interpolate Z for each sample
+                // if clip distances are enabled, we need to interpolate for each sample
+                if(!pPSState->writesODepth || rastState.clipDistanceMask)
+                {
+                    RDTSC_START(BEBarycentric);
+                    if(bIsStandardPattern)
+                    {
+                        // calculate per sample positions
+                        psContext.vX.sample = _simd_add_ps(psContext.vX.UL, MultisampleTraits<sampleCount>::vX(sample));
+                        psContext.vY.sample = _simd_add_ps(psContext.vY.UL, MultisampleTraits<sampleCount>::vY(sample));
+                    }
+                    else
+                    {
+                        psContext.vX.sample = psContext.vX.center;
+                        psContext.vY.sample = psContext.vY.center;
+                    }
+
+                    // calc I & J per sample
+                    backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
+
+                    // interpolate z
+                    if (!pPSState->writesODepth)
+                    {
+                        vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
+                    }
+                    
+                    ///@todo: perspective correct vs non-perspective correct clipping?
+                    // interpolate clip distances
+                    if (rastState.clipDistanceMask)
+                    {
+                        uint8_t clipMask = ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
+                            psContext.vI.sample, psContext.vJ.sample);
+                        vCoverageMask[sample] = _simd_and_ps(vCoverageMask[sample], vMask(~clipMask));
+                    }
+                    RDTSC_STOP(BEBarycentric, 0, 0);
+                }
+                // else 'broadcast' and test psContext.vZ written from the PS each sample
+                else
+                {
+                    vZ[sample] = psContext.vZ;
+                }
+
+                // offset depth/stencil buffers current sample
+                uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
+                uint8_t * pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
+
+                // ZTest for this sample
+                RDTSC_START(BEEarlyDepthTest);
+                stencilPassMask[sample] = vCoverageMask[sample];
+                depthPassMask[sample] = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+                                        vZ[sample], pDepthSample, vCoverageMask[sample], pStencilSample, &stencilPassMask[sample]);
+                RDTSC_STOP(BEEarlyDepthTest, 0, 0);
+
+                anyDepthSamplePassed = _simd_or_ps(anyDepthSamplePassed, depthPassMask[sample]);
+                anyStencilSamplePassed = _simd_or_ps(anyStencilSamplePassed, stencilPassMask[sample]);
+                uint32_t statMask = _simd_movemask_ps(depthPassMask[sample]);
+                uint32_t statCount = _mm_popcnt_u32(statMask);
+                UPDATE_STAT(DepthPassCount, statCount);
+            }
+
+            // if we didn't have to execute the PS early, and at least 1 sample passed the depth test, run the PS
+            if(!pPSState->writesODepth && !pPSState->killsPixel && _simd_movemask_ps(anyDepthSamplePassed))
+            {
+                RDTSC_START(BEBarycentric);
+                backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
+                // interpolate z
+                psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
+                RDTSC_STOP(BEBarycentric, 0, 0);
+
+                // execute pixel shader
+                RDTSC_START(BEPixelShader);
+                state.psState.pfnPixelShader(GetPrivateState(pDC), &psContext);
+                RDTSC_STOP(BEPixelShader, 0, 0);
+            }
+            ///@todo: make sure this works for kill pixel
+            else if(!_simd_movemask_ps(anyStencilSamplePassed))
+            {
+                goto Endtile;
+            }
+
+            // loop over all samples, broadcasting the results of the PS to all passing pixels
+            for(uint32_t sample = 0; sample < numOMSamples; sample++)
+            {
+                uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
+                uint8_t * pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
+
+                // output merger
+                RDTSC_START(BEOutputMerger);
+
+                // skip if none of the pixels for this sample passed
+                simdscalar coverageMaskSample;
+                simdscalar depthMaskSample;
+                simdscalar stencilMaskSample;
+                simdscalar vInterpolatedZ;
+
+                // forcedSampleCount outputs to any pixels with covered samples not masked off by SampleMask
+                // depth test is disabled, so just set the z val to 0.
+                if(bForcedSampleCount)
+                {
+                    coverageMaskSample = depthMaskSample = anyDepthSamplePassed;
+                    vInterpolatedZ = _simd_setzero_ps();
+                }
+                else if(bIsStandardPattern)
+                {
+                    if(!_simd_movemask_ps(depthPassMask[sample]))
+                    {
+                        depthPassMask[sample] = _simd_setzero_ps();
+                        DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[sample], pDepthSample, depthPassMask[sample],
+                                          vCoverageMask[sample], pStencilSample, stencilPassMask[sample]);
+                        continue;
+                    }
+                    coverageMaskSample = vCoverageMask[sample];
+                    depthMaskSample = depthPassMask[sample];
+                    stencilMaskSample = stencilPassMask[sample];
+                    vInterpolatedZ = vZ[sample];
+                }
+                else
+                {
+                    // center pattern only needs to use a single depth test as all samples are at the same position
+                    if(!_simd_movemask_ps(depthPassMask[0]))
+                    {
+                        depthPassMask[0] = _simd_setzero_ps();
+                        DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vZ[0], pDepthSample, depthPassMask[0],
+                                          vCoverageMask[0], pStencilSample, stencilPassMask[0]);
+                        continue;
+                    }
+                    coverageMaskSample = (vCoverageMask[0]);
+                    depthMaskSample = depthPassMask[0];
+                    stencilMaskSample = stencilPassMask[0];
+                    vInterpolatedZ = vZ[0];
+                }
+
+                // output merger
+                RDTSC_START(BEOutputMerger);
+                backendFuncs.pfnOutputMerger(psContext, pColorBase, sample, pBlendState, state.pfnBlendFunc,
+                                             coverageMaskSample, depthMaskSample);
+
+                DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, vInterpolatedZ, pDepthSample, depthMaskSample,
+                                  coverageMaskSample, pStencilSample, stencilMaskSample);
+                RDTSC_STOP(BEOutputMerger, 0, 0);
+            }
+
+Endtile:
+            RDTSC_START(BEEndTile);
+            for(uint32_t sample = 0; sample < numCoverageSamples; sample++)
+            {
+                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+            }
+
+            pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
+            pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
+
+            for(uint32_t rt = 0; rt < NumRT; ++rt)
+            {
+                pColorBase[rt] += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp) / 8;
+            }
+            RDTSC_STOP(BEEndTile, 0, 0);
+        }
+    }
+}
+// optimized backend flow with NULL PS
+template<uint32_t sampleCountT>
+void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers)
+{
+    RDTSC_START(BESetup);
+
+    static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
+    SWR_CONTEXT *pContext = pDC->pContext;
+    const API_STATE& state = GetApiState(pDC);
+    const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
+
+    // broadcast scalars
+    BarycentricCoeffs coeffs;
+    coeffs.vIa = _simd_broadcast_ss(&work.I[0]);
+    coeffs.vIb = _simd_broadcast_ss(&work.I[1]);
+    coeffs.vIc = _simd_broadcast_ss(&work.I[2]);
+
+    coeffs.vJa = _simd_broadcast_ss(&work.J[0]);
+    coeffs.vJb = _simd_broadcast_ss(&work.J[1]);
+    coeffs.vJc = _simd_broadcast_ss(&work.J[2]);
+
+    coeffs.vZa = _simd_broadcast_ss(&work.Z[0]);
+    coeffs.vZb = _simd_broadcast_ss(&work.Z[1]);
+    coeffs.vZc = _simd_broadcast_ss(&work.Z[2]);
+
+    coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
+
+    BYTE *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
+
+    RDTSC_STOP(BESetup, 0, 0);
+
+    SWR_PS_CONTEXT psContext;
+    for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
+    {
+        // UL pixel corner
+        simdscalar vYSamplePosUL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+
+        for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
+        {
+            // UL pixel corners
+            simdscalar vXSamplePosUL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+
+            // iterate over active samples
+            unsigned long sample = 0;
+            uint32_t sampleMask = state.blendState.sampleMask;
+            while (_BitScanForward(&sample, sampleMask))
+            {
+                sampleMask &= ~(1 << sample);
+                if (work.coverageMask[sample] & MASK)
+                {
+                    RDTSC_START(BEBarycentric);
+                    // calculate per sample positions
+                    psContext.vX.sample = _simd_add_ps(vXSamplePosUL, MultisampleTraits<sampleCount>::vX(sample));
+                    psContext.vY.sample = _simd_add_ps(vYSamplePosUL, MultisampleTraits<sampleCount>::vY(sample));
+
+                    backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
+
+                    // interpolate z
+                    psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
+
+                    RDTSC_STOP(BEBarycentric, 0, 0);
+
+                    simdscalar vCoverageMask = vMask(work.coverageMask[sample] & MASK);
+                    simdscalar stencilPassMask = vCoverageMask;
+
+                    // offset depth/stencil buffers current sample
+                    uint8_t *pDepthSample = pDepthBase + MultisampleTraits<sampleCount>::RasterTileDepthOffset(sample);
+                    uint8_t *pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
+
+                    RDTSC_START(BEEarlyDepthTest);
+                    simdscalar depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+                        psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
+                    DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
+                        pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
+                    RDTSC_STOP(BEEarlyDepthTest, 0, 0);
+
+                    uint32_t statMask = _simd_movemask_ps(depthPassMask);
+                    uint32_t statCount = _mm_popcnt_u32(statMask);
+                    UPDATE_STAT(DepthPassCount, statCount);
+                }
+                work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM);
+            }
+            pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8;
+            pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8;
+        }
+    }
+}
+
+void InitClearTilesTable()
+{
+    memset(sClearTilesTable, 0, sizeof(sClearTilesTable));
+
+    sClearTilesTable[R8G8B8A8_UNORM] = ClearMacroTile<R8G8B8A8_UNORM>;
+    sClearTilesTable[B8G8R8A8_UNORM] = ClearMacroTile<B8G8R8A8_UNORM>;
+    sClearTilesTable[R32_FLOAT] = ClearMacroTile<R32_FLOAT>;
+    sClearTilesTable[R32G32B32A32_FLOAT] = ClearMacroTile<R32G32B32A32_FLOAT>;
+    sClearTilesTable[R8_UINT] = ClearMacroTile<R8_UINT>;
+}
+
+PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
+PFN_BACKEND_FUNC gBackendSingleSample[2][2] = {};
+PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2] = {};
+PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2] = {};
+PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS+1][SWR_MULTISAMPLE_TYPE_MAX] = {};
+PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2] = {};
+PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2] = {};
+PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2] = {};
+
+// Recursive template used to auto-nest conditionals.  Converts dynamic enum function
+// arguments to static template arguments.
+template <uint32_t... ArgsT>
+struct OMChooser
+{
+    // Last Arg Terminator
+    static PFN_OUTPUT_MERGER GetFunc(SWR_MULTISAMPLE_COUNT tArg)
+    {
+        switch(tArg)
+        {
+        case SWR_MULTISAMPLE_1X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_1X>; break;
+        case SWR_MULTISAMPLE_2X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_2X>; break;
+        case SWR_MULTISAMPLE_4X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_4X>; break;
+        case SWR_MULTISAMPLE_8X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_8X>; break;
+        case SWR_MULTISAMPLE_16X: return OutputMerger<ArgsT..., SWR_MULTISAMPLE_16X>; break;
+        default:
+            SWR_ASSERT(0 && "Invalid sample count\n");
+            return nullptr;
+            break;
+        }
+    }
+
+    // Recursively parse args
+    template <typename... TArgsT>
+    static PFN_OUTPUT_MERGER GetFunc(uint32_t tArg, TArgsT... remainingArgs)
+    {
+        switch(tArg)
+        {
+        case 0: return OMChooser<ArgsT..., 0>::GetFunc(remainingArgs...); break;
+        case 1: return OMChooser<ArgsT..., 1>::GetFunc(remainingArgs...); break;
+        case 2: return OMChooser<ArgsT..., 2>::GetFunc(remainingArgs...); break;
+        case 3: return OMChooser<ArgsT..., 3>::GetFunc(remainingArgs...); break;
+        case 4: return OMChooser<ArgsT..., 4>::GetFunc(remainingArgs...); break;
+        case 5: return OMChooser<ArgsT..., 5>::GetFunc(remainingArgs...); break;
+        case 6: return OMChooser<ArgsT..., 6>::GetFunc(remainingArgs...); break;
+        case 7: return OMChooser<ArgsT..., 7>::GetFunc(remainingArgs...); break;
+        case 8: return OMChooser<ArgsT..., 8>::GetFunc(remainingArgs...); break;
+        default:
+            SWR_ASSERT(0 && "Invalid RT index\n");
+            return nullptr;
+            break;
+        }
+    }
+};
+
+// Recursive template used to auto-nest conditionals.  Converts dynamic enum function
+// arguments to static template arguments.
+template <uint32_t... ArgsT>
+struct BECentroidBarycentricChooser
+{
+
+    // Last Arg Terminator
+    template <typename... TArgsT>
+    static PFN_CALC_CENTROID_BARYCENTRICS GetFunc(uint32_t tArg)
+    {
+        if(tArg > 0)
+        {
+            return CalcCentroidBarycentrics<ArgsT..., 1>;
+        }
+
+        return CalcCentroidBarycentrics<ArgsT..., 0>;
+    }
+
+    // Recursively parse args
+    template <typename... TArgsT>
+    static PFN_CALC_CENTROID_BARYCENTRICS GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
+    {
+        switch(tArg)
+        {
+        case SWR_MULTISAMPLE_1X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
+        case SWR_MULTISAMPLE_2X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
+        case SWR_MULTISAMPLE_4X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
+        case SWR_MULTISAMPLE_8X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
+        case SWR_MULTISAMPLE_16X: return BECentroidBarycentricChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
+        default:
+            SWR_ASSERT(0 && "Invalid sample count\n");
+            return nullptr;
+            break;
+        }
+    }
+
+    // Recursively parse args
+    template <typename... TArgsT>
+    static PFN_CALC_CENTROID_BARYCENTRICS GetFunc(uint32_t tArg, TArgsT... remainingArgs)
+    {
+        if(tArg > 0)
+        {
+            return BECentroidBarycentricChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
+        }
+
+        return BECentroidBarycentricChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
+    }
+};
+
+// Recursive template used to auto-nest conditionals.  Converts dynamic enum function
+// arguments to static template arguments.
+template <uint32_t... ArgsT>
+struct BEChooser
+{
+    // Last Arg Terminator
+    static PFN_BACKEND_FUNC GetFunc(SWR_BACKEND_FUNCS tArg)
+    {
+        switch(tArg)
+        {
+        case SWR_BACKEND_SINGLE_SAMPLE: return BackendSingleSample<ArgsT...>; break;
+        case SWR_BACKEND_MSAA_PIXEL_RATE: return BackendPixelRate<ArgsT...>; break;
+        case SWR_BACKEND_MSAA_SAMPLE_RATE: return BackendSampleRate<ArgsT...>; break;
+        default:
+            SWR_ASSERT(0 && "Invalid backend func\n");
+            return nullptr;
+            break;
+        }
+    }
+
+
+    // Recursively parse args
+    template <typename... TArgsT>
+    static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs)
+    {
+        switch(tArg)
+        {
+        case SWR_MULTISAMPLE_1X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_1X>::GetFunc(remainingArgs...); break;
+        case SWR_MULTISAMPLE_2X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_2X>::GetFunc(remainingArgs...); break;
+        case SWR_MULTISAMPLE_4X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_4X>::GetFunc(remainingArgs...); break;
+        case SWR_MULTISAMPLE_8X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_8X>::GetFunc(remainingArgs...); break;
+        case SWR_MULTISAMPLE_16X: return BEChooser<ArgsT..., SWR_MULTISAMPLE_16X>::GetFunc(remainingArgs...); break;
+        default:
+            SWR_ASSERT(0 && "Invalid sample count\n");
+            return nullptr;
+            break;
+        }
+    }
+
+    // Recursively parse args
+    template <typename... TArgsT>
+    static PFN_BACKEND_FUNC GetFunc(uint32_t tArg, TArgsT... remainingArgs)
+    {
+        if(tArg > 0)
+        {
+            return BEChooser<ArgsT..., 1>::GetFunc(remainingArgs...);
+        }
+
+         return BEChooser<ArgsT..., 0>::GetFunc(remainingArgs...);
+    }
+};
+
+template <uint32_t numRenderTargets, SWR_MULTISAMPLE_COUNT numSampleRates>
+void InitBackendOMFuncTable(PFN_OUTPUT_MERGER (&table)[numRenderTargets][numSampleRates])
+{
+    for(uint32_t rtNum = SWR_ATTACHMENT_COLOR0; rtNum < numRenderTargets; rtNum++)
+    {
+        for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
+        {
+            table[rtNum][sampleCount] =
+                OMChooser<>::GetFunc((SWR_RENDERTARGET_ATTACHMENT)rtNum, (SWR_MULTISAMPLE_COUNT)sampleCount);
+        }
+    }
+}
+
+template <SWR_MULTISAMPLE_COUNT numSampleRates>
+void InitBackendBarycentricsTables(PFN_CALC_PIXEL_BARYCENTRICS (&pixelTable)[2], 
+                                   PFN_CALC_SAMPLE_BARYCENTRICS (&sampleTable)[2],
+                                   PFN_CALC_CENTROID_BARYCENTRICS (&centroidTable)[numSampleRates][2][2][2])
+{
+    pixelTable[0] = CalcPixelBarycentrics<0>;
+    pixelTable[1] = CalcPixelBarycentrics<1>;
+
+    sampleTable[0] = CalcSampleBarycentrics<0>;
+    sampleTable[1] = CalcSampleBarycentrics<1>;
+
+    for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
+    {
+        for(uint32_t baryMask = 0; baryMask < 2; baryMask++)
+        {
+            for(uint32_t patternNum = 0; patternNum < 2; patternNum++)
+            {
+                for(uint32_t forcedSampleEnable = 0; forcedSampleEnable < 2; forcedSampleEnable++)
+                {
+                    centroidTable[sampleCount][baryMask][patternNum][forcedSampleEnable]=
+                        BECentroidBarycentricChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, baryMask, patternNum, forcedSampleEnable);
+                }
+            }
+        }
+    }
+}
+
+void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[2][2])
+{
+    gBackendSingleSample[0][0] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NONE, 0, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
+    gBackendSingleSample[0][1] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NONE, 1, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
+    gBackendSingleSample[1][0] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NORMAL, 0, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
+    gBackendSingleSample[1][1] = BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, SWR_INPUT_COVERAGE_NORMAL, 1, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_SINGLE_SAMPLE);
+}
+
+template <SWR_MULTISAMPLE_COUNT numSampleRates, SWR_MSAA_SAMPLE_PATTERN numSamplePatterns, SWR_INPUT_COVERAGE numCoverageModes>
+void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[numSampleRates][numSamplePatterns][numCoverageModes][2][2])
+{
+    for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
+    {
+        for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < numSamplePatterns; samplePattern++)
+        {
+            for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < numCoverageModes; inputCoverage++)
+            {
+                for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++)
+                {
+                    table[sampleCount][samplePattern][inputCoverage][isCentroid][0] =
+                        BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, samplePattern, inputCoverage, isCentroid, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_PIXEL_RATE);
+                    table[sampleCount][samplePattern][inputCoverage][isCentroid][1] =
+                        BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, samplePattern, inputCoverage, isCentroid, 1, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_PIXEL_RATE);
+                }
+            }
+        }
+    }
+}
+
+template <uint32_t numSampleRates, uint32_t numCoverageModes>
+void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[numSampleRates][numCoverageModes][2])
+{
+    for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < numSampleRates; sampleCount++)
+    {
+        for(uint32_t inputCoverage = SWR_INPUT_COVERAGE_NONE; inputCoverage < numCoverageModes; inputCoverage++)
+        {
+            table[sampleCount][inputCoverage][0] =
+                BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, inputCoverage, 0, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
+            table[sampleCount][inputCoverage][1] =
+                BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, inputCoverage, 1, 0, (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE);
+        }
+    }
+}
+
+void InitBackendFuncTables()
+{    
+    InitBackendSampleFuncTable(gBackendSingleSample);
+    InitBackendPixelFuncTable<(SWR_MULTISAMPLE_COUNT)SWR_MULTISAMPLE_TYPE_MAX, SWR_MSAA_SAMPLE_PATTERN_MAX, SWR_INPUT_COVERAGE_MAX>(gBackendPixelRateTable);
+    InitBackendSampleFuncTable<SWR_MULTISAMPLE_TYPE_MAX, SWR_INPUT_COVERAGE_MAX>(gBackendSampleRateTable);
+    InitBackendOMFuncTable<SWR_NUM_RENDERTARGETS+1, SWR_MULTISAMPLE_TYPE_MAX>(gBackendOutputMergerTable);
+    InitBackendBarycentricsTables<(SWR_MULTISAMPLE_COUNT)(SWR_MULTISAMPLE_TYPE_MAX)>(gPixelBarycentricTable, gSampleBarycentricTable, gCentroidBarycentricTable);
+
+    gBackendNullPs[SWR_MULTISAMPLE_1X] = &BackendNullPS < SWR_MULTISAMPLE_1X > ;
+    gBackendNullPs[SWR_MULTISAMPLE_2X] = &BackendNullPS < SWR_MULTISAMPLE_2X > ;
+    gBackendNullPs[SWR_MULTISAMPLE_4X] = &BackendNullPS < SWR_MULTISAMPLE_4X > ;
+    gBackendNullPs[SWR_MULTISAMPLE_8X] = &BackendNullPS < SWR_MULTISAMPLE_8X > ;
+    gBackendNullPs[SWR_MULTISAMPLE_16X] = &BackendNullPS < SWR_MULTISAMPLE_16X > ;
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h
new file mode 100644
index 00000000000..53089e5047b
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.h
@@ -0,0 +1,59 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file backend.h
+*
+* @brief Backend handles rasterization, pixel shading and output merger
+*        operations.
+*
+******************************************************************************/
+#pragma once
+
+#include "common/os.h"
+#include "core/context.h" 
+
+void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId);
+void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
+void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
+void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
+void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
+void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
+void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers);
+void InitClearTilesTable();
+
+enum SWR_BACKEND_FUNCS
+{
+    SWR_BACKEND_SINGLE_SAMPLE,
+    SWR_BACKEND_MSAA_PIXEL_RATE,
+    SWR_BACKEND_MSAA_SAMPLE_RATE,
+    SWR_BACKEND_FUNCS_MAX,
+};
+void InitBackendFuncTables();
+
+extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
+extern PFN_BACKEND_FUNC gBackendSingleSample[2][2];
+extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2];
+extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2];
+extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS+1][SWR_MULTISAMPLE_TYPE_MAX];
+extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2];
+extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2];
+extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2];
diff --git a/src/gallium/drivers/swr/rasterizer/core/blend.h b/src/gallium/drivers/swr/rasterizer/core/blend.h
new file mode 100644
index 00000000000..626c237d75b
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/blend.h
@@ -0,0 +1,318 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file blend.cpp
+*
+* @brief Implementation for blending operations.
+*
+******************************************************************************/
+#include "state.h"
+
+template<bool Color, bool Alpha>
+INLINE
+void GenerateBlendFactor(SWR_BLEND_FACTOR func, simdvector &constantColor, simdvector &src, simdvector &src1, simdvector &dst, simdvector &out)
+{
+    simdvector result;
+
+    switch (func)
+    {
+    case BLENDFACTOR_ZERO: 
+        result.x = _simd_setzero_ps();
+        result.y = _simd_setzero_ps();
+        result.z = _simd_setzero_ps();
+        result.w = _simd_setzero_ps();
+        break;
+
+    case BLENDFACTOR_ONE: 
+        result.x = _simd_set1_ps(1.0);
+        result.y = _simd_set1_ps(1.0);
+        result.z = _simd_set1_ps(1.0);
+        result.w = _simd_set1_ps(1.0);
+        break;
+
+    case BLENDFACTOR_SRC_COLOR: 
+        result = src;
+        break;
+
+    case BLENDFACTOR_DST_COLOR: 
+        result = dst;
+        break;
+
+    case BLENDFACTOR_INV_SRC_COLOR: 
+        result.x = _simd_sub_ps(_simd_set1_ps(1.0), src.x);
+        result.y = _simd_sub_ps(_simd_set1_ps(1.0), src.y);
+        result.z = _simd_sub_ps(_simd_set1_ps(1.0), src.z);
+        result.w = _simd_sub_ps(_simd_set1_ps(1.0), src.w);
+        break;
+
+    case BLENDFACTOR_INV_DST_COLOR: 
+        result.x = _simd_sub_ps(_simd_set1_ps(1.0), dst.x);
+        result.y = _simd_sub_ps(_simd_set1_ps(1.0), dst.y);
+        result.z = _simd_sub_ps(_simd_set1_ps(1.0), dst.z);
+        result.w = _simd_sub_ps(_simd_set1_ps(1.0), dst.w);
+        break;
+
+    case BLENDFACTOR_SRC_ALPHA: result.x = src.w;
+        result.y = src.w;
+        result.z = src.w;
+        result.w = src.w;
+        break;
+
+    case BLENDFACTOR_INV_SRC_ALPHA:
+    {
+        simdscalar oneMinusSrcA = _simd_sub_ps(_simd_set1_ps(1.0), src.w);
+        result.x = oneMinusSrcA;
+        result.y = oneMinusSrcA;
+        result.z = oneMinusSrcA;
+        result.w = oneMinusSrcA;
+        break;
+    }
+
+    case BLENDFACTOR_DST_ALPHA: result.x = dst.w;
+        result.y = dst.w;
+        result.z = dst.w;
+        result.w = dst.w;
+        break;
+
+    case BLENDFACTOR_INV_DST_ALPHA:
+    {
+        simdscalar oneMinusDstA = _simd_sub_ps(_simd_set1_ps(1.0), dst.w);
+        result.x = oneMinusDstA;
+        result.y = oneMinusDstA;
+        result.z = oneMinusDstA;
+        result.w = oneMinusDstA;
+        break;
+    }
+
+    case BLENDFACTOR_SRC_ALPHA_SATURATE:
+    {
+        simdscalar sat = _simd_min_ps(src.w, _simd_sub_ps(_simd_set1_ps(1.0), dst.w));
+        result.x = sat;
+        result.y = sat;
+        result.z = sat;
+        result.w = _simd_set1_ps(1.0);
+        break;
+    }
+
+    case BLENDFACTOR_CONST_COLOR:
+        result.x = constantColor[0];
+        result.y = constantColor[1];
+        result.z = constantColor[2];
+        result.w = constantColor[3];
+        break;
+
+    case BLENDFACTOR_CONST_ALPHA:
+        result.x = result.y = result.z = result.w = constantColor[3];
+        break;
+
+    case BLENDFACTOR_INV_CONST_COLOR:
+    {
+        result.x = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[0]);
+        result.y = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[1]);
+        result.z = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[2]);
+        result.w = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[3]);
+        break;
+    }
+
+    case BLENDFACTOR_INV_CONST_ALPHA:
+    {
+        result.x = result.y = result.z = result.w = _simd_sub_ps(_simd_set1_ps(1.0f), constantColor[3]);
+        break;
+    }
+
+    case BLENDFACTOR_SRC1_COLOR:
+        result.x = src1.x;
+        result.y = src1.y;
+        result.z = src1.z;
+        result.w = src1.w;
+        break;
+
+    case BLENDFACTOR_SRC1_ALPHA:
+        result.x = result.y = result.z = result.w = src1.w;
+        break;
+
+    case BLENDFACTOR_INV_SRC1_COLOR:
+        result.x = _simd_sub_ps(_simd_set1_ps(1.0f), src1.x);
+        result.y = _simd_sub_ps(_simd_set1_ps(1.0f), src1.y);
+        result.z = _simd_sub_ps(_simd_set1_ps(1.0f), src1.z);
+        result.w = _simd_sub_ps(_simd_set1_ps(1.0f), src1.w);
+        break;
+
+    case BLENDFACTOR_INV_SRC1_ALPHA:
+        result.x = result.y = result.z = result.w = _simd_sub_ps(_simd_set1_ps(1.0f), src1.w);
+        break;
+
+    default: SWR_ASSERT(false, "Unimplemented blend factor: %d", func);
+    }
+
+    if (Color)
+    {
+        out.x = result.x;
+        out.y = result.y;
+        out.z = result.z;
+    }
+    if (Alpha)
+    {
+        out.w = result.w;
+    }
+
+}
+
+template<bool Color, bool Alpha>
+INLINE void BlendFunc(SWR_BLEND_OP blendOp, simdvector &src, simdvector &srcFactor, simdvector &dst, simdvector &dstFactor, simdvector &out)
+{
+    simdvector result;
+
+    switch (blendOp)
+    {
+    case BLENDOP_ADD:
+        result.x = _simd_fmadd_ps(srcFactor.x, src.x, _simd_mul_ps(dstFactor.x, dst.x));
+        result.y = _simd_fmadd_ps(srcFactor.y, src.y, _simd_mul_ps(dstFactor.y, dst.y));
+        result.z = _simd_fmadd_ps(srcFactor.z, src.z, _simd_mul_ps(dstFactor.z, dst.z));
+        result.w = _simd_fmadd_ps(srcFactor.w, src.w, _simd_mul_ps(dstFactor.w, dst.w));
+        break;
+
+    case BLENDOP_SUBTRACT:
+        result.x = _simd_fmsub_ps(srcFactor.x, src.x, _simd_mul_ps(dstFactor.x, dst.x));
+        result.y = _simd_fmsub_ps(srcFactor.y, src.y, _simd_mul_ps(dstFactor.y, dst.y));
+        result.z = _simd_fmsub_ps(srcFactor.z, src.z, _simd_mul_ps(dstFactor.z, dst.z));
+        result.w = _simd_fmsub_ps(srcFactor.w, src.w, _simd_mul_ps(dstFactor.w, dst.w));
+        break;
+
+    case BLENDOP_REVSUBTRACT:
+        result.x = _simd_fmsub_ps(dstFactor.x, dst.x, _simd_mul_ps(srcFactor.x, src.x));
+        result.y = _simd_fmsub_ps(dstFactor.y, dst.y, _simd_mul_ps(srcFactor.y, src.y));
+        result.z = _simd_fmsub_ps(dstFactor.z, dst.z, _simd_mul_ps(srcFactor.z, src.z));
+        result.w = _simd_fmsub_ps(dstFactor.w, dst.w, _simd_mul_ps(srcFactor.w, src.w));
+        break;
+        
+    case BLENDOP_MIN:
+        result.x = _simd_min_ps(_simd_mul_ps(srcFactor.x, src.x), _simd_mul_ps(dstFactor.x, dst.x));
+        result.y = _simd_min_ps(_simd_mul_ps(srcFactor.y, src.y), _simd_mul_ps(dstFactor.y, dst.y));
+        result.z = _simd_min_ps(_simd_mul_ps(srcFactor.z, src.z), _simd_mul_ps(dstFactor.z, dst.z));
+        result.w = _simd_min_ps(_simd_mul_ps(srcFactor.w, src.w), _simd_mul_ps(dstFactor.w, dst.w));
+        break;
+        
+    case BLENDOP_MAX:
+        result.x = _simd_max_ps(_simd_mul_ps(srcFactor.x, src.x), _simd_mul_ps(dstFactor.x, dst.x));
+        result.y = _simd_max_ps(_simd_mul_ps(srcFactor.y, src.y), _simd_mul_ps(dstFactor.y, dst.y));
+        result.z = _simd_max_ps(_simd_mul_ps(srcFactor.z, src.z), _simd_mul_ps(dstFactor.z, dst.z));
+        result.w = _simd_max_ps(_simd_mul_ps(srcFactor.w, src.w), _simd_mul_ps(dstFactor.w, dst.w));
+        break;
+        
+    default:
+        SWR_ASSERT(false, "Unimplemented blend function: %d", blendOp);
+    }
+
+    if (Color)
+    {
+        out.x = result.x;
+        out.y = result.y;
+        out.z = result.z;
+    }
+    if (Alpha)
+    {
+        out.w = result.w;
+    }
+}
+
+template<SWR_TYPE type>
+INLINE void Clamp(simdvector &src)
+{
+    switch (type)
+    {
+    case SWR_TYPE_FLOAT:
+        break;
+
+    case SWR_TYPE_UNORM:
+        src.x = _simd_max_ps(src.x, _simd_setzero_ps());
+        src.x = _simd_min_ps(src.x, _simd_set1_ps(1.0f));
+
+        src.y = _simd_max_ps(src.y, _simd_setzero_ps());
+        src.y = _simd_min_ps(src.y, _simd_set1_ps(1.0f));
+
+        src.z = _simd_max_ps(src.z, _simd_setzero_ps());
+        src.z = _simd_min_ps(src.z, _simd_set1_ps(1.0f));
+
+        src.w = _simd_max_ps(src.w, _simd_setzero_ps());
+        src.w = _simd_min_ps(src.w, _simd_set1_ps(1.0f));
+        break;
+
+    case SWR_TYPE_SNORM:
+        src.x = _simd_max_ps(src.x, _simd_set1_ps(-1.0f));
+        src.x = _simd_min_ps(src.x, _simd_set1_ps(1.0f));
+
+        src.y = _simd_max_ps(src.y, _simd_set1_ps(-1.0f));
+        src.y = _simd_min_ps(src.y, _simd_set1_ps(1.0f));
+
+        src.z = _simd_max_ps(src.z, _simd_set1_ps(-1.0f));
+        src.z = _simd_min_ps(src.z, _simd_set1_ps(1.0f));
+
+        src.w = _simd_max_ps(src.w, _simd_set1_ps(-1.0f));
+        src.w = _simd_min_ps(src.w, _simd_set1_ps(1.0f));
+        break;
+
+    default:
+        SWR_ASSERT(false, "Unimplemented clamp: %d", type);
+        break;
+    }
+}
+
+template<SWR_TYPE type>
+void Blend(const SWR_BLEND_STATE *pBlendState, const SWR_RENDER_TARGET_BLEND_STATE *pState, simdvector &src, simdvector& src1, BYTE *pDst, simdvector &result)
+{
+    // load render target
+    simdvector dst;
+    LoadSOA<KNOB_COLOR_HOT_TILE_FORMAT>(pDst, dst);
+
+    simdvector constColor;
+    constColor.x = _simd_broadcast_ss(&pBlendState->constantColor[0]);
+    constColor.y = _simd_broadcast_ss(&pBlendState->constantColor[1]);
+    constColor.z = _simd_broadcast_ss(&pBlendState->constantColor[2]);
+    constColor.w = _simd_broadcast_ss(&pBlendState->constantColor[3]);
+
+    // clamp src/dst/constant
+    Clamp<type>(src);
+    Clamp<type>(src1);
+    Clamp<type>(dst);
+    Clamp<type>(constColor);
+
+    simdvector srcFactor, dstFactor;
+    if (pBlendState->independentAlphaBlendEnable)
+    {
+        GenerateBlendFactor<true, false>((SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor);
+        GenerateBlendFactor<false, true>((SWR_BLEND_FACTOR)pState->sourceAlphaBlendFactor, constColor, src, src1, dst, srcFactor);
+
+        GenerateBlendFactor<true, false>((SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor);
+        GenerateBlendFactor<false, true>((SWR_BLEND_FACTOR)pState->destAlphaBlendFactor, constColor, src, src1, dst, dstFactor);
+
+        BlendFunc<true, false>((SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result);
+        BlendFunc<false, true>((SWR_BLEND_OP)pState->alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
+    }
+    else
+    {
+        GenerateBlendFactor<true, true>((SWR_BLEND_FACTOR)pState->sourceBlendFactor, constColor, src, src1, dst, srcFactor);
+        GenerateBlendFactor<true, true>((SWR_BLEND_FACTOR)pState->destBlendFactor, constColor, src, src1, dst, dstFactor);
+
+        BlendFunc<true, true>((SWR_BLEND_OP)pState->colorBlendFunc, src, srcFactor, dst, dstFactor, result);
+    }
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
new file mode 100644
index 00000000000..ce27bf71d3c
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
@@ -0,0 +1,201 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file clip.cpp
+*
+* @brief Implementation for clipping
+*
+******************************************************************************/
+
+#include <assert.h>
+
+#include "common/os.h"
+#include "core/clip.h"
+
+float ComputeInterpFactor(float boundaryCoord0, float boundaryCoord1)
+{
+    return (boundaryCoord0 / (boundaryCoord0 - boundaryCoord1));
+}
+
+template<SWR_CLIPCODES ClippingPlane>
+inline void intersect(
+    int s,                       // index to first edge vertex v0 in pInPts.
+    int p,                       // index to second edge vertex v1 in pInPts.
+    const float *pInPts,         // array of all the input positions.
+    const float *pInAttribs,     // array of all attributes for all vertex. All the attributes for each vertex is contiguous.
+    int numInAttribs,            // number of attributes per vertex.
+    int i,                       // output index.
+    float *pOutPts,              // array of output positions. We'll write our new intersection point at i*4.
+    float *pOutAttribs)          // array of output attributes. We'll write our new attributes at i*numInAttribs.
+{
+    float t;
+
+    // Find the parameter of the intersection.
+    //        t = (v1.w - v1.x) / ((v2.x - v1.x) - (v2.w - v1.w)) for x = w (RIGHT) plane, etc.
+    const float *v1 = &pInPts[s*4];
+    const float *v2 = &pInPts[p*4];
+
+    switch (ClippingPlane)
+    {
+    case FRUSTUM_LEFT:      t = ComputeInterpFactor(v1[3] + v1[0], v2[3] + v2[0]); break;
+    case FRUSTUM_RIGHT:     t = ComputeInterpFactor(v1[3] - v1[0], v2[3] - v2[0]); break;
+    case FRUSTUM_TOP:       t = ComputeInterpFactor(v1[3] + v1[1], v2[3] + v2[1]); break;
+    case FRUSTUM_BOTTOM:    t = ComputeInterpFactor(v1[3] - v1[1], v2[3] - v2[1]); break;
+    case FRUSTUM_NEAR:      t = ComputeInterpFactor(v1[2], v2[2]); break;
+    case FRUSTUM_FAR:       t = ComputeInterpFactor(v1[3] - v1[2], v2[3] - v2[2]); break;
+    default: SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
+    };
+
+
+    const float *a1 = &pInAttribs[s*numInAttribs];
+    const float *a2 = &pInAttribs[p*numInAttribs];
+
+    float *pOutP    = &pOutPts[i*4];
+    float *pOutA    = &pOutAttribs[i*numInAttribs];
+
+    // Interpolate new position.
+    for(int j = 0; j < 4; ++j)
+    {
+        pOutP[j] = v1[j] + (v2[j]-v1[j])*t;
+    }
+
+    // Interpolate Attributes
+    for(int attr = 0; attr < numInAttribs; ++attr)
+    {
+        pOutA[attr] = a1[attr] + (a2[attr]-a1[attr])*t;
+    }
+}
+
+
+// Checks whether vertex v lies inside clipping plane
+// in homogenous coords check -w < {x,y,z} < w;
+//
+template<SWR_CLIPCODES ClippingPlane>
+inline int inside(const float v[4])
+{
+    switch (ClippingPlane)
+    {
+    case FRUSTUM_LEFT   : return (v[0]>=-v[3]);
+    case FRUSTUM_RIGHT  : return (v[0]<= v[3]);
+    case FRUSTUM_TOP    : return (v[1]>=-v[3]);
+    case FRUSTUM_BOTTOM : return (v[1]<= v[3]);
+    case FRUSTUM_NEAR   : return (v[2]>=0.0f);
+    case FRUSTUM_FAR    : return (v[2]<= v[3]);
+    default:
+        SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
+        return 0;
+    }
+}
+
+
+// Clips a polygon in homogenous coordinates to a particular clipping plane.
+// Takes in vertices of the polygon (InPts) and the clipping plane
+// Puts the vertices of the clipped polygon in OutPts
+// Returns number of points in clipped polygon
+//
+template<SWR_CLIPCODES ClippingPlane>
+int ClipTriToPlane( const float *pInPts, int numInPts,
+                    const float *pInAttribs, int numInAttribs,
+                    float *pOutPts, float *pOutAttribs)
+{
+    int i=0; // index number of OutPts, # of vertices in OutPts = i div 4;
+
+    for (int j = 0; j < numInPts; ++j)
+    {
+        int s = j;
+        int p = (j + 1) % numInPts;
+
+        int s_in = inside<ClippingPlane>(&pInPts[s*4]);
+        int p_in = inside<ClippingPlane>(&pInPts[p*4]);
+
+        // test if vertex is to be added to output vertices
+        if (s_in != p_in)  // edge crosses clipping plane
+        {
+            // find point of intersection
+            intersect<ClippingPlane>(s, p, pInPts, pInAttribs, numInAttribs, i, pOutPts, pOutAttribs);
+            i++;
+        }
+        if (p_in) // 2nd vertex is inside clipping volume, add it to output
+        {
+            // Copy 2nd vertex position of edge over to output.
+            for(int k = 0; k < 4; ++k)
+            {
+                pOutPts[i*4 + k] = pInPts[p*4 + k];
+            }
+            // Copy 2nd vertex attributes of edge over to output.
+            for(int attr = 0; attr < numInAttribs; ++attr)
+            {
+                pOutAttribs[i*numInAttribs+attr] = pInAttribs[p*numInAttribs+attr];
+            }
+            i++;
+        }
+        // edge does not cross clipping plane and vertex outside clipping volume
+        //  => do not add vertex
+    }
+    return i;
+}
+
+
+
+void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *pOutTriangles, int *numVerts, float *pOutAttribs)
+{
+    // temp storage to hold at least 6 sets of vertices, the max number that can be created during clipping
+    OSALIGN(float, 16) tempPts[6 * 4];
+    OSALIGN(float, 16) tempAttribs[6 * KNOB_NUM_ATTRIBUTES * 4];
+
+    // we opt to clip to viewport frustum to produce smaller triangles for rasterization precision
+    int NumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pTriangle, 3, pAttribs, numAttribs, tempPts, tempAttribs);
+    NumOutPts = ClipTriToPlane<FRUSTUM_FAR>(tempPts, NumOutPts, tempAttribs, numAttribs, pOutTriangles, pOutAttribs);
+    NumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pOutTriangles, NumOutPts, pOutAttribs, numAttribs, tempPts, tempAttribs);
+    NumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(tempPts, NumOutPts, tempAttribs, numAttribs, pOutTriangles, pOutAttribs);
+    NumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pOutTriangles, NumOutPts, pOutAttribs, numAttribs, tempPts, tempAttribs);
+    NumOutPts = ClipTriToPlane<FRUSTUM_TOP>(tempPts, NumOutPts, tempAttribs, numAttribs, pOutTriangles, pOutAttribs);
+
+    SWR_ASSERT(NumOutPts <= 6);
+
+    *numVerts = NumOutPts;
+    return;
+}
+
+void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId)
+{
+    RDTSC_START(FEClipTriangles);
+    Clipper<3> clipper(workerId, pDC);
+    clipper.ExecuteStage(pa, prims, primMask, primId);
+    RDTSC_STOP(FEClipTriangles, 1, 0);
+}
+
+void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId)
+{
+    RDTSC_START(FEClipLines);
+    Clipper<2> clipper(workerId, pDC);
+    clipper.ExecuteStage(pa, prims, primMask, primId);
+    RDTSC_STOP(FEClipLines, 1, 0);
+}
+void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId)
+{
+    RDTSC_START(FEClipPoints);
+    Clipper<1> clipper(workerId, pDC);
+    clipper.ExecuteStage(pa, prims, primMask, primId);
+    RDTSC_STOP(FEClipPoints, 1, 0);
+}
+
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h
new file mode 100644
index 00000000000..49494a4e374
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -0,0 +1,868 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file clip.h
+*
+* @brief Definitions for clipping
+*
+******************************************************************************/
+#pragma once
+
+#include "common/simdintrin.h"
+#include "core/context.h"
+#include "core/pa.h"
+#include "rdtsc_core.h"
+
+enum SWR_CLIPCODES
+{
+    // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
+    // Guardband is able to use a single high-bit with 4 separate LSBs, because it computes a union, rather than intersection, of clipcodes.
+#define CLIPCODE_SHIFT 23
+    FRUSTUM_LEFT    = (0x01 << CLIPCODE_SHIFT),
+    FRUSTUM_TOP     = (0x02 << CLIPCODE_SHIFT),
+    FRUSTUM_RIGHT   = (0x04 << CLIPCODE_SHIFT),
+    FRUSTUM_BOTTOM  = (0x08 << CLIPCODE_SHIFT),
+
+    FRUSTUM_NEAR    = (0x10 << CLIPCODE_SHIFT),
+    FRUSTUM_FAR     = (0x20 << CLIPCODE_SHIFT),
+
+    NEGW            = (0x40 << CLIPCODE_SHIFT),
+
+    GUARDBAND_LEFT   = (0x80 << CLIPCODE_SHIFT | 0x1),
+    GUARDBAND_TOP    = (0x80 << CLIPCODE_SHIFT | 0x2),
+    GUARDBAND_RIGHT  = (0x80 << CLIPCODE_SHIFT | 0x4),
+    GUARDBAND_BOTTOM = (0x80 << CLIPCODE_SHIFT | 0x8)
+};
+
+#define FRUSTUM_CLIP_MASK (FRUSTUM_LEFT|FRUSTUM_TOP|FRUSTUM_RIGHT|FRUSTUM_BOTTOM|FRUSTUM_NEAR|FRUSTUM_FAR)
+#define GUARDBAND_CLIP_MASK (FRUSTUM_NEAR|FRUSTUM_FAR|GUARDBAND_LEFT|GUARDBAND_TOP|GUARDBAND_RIGHT|GUARDBAND_BOTTOM|NEGW)
+
+void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *pOutTriangles, 
+          int *numVerts, float *pOutAttribs);
+
+INLINE
+void ComputeClipCodes(DRIVER_TYPE type, const API_STATE& state, const simdvector& vertex, simdscalar& clipCodes)
+{
+    clipCodes = _simd_setzero_ps();
+
+    // -w
+    simdscalar vNegW = _simd_mul_ps(vertex.w, _simd_set1_ps(-1.0f));
+
+    // FRUSTUM_LEFT
+    simdscalar vRes = _simd_cmplt_ps(vertex.x, vNegW);
+    clipCodes = _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_LEFT)));
+
+    // FRUSTUM_TOP
+    vRes = _simd_cmplt_ps(vertex.y, vNegW);
+    clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_TOP))));
+
+    // FRUSTUM_RIGHT
+    vRes = _simd_cmpgt_ps(vertex.x, vertex.w);
+    clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_RIGHT))));
+
+    // FRUSTUM_BOTTOM
+    vRes = _simd_cmpgt_ps(vertex.y, vertex.w);
+    clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_BOTTOM))));
+
+    if (state.rastState.depthClipEnable)
+    {
+        // FRUSTUM_NEAR
+        // DX clips depth [0..w], GL clips [-w..w]
+        if (type == DX)
+        {
+            vRes = _simd_cmplt_ps(vertex.z, _simd_setzero_ps());
+        }
+        else
+        {
+            vRes = _simd_cmplt_ps(vertex.z, vNegW);
+        }
+        clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_NEAR))));
+
+        // FRUSTUM_FAR
+        vRes = _simd_cmpgt_ps(vertex.z, vertex.w);
+        clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(FRUSTUM_FAR))));
+    }
+
+    // NEGW
+    vRes = _simd_cmple_ps(vertex.w, _simd_setzero_ps());
+    clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(NEGW))));
+
+    // GUARDBAND_LEFT
+    simdscalar gbMult = _simd_mul_ps(vNegW, _simd_set1_ps(state.gbState.left));
+    vRes = _simd_cmplt_ps(vertex.x, gbMult);
+    clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_LEFT))));
+
+    // GUARDBAND_TOP
+    gbMult = _simd_mul_ps(vNegW, _simd_set1_ps(state.gbState.top));
+    vRes = _simd_cmplt_ps(vertex.y, gbMult);
+    clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_TOP))));
+
+    // GUARDBAND_RIGHT
+    gbMult = _simd_mul_ps(vertex.w, _simd_set1_ps(state.gbState.right));
+    vRes = _simd_cmpgt_ps(vertex.x, gbMult);
+    clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_RIGHT))));
+
+    // GUARDBAND_BOTTOM
+    gbMult = _simd_mul_ps(vertex.w, _simd_set1_ps(state.gbState.bottom));
+    vRes = _simd_cmpgt_ps(vertex.y, gbMult);
+    clipCodes = _simd_or_ps(clipCodes, _simd_and_ps(vRes, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_BOTTOM))));
+}
+
+template<uint32_t NumVertsPerPrim>
+class Clipper
+{
+public:
+    Clipper(uint32_t in_workerId, DRAW_CONTEXT* in_pDC) :
+        workerId(in_workerId), driverType(in_pDC->pContext->driverType), pDC(in_pDC), state(GetApiState(in_pDC))
+    {
+        static_assert(NumVertsPerPrim >= 1 && NumVertsPerPrim <= 3, "Invalid NumVertsPerPrim");
+    }
+
+    void ComputeClipCodes(simdvector vertex[])
+    {
+        for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+        {
+            ::ComputeClipCodes(this->driverType, this->state, vertex[i], this->clipCodes[i]);
+        }
+    }
+
+    simdscalar ComputeClipCodeIntersection()
+    {
+        simdscalar result = this->clipCodes[0];
+        for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
+        {
+            result = _simd_and_ps(result, this->clipCodes[i]);
+        }
+        return result;
+    }
+
+    simdscalar ComputeClipCodeUnion()
+    {
+        simdscalar result = this->clipCodes[0];
+        for (uint32_t i = 1; i < NumVertsPerPrim; ++i)
+        {
+            result = _simd_or_ps(result, this->clipCodes[i]);
+        }
+        return result;
+    }
+
+    int ComputeNegWMask()
+    {
+        simdscalar clipCodeUnion = ComputeClipCodeUnion();
+        clipCodeUnion = _simd_and_ps(clipCodeUnion, _simd_castsi_ps(_simd_set1_epi32(NEGW)));
+        return _simd_movemask_ps(_simd_cmpneq_ps(clipCodeUnion, _simd_setzero_ps()));
+    }
+
+    int ComputeClipMask()
+    {
+        simdscalar clipUnion = ComputeClipCodeUnion();
+        clipUnion = _simd_and_ps(clipUnion, _simd_castsi_ps(_simd_set1_epi32(GUARDBAND_CLIP_MASK)));
+        return _simd_movemask_ps(_simd_cmpneq_ps(clipUnion, _simd_setzero_ps()));
+    }
+
+    // clipper is responsible for culling any prims with NAN coordinates
+    int ComputeNaNMask(simdvector prim[])
+    {
+        simdscalar vNanMask = _simd_setzero_ps();
+        for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
+        {
+            simdscalar vNan01 = _simd_cmp_ps(prim[e].v[0], prim[e].v[1], _CMP_UNORD_Q);
+            vNanMask = _simd_or_ps(vNanMask, vNan01);
+            simdscalar vNan23 = _simd_cmp_ps(prim[e].v[2], prim[e].v[3], _CMP_UNORD_Q);
+            vNanMask = _simd_or_ps(vNanMask, vNan23);
+        }
+
+        return _simd_movemask_ps(vNanMask);
+    }
+
+    int ComputeUserClipCullMask(PA_STATE& pa, simdvector prim[])
+    {
+        uint8_t cullMask = this->state.rastState.cullDistanceMask;
+        simdscalar vClipCullMask = _simd_setzero_ps();
+        DWORD index;
+
+        simdvector vClipCullDistLo[3];
+        simdvector vClipCullDistHi[3];
+
+        pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, vClipCullDistLo);
+        pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, vClipCullDistHi);
+        while (_BitScanForward(&index, cullMask))
+        {
+            cullMask &= ~(1 << index);
+            uint32_t slot = index >> 2;
+            uint32_t component = index & 0x3;
+
+            simdscalar vCullMaskElem = _simd_set1_ps(-1.0f);
+            for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
+            {
+                simdscalar vCullComp;
+                if (slot == 0)
+                {
+                    vCullComp = vClipCullDistLo[e][component];
+                }
+                else
+                {
+                    vCullComp = vClipCullDistHi[e][component];
+                }
+
+                // cull if cull distance < 0 || NAN
+                simdscalar vCull = _simd_cmp_ps(_mm256_setzero_ps(), vCullComp, _CMP_NLE_UQ);
+                vCullMaskElem = _simd_and_ps(vCullMaskElem, vCull);
+            }
+            vClipCullMask = _simd_or_ps(vClipCullMask, vCullMaskElem);
+        }
+
+        // clipper should also discard any primitive with NAN clip distance
+        uint8_t clipMask = this->state.rastState.clipDistanceMask;
+        while (_BitScanForward(&index, clipMask))
+        {
+            clipMask &= ~(1 << index);
+            uint32_t slot = index >> 2;
+            uint32_t component = index & 0x3;
+
+            for (uint32_t e = 0; e < NumVertsPerPrim; ++e)
+            {
+                simdscalar vClipComp;
+                if (slot == 0)
+                {
+                    vClipComp = vClipCullDistLo[e][component];
+                }
+                else
+                {
+                    vClipComp = vClipCullDistHi[e][component];
+                }
+
+                simdscalar vClip = _simd_cmp_ps(vClipComp, vClipComp, _CMP_UNORD_Q);
+                vClipCullMask = _simd_or_ps(vClipCullMask, vClip);
+            }
+        }
+
+        return _simd_movemask_ps(vClipCullMask);
+    }
+
+    // clip a single primitive
+    int ClipScalar(PA_STATE& pa, uint32_t primIndex, float* pOutPos, float* pOutAttribs)
+    {
+        OSALIGN(float, 16) inVerts[3 * 4];
+        OSALIGN(float, 16) inAttribs[3 * KNOB_NUM_ATTRIBUTES * 4];
+
+        // transpose primitive position
+        __m128 verts[3];
+        pa.AssembleSingle(VERTEX_POSITION_SLOT, primIndex, verts);
+        _mm_store_ps(&inVerts[0], verts[0]);
+        _mm_store_ps(&inVerts[4], verts[1]);
+        _mm_store_ps(&inVerts[8], verts[2]);
+
+        // transpose attribs
+        uint32_t numScalarAttribs = this->state.linkageCount * 4;
+
+        int idx = 0;
+        DWORD slot = 0;
+        uint32_t mapIdx = 0;
+        uint32_t tmpLinkage = uint32_t(this->state.linkageMask);
+        while (_BitScanForward(&slot, tmpLinkage))
+        {
+            tmpLinkage &= ~(1 << slot);
+            // Compute absolute attrib slot in vertex array
+            uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + this->state.linkageMap[mapIdx++];
+            __m128 attrib[3];    // triangle attribs (always 4 wide)
+            pa.AssembleSingle(inputSlot, primIndex, attrib);
+            _mm_store_ps(&inAttribs[idx], attrib[0]);
+            _mm_store_ps(&inAttribs[idx + numScalarAttribs], attrib[1]);
+            _mm_store_ps(&inAttribs[idx + numScalarAttribs * 2], attrib[2]);
+            idx += 4;
+        }
+
+        int numVerts;
+        Clip(inVerts, inAttribs, numScalarAttribs, pOutPos, &numVerts, pOutAttribs);
+
+        return numVerts;
+    }
+
+    // clip SIMD primitives
+    void ClipSimd(const simdscalar& vPrimMask, const simdscalar& vClipMask, PA_STATE& pa, const simdscalari& vPrimId)
+    {
+        // input/output vertex store for clipper
+        simdvertex vertices[7]; // maximum 7 verts generated per triangle
+
+        LONG constantInterpMask = this->state.backendState.constantInterpolationMask;
+        uint32_t provokingVertex = 0;
+        if(pa.binTopology == TOP_TRIANGLE_FAN)
+        {
+            provokingVertex = this->state.frontendState.provokingVertex.triFan;
+        }
+        ///@todo: line topology for wireframe?
+
+        // assemble pos
+        simdvector tmpVector[NumVertsPerPrim];
+        pa.Assemble(VERTEX_POSITION_SLOT, tmpVector);
+        for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+        {
+            vertices[i].attrib[VERTEX_POSITION_SLOT] = tmpVector[i];
+        }
+
+        // assemble attribs
+        DWORD slot = 0;
+        uint32_t mapIdx = 0;
+        uint32_t tmpLinkage = this->state.linkageMask;
+
+        int32_t maxSlot = -1;
+        while (_BitScanForward(&slot, tmpLinkage))
+        {
+            tmpLinkage &= ~(1 << slot);
+            // Compute absolute attrib slot in vertex array
+            uint32_t mapSlot = this->state.linkageMap[mapIdx++];
+            maxSlot = std::max<int32_t>(maxSlot, mapSlot);
+            uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + mapSlot;
+
+            pa.Assemble(inputSlot, tmpVector);
+
+            // if constant interpolation enabled for this attribute, assign the provoking
+            // vertex values to all edges
+            if (_bittest(&constantInterpMask, slot))
+            {
+                for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+                {
+                    vertices[i].attrib[inputSlot] = tmpVector[provokingVertex];
+                }
+            }
+            else
+            {
+                for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+                {
+                    vertices[i].attrib[inputSlot] = tmpVector[i];
+                }
+            }
+        }
+
+        uint32_t numAttribs = maxSlot + 1;
+
+        simdscalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
+
+        // set up new PA for binning clipped primitives
+        PFN_PROCESS_PRIMS pfnBinFunc = nullptr;
+        PRIMITIVE_TOPOLOGY clipTopology = TOP_UNKNOWN;
+        if (NumVertsPerPrim == 3)
+        {
+            pfnBinFunc = BinTriangles;
+            clipTopology = TOP_TRIANGLE_FAN;
+
+            // so that the binner knows to bloat wide points later
+            if (pa.binTopology == TOP_POINT_LIST)
+                clipTopology = TOP_POINT_LIST;
+        }
+        else if (NumVertsPerPrim == 2)
+        {
+            pfnBinFunc = BinLines;
+            clipTopology = TOP_LINE_LIST;
+        }
+        else
+        {
+            SWR_ASSERT(0 && "Unexpected points in clipper.");
+        }
+        
+
+        uint32_t* pVertexCount = (uint32_t*)&vNumClippedVerts;
+        uint32_t* pPrimitiveId = (uint32_t*)&vPrimId;
+
+        const simdscalari vOffsets = _mm256_set_epi32(
+            0 * sizeof(simdvertex),  // unused lane
+            6 * sizeof(simdvertex),
+            5 * sizeof(simdvertex),
+            4 * sizeof(simdvertex),
+            3 * sizeof(simdvertex),
+            2 * sizeof(simdvertex),
+            1 * sizeof(simdvertex),
+            0 * sizeof(simdvertex));
+
+        // only need to gather 7 verts
+        // @todo dynamic mask based on actual # of verts generated per lane
+        const simdscalar vMask = _mm256_set_ps(0, -1, -1, -1, -1, -1, -1, -1);
+
+        uint32_t numClippedPrims = 0;
+        for (uint32_t inputPrim = 0; inputPrim < pa.NumPrims(); ++inputPrim)
+        {
+            uint32_t numEmittedVerts = pVertexCount[inputPrim];
+            if (numEmittedVerts < NumVertsPerPrim)
+            {
+                continue;
+            }
+            SWR_ASSERT(numEmittedVerts <= 7, "Unexpected vertex count from clipper.");
+
+            uint32_t numEmittedPrims = GetNumPrims(clipTopology, numEmittedVerts);
+            numClippedPrims += numEmittedPrims;
+
+            // tranpose clipper output so that each lane's vertices are in SIMD order
+            // set aside space for 2 vertices, as the PA will try to read up to 16 verts
+            // for triangle fan
+            simdvertex transposedPrims[2];
+
+            // transpose pos
+            uint8_t* pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_POSITION_SLOT]) + sizeof(float) * inputPrim;
+            for (uint32_t c = 0; c < 4; ++c)
+            {
+                transposedPrims[0].attrib[VERTEX_POSITION_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
+                pBase += sizeof(simdscalar);
+            }
+
+            // transpose attribs
+            pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_ATTRIB_START_SLOT]) + sizeof(float) * inputPrim;
+            for (uint32_t attrib = 0; attrib < numAttribs; ++attrib)
+            {
+                uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + attrib;
+                for (uint32_t c = 0; c < 4; ++c)
+                {
+                    transposedPrims[0].attrib[attribSlot][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
+                    pBase += sizeof(simdscalar);
+                }
+            }
+
+            PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology);
+
+            while (clipPa.GetNextStreamOutput())
+            {
+                do
+                {
+                    simdvector attrib[NumVertsPerPrim];
+                    bool assemble = clipPa.Assemble(VERTEX_POSITION_SLOT, attrib);
+                    if (assemble)
+                    {
+                        static const uint32_t primMaskMap[] = { 0x0, 0x1, 0x3, 0x7, 0xf, 0x1f, 0x3f, 0x7f, 0xff };
+                        pfnBinFunc(this->pDC, clipPa, this->workerId, attrib, primMaskMap[numEmittedPrims], _simd_set1_epi32(pPrimitiveId[inputPrim]));
+                    }
+                } while (clipPa.NextPrim());
+            }
+        }
+
+        // update global pipeline stat
+        SWR_CONTEXT* pContext = this->pDC->pContext;
+        UPDATE_STAT(CPrimitives, numClippedPrims);
+    }
+    
+    // execute the clipper stage
+    void ExecuteStage(PA_STATE& pa, simdvector prim[], uint32_t primMask, simdscalari primId)
+    {
+        // set up binner based on PA state
+        PFN_PROCESS_PRIMS pfnBinner;
+        switch (pa.binTopology)
+        {
+        case TOP_POINT_LIST:
+            pfnBinner = BinPoints;
+            break;
+        case TOP_LINE_LIST:
+        case TOP_LINE_STRIP:
+        case TOP_LINE_LOOP:
+        case TOP_LINE_LIST_ADJ:
+        case TOP_LISTSTRIP_ADJ:
+            pfnBinner = BinLines;
+            break;
+        default:
+            pfnBinner = BinTriangles;
+            break;
+        };
+
+        // update clipper invocations pipeline stat
+        SWR_CONTEXT* pContext = this->pDC->pContext;
+        uint32_t numInvoc = _mm_popcnt_u32(primMask);
+        UPDATE_STAT(CInvocations, numInvoc);
+
+        ComputeClipCodes(prim);
+
+        // cull prims with NAN coords
+        primMask &= ~ComputeNaNMask(prim);
+
+        // user cull distance cull 
+        if (this->state.rastState.cullDistanceMask)
+        {
+            primMask &= ~ComputeUserClipCullMask(pa, prim);
+        }
+
+        // cull prims outside view frustum
+        simdscalar clipIntersection = ComputeClipCodeIntersection();
+        int validMask = primMask & _simd_movemask_ps(_simd_cmpeq_ps(clipIntersection, _simd_setzero_ps()));
+
+        // skip clipping for points
+        uint32_t clipMask = 0;
+        if (NumVertsPerPrim != 1)
+        {
+            clipMask = primMask & ComputeClipMask();
+        }
+
+        if (clipMask)
+        {
+            RDTSC_START(FEGuardbandClip);
+            // we have to clip tris, execute the clipper, which will also
+            // call the binner
+            ClipSimd(vMask(primMask), vMask(clipMask), pa, primId);
+            RDTSC_STOP(FEGuardbandClip, 1, 0);
+        }
+        else if (validMask)
+        {
+            // update CPrimitives pipeline state
+            SWR_CONTEXT* pContext = this->pDC->pContext;
+            UPDATE_STAT(CPrimitives, _mm_popcnt_u32(validMask));
+
+            // forward valid prims directly to binner
+            pfnBinner(this->pDC, pa, this->workerId, prim, validMask, primId);
+        }
+    }
+
+private:
+    inline simdscalar ComputeInterpFactor(simdscalar boundaryCoord0, simdscalar boundaryCoord1)
+    {
+        return _simd_div_ps(boundaryCoord0, _simd_sub_ps(boundaryCoord0, boundaryCoord1));
+    }
+
+    inline simdscalari ComputeOffsets(uint32_t attrib, simdscalari vIndices, uint32_t component)
+    {
+        const uint32_t simdVertexStride = sizeof(simdvertex);
+        const uint32_t componentStride = sizeof(simdscalar);
+        const uint32_t attribStride = sizeof(simdvector);
+        const __m256i vElemOffset = _mm256_set_epi32(7 * sizeof(float), 6 * sizeof(float), 5 * sizeof(float), 4 * sizeof(float),
+            3 * sizeof(float), 2 * sizeof(float), 1 * sizeof(float), 0 * sizeof(float));
+
+        // step to the simdvertex
+        simdscalari vOffsets = _simd_mullo_epi32(vIndices, _simd_set1_epi32(simdVertexStride));
+
+        // step to the attribute and component
+        vOffsets = _simd_add_epi32(vOffsets, _simd_set1_epi32(attribStride * attrib + componentStride * component));
+
+        // step to the lane
+        vOffsets = _simd_add_epi32(vOffsets, vElemOffset);
+
+        return vOffsets;
+    }
+
+    // gathers a single component for a given attribute for each SIMD lane
+    inline simdscalar GatherComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component)
+    {
+        simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
+        simdscalar vSrc = _mm256_undefined_ps();
+        return _simd_mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask, 1);
+    }
+
+    inline void ScatterComponent(const float* pBuffer, uint32_t attrib, simdscalar vMask, simdscalari vIndices, uint32_t component, simdscalar vSrc)
+    {
+        simdscalari vOffsets = ComputeOffsets(attrib, vIndices, component);
+
+        uint32_t* pOffsets = (uint32_t*)&vOffsets;
+        float* pSrc = (float*)&vSrc;
+        uint32_t mask = _simd_movemask_ps(vMask);
+        DWORD lane;
+        while (_BitScanForward(&lane, mask))
+        {
+            mask &= ~(1 << lane);
+            uint8_t* pBuf = (uint8_t*)pBuffer + pOffsets[lane];
+            *(float*)pBuf = pSrc[lane];
+        }
+    }
+
+    template<SWR_CLIPCODES ClippingPlane>
+    inline void intersect(
+        const simdscalar& vActiveMask,  // active lanes to operate on
+        const simdscalari& s,           // index to first edge vertex v0 in pInPts.
+        const simdscalari& p,           // index to second edge vertex v1 in pInPts.
+        const simdvector& v1,           // vertex 0 position
+        const simdvector& v2,           // vertex 1 position
+        simdscalari& outIndex,          // output index.
+        const float *pInVerts,          // array of all the input positions.
+        uint32_t numInAttribs,          // number of attributes per vertex.
+        float *pOutVerts)               // array of output positions. We'll write our new intersection point at i*4.
+    {
+        // compute interpolation factor
+        simdscalar t;
+        switch (ClippingPlane)
+        {
+        case FRUSTUM_LEFT:      t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[0]), _simd_add_ps(v2[3], v2[0])); break;
+        case FRUSTUM_RIGHT:     t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[0]), _simd_sub_ps(v2[3], v2[0])); break;
+        case FRUSTUM_TOP:       t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[1]), _simd_add_ps(v2[3], v2[1])); break;
+        case FRUSTUM_BOTTOM:    t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[1]), _simd_sub_ps(v2[3], v2[1])); break;
+        case FRUSTUM_NEAR:      
+            // DX Znear plane is 0, GL is -w
+            if (this->driverType == DX)
+            {
+                t = ComputeInterpFactor(v1[2], v2[2]);
+            }
+            else
+            {
+                t = ComputeInterpFactor(_simd_add_ps(v1[3], v1[2]), _simd_add_ps(v2[3], v2[2]));
+            }
+            break;
+        case FRUSTUM_FAR:       t = ComputeInterpFactor(_simd_sub_ps(v1[3], v1[2]), _simd_sub_ps(v2[3], v2[2])); break;
+        default: SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
+        };
+
+        // interpolate position and store
+        for (uint32_t c = 0; c < 4; ++c)
+        {
+            simdscalar vOutPos = _simd_fmadd_ps(_simd_sub_ps(v2[c], v1[c]), t, v1[c]);
+            ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, vActiveMask, outIndex, c, vOutPos);
+        }
+
+        // interpolate attributes and store
+        for (uint32_t a = 0; a < numInAttribs; ++a)
+        {
+            uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
+            for (uint32_t c = 0; c < 4; ++c)
+            {
+                simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
+                simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
+                simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+                ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
+            }
+        }
+    }
+
+    template<SWR_CLIPCODES ClippingPlane>
+    inline simdscalar inside(const simdvector& v)
+    {
+        switch (ClippingPlane)
+        {
+        case FRUSTUM_LEFT:      return _simd_cmpge_ps(v[0], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
+        case FRUSTUM_RIGHT:     return _simd_cmple_ps(v[0], v[3]);
+        case FRUSTUM_TOP:       return _simd_cmpge_ps(v[1], _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
+        case FRUSTUM_BOTTOM:    return _simd_cmple_ps(v[1], v[3]);
+        case FRUSTUM_NEAR:      return _simd_cmpge_ps(v[2], this->driverType == DX ? _simd_setzero_ps() : _simd_mul_ps(v[3], _simd_set1_ps(-1.0f)));
+        case FRUSTUM_FAR:       return _simd_cmple_ps(v[2], v[3]);
+        default:
+            SWR_ASSERT(false, "invalid clipping plane: %d", ClippingPlane);
+            return _simd_setzero_ps();
+        }
+    }
+
+    template<SWR_CLIPCODES ClippingPlane>
+    simdscalari ClipTriToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
+    {
+        simdscalari vCurIndex = _simd_setzero_si();
+        simdscalari vOutIndex = _simd_setzero_si();
+        simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
+
+        while (!_simd_testz_ps(vActiveMask, vActiveMask)) // loop until activeMask is empty
+        {
+            simdscalari s = vCurIndex;
+            simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1));
+            simdscalari underFlowMask = _simd_cmpgt_epi32(vNumInPts, p);
+            p = _simd_castps_si(_simd_blendv_ps(_simd_setzero_ps(), _simd_castsi_ps(p), _simd_castsi_ps(underFlowMask)));
+
+            // gather position
+            simdvector vInPos0, vInPos1;
+            for (uint32_t c = 0; c < 4; ++c)
+            {
+                vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
+                vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
+            }
+
+            // compute inside mask
+            simdscalar s_in = inside<ClippingPlane>(vInPos0);
+            simdscalar p_in = inside<ClippingPlane>(vInPos1);
+
+            // compute intersection mask (s_in != p_in)
+            simdscalar intersectMask = _simd_xor_ps(s_in, p_in);
+            intersectMask = _simd_and_ps(intersectMask, vActiveMask);
+
+            // store s if inside
+            s_in = _simd_and_ps(s_in, vActiveMask);
+            if (!_simd_testz_ps(s_in, s_in))
+            {
+                // store position
+                for (uint32_t c = 0; c < 4; ++c)
+                {
+                    ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
+                }
+
+                // store attribs
+                for (uint32_t a = 0; a < numInAttribs; ++a)
+                {
+                    uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
+                    for (uint32_t c = 0; c < 4; ++c)
+                    {
+                        simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+                        ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
+                    }
+                }
+
+                // increment outIndex
+                vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
+            }
+
+            // compute and store intersection
+            if (!_simd_testz_ps(intersectMask, intersectMask))
+            {
+                intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
+
+                // increment outIndex for active lanes
+                vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask);
+            }
+
+            // increment loop index and update active mask
+            vCurIndex = _simd_add_epi32(vCurIndex, _simd_set1_epi32(1));
+            vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
+        }
+
+        return vOutIndex;
+    }
+
+    template<SWR_CLIPCODES ClippingPlane>
+    simdscalari ClipLineToPlane(const float* pInVerts, const simdscalari& vNumInPts, uint32_t numInAttribs, float* pOutVerts)
+    {
+        simdscalari vCurIndex = _simd_setzero_si();
+        simdscalari vOutIndex = _simd_setzero_si();
+        simdscalar vActiveMask = _simd_castsi_ps(_simd_cmplt_epi32(vCurIndex, vNumInPts));
+
+        if (!_simd_testz_ps(vActiveMask, vActiveMask))
+        {
+            simdscalari s = vCurIndex;
+            simdscalari p = _simd_add_epi32(s, _simd_set1_epi32(1));
+
+            // gather position
+            simdvector vInPos0, vInPos1;
+            for (uint32_t c = 0; c < 4; ++c)
+            {
+                vInPos0[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, s, c);
+                vInPos1[c] = GatherComponent(pInVerts, VERTEX_POSITION_SLOT, vActiveMask, p, c);
+            }
+
+            // compute inside mask
+            simdscalar s_in = inside<ClippingPlane>(vInPos0);
+            simdscalar p_in = inside<ClippingPlane>(vInPos1);
+
+            // compute intersection mask (s_in != p_in)
+            simdscalar intersectMask = _simd_xor_ps(s_in, p_in);
+            intersectMask = _simd_and_ps(intersectMask, vActiveMask);
+
+            // store s if inside
+            s_in = _simd_and_ps(s_in, vActiveMask);
+            if (!_simd_testz_ps(s_in, s_in))
+            {
+                for (uint32_t c = 0; c < 4; ++c)
+                {
+                    ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, s_in, vOutIndex, c, vInPos0[c]);
+                }
+
+                // interpolate attributes and store
+                for (uint32_t a = 0; a < numInAttribs; ++a)
+                {
+                    uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
+                    for (uint32_t c = 0; c < 4; ++c)
+                    {
+                        simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+                        ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
+                    }
+                }
+
+                // increment outIndex
+                vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
+            }
+
+            // compute and store intersection
+            if (!_simd_testz_ps(intersectMask, intersectMask))
+            {
+                intersect<ClippingPlane>(intersectMask, s, p, vInPos0, vInPos1, vOutIndex, pInVerts, numInAttribs, pOutVerts);
+
+                // increment outIndex for active lanes
+                vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), intersectMask);
+            }
+
+            // store p if inside
+            p_in = _simd_and_ps(p_in, vActiveMask);
+            if (!_simd_testz_ps(p_in, p_in))
+            {
+                for (uint32_t c = 0; c < 4; ++c)
+                {
+                    ScatterComponent(pOutVerts, VERTEX_POSITION_SLOT, p_in, vOutIndex, c, vInPos1[c]);
+                }
+
+                // interpolate attributes and store
+                for (uint32_t a = 0; a < numInAttribs; ++a)
+                {
+                    uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + a;
+                    for (uint32_t c = 0; c < 4; ++c)
+                    {
+                        simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, p_in, p, c);
+                        ScatterComponent(pOutVerts, attribSlot, p_in, vOutIndex, c, vAttrib);
+                    }
+                }
+
+                // increment outIndex
+                vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), p_in);
+            }
+        }
+
+        return vOutIndex;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Vertical clipper. Clips SIMD primitives at a time
+    /// @param pVertices - pointer to vertices in SOA form. Clipper will read input and write results to this buffer
+    /// @param vPrimMask - mask of valid input primitives, including non-clipped prims
+    /// @param numAttribs - number of valid input attribs, including position
+    simdscalari ClipPrims(float* pVertices, const simdscalar& vPrimMask, const simdscalar& vClipMask, int numAttribs)
+    {
+        // temp storage
+        simdvertex tempVertices[7];
+        float* pTempVerts = (float*)&tempVertices[0];
+
+        // zero out num input verts for non-active lanes
+        simdscalari vNumInPts = _simd_set1_epi32(NumVertsPerPrim);
+        vNumInPts = _simd_blendv_epi32(_simd_setzero_si(), vNumInPts, vClipMask);
+
+        // clip prims to frustum
+        simdscalari vNumOutPts;
+        if (NumVertsPerPrim == 3)
+        {
+            vNumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
+            vNumOutPts = ClipTriToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
+            vNumOutPts = ClipTriToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
+            vNumOutPts = ClipTriToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
+            vNumOutPts = ClipTriToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
+            vNumOutPts = ClipTriToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
+        }
+        else
+        {
+            SWR_ASSERT(NumVertsPerPrim == 2);
+            vNumOutPts = ClipLineToPlane<FRUSTUM_NEAR>(pVertices, vNumInPts, numAttribs, pTempVerts);
+            vNumOutPts = ClipLineToPlane<FRUSTUM_FAR>(pTempVerts, vNumOutPts, numAttribs, pVertices);
+            vNumOutPts = ClipLineToPlane<FRUSTUM_LEFT>(pVertices, vNumOutPts, numAttribs, pTempVerts);
+            vNumOutPts = ClipLineToPlane<FRUSTUM_RIGHT>(pTempVerts, vNumOutPts, numAttribs, pVertices);
+            vNumOutPts = ClipLineToPlane<FRUSTUM_BOTTOM>(pVertices, vNumOutPts, numAttribs, pTempVerts);
+            vNumOutPts = ClipLineToPlane<FRUSTUM_TOP>(pTempVerts, vNumOutPts, numAttribs, pVertices);
+        }
+
+        // restore num verts for non-clipped, active lanes
+        simdscalar vNonClippedMask = _simd_andnot_ps(vClipMask, vPrimMask);
+        vNumOutPts = _simd_blendv_epi32(vNumOutPts, _simd_set1_epi32(NumVertsPerPrim), vNonClippedMask);
+
+        return vNumOutPts;
+    }
+
+    const uint32_t workerId;
+    const DRIVER_TYPE driverType;
+    DRAW_CONTEXT* pDC;
+    const API_STATE& state;
+    simdscalar clipCodes[NumVertsPerPrim];
+};
+
+
+// pipeline stage functions
+void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
+void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
+void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
new file mode 100644
index 00000000000..4a214aff1c8
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -0,0 +1,495 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file context.h
+*
+* @brief Definitions for SWR_CONTEXT and DRAW_CONTEXT
+*        The SWR_CONTEXT is our global context and contains the DC ring,
+*        thread state, etc.
+*
+*        The DRAW_CONTEXT contains all state associated with a draw operation.
+*
+******************************************************************************/
+#pragma once
+
+#include <condition_variable>
+#include <algorithm>
+
+#include "core/api.h"
+#include "core/utils.h"
+#include "core/arena.h"
+#include "core/fifo.hpp"
+#include "core/knobs.h"
+#include "common/simdintrin.h"
+#include "core/threads.h"
+
+// x.8 fixed point precision values
+#define FIXED_POINT_SHIFT 8
+#define FIXED_POINT_SCALE 256
+
+// x.16 fixed point precision values
+#define FIXED_POINT16_SHIFT 16
+#define FIXED_POINT16_SCALE 65536
+
+struct SWR_CONTEXT;
+struct DRAW_CONTEXT;
+
+struct TRI_FLAGS
+{
+    uint32_t frontFacing : 1;
+    uint32_t yMajor : 1;
+    uint32_t coverageMask : (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
+    uint32_t reserved : 32 - 1 - 1 - (SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM);
+    float pointSize;
+    uint32_t primID;
+    uint32_t renderTargetArrayIndex;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_TRIANGLE_DESC
+/////////////////////////////////////////////////////////////////////////
+struct SWR_TRIANGLE_DESC
+{
+    float I[3];
+    float J[3];
+    float Z[3];
+    float OneOverW[3];
+    float recipDet;
+
+    float *pRecipW;
+    float *pAttribs;
+    float *pPerspAttribs;
+    float *pSamplePos;
+    float *pUserClipBuffer;
+
+    uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES];
+
+    TRI_FLAGS triFlags;
+};
+
+struct TRIANGLE_WORK_DESC
+{
+    float *pTriBuffer;
+    float *pAttribs;
+    float *pUserClipBuffer;
+    uint32_t numAttribs;
+    TRI_FLAGS triFlags;
+};
+
+union CLEAR_FLAGS
+{
+    struct
+    {
+        uint32_t mask : 3;
+    };
+    uint32_t bits;
+};
+
+struct CLEAR_DESC
+{
+    CLEAR_FLAGS flags;
+    float clearRTColor[4];  // RGBA_32F
+    float clearDepth;   // [0..1]
+    BYTE clearStencil;
+};
+
+struct INVALIDATE_TILES_DESC
+{
+    uint32_t attachmentMask;
+};
+
+struct SYNC_DESC
+{
+    PFN_CALLBACK_FUNC pfnCallbackFunc;
+    uint64_t userData;
+    uint64_t userData2;
+    uint64_t userData3;
+};
+
+struct QUERY_DESC
+{
+    SWR_STATS* pStats;
+};
+
+struct STORE_TILES_DESC
+{
+    SWR_RENDERTARGET_ATTACHMENT attachment;
+    SWR_TILE_STATE postStoreTileState;
+};
+
+struct COMPUTE_DESC
+{
+    uint32_t threadGroupCountX;
+    uint32_t threadGroupCountY;
+    uint32_t threadGroupCountZ;
+};
+
+typedef void(*PFN_WORK_FUNC)(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc);
+
+enum WORK_TYPE
+{
+    SYNC,
+    DRAW,
+    CLEAR,
+    INVALIDATETILES,
+    STORETILES,
+    QUERYSTATS,
+};
+
+struct BE_WORK
+{
+    WORK_TYPE type;
+    PFN_WORK_FUNC pfnWork;
+    union
+    {
+        SYNC_DESC sync;
+        TRIANGLE_WORK_DESC tri;
+        CLEAR_DESC clear;
+        INVALIDATE_TILES_DESC invalidateTiles;
+        STORE_TILES_DESC storeTiles;
+        QUERY_DESC queryStats;
+    } desc;
+};
+
+struct DRAW_WORK
+{
+    DRAW_CONTEXT*   pDC;
+    union
+    {
+        uint32_t   numIndices;      // DrawIndexed: Number of indices for draw.
+        uint32_t   numVerts;        // Draw: Number of verts (triangles, lines, etc)
+    };
+    union
+    {
+        const int32_t* pIB;        // DrawIndexed: App supplied indices
+        uint32_t   startVertex;    // Draw: Starting vertex in VB to render from.
+    };
+    int32_t    baseVertex;
+    uint32_t   numInstances;        // Number of instances
+    uint32_t   startInstance;       // Instance offset
+    uint32_t   startPrimID;         // starting primitiveID for this draw batch
+    uint32_t   startVertexID;       // starting VertexID for this draw batch (only needed for non-indexed draws)
+    SWR_FORMAT type;                // index buffer type
+};
+
+typedef void(*PFN_FE_WORK_FUNC)(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t workerId, void* pDesc);
+struct FE_WORK
+{
+    WORK_TYPE type;
+    PFN_FE_WORK_FUNC pfnWork;
+    union
+    {
+        SYNC_DESC sync;
+        DRAW_WORK draw;
+        CLEAR_DESC clear;
+        INVALIDATE_TILES_DESC invalidateTiles;
+        STORE_TILES_DESC storeTiles;
+        QUERY_DESC queryStats;
+    } desc;
+};
+
+struct GUARDBAND
+{
+    float left, right, top, bottom;
+};
+
+struct PA_STATE;
+
+// function signature for pipeline stages that execute after primitive assembly
+typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], 
+    uint32_t primMask, simdscalari primID);
+
+OSALIGNLINE(struct) API_STATE
+{
+    // Vertex Buffers
+    SWR_VERTEX_BUFFER_STATE vertexBuffers[KNOB_NUM_STREAMS];
+
+    // Index Buffer
+    SWR_INDEX_BUFFER_STATE  indexBuffer;
+
+    // FS - Fetch Shader State
+    PFN_FETCH_FUNC          pfnFetchFunc;
+
+    // VS - Vertex Shader State
+    PFN_VERTEX_FUNC         pfnVertexFunc;
+
+    // GS - Geometry Shader State
+    PFN_GS_FUNC             pfnGsFunc;
+    SWR_GS_STATE            gsState;
+
+    // CS - Compute Shader
+    PFN_CS_FUNC             pfnCsFunc;
+    uint32_t                totalThreadsInGroup;
+
+    // FE - Frontend State
+    SWR_FRONTEND_STATE      frontendState;
+
+    // SOS - Streamout Shader State
+    PFN_SO_FUNC             pfnSoFunc[MAX_SO_STREAMS];
+
+    // Streamout state
+    SWR_STREAMOUT_STATE     soState;
+    mutable SWR_STREAMOUT_BUFFER soBuffer[MAX_SO_STREAMS];
+
+    // Tessellation State
+    PFN_HS_FUNC             pfnHsFunc;
+    PFN_DS_FUNC             pfnDsFunc;
+    SWR_TS_STATE            tsState;
+
+    // Specifies which VS outputs are sent to PS.
+    // Does not include position
+    uint32_t                linkageMask; 
+    uint32_t                linkageCount;
+    uint8_t                 linkageMap[MAX_ATTRIBUTES];
+
+    // attrib mask, specifies the total set of attributes used
+    // by the frontend (vs, so, gs)
+    uint32_t                feAttribMask;
+
+    PRIMITIVE_TOPOLOGY      topology;
+    bool                    forceFront;
+
+    // RS - Rasterizer State
+    SWR_RASTSTATE           rastState;
+    // floating point multisample offsets
+    float samplePos[SWR_MAX_NUM_MULTISAMPLES * 2];
+
+    GUARDBAND               gbState;
+
+    SWR_VIEWPORT            vp[KNOB_NUM_VIEWPORTS_SCISSORS];
+    SWR_VIEWPORT_MATRIX     vpMatrix[KNOB_NUM_VIEWPORTS_SCISSORS];
+
+    BBOX                    scissorRects[KNOB_NUM_VIEWPORTS_SCISSORS];
+    BBOX                    scissorInFixedPoint;
+
+    // Backend state
+    SWR_BACKEND_STATE       backendState;
+
+    // PS - Pixel shader state
+    SWR_PS_STATE            psState;
+
+    SWR_DEPTH_STENCIL_STATE depthStencilState;
+
+    // OM - Output Merger State
+    SWR_BLEND_STATE         blendState;
+    PFN_BLEND_JIT_FUNC      pfnBlendFunc[SWR_NUM_RENDERTARGETS];
+
+    // Stats are incremented when this is true.
+    bool enableStats;
+
+    struct
+    {
+        uint32_t colorHottileEnable : 8;
+        uint32_t depthHottileEnable: 1;
+        uint32_t stencilHottileEnable : 1;
+    };
+};
+
+class MacroTileMgr;
+class DispatchQueue;
+
+struct RenderOutputBuffers
+{
+    uint8_t* pColor[SWR_NUM_RENDERTARGETS];
+    uint8_t* pDepth;
+    uint8_t* pStencil;
+};
+
+// Plane equation A/B/C coeffs used to evaluate I/J barycentric coords
+struct BarycentricCoeffs
+{
+    simdscalar vIa;
+    simdscalar vIb;
+    simdscalar vIc;
+
+    simdscalar vJa;
+    simdscalar vJb;
+    simdscalar vJc;
+
+    simdscalar vZa;
+    simdscalar vZb;
+    simdscalar vZc;
+
+    simdscalar vRecipDet;
+
+    simdscalar vAOneOverW;
+    simdscalar vBOneOverW;
+    simdscalar vCOneOverW;
+};
+
+// pipeline function pointer types
+typedef void(*PFN_BACKEND_FUNC)(DRAW_CONTEXT*, uint32_t, uint32_t, uint32_t, SWR_TRIANGLE_DESC&, RenderOutputBuffers&);
+typedef void(*PFN_OUTPUT_MERGER)(SWR_PS_CONTEXT &, uint8_t* (&)[SWR_NUM_RENDERTARGETS], uint32_t, const SWR_BLEND_STATE*,
+                                 const PFN_BLEND_JIT_FUNC (&)[SWR_NUM_RENDERTARGETS], simdscalar&, simdscalar);
+typedef void(*PFN_CALC_PIXEL_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &);
+typedef void(*PFN_CALC_SAMPLE_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT&);
+typedef void(*PFN_CALC_CENTROID_BARYCENTRICS)(const BarycentricCoeffs&, SWR_PS_CONTEXT &, const uint64_t *const, const uint32_t,
+                                              const simdscalar, const simdscalar);
+
+struct BACKEND_FUNCS
+{
+    PFN_BACKEND_FUNC pfnBackend;
+    PFN_CALC_PIXEL_BARYCENTRICS pfnCalcPixelBarycentrics;
+    PFN_CALC_SAMPLE_BARYCENTRICS pfnCalcSampleBarycentrics;
+    PFN_CALC_CENTROID_BARYCENTRICS pfnCalcCentroidBarycentrics;
+    PFN_OUTPUT_MERGER pfnOutputMerger;
+};
+
+// Draw State
+struct DRAW_STATE
+{
+    API_STATE state;
+
+    void* pPrivateState;  // Its required the driver sets this up for each draw.
+
+    // pipeline function pointers, filled in by API thread when setting up the draw
+    BACKEND_FUNCS backendFuncs;
+    PFN_PROCESS_PRIMS pfnProcessPrims;
+
+    Arena*    pArena;     // This should only be used by API thread.
+};
+
+// Draw Context
+//    The api thread sets up a draw context that exists for the life of the draw.
+//    This draw context maintains all of the state needed for the draw operation.
+struct DRAW_CONTEXT
+{
+    SWR_CONTEXT *pContext;
+
+    uint64_t drawId;
+
+    bool isCompute;    // Is this DC a compute context?
+
+    FE_WORK FeWork;
+    volatile OSALIGNLINE(uint32_t) FeLock;
+    volatile OSALIGNLINE(bool) inUse;
+    volatile OSALIGNLINE(bool) doneFE;    // Is FE work done for this draw?
+
+    // Have all worker threads moved past draw in DC ring?
+    volatile OSALIGNLINE(uint32_t) threadsDoneFE;
+    volatile OSALIGNLINE(uint32_t) threadsDoneBE;
+
+    uint64_t dependency;
+
+    MacroTileMgr* pTileMgr;
+
+    // The following fields are valid if isCompute is true.
+    volatile OSALIGNLINE(bool) doneCompute; // Is this dispatch done?   (isCompute)
+    DispatchQueue* pDispatch;               // Queue for thread groups. (isCompute)
+
+    DRAW_STATE* pState;
+    Arena*    pArena;
+
+    uint8_t* pSpillFill[KNOB_MAX_NUM_THREADS];  // Scratch space used for spill fills.
+};
+
+INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
+{
+    SWR_ASSERT(pDC != nullptr);
+    SWR_ASSERT(pDC->pState != nullptr);
+
+    return pDC->pState->state;
+}
+
+INLINE void* GetPrivateState(const DRAW_CONTEXT* pDC)
+{
+    SWR_ASSERT(pDC != nullptr);
+    SWR_ASSERT(pDC->pState != nullptr);
+
+    return pDC->pState->pPrivateState;
+}
+
+class HotTileMgr;
+
+struct SWR_CONTEXT
+{
+    // Draw Context Ring
+    //  Each draw needs its own state in order to support mulitple draws in flight across multiple threads.
+    //  We maintain N draw contexts configured as a ring. The size of the ring limits the maximum number
+    //  of draws that can be in flight at any given time.
+    //
+    //  Description:
+    //  1. State - When an application first sets state we'll request a new draw context to use.
+    //     a. If there are no available draw contexts then we'll have to wait until one becomes free.
+    //     b. If one is available then set pCurDrawContext to point to it and mark it in use.
+    //     c. All state calls set state on pCurDrawContext.
+    //  2. Draw - Creates submits a work item that is associated with current draw context.
+    //     a. Set pPrevDrawContext = pCurDrawContext
+    //     b. Set pCurDrawContext to NULL.
+    //  3. State - When an applications sets state after draw
+    //     a. Same as step 1.
+    //     b. State is copied from prev draw context to current.
+    DRAW_CONTEXT* dcRing;
+
+    DRAW_CONTEXT *pCurDrawContext;    // This points to DC entry in ring for an unsubmitted draw.
+    DRAW_CONTEXT *pPrevDrawContext;   // This points to DC entry for the previous context submitted that we can copy state from.
+
+    // Draw State Ring
+    //  When draw are very large (lots of primitives) then the API thread will break these up.
+    //  These split draws all have identical state. So instead of storing the state directly
+    //  in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs
+    //  to reference a single entry in the DS ring.
+    DRAW_STATE*   dsRing;
+
+    uint32_t curStateId;               // Current index to the next available entry in the DS ring.
+
+    DRAW_STATE*   subCtxSave;          // Save area for inactive contexts.
+    uint32_t      curSubCtxId;         // Current index for active state subcontext.
+    uint32_t      numSubContexts;      // Number of available subcontexts
+
+    uint32_t NumWorkerThreads;
+
+    THREAD_POOL threadPool; // Thread pool associated with this context
+
+    std::condition_variable FifosNotEmpty;
+    std::mutex WaitLock;
+
+    // Draw Contexts will get a unique drawId generated from this
+    uint64_t nextDrawId;
+
+    // most recent draw id enqueued by the API thread
+    // written by api thread, read by multiple workers
+    OSALIGNLINE(volatile uint64_t) DrawEnqueued;
+
+    DRIVER_TYPE driverType;
+
+    uint32_t privateStateSize;
+
+    HotTileMgr *pHotTileMgr;
+
+    // tile load/store functions, passed in at create context time
+    PFN_LOAD_TILE pfnLoadTile;
+    PFN_STORE_TILE pfnStoreTile;
+    PFN_CLEAR_TILE pfnClearTile;
+
+    // Global Stats
+    SWR_STATS stats[KNOB_MAX_NUM_THREADS];
+
+    // Scratch space for workers.
+    uint8_t* pScratch[KNOB_MAX_NUM_THREADS];
+};
+
+void WaitForDependencies(SWR_CONTEXT *pContext, uint64_t drawId);
+void WakeAllThreads(SWR_CONTEXT *pContext);
+
+#define UPDATE_STAT(name, count) if (GetApiState(pDC).enableStats) { pContext->stats[workerId].name += count; }
+#define SET_STAT(name, count) if (GetApiState(pDC).enableStats) { pContext->stats[workerId].name = count; }
diff --git a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
new file mode 100644
index 00000000000..4f245c8c53e
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
@@ -0,0 +1,245 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file depthstencil.h
+*
+* @brief Implements depth/stencil functionality
+*
+******************************************************************************/
+#pragma once
+#include "common/os.h"
+#include "format_conversion.h"
+
+INLINE
+void StencilOp(SWR_STENCILOP op, simdscalar mask, simdscalar stencilRefps, simdscalar &stencilps)
+{
+    simdscalari stencil = _simd_castps_si(stencilps);
+
+    switch (op)
+    {
+    case STENCILOP_KEEP:
+        break;
+    case STENCILOP_ZERO:
+        stencilps = _simd_blendv_ps(stencilps, _simd_setzero_ps(), mask);
+        break;
+    case STENCILOP_REPLACE:
+        stencilps = _simd_blendv_ps(stencilps, stencilRefps, mask);
+        break;
+    case STENCILOP_INCRSAT:
+    {
+        simdscalari stencilincr = _simd_adds_epu8(stencil, _simd_set1_epi32(1));
+        stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask);
+        break;
+    }
+    case STENCILOP_DECRSAT:
+    {
+        simdscalari stencildecr = _simd_subs_epu8(stencil, _simd_set1_epi32(1));
+        stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask);
+        break;
+    }
+    case STENCILOP_INCR:
+    {
+        simdscalari stencilincr = _simd_add_epi8(stencil, _simd_set1_epi32(1));
+        stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencilincr), mask);
+        break;
+    }
+    case STENCILOP_DECR:
+    {
+        simdscalari stencildecr = _simd_add_epi8(stencil, _simd_set1_epi32((-1) & 0xff));
+        stencilps = _simd_blendv_ps(stencilps, _simd_castsi_ps(stencildecr), mask);
+        break;
+    }
+    case STENCILOP_INVERT:
+    {
+        simdscalar stencilinvert = _simd_andnot_ps(stencilps, _simd_cmpeq_ps(_simd_setzero_ps(), _simd_setzero_ps()));
+        stencilps = _simd_blendv_ps(stencilps, stencilinvert, mask);
+        break;
+    }
+    default:
+        break;
+    }
+}
+
+
+INLINE
+simdscalar DepthStencilTest(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState,
+                 bool frontFacing, simdscalar interpZ, BYTE* pDepthBase, simdscalar coverageMask, BYTE *pStencilBase,
+                 simdscalar* pStencilMask)
+{
+    static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
+    static_assert(KNOB_STENCIL_HOT_TILE_FORMAT == R8_UINT, "Unsupported stencil hot tile format");
+
+    simdscalar depthResult = _simd_set1_ps(-1.0f);
+    simdscalar zbuf;
+
+    // clamp Z to viewport [minZ..maxZ]
+    simdscalar vMinZ = _simd_broadcast_ss(&pViewport->minZ);
+    simdscalar vMaxZ = _simd_broadcast_ss(&pViewport->maxZ);
+    interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, interpZ));
+    
+    if (pDSState->depthTestEnable)
+    {
+        switch (pDSState->depthTestFunc)
+        {
+        case ZFUNC_NEVER: depthResult = _simd_setzero_ps(); break;
+        case ZFUNC_ALWAYS: break;
+        default:
+            zbuf = _simd_load_ps((const float*)pDepthBase);
+        }
+
+        switch (pDSState->depthTestFunc)
+        {
+        case ZFUNC_LE: depthResult = _simd_cmple_ps(interpZ, zbuf); break;
+        case ZFUNC_LT: depthResult = _simd_cmplt_ps(interpZ, zbuf); break;
+        case ZFUNC_GT: depthResult = _simd_cmpgt_ps(interpZ, zbuf); break;
+        case ZFUNC_GE: depthResult = _simd_cmpge_ps(interpZ, zbuf); break;
+        case ZFUNC_EQ: depthResult = _simd_cmpeq_ps(interpZ, zbuf); break;
+        case ZFUNC_NE: depthResult = _simd_cmpneq_ps(interpZ, zbuf); break;
+        }
+    }
+
+    simdscalar stencilMask = _simd_set1_ps(-1.0f);
+
+    if (pDSState->stencilTestEnable)
+    {
+        uint8_t stencilRefValue;
+        uint32_t stencilTestFunc;
+        uint8_t stencilTestMask;
+        if (frontFacing || !pDSState->doubleSidedStencilTestEnable)
+        {
+            stencilRefValue = pDSState->stencilRefValue;
+            stencilTestFunc = pDSState->stencilTestFunc;
+            stencilTestMask = pDSState->stencilTestMask;
+        }
+        else
+        {
+            stencilRefValue = pDSState->backfaceStencilRefValue;
+            stencilTestFunc = pDSState->backfaceStencilTestFunc;
+            stencilTestMask = pDSState->backfaceStencilTestMask;
+        }
+
+        simdvector sbuf;
+        simdscalar stencilWithMask;
+        simdscalar stencilRef;
+        switch(stencilTestFunc)
+        {
+        case ZFUNC_NEVER: stencilMask = _simd_setzero_ps(); break;
+        case ZFUNC_ALWAYS: break;
+        default:
+            LoadSOA<R8_UINT>(pStencilBase, sbuf);
+            
+            // apply stencil read mask
+            stencilWithMask = _simd_castsi_ps(_simd_and_si(_simd_castps_si(sbuf.v[0]), _simd_set1_epi32(stencilTestMask)));
+
+            // do stencil compare in float to avoid simd integer emulation in AVX1
+            stencilWithMask = _simd_cvtepi32_ps(_simd_castps_si(stencilWithMask));
+
+            stencilRef = _simd_set1_ps((float)(stencilRefValue & stencilTestMask));
+            break;
+        }
+
+        switch(stencilTestFunc)
+        {
+        case ZFUNC_LE: stencilMask = _simd_cmple_ps(stencilRef, stencilWithMask); break;
+        case ZFUNC_LT: stencilMask = _simd_cmplt_ps(stencilRef, stencilWithMask); break;
+        case ZFUNC_GT: stencilMask = _simd_cmpgt_ps(stencilRef, stencilWithMask); break;
+        case ZFUNC_GE: stencilMask = _simd_cmpge_ps(stencilRef, stencilWithMask); break;
+        case ZFUNC_EQ: stencilMask = _simd_cmpeq_ps(stencilRef, stencilWithMask); break;
+        case ZFUNC_NE: stencilMask = _simd_cmpneq_ps(stencilRef, stencilWithMask); break;
+        }
+    }
+
+    simdscalar depthWriteMask = _simd_and_ps(depthResult, stencilMask);
+    depthWriteMask = _simd_and_ps(depthWriteMask, coverageMask);
+
+    *pStencilMask = stencilMask;
+    return depthWriteMask;
+}
+
+INLINE
+void DepthStencilWrite(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState,
+        bool frontFacing, simdscalar interpZ, BYTE* pDepthBase, const simdscalar& depthMask, const simdscalar& coverageMask, 
+        BYTE *pStencilBase, const simdscalar& stencilMask)
+{
+    if (pDSState->depthWriteEnable)
+    {
+        // clamp Z to viewport [minZ..maxZ]
+        simdscalar vMinZ = _simd_broadcast_ss(&pViewport->minZ);
+        simdscalar vMaxZ = _simd_broadcast_ss(&pViewport->maxZ);
+        interpZ = _simd_min_ps(vMaxZ, _simd_max_ps(vMinZ, interpZ));
+
+        simdscalar vMask = _simd_and_ps(depthMask, coverageMask);
+        _simd_maskstore_ps((float*)pDepthBase, _simd_castps_si(vMask), interpZ);
+    }
+
+    if (pDSState->stencilWriteEnable)
+    {
+        simdvector sbuf;
+        LoadSOA<R8_UINT>(pStencilBase, sbuf);
+        simdscalar stencilbuf = sbuf.v[0];
+
+        uint8_t stencilRefValue;
+        uint32_t stencilFailOp;
+        uint32_t stencilPassDepthPassOp;
+        uint32_t stencilPassDepthFailOp;
+        uint8_t stencilWriteMask;
+        if (frontFacing || !pDSState->doubleSidedStencilTestEnable)
+        {
+            stencilRefValue = pDSState->stencilRefValue;
+            stencilFailOp = pDSState->stencilFailOp;
+            stencilPassDepthPassOp = pDSState->stencilPassDepthPassOp;
+            stencilPassDepthFailOp = pDSState->stencilPassDepthFailOp;
+            stencilWriteMask = pDSState->stencilWriteMask;
+        }
+        else
+        {
+            stencilRefValue = pDSState->backfaceStencilRefValue;
+            stencilFailOp = pDSState->backfaceStencilFailOp;
+            stencilPassDepthPassOp = pDSState->backfaceStencilPassDepthPassOp;
+            stencilPassDepthFailOp = pDSState->backfaceStencilPassDepthFailOp;
+            stencilWriteMask = pDSState->backfaceStencilWriteMask;
+        }
+
+        simdscalar stencilps = stencilbuf;
+        simdscalar stencilRefps = _simd_castsi_ps(_simd_set1_epi32(stencilRefValue));
+
+        simdscalar stencilFailMask = _simd_andnot_ps(stencilMask, coverageMask);
+        simdscalar stencilPassDepthPassMask = _simd_and_ps(stencilMask, depthMask);
+        simdscalar stencilPassDepthFailMask = _simd_and_ps(stencilMask, _simd_andnot_ps(depthMask, _simd_set1_ps(-1)));
+
+        simdscalar origStencil = stencilps;
+
+        StencilOp((SWR_STENCILOP)stencilFailOp, stencilFailMask, stencilRefps, stencilps);
+        StencilOp((SWR_STENCILOP)stencilPassDepthFailOp, stencilPassDepthFailMask, stencilRefps, stencilps);
+        StencilOp((SWR_STENCILOP)stencilPassDepthPassOp, stencilPassDepthPassMask, stencilRefps, stencilps);
+
+        // apply stencil write mask
+        simdscalari vWriteMask = _simd_set1_epi32(stencilWriteMask);
+        stencilps = _simd_and_ps(stencilps, _simd_castsi_ps(vWriteMask));
+        stencilps = _simd_or_ps(_simd_andnot_ps(_simd_castsi_ps(vWriteMask), origStencil), stencilps);
+
+        simdvector stencilResult;
+        stencilResult.v[0] = _simd_blendv_ps(origStencil, stencilps, coverageMask);
+        StoreSOA<R8_UINT>(stencilResult, pStencilBase);
+    }
+
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
new file mode 100644
index 00000000000..7e556012e6b
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
@@ -0,0 +1,136 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file fifo.hpp
+*
+* @brief Definitions for our fifos used for thread communication.
+*
+******************************************************************************/
+#pragma once
+
+
+#include "common/os.h"
+#include "arena.h"
+
+#include <vector>
+#include <cassert>
+
+template<class T>
+struct QUEUE
+{
+    OSALIGNLINE(volatile uint32_t) mLock{ 0 };
+    OSALIGNLINE(volatile uint32_t) mNumEntries{ 0 };
+    std::vector<T*> mBlocks;
+    T* mCurBlock{ nullptr };
+    uint32_t mHead{ 0 };
+    uint32_t mTail{ 0 };
+    uint32_t mCurBlockIdx{ 0 };
+
+    // power of 2
+    static const uint32_t mBlockSizeShift = 6;
+    static const uint32_t mBlockSize = 1 << mBlockSizeShift;
+
+    void clear(Arena& arena)
+    {
+        mHead = 0;
+        mTail = 0;
+        mBlocks.clear();
+        T* pNewBlock = (T*)arena.Alloc(sizeof(T)*mBlockSize);
+        mBlocks.push_back(pNewBlock);
+        mCurBlock = pNewBlock;
+        mCurBlockIdx = 0;
+
+        mNumEntries = 0;
+        _ReadWriteBarrier();
+        mLock = 0;
+    }
+
+    uint32_t getNumQueued()
+    {
+        return mNumEntries;
+    }
+
+    bool tryLock()
+    {
+        if (mLock)
+        {
+            return false;
+        }
+
+        // try to lock the FIFO
+        LONG initial = InterlockedCompareExchange(&mLock, 1, 0);
+        return (initial == 0);
+    }
+        
+    void unlock()
+    {
+        mLock = 0;
+    }
+
+    T* peek()
+    {
+        if (mNumEntries == 0)
+        {
+            return nullptr;
+        }
+        uint32_t block = mHead >> mBlockSizeShift;
+        return &mBlocks[block][mHead & (mBlockSize-1)];
+    }
+
+    void dequeue_noinc()
+    {
+        mHead ++;
+        mNumEntries --;
+    }
+
+    bool enqueue_try_nosync(Arena& arena, const T* entry)
+    {
+        memcpy(&mCurBlock[mTail], entry, sizeof(T));
+
+        mTail ++;
+        if (mTail == mBlockSize)
+        {
+            if (++mCurBlockIdx < mBlocks.size())
+            {
+                mCurBlock = mBlocks[mCurBlockIdx];
+            }
+            else
+            {
+                T* newBlock = (T*)arena.Alloc(sizeof(T)*mBlockSize);
+                SWR_ASSERT(newBlock);
+
+                mBlocks.push_back(newBlock);
+                mCurBlock = newBlock;
+            }
+
+            mTail = 0;
+        }
+
+        mNumEntries ++;
+        return true;
+    }
+
+    void destroy()
+    {
+    }
+
+};
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
new file mode 100644
index 00000000000..83d85fc86d8
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
@@ -0,0 +1,196 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file format_conversion.h
+*
+* @brief API implementation
+*
+******************************************************************************/
+#include "format_types.h"
+#include "format_traits.h"
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Load SIMD packed pixels in SOA format and converts to
+///        SOA RGBA32_FLOAT format.
+/// @param pSrc - source data in SOA form
+/// @param dst - output data in SOA form
+template<SWR_FORMAT SrcFormat>
+INLINE void LoadSOA(const BYTE *pSrc, simdvector &dst)
+{
+    // fast path for float32
+    if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<SrcFormat>::GetBPC(0) == 32))
+    {
+        auto lambda = [&](int comp)
+        {
+            simdscalar vComp = _simd_load_ps((const float*)(pSrc + comp*sizeof(simdscalar)));
+
+            dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp;
+        };
+
+        UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda);
+        return;
+    }
+
+    auto lambda = [&](int comp)
+    {
+        // load SIMD components
+        simdscalar vComp = FormatTraits<SrcFormat>::loadSOA(comp, pSrc);
+
+        // unpack
+        vComp = FormatTraits<SrcFormat>::unpack(comp, vComp);
+
+        // convert
+        if (FormatTraits<SrcFormat>::isNormalized(comp))
+        {
+            vComp = _simd_cvtepi32_ps(_simd_castps_si(vComp));
+            vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<SrcFormat>::toFloat(comp)));
+        }
+
+        dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp;
+
+        pSrc += (FormatTraits<SrcFormat>::GetBPC(comp) * KNOB_SIMD_WIDTH) / 8;
+    };
+
+    UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Clamps the given component based on the requirements on the 
+///        Format template arg
+/// @param vComp - SIMD vector of floats
+/// @param Component - component
+template<SWR_FORMAT Format>
+INLINE simdscalar Clamp(simdscalar vComp, uint32_t Component)
+{
+    if (FormatTraits<Format>::isNormalized(Component))
+    {
+        if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UNORM)
+        {
+            vComp = _simd_max_ps(vComp, _simd_setzero_ps());
+        }
+
+        if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SNORM)
+        {
+            vComp = _simd_max_ps(vComp, _simd_set1_ps(-1.0f));
+        }
+        vComp = _simd_min_ps(vComp, _simd_set1_ps(1.0f));
+    }
+    else if (FormatTraits<Format>::GetBPC(Component) < 32)
+    {
+        if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UINT)
+        {
+            int iMax = (1 << FormatTraits<Format>::GetBPC(Component)) - 1;
+            int iMin = 0;
+            simdscalari vCompi = _simd_castps_si(vComp);
+            vCompi = _simd_max_epu32(vCompi, _simd_set1_epi32(iMin));
+            vCompi = _simd_min_epu32(vCompi, _simd_set1_epi32(iMax));
+            vComp = _simd_castsi_ps(vCompi);
+        }
+        else if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SINT)
+        {
+            int iMax = (1 << (FormatTraits<Format>::GetBPC(Component) - 1)) - 1;
+            int iMin = -1 - iMax;
+            simdscalari vCompi = _simd_castps_si(vComp);
+            vCompi = _simd_max_epi32(vCompi, _simd_set1_epi32(iMin));
+            vCompi = _simd_min_epi32(vCompi, _simd_set1_epi32(iMax));
+            vComp = _simd_castsi_ps(vCompi);
+        }
+    }
+
+    return vComp;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Normalize the given component based on the requirements on the
+///        Format template arg
+/// @param vComp - SIMD vector of floats
+/// @param Component - component
+template<SWR_FORMAT Format>
+INLINE simdscalar Normalize(simdscalar vComp, uint32_t Component)
+{
+    if (FormatTraits<Format>::isNormalized(Component))
+    {
+        vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<Format>::fromFloat(Component)));
+        vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
+    }
+    return vComp;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Convert and store simdvector of pixels in SOA
+///        RGBA32_FLOAT to SOA format
+/// @param src - source data in SOA form
+/// @param dst - output data in SOA form
+template<SWR_FORMAT DstFormat>
+INLINE void StoreSOA(const simdvector &src, BYTE *pDst)
+{
+    // fast path for float32
+    if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<DstFormat>::GetBPC(0) == 32))
+    {
+        for (uint32_t comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
+        {
+            simdscalar vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
+
+            // Gamma-correct
+            if (FormatTraits<DstFormat>::isSRGB)
+            {
+                if (comp < 3)  // Input format is always RGBA32_FLOAT.
+                {
+                    vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp);
+                }
+            }
+
+            _simd_store_ps((float*)(pDst + comp*sizeof(simdscalar)), vComp);
+        }
+        return;
+    }
+
+    auto lambda = [&](int comp)
+    {
+        simdscalar vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
+
+        // Gamma-correct
+        if (FormatTraits<DstFormat>::isSRGB)
+        {
+            if (comp < 3)  // Input format is always RGBA32_FLOAT.
+            {
+                vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp);
+            }
+        }
+
+        // clamp
+        vComp = Clamp<DstFormat>(vComp, comp);
+
+        // normalize
+        vComp = Normalize<DstFormat>(vComp, comp);
+
+        // pack
+        vComp = FormatTraits<DstFormat>::pack(comp, vComp);
+
+        // store
+        FormatTraits<DstFormat>::storeSOA(comp, pDst, vComp);
+
+        pDst += (FormatTraits<DstFormat>::GetBPC(comp) * KNOB_SIMD_WIDTH) / 8;
+    };
+
+    UnrollerL<0, FormatTraits<DstFormat>::numComps, 1>::step(lambda);
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_traits.h b/src/gallium/drivers/swr/rasterizer/core/format_traits.h
new file mode 100644
index 00000000000..52340f4987a
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/format_traits.h
@@ -0,0 +1,3548 @@
+
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+* 
+* @file format_traits.h
+* 
+* @brief auto-generated file
+* 
+* DO NOT EDIT
+* 
+******************************************************************************/
+
+#pragma once
+
+#include "format_types.h"
+#include "utils.h"
+
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatSwizzle - Component swizzle selects
+//////////////////////////////////////////////////////////////////////////
+template<UINT comp0 = 0, uint32_t comp1 = 0, uint32_t comp2 = 0, uint32_t comp3 = 0>
+struct FormatSwizzle
+{
+    // Return swizzle select for component.
+    INLINE static uint32_t swizzle(UINT c)
+    {
+        static const uint32_t s[4] = { comp0, comp1, comp2, comp3 };
+        return s[c];
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits - Format traits
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT format>
+struct FormatTraits :
+    ComponentTraits<SWR_TYPE_UNKNOWN, 0>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0>
+{
+    static const uint32_t bpp{ 0 };
+    static const uint32_t numComps{ 0 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{1};
+    static const uint32_t bcHeight{1};
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32B32A32_FLOAT> - Format traits specialization for R32G32B32A32_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32B32A32_FLOAT> :
+    ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 128 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose32_32_32_32 TransposeT;
+    typedef Format4<32, 32, 32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32B32A32_SINT> - Format traits specialization for R32G32B32A32_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32B32A32_SINT> :
+    ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 128 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose32_32_32_32 TransposeT;
+    typedef Format4<32, 32, 32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32B32A32_UINT> - Format traits specialization for R32G32B32A32_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32B32A32_UINT> :
+    ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 128 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose32_32_32_32 TransposeT;
+    typedef Format4<32, 32, 32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32B32X32_FLOAT> - Format traits specialization for R32G32B32X32_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32B32X32_FLOAT> :
+    ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_UNUSED, 32>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 128 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose32_32_32_32 TransposeT;
+    typedef Format4<32, 32, 32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32B32A32_SSCALED> - Format traits specialization for R32G32B32A32_SSCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32B32A32_SSCALED> :
+    ComponentTraits<SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 128 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose32_32_32_32 TransposeT;
+    typedef Format4<32, 32, 32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32B32A32_USCALED> - Format traits specialization for R32G32B32A32_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32B32A32_USCALED> :
+    ComponentTraits<SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 128 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose32_32_32_32 TransposeT;
+    typedef Format4<32, 32, 32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32B32_FLOAT> - Format traits specialization for R32G32B32_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32B32_FLOAT> :
+    ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
+    FormatSwizzle<0, 1, 2>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 96 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose32_32_32 TransposeT;
+    typedef Format3<32, 32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32B32_SINT> - Format traits specialization for R32G32B32_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32B32_SINT> :
+    ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>,
+    FormatSwizzle<0, 1, 2>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 96 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose32_32_32 TransposeT;
+    typedef Format3<32, 32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32B32_UINT> - Format traits specialization for R32G32B32_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32B32_UINT> :
+    ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>,
+    FormatSwizzle<0, 1, 2>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 96 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose32_32_32 TransposeT;
+    typedef Format3<32, 32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32B32_SSCALED> - Format traits specialization for R32G32B32_SSCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32B32_SSCALED> :
+    ComponentTraits<SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32>,
+    FormatSwizzle<0, 1, 2>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 96 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose32_32_32 TransposeT;
+    typedef Format3<32, 32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32B32_USCALED> - Format traits specialization for R32G32B32_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32B32_USCALED> :
+    ComponentTraits<SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32>,
+    FormatSwizzle<0, 1, 2>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 96 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose32_32_32 TransposeT;
+    typedef Format3<32, 32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16A16_UNORM> - Format traits specialization for R16G16B16A16_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16A16_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 64 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose16_16_16_16 TransposeT;
+    typedef Format4<16, 16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16A16_SNORM> - Format traits specialization for R16G16B16A16_SNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16A16_SNORM> :
+    ComponentTraits<SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 64 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose16_16_16_16 TransposeT;
+    typedef Format4<16, 16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16A16_SINT> - Format traits specialization for R16G16B16A16_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16A16_SINT> :
+    ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 64 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose16_16_16_16 TransposeT;
+    typedef Format4<16, 16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16A16_UINT> - Format traits specialization for R16G16B16A16_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16A16_UINT> :
+    ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 64 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose16_16_16_16 TransposeT;
+    typedef Format4<16, 16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16A16_FLOAT> - Format traits specialization for R16G16B16A16_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16A16_FLOAT> :
+    ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 64 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose16_16_16_16 TransposeT;
+    typedef Format4<16, 16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32_FLOAT> - Format traits specialization for R32G32_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32_FLOAT> :
+    ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
+    FormatSwizzle<0, 1>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 64 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose32_32 TransposeT;
+    typedef Format2<32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32_SINT> - Format traits specialization for R32G32_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32_SINT> :
+    ComponentTraits<SWR_TYPE_SINT, 32, SWR_TYPE_SINT, 32>,
+    FormatSwizzle<0, 1>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 64 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose32_32 TransposeT;
+    typedef Format2<32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32_UINT> - Format traits specialization for R32G32_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32_UINT> :
+    ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UINT, 32>,
+    FormatSwizzle<0, 1>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 64 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose32_32 TransposeT;
+    typedef Format2<32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32_FLOAT_X8X24_TYPELESS> - Format traits specialization for R32_FLOAT_X8X24_TYPELESS
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32_FLOAT_X8X24_TYPELESS> :
+    ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_UNUSED, 32>,
+    FormatSwizzle<0, 1>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 64 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose32_32 TransposeT;
+    typedef Format2<32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<X32_TYPELESS_G8X24_UINT> - Format traits specialization for X32_TYPELESS_G8X24_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<X32_TYPELESS_G8X24_UINT> :
+    ComponentTraits<SWR_TYPE_UINT, 32, SWR_TYPE_UNUSED, 32>,
+    FormatSwizzle<0, 1>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 64 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose32_32 TransposeT;
+    typedef Format2<32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L32A32_FLOAT> - Format traits specialization for L32A32_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L32A32_FLOAT> :
+    ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
+    FormatSwizzle<0, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 64 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 1 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose32_32 TransposeT;
+    typedef Format2<32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16X16_UNORM> - Format traits specialization for R16G16B16X16_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16X16_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNUSED, 16>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 64 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose16_16_16_16 TransposeT;
+    typedef Format4<16, 16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16X16_FLOAT> - Format traits specialization for R16G16B16X16_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16X16_FLOAT> :
+    ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_UNUSED, 16>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 64 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose16_16_16_16 TransposeT;
+    typedef Format4<16, 16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L32X32_FLOAT> - Format traits specialization for L32X32_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L32X32_FLOAT> :
+    ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
+    FormatSwizzle<0, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 64 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose32_32 TransposeT;
+    typedef Format2<32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<I32X32_FLOAT> - Format traits specialization for I32X32_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<I32X32_FLOAT> :
+    ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_FLOAT, 32>,
+    FormatSwizzle<0, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 64 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose32_32 TransposeT;
+    typedef Format2<32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16A16_SSCALED> - Format traits specialization for R16G16B16A16_SSCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16A16_SSCALED> :
+    ComponentTraits<SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 64 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose16_16_16_16 TransposeT;
+    typedef Format4<16, 16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16A16_USCALED> - Format traits specialization for R16G16B16A16_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16A16_USCALED> :
+    ComponentTraits<SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 64 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose16_16_16_16 TransposeT;
+    typedef Format4<16, 16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32_SSCALED> - Format traits specialization for R32G32_SSCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32_SSCALED> :
+    ComponentTraits<SWR_TYPE_SSCALED, 32, SWR_TYPE_SSCALED, 32>,
+    FormatSwizzle<0, 1>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 64 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose32_32 TransposeT;
+    typedef Format2<32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32G32_USCALED> - Format traits specialization for R32G32_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32G32_USCALED> :
+    ComponentTraits<SWR_TYPE_USCALED, 32, SWR_TYPE_USCALED, 32>,
+    FormatSwizzle<0, 1>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 64 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose32_32 TransposeT;
+    typedef Format2<32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32_FLOAT_X8X24_TYPELESS_LD> - Format traits specialization for R32_FLOAT_X8X24_TYPELESS_LD
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32_FLOAT_X8X24_TYPELESS_LD> :
+    ComponentTraits<SWR_TYPE_FLOAT, 32, SWR_TYPE_UNUSED, 32>,
+    FormatSwizzle<0, 1>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 64 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose32_32 TransposeT;
+    typedef Format2<32, 32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B8G8R8A8_UNORM> - Format traits specialization for B8G8R8A8_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B8G8R8A8_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<2, 1, 0, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8_8_8 TransposeT;
+    typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B8G8R8A8_UNORM_SRGB> - Format traits specialization for B8G8R8A8_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B8G8R8A8_UNORM_SRGB> :
+    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<2, 1, 0, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ true };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8_8_8 TransposeT;
+    typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R10G10B10A2_UNORM> - Format traits specialization for R10G10B10A2_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R10G10B10A2_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 2>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose10_10_10_2 TransposeT;
+    typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R10G10B10A2_UNORM_SRGB> - Format traits specialization for R10G10B10A2_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R10G10B10A2_UNORM_SRGB> :
+    ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 2>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ true };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose10_10_10_2 TransposeT;
+    typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R10G10B10A2_UINT> - Format traits specialization for R10G10B10A2_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R10G10B10A2_UINT> :
+    ComponentTraits<SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 2>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose10_10_10_2 TransposeT;
+    typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8A8_UNORM> - Format traits specialization for R8G8B8A8_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8A8_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8_8_8 TransposeT;
+    typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8A8_UNORM_SRGB> - Format traits specialization for R8G8B8A8_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8A8_UNORM_SRGB> :
+    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ true };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8_8_8 TransposeT;
+    typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8A8_SNORM> - Format traits specialization for R8G8B8A8_SNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8A8_SNORM> :
+    ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8_8_8 TransposeT;
+    typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8A8_SINT> - Format traits specialization for R8G8B8A8_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8A8_SINT> :
+    ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8_8_8 TransposeT;
+    typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8A8_UINT> - Format traits specialization for R8G8B8A8_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8A8_UINT> :
+    ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8_8_8 TransposeT;
+    typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16_UNORM> - Format traits specialization for R16G16_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>,
+    FormatSwizzle<0, 1>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose16_16 TransposeT;
+    typedef Format2<16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16_SNORM> - Format traits specialization for R16G16_SNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16_SNORM> :
+    ComponentTraits<SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16>,
+    FormatSwizzle<0, 1>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose16_16 TransposeT;
+    typedef Format2<16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16_SINT> - Format traits specialization for R16G16_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16_SINT> :
+    ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>,
+    FormatSwizzle<0, 1>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose16_16 TransposeT;
+    typedef Format2<16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16_UINT> - Format traits specialization for R16G16_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16_UINT> :
+    ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>,
+    FormatSwizzle<0, 1>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose16_16 TransposeT;
+    typedef Format2<16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16_FLOAT> - Format traits specialization for R16G16_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16_FLOAT> :
+    ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>,
+    FormatSwizzle<0, 1>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose16_16 TransposeT;
+    typedef Format2<16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B10G10R10A2_UNORM> - Format traits specialization for B10G10R10A2_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B10G10R10A2_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 2>,
+    FormatSwizzle<2, 1, 0, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose10_10_10_2 TransposeT;
+    typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B10G10R10A2_UNORM_SRGB> - Format traits specialization for B10G10R10A2_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B10G10R10A2_UNORM_SRGB> :
+    ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 2>,
+    FormatSwizzle<2, 1, 0, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ true };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose10_10_10_2 TransposeT;
+    typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R11G11B10_FLOAT> - Format traits specialization for R11G11B10_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R11G11B10_FLOAT> :
+    ComponentTraits<SWR_TYPE_FLOAT, 11, SWR_TYPE_FLOAT, 11, SWR_TYPE_FLOAT, 10>,
+    FormatSwizzle<0, 1, 2>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose11_11_10 TransposeT;
+    typedef Format3<11, 11, 10> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32_SINT> - Format traits specialization for R32_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32_SINT> :
+    ComponentTraits<SWR_TYPE_SINT, 32>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<32> TransposeT;
+    typedef Format1<32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32_UINT> - Format traits specialization for R32_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32_UINT> :
+    ComponentTraits<SWR_TYPE_UINT, 32>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<32> TransposeT;
+    typedef Format1<32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32_FLOAT> - Format traits specialization for R32_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32_FLOAT> :
+    ComponentTraits<SWR_TYPE_FLOAT, 32>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<32> TransposeT;
+    typedef Format1<32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R24_UNORM_X8_TYPELESS> - Format traits specialization for R24_UNORM_X8_TYPELESS
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R24_UNORM_X8_TYPELESS> :
+    ComponentTraits<SWR_TYPE_UNORM, 24>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<32> TransposeT;
+    typedef Format1<24> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R24_UNORM_X8_TYPELESS_LD> - Format traits specialization for R24_UNORM_X8_TYPELESS_LD
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R24_UNORM_X8_TYPELESS_LD> :
+    ComponentTraits<SWR_TYPE_UNORM, 24>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<32> TransposeT;
+    typedef Format1<24> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L16A16_UNORM> - Format traits specialization for L16A16_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L16A16_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>,
+    FormatSwizzle<0, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 1 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose16_16 TransposeT;
+    typedef Format2<16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<I24X8_UNORM> - Format traits specialization for I24X8_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<I24X8_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 24, SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<0, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose24_8 TransposeT;
+    typedef Format2<24, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L24X8_UNORM> - Format traits specialization for L24X8_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L24X8_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 24, SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<0, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose24_8 TransposeT;
+    typedef Format2<24, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<I32_FLOAT> - Format traits specialization for I32_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<I32_FLOAT> :
+    ComponentTraits<SWR_TYPE_FLOAT, 32>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<32> TransposeT;
+    typedef Format1<32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L32_FLOAT> - Format traits specialization for L32_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L32_FLOAT> :
+    ComponentTraits<SWR_TYPE_FLOAT, 32>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<32> TransposeT;
+    typedef Format1<32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<A32_FLOAT> - Format traits specialization for A32_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<A32_FLOAT> :
+    ComponentTraits<SWR_TYPE_FLOAT, 32>,
+    FormatSwizzle<3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<32> TransposeT;
+    typedef Format1<32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B8G8R8X8_UNORM> - Format traits specialization for B8G8R8X8_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B8G8R8X8_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>,
+    FormatSwizzle<2, 1, 0, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8_8_8 TransposeT;
+    typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B8G8R8X8_UNORM_SRGB> - Format traits specialization for B8G8R8X8_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B8G8R8X8_UNORM_SRGB> :
+    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>,
+    FormatSwizzle<2, 1, 0, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ true };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8_8_8 TransposeT;
+    typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8X8_UNORM> - Format traits specialization for R8G8B8X8_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8X8_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8_8_8 TransposeT;
+    typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8X8_UNORM_SRGB> - Format traits specialization for R8G8B8X8_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8X8_UNORM_SRGB> :
+    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNUSED, 8>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ true };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8_8_8 TransposeT;
+    typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R9G9B9E5_SHAREDEXP> - Format traits specialization for R9G9B9E5_SHAREDEXP
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R9G9B9E5_SHAREDEXP> :
+    ComponentTraits<SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 9, SWR_TYPE_UINT, 5>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose9_9_9_5 TransposeT;
+    typedef Format4<9, 9, 9, 5> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B10G10R10X2_UNORM> - Format traits specialization for B10G10R10X2_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B10G10R10X2_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNORM, 10, SWR_TYPE_UNUSED, 2>,
+    FormatSwizzle<2, 1, 0, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose10_10_10_2 TransposeT;
+    typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L16A16_FLOAT> - Format traits specialization for L16A16_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L16A16_FLOAT> :
+    ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>,
+    FormatSwizzle<0, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 1 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose16_16 TransposeT;
+    typedef Format2<16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R10G10B10X2_USCALED> - Format traits specialization for R10G10B10X2_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R10G10B10X2_USCALED> :
+    ComponentTraits<SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_UNUSED, 2>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose10_10_10_2 TransposeT;
+    typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8A8_SSCALED> - Format traits specialization for R8G8B8A8_SSCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8A8_SSCALED> :
+    ComponentTraits<SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8_8_8 TransposeT;
+    typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8A8_USCALED> - Format traits specialization for R8G8B8A8_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8A8_USCALED> :
+    ComponentTraits<SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8_8_8 TransposeT;
+    typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16_SSCALED> - Format traits specialization for R16G16_SSCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16_SSCALED> :
+    ComponentTraits<SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16>,
+    FormatSwizzle<0, 1>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose16_16 TransposeT;
+    typedef Format2<16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16_USCALED> - Format traits specialization for R16G16_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16_USCALED> :
+    ComponentTraits<SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16>,
+    FormatSwizzle<0, 1>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose16_16 TransposeT;
+    typedef Format2<16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32_SSCALED> - Format traits specialization for R32_SSCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32_SSCALED> :
+    ComponentTraits<SWR_TYPE_SSCALED, 32>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<32> TransposeT;
+    typedef Format1<32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R32_USCALED> - Format traits specialization for R32_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R32_USCALED> :
+    ComponentTraits<SWR_TYPE_USCALED, 32>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<32> TransposeT;
+    typedef Format1<32> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B5G6R5_UNORM> - Format traits specialization for B5G6R5_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B5G6R5_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 6, SWR_TYPE_UNORM, 5>,
+    FormatSwizzle<2, 1, 0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose5_6_5 TransposeT;
+    typedef Format3<5, 6, 5> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B5G6R5_UNORM_SRGB> - Format traits specialization for B5G6R5_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B5G6R5_UNORM_SRGB> :
+    ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 6, SWR_TYPE_UNORM, 5>,
+    FormatSwizzle<2, 1, 0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ true };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose5_6_5 TransposeT;
+    typedef Format3<5, 6, 5> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B5G5R5A1_UNORM> - Format traits specialization for B5G5R5A1_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B5G5R5A1_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 1>,
+    FormatSwizzle<2, 1, 0, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose5_5_5_1 TransposeT;
+    typedef Format4<5, 5, 5, 1> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B5G5R5A1_UNORM_SRGB> - Format traits specialization for B5G5R5A1_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B5G5R5A1_UNORM_SRGB> :
+    ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 1>,
+    FormatSwizzle<2, 1, 0, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ true };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose5_5_5_1 TransposeT;
+    typedef Format4<5, 5, 5, 1> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B4G4R4A4_UNORM> - Format traits specialization for B4G4R4A4_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B4G4R4A4_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4>,
+    FormatSwizzle<2, 1, 0, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose4_4_4_4 TransposeT;
+    typedef Format4<4, 4, 4, 4> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B4G4R4A4_UNORM_SRGB> - Format traits specialization for B4G4R4A4_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B4G4R4A4_UNORM_SRGB> :
+    ComponentTraits<SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4, SWR_TYPE_UNORM, 4>,
+    FormatSwizzle<2, 1, 0, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ true };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose4_4_4_4 TransposeT;
+    typedef Format4<4, 4, 4, 4> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8_UNORM> - Format traits specialization for R8G8_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<0, 1>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8 TransposeT;
+    typedef Format2<8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8_SNORM> - Format traits specialization for R8G8_SNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8_SNORM> :
+    ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>,
+    FormatSwizzle<0, 1>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8 TransposeT;
+    typedef Format2<8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8_SINT> - Format traits specialization for R8G8_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8_SINT> :
+    ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>,
+    FormatSwizzle<0, 1>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8 TransposeT;
+    typedef Format2<8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8_UINT> - Format traits specialization for R8G8_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8_UINT> :
+    ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
+    FormatSwizzle<0, 1>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8 TransposeT;
+    typedef Format2<8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16_UNORM> - Format traits specialization for R16_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 16>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<16> TransposeT;
+    typedef Format1<16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16_SNORM> - Format traits specialization for R16_SNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16_SNORM> :
+    ComponentTraits<SWR_TYPE_SNORM, 16>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<16> TransposeT;
+    typedef Format1<16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16_SINT> - Format traits specialization for R16_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16_SINT> :
+    ComponentTraits<SWR_TYPE_SINT, 16>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<16> TransposeT;
+    typedef Format1<16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16_UINT> - Format traits specialization for R16_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16_UINT> :
+    ComponentTraits<SWR_TYPE_UINT, 16>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<16> TransposeT;
+    typedef Format1<16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16_FLOAT> - Format traits specialization for R16_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16_FLOAT> :
+    ComponentTraits<SWR_TYPE_FLOAT, 16>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<16> TransposeT;
+    typedef Format1<16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<I16_UNORM> - Format traits specialization for I16_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<I16_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 16>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<16> TransposeT;
+    typedef Format1<16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L16_UNORM> - Format traits specialization for L16_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L16_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 16>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<16> TransposeT;
+    typedef Format1<16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<A16_UNORM> - Format traits specialization for A16_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<A16_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 16>,
+    FormatSwizzle<3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<16> TransposeT;
+    typedef Format1<16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L8A8_UNORM> - Format traits specialization for L8A8_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L8A8_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<0, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 1 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8 TransposeT;
+    typedef Format2<8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<I16_FLOAT> - Format traits specialization for I16_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<I16_FLOAT> :
+    ComponentTraits<SWR_TYPE_FLOAT, 16>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<16> TransposeT;
+    typedef Format1<16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L16_FLOAT> - Format traits specialization for L16_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L16_FLOAT> :
+    ComponentTraits<SWR_TYPE_FLOAT, 16>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<16> TransposeT;
+    typedef Format1<16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<A16_FLOAT> - Format traits specialization for A16_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<A16_FLOAT> :
+    ComponentTraits<SWR_TYPE_FLOAT, 16>,
+    FormatSwizzle<3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<16> TransposeT;
+    typedef Format1<16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L8A8_UNORM_SRGB> - Format traits specialization for L8A8_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L8A8_UNORM_SRGB> :
+    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<0, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 1 };
+    static const bool isSRGB{ true };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8 TransposeT;
+    typedef Format2<8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B5G5R5X1_UNORM> - Format traits specialization for B5G5R5X1_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B5G5R5X1_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNUSED, 1>,
+    FormatSwizzle<2, 1, 0, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose5_5_5_1 TransposeT;
+    typedef Format4<5, 5, 5, 1> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B5G5R5X1_UNORM_SRGB> - Format traits specialization for B5G5R5X1_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B5G5R5X1_UNORM_SRGB> :
+    ComponentTraits<SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNORM, 5, SWR_TYPE_UNUSED, 1>,
+    FormatSwizzle<2, 1, 0, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ true };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose5_5_5_1 TransposeT;
+    typedef Format4<5, 5, 5, 1> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8_SSCALED> - Format traits specialization for R8G8_SSCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8_SSCALED> :
+    ComponentTraits<SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8>,
+    FormatSwizzle<0, 1>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8 TransposeT;
+    typedef Format2<8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8_USCALED> - Format traits specialization for R8G8_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8_USCALED> :
+    ComponentTraits<SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8>,
+    FormatSwizzle<0, 1>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8 TransposeT;
+    typedef Format2<8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16_SSCALED> - Format traits specialization for R16_SSCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16_SSCALED> :
+    ComponentTraits<SWR_TYPE_SSCALED, 16>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<16> TransposeT;
+    typedef Format1<16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16_USCALED> - Format traits specialization for R16_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16_USCALED> :
+    ComponentTraits<SWR_TYPE_USCALED, 16>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<16> TransposeT;
+    typedef Format1<16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L8A8_UINT> - Format traits specialization for L8A8_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L8A8_UINT> :
+    ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
+    FormatSwizzle<0, 3>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 1 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8 TransposeT;
+    typedef Format2<8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L8A8_SINT> - Format traits specialization for L8A8_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L8A8_SINT> :
+    ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>,
+    FormatSwizzle<0, 3>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 16 };
+    static const uint32_t numComps{ 2 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 1 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8 TransposeT;
+    typedef Format2<8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8_UNORM> - Format traits specialization for R8_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 8 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<8> TransposeT;
+    typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8_SNORM> - Format traits specialization for R8_SNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8_SNORM> :
+    ComponentTraits<SWR_TYPE_SNORM, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 8 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<8> TransposeT;
+    typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8_SINT> - Format traits specialization for R8_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8_SINT> :
+    ComponentTraits<SWR_TYPE_SINT, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 8 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<8> TransposeT;
+    typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8_UINT> - Format traits specialization for R8_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8_UINT> :
+    ComponentTraits<SWR_TYPE_UINT, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 8 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<8> TransposeT;
+    typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<A8_UNORM> - Format traits specialization for A8_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<A8_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 8 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<8> TransposeT;
+    typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<I8_UNORM> - Format traits specialization for I8_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<I8_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 8 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<8> TransposeT;
+    typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L8_UNORM> - Format traits specialization for L8_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L8_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 8 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<8> TransposeT;
+    typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8_SSCALED> - Format traits specialization for R8_SSCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8_SSCALED> :
+    ComponentTraits<SWR_TYPE_SSCALED, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 8 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<8> TransposeT;
+    typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8_USCALED> - Format traits specialization for R8_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8_USCALED> :
+    ComponentTraits<SWR_TYPE_USCALED, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 8 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<8> TransposeT;
+    typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L8_UNORM_SRGB> - Format traits specialization for L8_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L8_UNORM_SRGB> :
+    ComponentTraits<SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 8 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ true };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<8> TransposeT;
+    typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L8_UINT> - Format traits specialization for L8_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L8_UINT> :
+    ComponentTraits<SWR_TYPE_UINT, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 8 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<8> TransposeT;
+    typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<L8_SINT> - Format traits specialization for L8_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<L8_SINT> :
+    ComponentTraits<SWR_TYPE_SINT, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 8 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<8> TransposeT;
+    typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<I8_UINT> - Format traits specialization for I8_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<I8_UINT> :
+    ComponentTraits<SWR_TYPE_UINT, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 8 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<8> TransposeT;
+    typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<I8_SINT> - Format traits specialization for I8_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<I8_SINT> :
+    ComponentTraits<SWR_TYPE_SINT, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 8 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef TransposeSingleComponent<8> TransposeT;
+    typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<YCRCB_SWAPUVY> - Format traits specialization for YCRCB_SWAPUVY
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<YCRCB_SWAPUVY> :
+    ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ true };
+    static const uint32_t bcWidth{ 2 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8_8_8 TransposeT;
+    typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<BC1_UNORM> - Format traits specialization for BC1_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<BC1_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 64 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ true };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 4 };
+    static const uint32_t bcHeight{ 4 };
+
+    typedef TransposeSingleComponent<64> TransposeT;
+    typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<BC2_UNORM> - Format traits specialization for BC2_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<BC2_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 128 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ true };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 4 };
+    static const uint32_t bcHeight{ 4 };
+
+    typedef TransposeSingleComponent<128> TransposeT;
+    typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<BC3_UNORM> - Format traits specialization for BC3_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<BC3_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 128 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ true };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 4 };
+    static const uint32_t bcHeight{ 4 };
+
+    typedef TransposeSingleComponent<128> TransposeT;
+    typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<BC4_UNORM> - Format traits specialization for BC4_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<BC4_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 64 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ true };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 4 };
+    static const uint32_t bcHeight{ 4 };
+
+    typedef TransposeSingleComponent<64> TransposeT;
+    typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<BC5_UNORM> - Format traits specialization for BC5_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<BC5_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 128 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ true };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 4 };
+    static const uint32_t bcHeight{ 4 };
+
+    typedef TransposeSingleComponent<128> TransposeT;
+    typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<BC1_UNORM_SRGB> - Format traits specialization for BC1_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<BC1_UNORM_SRGB> :
+    ComponentTraits<SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 64 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ true };
+    static const bool isBC{ true };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 4 };
+    static const uint32_t bcHeight{ 4 };
+
+    typedef TransposeSingleComponent<64> TransposeT;
+    typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<BC2_UNORM_SRGB> - Format traits specialization for BC2_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<BC2_UNORM_SRGB> :
+    ComponentTraits<SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 128 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ true };
+    static const bool isBC{ true };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 4 };
+    static const uint32_t bcHeight{ 4 };
+
+    typedef TransposeSingleComponent<128> TransposeT;
+    typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<BC3_UNORM_SRGB> - Format traits specialization for BC3_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<BC3_UNORM_SRGB> :
+    ComponentTraits<SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 128 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ true };
+    static const bool isBC{ true };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 4 };
+    static const uint32_t bcHeight{ 4 };
+
+    typedef TransposeSingleComponent<128> TransposeT;
+    typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<YCRCB_SWAPUV> - Format traits specialization for YCRCB_SWAPUV
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<YCRCB_SWAPUV> :
+    ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ true };
+    static const uint32_t bcWidth{ 2 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8_8_8 TransposeT;
+    typedef Format4<8, 8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8_UNORM> - Format traits specialization for R8G8B8_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<0, 1, 2>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 24 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8_8 TransposeT;
+    typedef Format3<8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8_SNORM> - Format traits specialization for R8G8B8_SNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8_SNORM> :
+    ComponentTraits<SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8, SWR_TYPE_SNORM, 8>,
+    FormatSwizzle<0, 1, 2>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 24 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8_8 TransposeT;
+    typedef Format3<8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8_SSCALED> - Format traits specialization for R8G8B8_SSCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8_SSCALED> :
+    ComponentTraits<SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8, SWR_TYPE_SSCALED, 8>,
+    FormatSwizzle<0, 1, 2>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 24 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8_8 TransposeT;
+    typedef Format3<8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8_USCALED> - Format traits specialization for R8G8B8_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8_USCALED> :
+    ComponentTraits<SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8, SWR_TYPE_USCALED, 8>,
+    FormatSwizzle<0, 1, 2>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 24 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8_8 TransposeT;
+    typedef Format3<8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<BC4_SNORM> - Format traits specialization for BC4_SNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<BC4_SNORM> :
+    ComponentTraits<SWR_TYPE_SNORM, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 64 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ true };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 4 };
+    static const uint32_t bcHeight{ 4 };
+
+    typedef TransposeSingleComponent<64> TransposeT;
+    typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<BC5_SNORM> - Format traits specialization for BC5_SNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<BC5_SNORM> :
+    ComponentTraits<SWR_TYPE_SNORM, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 128 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ true };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 4 };
+    static const uint32_t bcHeight{ 4 };
+
+    typedef TransposeSingleComponent<128> TransposeT;
+    typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16_FLOAT> - Format traits specialization for R16G16B16_FLOAT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16_FLOAT> :
+    ComponentTraits<SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16, SWR_TYPE_FLOAT, 16>,
+    FormatSwizzle<0, 1, 2>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 48 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose16_16_16 TransposeT;
+    typedef Format3<16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16_UNORM> - Format traits specialization for R16G16B16_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16, SWR_TYPE_UNORM, 16>,
+    FormatSwizzle<0, 1, 2>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 48 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose16_16_16 TransposeT;
+    typedef Format3<16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16_SNORM> - Format traits specialization for R16G16B16_SNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16_SNORM> :
+    ComponentTraits<SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16, SWR_TYPE_SNORM, 16>,
+    FormatSwizzle<0, 1, 2>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 48 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose16_16_16 TransposeT;
+    typedef Format3<16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16_SSCALED> - Format traits specialization for R16G16B16_SSCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16_SSCALED> :
+    ComponentTraits<SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16, SWR_TYPE_SSCALED, 16>,
+    FormatSwizzle<0, 1, 2>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 48 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose16_16_16 TransposeT;
+    typedef Format3<16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16_USCALED> - Format traits specialization for R16G16B16_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16_USCALED> :
+    ComponentTraits<SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16, SWR_TYPE_USCALED, 16>,
+    FormatSwizzle<0, 1, 2>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 48 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose16_16_16 TransposeT;
+    typedef Format3<16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<BC6H_SF16> - Format traits specialization for BC6H_SF16
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<BC6H_SF16> :
+    ComponentTraits<SWR_TYPE_SNORM, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 128 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ true };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 4 };
+    static const uint32_t bcHeight{ 4 };
+
+    typedef TransposeSingleComponent<128> TransposeT;
+    typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<BC7_UNORM> - Format traits specialization for BC7_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<BC7_UNORM> :
+    ComponentTraits<SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 128 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ true };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 4 };
+    static const uint32_t bcHeight{ 4 };
+
+    typedef TransposeSingleComponent<128> TransposeT;
+    typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<BC7_UNORM_SRGB> - Format traits specialization for BC7_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<BC7_UNORM_SRGB> :
+    ComponentTraits<SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 128 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ true };
+    static const bool isBC{ true };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 4 };
+    static const uint32_t bcHeight{ 4 };
+
+    typedef TransposeSingleComponent<128> TransposeT;
+    typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<BC6H_UF16> - Format traits specialization for BC6H_UF16
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<BC6H_UF16> :
+    ComponentTraits<SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<0>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 128 };
+    static const uint32_t numComps{ 1 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ true };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 4 };
+    static const uint32_t bcHeight{ 4 };
+
+    typedef TransposeSingleComponent<128> TransposeT;
+    typedef Format1<8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8_UNORM_SRGB> - Format traits specialization for R8G8B8_UNORM_SRGB
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8_UNORM_SRGB> :
+    ComponentTraits<SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8, SWR_TYPE_UNORM, 8>,
+    FormatSwizzle<0, 1, 2>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 24 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ true };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8_8 TransposeT;
+    typedef Format3<8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16_UINT> - Format traits specialization for R16G16B16_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16_UINT> :
+    ComponentTraits<SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16, SWR_TYPE_UINT, 16>,
+    FormatSwizzle<0, 1, 2>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 48 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose16_16_16 TransposeT;
+    typedef Format3<16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R16G16B16_SINT> - Format traits specialization for R16G16B16_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R16G16B16_SINT> :
+    ComponentTraits<SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16, SWR_TYPE_SINT, 16>,
+    FormatSwizzle<0, 1, 2>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 48 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose16_16_16 TransposeT;
+    typedef Format3<16, 16, 16> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R10G10B10A2_SNORM> - Format traits specialization for R10G10B10A2_SNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R10G10B10A2_SNORM> :
+    ComponentTraits<SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 2>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose10_10_10_2 TransposeT;
+    typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R10G10B10A2_USCALED> - Format traits specialization for R10G10B10A2_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R10G10B10A2_USCALED> :
+    ComponentTraits<SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 2>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose10_10_10_2 TransposeT;
+    typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R10G10B10A2_SSCALED> - Format traits specialization for R10G10B10A2_SSCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R10G10B10A2_SSCALED> :
+    ComponentTraits<SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 2>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose10_10_10_2 TransposeT;
+    typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R10G10B10A2_SINT> - Format traits specialization for R10G10B10A2_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R10G10B10A2_SINT> :
+    ComponentTraits<SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 2>,
+    FormatSwizzle<0, 1, 2, 3>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose10_10_10_2 TransposeT;
+    typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B10G10R10A2_SNORM> - Format traits specialization for B10G10R10A2_SNORM
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B10G10R10A2_SNORM> :
+    ComponentTraits<SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 10, SWR_TYPE_SNORM, 2>,
+    FormatSwizzle<2, 1, 0, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose10_10_10_2 TransposeT;
+    typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B10G10R10A2_USCALED> - Format traits specialization for B10G10R10A2_USCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B10G10R10A2_USCALED> :
+    ComponentTraits<SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 10, SWR_TYPE_USCALED, 2>,
+    FormatSwizzle<2, 1, 0, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose10_10_10_2 TransposeT;
+    typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B10G10R10A2_SSCALED> - Format traits specialization for B10G10R10A2_SSCALED
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B10G10R10A2_SSCALED> :
+    ComponentTraits<SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 10, SWR_TYPE_SSCALED, 2>,
+    FormatSwizzle<2, 1, 0, 3>,
+    Defaults<0, 0, 0, 0x3f800000>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose10_10_10_2 TransposeT;
+    typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B10G10R10A2_UINT> - Format traits specialization for B10G10R10A2_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B10G10R10A2_UINT> :
+    ComponentTraits<SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 10, SWR_TYPE_UINT, 2>,
+    FormatSwizzle<2, 1, 0, 3>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose10_10_10_2 TransposeT;
+    typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<B10G10R10A2_SINT> - Format traits specialization for B10G10R10A2_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<B10G10R10A2_SINT> :
+    ComponentTraits<SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 10, SWR_TYPE_SINT, 2>,
+    FormatSwizzle<2, 1, 0, 3>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 32 };
+    static const uint32_t numComps{ 4 };
+    static const bool hasAlpha{ true };
+    static const uint32_t alphaComp{ 3 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose10_10_10_2 TransposeT;
+    typedef Format4<10, 10, 10, 2> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8_UINT> - Format traits specialization for R8G8B8_UINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8_UINT> :
+    ComponentTraits<SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8, SWR_TYPE_UINT, 8>,
+    FormatSwizzle<0, 1, 2>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 24 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8_8 TransposeT;
+    typedef Format3<8, 8, 8> FormatT;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// FormatTraits<R8G8B8_SINT> - Format traits specialization for R8G8B8_SINT
+//////////////////////////////////////////////////////////////////////////
+template<> struct FormatTraits<R8G8B8_SINT> :
+    ComponentTraits<SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8, SWR_TYPE_SINT, 8>,
+    FormatSwizzle<0, 1, 2>,
+    Defaults<0, 0, 0, 0x1>
+{
+    static const uint32_t bpp{ 24 };
+    static const uint32_t numComps{ 3 };
+    static const bool hasAlpha{ false };
+    static const uint32_t alphaComp{ 0 };
+    static const bool isSRGB{ false };
+    static const bool isBC{ false };
+    static const bool isSubsampled{ false };
+    static const uint32_t bcWidth{ 1 };
+    static const uint32_t bcHeight{ 1 };
+
+    typedef Transpose8_8_8 TransposeT;
+    typedef Format3<8, 8, 8> FormatT;
+};
+
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h b/src/gallium/drivers/swr/rasterizer/core/format_types.h
new file mode 100644
index 00000000000..aa350259a15
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h
@@ -0,0 +1,1075 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file formats.h
+*
+* @brief Definitions for SWR_FORMAT functions.
+*
+******************************************************************************/
+#pragma once
+
+//////////////////////////////////////////////////////////////////////////
+/// PackTraits - Helpers for packing / unpacking same pixel sizes
+//////////////////////////////////////////////////////////////////////////
+template <uint32_t NumBits, bool Signed = false>
+struct PackTraits
+{
+    static const uint32_t MyNumBits = NumBits;
+    static simdscalar loadSOA(const BYTE *pSrc) = delete;
+    static void storeSOA(BYTE *pDst, simdscalar src) = delete;
+    static simdscalar unpack(simdscalar &in) = delete;
+    static simdscalar pack(simdscalar &in) = delete;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// PackTraits - Helpers for packing / unpacking unused channels
+//////////////////////////////////////////////////////////////////////////
+template <>
+struct PackTraits<0, false>
+{
+    static const uint32_t MyNumBits = 0;
+
+    static simdscalar loadSOA(const BYTE *pSrc) { return _simd_setzero_ps(); }
+    static void storeSOA(BYTE *pDst, simdscalar src) { return; }
+    static simdscalar unpack(simdscalar &in) { return _simd_setzero_ps(); }
+    static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); }
+};
+
+
+//////////////////////////////////////////////////////////////////////////
+/// PackTraits - Helpers for packing / unpacking 8 bit unsigned channels
+//////////////////////////////////////////////////////////////////////////
+template <>
+struct PackTraits<8, false>
+{
+    static const uint32_t MyNumBits = 8;
+
+    static simdscalar loadSOA(const BYTE *pSrc)
+    {
+#if KNOB_SIMD_WIDTH == 8
+        __m256 result = _mm256_setzero_ps();
+        __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
+        return _mm256_insertf128_ps(result, vLo, 0);
+#else
+#error Unsupported vector width
+#endif
+    }
+
+    static void storeSOA(BYTE *pDst, simdscalar src)
+    {
+        // store simd bytes
+#if KNOB_SIMD_WIDTH == 8
+        _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
+#else
+#error Unsupported vector width
+#endif
+    }
+
+    static simdscalar unpack(simdscalar &in)
+    {
+#if KNOB_SIMD_WIDTH == 8
+#if KNOB_ARCH==KNOB_ARCH_AVX
+        __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
+        __m128i resLo = _mm_cvtepu8_epi32(src);
+        __m128i resHi = _mm_shuffle_epi8(src,
+            _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
+
+        __m256i result = _mm256_castsi128_si256(resLo);
+        result = _mm256_insertf128_si256(result, resHi, 1);
+        return _mm256_castsi256_ps(result);
+#elif KNOB_ARCH==KNOB_ARCH_AVX2
+        return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
+#endif
+#else
+#error Unsupported vector width
+#endif
+    }
+
+    static simdscalar pack(simdscalar &in)
+    {
+#if KNOB_SIMD_WIDTH == 8
+        simdscalari src = _simd_castps_si(in);
+        __m128i res16 = _mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
+        __m128i res8 = _mm_packus_epi16(res16, _mm_undefined_si128());
+        return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
+#else
+#error Unsupported vector width
+#endif
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// PackTraits - Helpers for packing / unpacking 8 bit signed channels
+//////////////////////////////////////////////////////////////////////////
+template <>
+struct PackTraits<8, true>
+{
+    static const uint32_t MyNumBits = 8;
+
+    static simdscalar loadSOA(const BYTE *pSrc)
+    {
+#if KNOB_SIMD_WIDTH == 8
+        __m256 result = _mm256_setzero_ps();
+        __m128 vLo = _mm_castpd_ps(_mm_load_sd((double*)pSrc));
+        return _mm256_insertf128_ps(result, vLo, 0);
+#else
+#error Unsupported vector width
+#endif
+    }
+
+    static void storeSOA(BYTE *pDst, simdscalar src)
+    {
+        // store simd bytes
+#if KNOB_SIMD_WIDTH == 8
+        _mm_storel_pd((double*)pDst, _mm_castps_pd(_mm256_castps256_ps128(src)));
+#else
+#error Unsupported vector width
+#endif
+    }
+
+    static simdscalar unpack(simdscalar &in)
+    {
+#if KNOB_SIMD_WIDTH == 8
+#if KNOB_ARCH==KNOB_ARCH_AVX
+        SWR_ASSERT(0); // I think this may be incorrect.
+        __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
+        __m128i resLo = _mm_cvtepi8_epi32(src);
+        __m128i resHi = _mm_shuffle_epi8(src,
+            _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
+
+        __m256i result = _mm256_castsi128_si256(resLo);
+        result = _mm256_insertf128_si256(result, resHi, 1);
+        return _mm256_castsi256_ps(result);
+#elif KNOB_ARCH==KNOB_ARCH_AVX2
+        return _mm256_castsi256_ps(_mm256_cvtepi8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
+#endif
+#else
+#error Unsupported vector width
+#endif
+    }
+
+    static simdscalar pack(simdscalar &in)
+    {
+#if KNOB_SIMD_WIDTH == 8
+        simdscalari src = _simd_castps_si(in);
+        __m128i res16 = _mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1));
+        __m128i res8 = _mm_packs_epi16(res16, _mm_undefined_si128());
+        return _mm256_castsi256_ps(_mm256_castsi128_si256(res8));
+#else
+#error Unsupported vector width
+#endif
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// PackTraits - Helpers for packing / unpacking 16 bit unsigned channels
+//////////////////////////////////////////////////////////////////////////
+template <>
+struct PackTraits<16, false>
+{
+    static const uint32_t MyNumBits = 16;
+
+    static simdscalar loadSOA(const BYTE *pSrc)
+    {
+#if KNOB_SIMD_WIDTH == 8
+        __m256 result = _mm256_setzero_ps();
+        __m128 vLo = _mm_load_ps((const float*)pSrc);
+        return _mm256_insertf128_ps(result, vLo, 0);
+#else
+#error Unsupported vector width
+#endif
+    }
+
+    static void storeSOA(BYTE *pDst, simdscalar src)
+    {
+#if KNOB_SIMD_WIDTH == 8
+        // store 16B (2B * 8)
+        _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
+#else
+#error Unsupported vector width
+#endif
+    }
+
+    static simdscalar unpack(simdscalar &in)
+    {
+#if KNOB_SIMD_WIDTH == 8
+#if KNOB_ARCH==KNOB_ARCH_AVX
+        __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
+        __m128i resLo = _mm_cvtepu16_epi32(src);
+        __m128i resHi = _mm_shuffle_epi8(src,
+            _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
+
+        __m256i result = _mm256_castsi128_si256(resLo);
+        result = _mm256_insertf128_si256(result, resHi, 1);
+        return _mm256_castsi256_ps(result);
+#elif KNOB_ARCH==KNOB_ARCH_AVX2
+        return _mm256_castsi256_ps(_mm256_cvtepu16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
+#endif
+#else
+#error Unsupported vector width
+#endif
+    }
+
+    static simdscalar pack(simdscalar &in)
+    {
+#if KNOB_SIMD_WIDTH == 8
+        simdscalari src = _simd_castps_si(in);
+        __m256i res = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
+        return _mm256_castsi256_ps(res);
+#else
+#error Unsupported vector width
+#endif
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// PackTraits - Helpers for packing / unpacking 16 bit signed channels
+//////////////////////////////////////////////////////////////////////////
+template <>
+struct PackTraits<16, true>
+{
+    static const uint32_t MyNumBits = 16;
+
+    static simdscalar loadSOA(const BYTE *pSrc)
+    {
+#if KNOB_SIMD_WIDTH == 8
+        __m256 result = _mm256_setzero_ps();
+        __m128 vLo = _mm_load_ps((const float*)pSrc);
+        return _mm256_insertf128_ps(result, vLo, 0);
+#else
+#error Unsupported vector width
+#endif
+    }
+
+    static void storeSOA(BYTE *pDst, simdscalar src)
+    {
+#if KNOB_SIMD_WIDTH == 8
+        // store 16B (2B * 8)
+        _mm_store_ps((float*)pDst, _mm256_castps256_ps128(src));
+#else
+#error Unsupported vector width
+#endif
+    }
+
+    static simdscalar unpack(simdscalar &in)
+    {
+#if KNOB_SIMD_WIDTH == 8
+#if KNOB_ARCH==KNOB_ARCH_AVX
+        SWR_ASSERT(0); // I think this is incorrectly implemented
+        __m128i src = _mm_castps_si128(_mm256_castps256_ps128(in));
+        __m128i resLo = _mm_cvtepi16_epi32(src);
+        __m128i resHi = _mm_shuffle_epi8(src,
+            _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
+
+        __m256i result = _mm256_castsi128_si256(resLo);
+        result = _mm256_insertf128_si256(result, resHi, 1);
+        return _mm256_castsi256_ps(result);
+#elif KNOB_ARCH==KNOB_ARCH_AVX2
+        return _mm256_castsi256_ps(_mm256_cvtepi16_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
+#endif
+#else
+#error Unsupported vector width
+#endif
+    }
+
+    static simdscalar pack(simdscalar &in)
+    {
+#if KNOB_SIMD_WIDTH == 8
+        simdscalari src = _simd_castps_si(in);
+        __m256i res = _mm256_castsi128_si256(_mm_packs_epi32(_mm256_castsi256_si128(src), _mm256_extractf128_si256(src, 1)));
+        return _mm256_castsi256_ps(res);
+#else
+#error Unsupported vector width
+#endif
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// PackTraits - Helpers for packing / unpacking 32 bit channels
+//////////////////////////////////////////////////////////////////////////
+template <>
+struct PackTraits<32, false>
+{
+    static const uint32_t MyNumBits = 32;
+
+    static simdscalar loadSOA(const BYTE *pSrc) { return _simd_load_ps((const float*)pSrc); }
+    static void storeSOA(BYTE *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); }
+    static simdscalar unpack(simdscalar &in) { return in; }
+    static simdscalar pack(simdscalar &in) { return in; }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits.
+//////////////////////////////////////////////////////////////////////////
+template<SWR_TYPE type, uint32_t NumBits>
+struct TypeTraits : PackTraits<NumBits>
+{
+    static const SWR_TYPE MyType = type;
+    static float toFloat() { return 0.0; }
+    static float fromFloat() { SWR_ASSERT(0); return 0.0; }
+    static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for UINT8
+//////////////////////////////////////////////////////////////////////////
+template<> struct TypeTraits<SWR_TYPE_UINT, 8> : PackTraits<8>
+{
+    static const SWR_TYPE MyType = SWR_TYPE_UINT;
+    static float toFloat() { return 0.0; }
+    static float fromFloat() { SWR_ASSERT(0); return 0.0; }
+    static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for UINT8
+//////////////////////////////////////////////////////////////////////////
+template<> struct TypeTraits<SWR_TYPE_SINT, 8> : PackTraits<8, true>
+{
+    static const SWR_TYPE MyType = SWR_TYPE_SINT;
+    static float toFloat() { return 0.0; }
+    static float fromFloat() { SWR_ASSERT(0); return 0.0; }
+    static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for UINT16
+//////////////////////////////////////////////////////////////////////////
+template<> struct TypeTraits<SWR_TYPE_UINT, 16> : PackTraits<16>
+{
+    static const SWR_TYPE MyType = SWR_TYPE_UINT;
+    static float toFloat() { return 0.0; }
+    static float fromFloat() { SWR_ASSERT(0); return 0.0; }
+    static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for SINT16
+//////////////////////////////////////////////////////////////////////////
+template<> struct TypeTraits<SWR_TYPE_SINT, 16> : PackTraits<16, true>
+{
+    static const SWR_TYPE MyType = SWR_TYPE_SINT;
+    static float toFloat() { return 0.0; }
+    static float fromFloat() { SWR_ASSERT(0); return 0.0; }
+    static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for UINT32
+//////////////////////////////////////////////////////////////////////////
+template<> struct TypeTraits<SWR_TYPE_UINT, 32> : PackTraits<32>
+{
+    static const SWR_TYPE MyType = SWR_TYPE_UINT;
+    static float toFloat() { return 0.0; }
+    static float fromFloat() { SWR_ASSERT(0); return 0.0; }
+    static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for UINT32
+//////////////////////////////////////////////////////////////////////////
+template<> struct TypeTraits<SWR_TYPE_SINT, 32> : PackTraits<32>
+{
+    static const SWR_TYPE MyType = SWR_TYPE_SINT;
+    static float toFloat() { return 0.0; }
+    static float fromFloat() { SWR_ASSERT(0); return 0.0; }
+    static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for UNORM5
+//////////////////////////////////////////////////////////////////////////
+template<> struct TypeTraits<SWR_TYPE_UNORM, 5> : PackTraits<5>
+{
+    static const SWR_TYPE MyType = SWR_TYPE_UNORM;
+    static float toFloat() { return 1.0f / 31.0f; }
+    static float fromFloat() { return 31.0f; }
+    static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for UNORM6
+//////////////////////////////////////////////////////////////////////////
+template<> struct TypeTraits<SWR_TYPE_UNORM, 6> : PackTraits<6>
+{
+    static const SWR_TYPE MyType = SWR_TYPE_UNORM;
+    static float toFloat() { return 1.0f / 63.0f; }
+    static float fromFloat() { return 63.0f; }
+    static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for UNORM8
+//////////////////////////////////////////////////////////////////////////
+template<> struct TypeTraits<SWR_TYPE_UNORM, 8> : PackTraits<8>
+{
+    static const SWR_TYPE MyType = SWR_TYPE_UNORM;
+    static float toFloat() { return 1.0f / 255.0f; }
+    static float fromFloat() { return 255.0f; }
+    static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for UNORM8
+//////////////////////////////////////////////////////////////////////////
+template<> struct TypeTraits<SWR_TYPE_SNORM, 8> : PackTraits<8, true>
+{
+    static const SWR_TYPE MyType = SWR_TYPE_SNORM;
+    static float toFloat() { return 1.0f / 127.0f; }
+    static float fromFloat() { return 127.0f; }
+    static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for UNORM16
+//////////////////////////////////////////////////////////////////////////
+template<> struct TypeTraits<SWR_TYPE_UNORM, 16> : PackTraits<16>
+{
+    static const SWR_TYPE MyType = SWR_TYPE_UNORM;
+    static float toFloat() { return 1.0f / 65535.0f; }
+    static float fromFloat() { return 65535.0f; }
+    static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for SNORM16
+//////////////////////////////////////////////////////////////////////////
+template<> struct TypeTraits<SWR_TYPE_SNORM, 16> : PackTraits<16, true>
+{
+    static const SWR_TYPE MyType = SWR_TYPE_UNORM;
+    static float toFloat() { return 1.0f / 32767.0f; }
+    static float fromFloat() { return 32767.0f; }
+    static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for UNORM24
+//////////////////////////////////////////////////////////////////////////
+template<>
+struct TypeTraits < SWR_TYPE_UNORM, 24 > : PackTraits<32>
+{
+    static const SWR_TYPE MyType = SWR_TYPE_UNORM;
+    static float toFloat() { return 1.0f / 16777215.0f; }
+    static float fromFloat() { return 16777215.0f; }
+    static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+};
+
+//////////////////////////////////////////////////////////////////////////
+// FLOAT Specializations from here on...
+//////////////////////////////////////////////////////////////////////////
+#define TO_M128i(a) _mm_castps_si128(a)
+#define TO_M128(a) _mm_castsi128_ps(a)
+
+#include "math.h"
+
+template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
+inline static __m128 fastpow(__m128 arg) {
+    __m128 ret = arg;
+
+    static const __m128 factor = _mm_set1_ps(exp2(127.0f * expden / expnum - 127.0f)
+        * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum));
+
+    // Apply a constant pre-correction factor.
+    ret = _mm_mul_ps(ret, factor);
+
+    // Reinterpret arg as integer to obtain logarithm.
+    //asm("cvtdq2ps %1, %0" : "=x" (ret) : "x" (ret));
+    ret = _mm_cvtepi32_ps(_mm_castps_si128(ret));
+
+    // Multiply logarithm by power.
+    ret = _mm_mul_ps(ret, _mm_set1_ps(1.0f * expnum / expden));
+
+    // Convert back to "integer" to exponentiate.
+    //asm("cvtps2dq %1, %0" : "=x" (ret) : "x" (ret));
+    ret = _mm_castsi128_ps(_mm_cvtps_epi32(ret));
+
+    return ret;
+}
+
+inline static __m128 pow512_4(__m128 arg) {
+    // 5/12 is too small, so compute the 4th root of 20/12 instead.
+    // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
+    // weighting coefficient: a^-1/2 = 2 a; a = 2^-2/3
+    __m128 xf = fastpow< 2, 3, int(0.629960524947437 * 1e9), int(1e9) >(arg);
+    __m128 xover = _mm_mul_ps(arg, xf);
+
+    __m128 xfm1 = _mm_rsqrt_ps(xf);
+    __m128 x2 = _mm_mul_ps(arg, arg);
+    __m128 xunder = _mm_mul_ps(x2, xfm1);
+
+    // sqrt2 * over + 2 * sqrt2 * under
+    __m128 xavg = _mm_mul_ps(_mm_set1_ps(1.0f / (3.0f * 0.629960524947437f) * 0.999852f),
+        _mm_add_ps(xover, xunder));
+
+    xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
+    xavg = _mm_mul_ps(xavg, _mm_rsqrt_ps(xavg));
+    return xavg;
+}
+
+inline static __m128 powf_wrapper(__m128 Base, float Exp)
+{
+    float *f = (float *)(&Base);
+
+    return _mm_set_ps(powf(f[0], Exp),
+                      powf(f[1], Exp),
+                      powf(f[2], Exp),
+                      powf(f[3], Exp));
+}
+
+static inline __m128 ConvertFloatToSRGB2(__m128& Src)
+{
+    // create a mask with 0xFFFFFFFF in the DWORDs where the source is <= the minimal SRGB float value
+    __m128i CmpToSRGBThresholdMask = TO_M128i(_mm_cmpnlt_ps(_mm_set1_ps(0.0031308f), Src));
+
+    // squeeze the mask down to 16 bits (4 bits per DWORD)
+    int CompareResult = _mm_movemask_epi8(CmpToSRGBThresholdMask);
+
+    __m128 Result;
+
+    //
+    if (CompareResult == 0xFFFF)
+    {
+        // all DWORDs are <= the threshold
+        Result = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
+    }
+    else if (CompareResult == 0x0)
+    {
+        // all DWORDs are > the threshold
+        __m128 fSrc_0RGB = Src;
+
+        // --> 1.055f * c(1.0f/2.4f) - 0.055f
+#if KNOB_USE_FAST_SRGB == TRUE
+        // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
+        __m128 f = pow512_4(fSrc_0RGB);
+#else
+        __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
+#endif
+        f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
+        Result = _mm_sub_ps(f, _mm_set1_ps(0.055f));
+    }
+    else
+    {
+        // some DWORDs are <= the threshold and some are > threshold
+        __m128 Src_0RGB_mul_denorm = _mm_mul_ps(Src, _mm_set1_ps(12.92f));
+
+        __m128 fSrc_0RGB = Src;
+
+        // --> 1.055f * c(1.0f/2.4f) - 0.055f
+#if KNOB_USE_FAST_SRGB == TRUE
+        // 1.0f / 2.4f is 5.0f / 12.0f which is used for approximation.
+        __m128 f = pow512_4(fSrc_0RGB);
+#else
+        __m128 f = powf_wrapper(fSrc_0RGB, 1.0f / 2.4f);
+#endif
+        f = _mm_mul_ps(f, _mm_set1_ps(1.055f));
+        f = _mm_sub_ps(f, _mm_set1_ps(0.055f));
+
+        // Clear the alpha (is garbage after the sub)
+        __m128i i = _mm_and_si128(TO_M128i(f), _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF));
+
+        __m128i LessThanPart = _mm_and_si128(CmpToSRGBThresholdMask, TO_M128i(Src_0RGB_mul_denorm));
+        __m128i GreaterEqualPart = _mm_andnot_si128(CmpToSRGBThresholdMask, i);
+        __m128i CombinedParts = _mm_or_si128(LessThanPart, GreaterEqualPart);
+
+        Result = TO_M128(CombinedParts);
+    }
+
+    return Result;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for FLOAT16
+//////////////////////////////////////////////////////////////////////////
+template<> struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16>
+{
+    static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
+    static float toFloat() { return 1.0f; }
+    static float fromFloat() { return 1.0f; }
+    static simdscalar convertSrgb(simdscalar &in) { SWR_ASSERT(0); return _simd_setzero_ps(); }
+
+    static simdscalar pack(const simdscalar &in)
+    {
+#if KNOB_SIMD_WIDTH == 8
+#if (KNOB_ARCH == KNOB_ARCH_AVX)
+        // input is 8 packed float32, output is 8 packed float16
+        simdscalari src = _simd_castps_si(in);
+
+        static const uint32_t FLOAT_EXP_BITS = 8;
+        static const uint32_t FLOAT_MANTISSA_BITS = 23;
+        static const uint32_t FLOAT_MANTISSA_MASK = (1U << FLOAT_MANTISSA_BITS) - 1;
+        static const uint32_t FLOAT_EXP_MASK = ((1U << FLOAT_EXP_BITS) - 1) << FLOAT_MANTISSA_BITS;
+
+        static const uint32_t HALF_EXP_BITS = 5;
+        static const uint32_t HALF_MANTISSA_BITS = 10;
+        static const uint32_t HALF_MANTISSA_MASK = (1U << HALF_MANTISSA_BITS) - 1;
+        static const uint32_t HALF_EXP_MASK = ((1U << HALF_EXP_BITS) - 1) << HALF_MANTISSA_BITS;
+
+        // minimum exponent required, exponents below this are flushed to 0.
+        static const int32_t HALF_EXP_MIN = -14;
+        static const int32_t FLOAT_EXP_BIAS = 127;
+        static const int32_t FLOAT_EXP_MIN = HALF_EXP_MIN + FLOAT_EXP_BIAS;
+        static const int32_t FLOAT_EXP_MIN_FTZ = FLOAT_EXP_MIN - (HALF_MANTISSA_BITS + 1); // +1 for the lack of implicit significand
+
+        // maximum exponent required, exponents above this are set to infinity
+        static const int32_t HALF_EXP_MAX = 15;
+        static const int32_t FLOAT_EXP_MAX = HALF_EXP_MAX + FLOAT_EXP_BIAS;
+
+        const simdscalari vSignMask     = _simd_set1_epi32(0x80000000);
+        const simdscalari vExpMask      = _simd_set1_epi32(FLOAT_EXP_MASK);
+        const simdscalari vManMask      = _simd_set1_epi32(FLOAT_MANTISSA_MASK);
+        const simdscalari vExpMin       = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN << FLOAT_MANTISSA_BITS));
+        const simdscalari vExpMinFtz    = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MIN_FTZ << FLOAT_MANTISSA_BITS));
+        const simdscalari vExpMax       = _simd_set1_epi32(FLOAT_EXP_MASK & uint32_t(FLOAT_EXP_MAX << FLOAT_MANTISSA_BITS));
+
+        simdscalari vSign       = _simd_and_si(src, vSignMask);
+        simdscalari vExp        = _simd_and_si(src, vExpMask);
+        simdscalari vMan        = _simd_and_si(src, vManMask);
+
+        simdscalari vFTZMask    = _simd_cmplt_epi32(vExp, vExpMinFtz);
+        simdscalari vDenormMask = _simd_andnot_si(vFTZMask, _simd_cmplt_epi32(vExp, vExpMin));
+        simdscalari vInfMask    = _simd_cmpeq_epi32(vExpMask, vExp);
+        simdscalari vClampMask  = _simd_andnot_si(vInfMask, _simd_cmplt_epi32(vExpMax, vExp));
+
+        simdscalari vHalfExp    = _simd_add_epi32(_simd_sub_epi32(vExp, vExpMin), _simd_set1_epi32(1U << FLOAT_MANTISSA_BITS));
+
+        // pack output 16-bits into the lower 16-bits of each 32-bit channel
+        simdscalari vDst        = _simd_and_si(_simd_srli_epi32(vHalfExp, 13), _simd_set1_epi32(HALF_EXP_MASK));
+        vDst   = _simd_or_si(vDst, _simd_srli_epi32(vMan, FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
+
+        // Flush To Zero
+        vDst   = _simd_andnot_si(vFTZMask, vDst);
+        // Apply Infinites / NaN
+        vDst   = _simd_or_si(vDst, _simd_and_si(vInfMask, _simd_set1_epi32(HALF_EXP_MASK)));
+
+        // Apply clamps
+        vDst = _simd_andnot_si(vClampMask, vDst);
+        vDst = _simd_or_si(vDst,
+                _simd_and_si(vClampMask, _simd_set1_epi32(0x7BFF)));
+
+        // Compute Denormals (subnormals)
+        if (!_mm256_testz_si256(vDenormMask, vDenormMask))
+        {
+            uint32_t *pDenormMask = (uint32_t*)&vDenormMask;
+            uint32_t *pExp = (uint32_t*)&vExp;
+            uint32_t *pMan = (uint32_t*)&vMan;
+            uint32_t *pDst = (uint32_t*)&vDst;
+            for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
+            {
+                if (pDenormMask[i])
+                {
+                    // Need to compute subnormal value
+                    uint32_t exponent = pExp[i] >> FLOAT_MANTISSA_BITS;
+                    uint32_t mantissa = pMan[i] |
+                                        (1U << FLOAT_MANTISSA_BITS); // Denorms include no "implicit" 1s.  Make it explicit
+
+                    pDst[i] = mantissa >> ((FLOAT_EXP_MIN - exponent) + (FLOAT_MANTISSA_BITS - HALF_MANTISSA_BITS));
+                }
+            }
+        }
+
+        // Add in sign bits
+        vDst = _simd_or_si(vDst, _simd_srli_epi32(vSign, 16));
+
+        // Pack to lower 128-bits
+        vDst = _mm256_castsi128_si256(_mm_packus_epi32(_mm256_castsi256_si128(vDst), _mm256_extractf128_si256(vDst, 1)));
+
+#if 0
+#if !defined(NDEBUG)
+        simdscalari vCheck = _mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC));
+
+        for (uint32_t i = 0; i < 4; ++i)
+        {
+            SWR_ASSERT(vCheck.m256i_i32[i] == vDst.m256i_i32[i]);
+        }
+#endif
+#endif
+
+        return _simd_castsi_ps(vDst);
+
+#else
+        return _mm256_castsi256_ps(_mm256_castsi128_si256(_mm256_cvtps_ph(in, _MM_FROUND_TRUNC)));
+#endif
+#else
+#error Unsupported vector width
+#endif
+    }
+
+    static simdscalar unpack(const simdscalar &in)
+    {
+        // input is 8 packed float16, output is 8 packed float32
+        SWR_ASSERT(0); // @todo
+        return _simd_setzero_ps();
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// TypeTraits - Format type traits specialization for FLOAT32
+//////////////////////////////////////////////////////////////////////////
+template<> struct TypeTraits<SWR_TYPE_FLOAT, 32> : PackTraits<32>
+{
+    static const SWR_TYPE MyType = SWR_TYPE_FLOAT;
+    static float toFloat() { return 1.0f; }
+    static float fromFloat() { return 1.0f; }
+    static inline simdscalar convertSrgb(simdscalar &in)
+    {
+#if (KNOB_ARCH == KNOB_ARCH_AVX || KNOB_ARCH == KNOB_ARCH_AVX2)
+        __m128 srcLo = _mm256_extractf128_ps(in, 0);
+        __m128 srcHi = _mm256_extractf128_ps(in, 1);
+
+        srcLo = ConvertFloatToSRGB2(srcLo);
+        srcHi = ConvertFloatToSRGB2(srcHi);
+
+        in = _mm256_insertf128_ps(in, srcLo, 0);
+        in = _mm256_insertf128_ps(in, srcHi, 1);
+
+#endif
+        return in;
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Format1 - Bitfield for single component formats.
+//////////////////////////////////////////////////////////////////////////
+template<uint32_t x>
+struct Format1
+{
+    union
+    {
+        uint32_t r : x;
+
+        ///@ The following are here to provide full template needed in Formats.
+        uint32_t g : x;
+        uint32_t b : x;
+        uint32_t a : x;
+    };
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Format1 - Bitfield for single component formats - 8 bit specialization
+//////////////////////////////////////////////////////////////////////////
+template<>
+struct Format1<8>
+{
+    union
+    {
+        uint8_t r;
+
+        ///@ The following are here to provide full template needed in Formats.
+        uint8_t g;
+        uint8_t b;
+        uint8_t a;
+    };
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Format1 - Bitfield for single component formats - 16 bit specialization
+//////////////////////////////////////////////////////////////////////////
+template<>
+struct Format1<16>
+{
+    union
+    {
+        uint16_t r;
+
+        ///@ The following are here to provide full template needed in Formats.
+        uint16_t g;
+        uint16_t b;
+        uint16_t a;
+    };
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Format2 - Bitfield for 2 component formats.
+//////////////////////////////////////////////////////////////////////////
+template<uint32_t x, uint32_t y>
+union Format2
+{
+    struct
+    {
+        uint32_t r : x;
+        uint32_t g : y;
+    };
+    struct
+    {
+        ///@ The following are here to provide full template needed in Formats.
+        uint32_t b : x;
+        uint32_t a : y;
+    };
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Format2 - Bitfield for 2 component formats - 16 bit specialization
+//////////////////////////////////////////////////////////////////////////
+template<>
+union Format2<8,8>
+{
+    struct
+    {
+        uint16_t r : 8;
+        uint16_t g : 8;
+    };
+    struct
+    {
+        ///@ The following are here to provide full template needed in Formats.
+        uint16_t b : 8;
+        uint16_t a : 8;
+    };
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Format3 - Bitfield for 3 component formats.
+//////////////////////////////////////////////////////////////////////////
+template<uint32_t x, uint32_t y, uint32_t z>
+union Format3
+{
+    struct
+    {
+        uint32_t r : x;
+        uint32_t g : y;
+        uint32_t b : z;
+    };
+    uint32_t a;  ///@note This is here to provide full template needed in Formats.
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Format3 - Bitfield for 3 component formats - 16 bit specialization
+//////////////////////////////////////////////////////////////////////////
+template<>
+union Format3<5,6,5>
+{
+    struct
+    {
+        uint16_t r : 5;
+        uint16_t g : 6;
+        uint16_t b : 5;
+    };
+    uint16_t a;  ///@note This is here to provide full template needed in Formats.
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Format4 - Bitfield for 4 component formats.
+//////////////////////////////////////////////////////////////////////////
+template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
+struct Format4
+{
+    uint32_t r : x;
+    uint32_t g : y;
+    uint32_t b : z;
+    uint32_t a : w;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Format4 - Bitfield for 4 component formats - 16 bit specialization
+//////////////////////////////////////////////////////////////////////////
+template<>
+struct Format4<5,5,5,1>
+{
+    uint16_t r : 5;
+    uint16_t g : 5;
+    uint16_t b : 5;
+    uint16_t a : 1;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Format4 - Bitfield for 4 component formats - 16 bit specialization
+//////////////////////////////////////////////////////////////////////////
+template<>
+struct Format4<4,4,4,4>
+{
+    uint16_t r : 4;
+    uint16_t g : 4;
+    uint16_t b : 4;
+    uint16_t a : 4;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// ComponentTraits - Default components
+//////////////////////////////////////////////////////////////////////////
+template<uint32_t x, uint32_t y, uint32_t z, uint32_t w>
+struct Defaults
+{
+    INLINE static uint32_t GetDefault(uint32_t comp)
+    {
+        static const uint32_t defaults[4]{ x, y, z, w };
+        return defaults[comp];
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// ComponentTraits - Component type traits.
+//////////////////////////////////////////////////////////////////////////
+template<SWR_TYPE X, uint32_t NumBitsX, SWR_TYPE Y = SWR_TYPE_UNKNOWN, uint32_t NumBitsY = 0, SWR_TYPE Z = SWR_TYPE_UNKNOWN, uint32_t NumBitsZ = 0, SWR_TYPE W = SWR_TYPE_UNKNOWN, uint32_t NumBitsW = 0>
+struct ComponentTraits
+{
+    INLINE static SWR_TYPE GetType(uint32_t comp)
+    {
+        static const SWR_TYPE CompType[4]{ X, Y, Z, W };
+        return CompType[comp];
+    }
+
+    INLINE static uint32_t GetBPC(uint32_t comp)
+    {
+        static const uint32_t MyBpc[4]{ NumBitsX, NumBitsY, NumBitsZ, NumBitsW };
+        return MyBpc[comp];
+    }
+
+    INLINE static bool isNormalized(uint32_t comp)
+    {
+        switch (comp)
+        {
+        case 0:
+            return (X == SWR_TYPE_UNORM || X == SWR_TYPE_SNORM) ? true : false;
+        case 1:
+            return (Y == SWR_TYPE_UNORM || Y == SWR_TYPE_SNORM) ? true : false;
+        case 2:
+            return (Z == SWR_TYPE_UNORM || Z == SWR_TYPE_SNORM) ? true : false;
+        case 3:
+            return (W == SWR_TYPE_UNORM || W == SWR_TYPE_SNORM) ? true : false;
+        }
+        SWR_ASSERT(0);
+        return false;
+    }
+
+    INLINE static float toFloat(uint32_t comp)
+    {
+        switch (comp)
+        {
+        case 0:
+            return TypeTraits<X, NumBitsX>::toFloat();
+        case 1:
+            return TypeTraits<Y, NumBitsY>::toFloat();
+        case 2:
+            return TypeTraits<Z, NumBitsZ>::toFloat();
+        case 3:
+            return TypeTraits<W, NumBitsW>::toFloat();
+        }
+        SWR_ASSERT(0);
+        return TypeTraits<X, NumBitsX>::toFloat();
+
+    }
+
+    INLINE static float fromFloat(uint32_t comp)
+    {
+        switch (comp)
+        {
+        case 0:
+            return TypeTraits<X, NumBitsX>::fromFloat();
+        case 1:
+            return TypeTraits<Y, NumBitsY>::fromFloat();
+        case 2:
+            return TypeTraits<Z, NumBitsZ>::fromFloat();
+        case 3:
+            return TypeTraits<W, NumBitsW>::fromFloat();
+        }
+        SWR_ASSERT(0);
+        return TypeTraits<X, NumBitsX>::fromFloat();
+    }
+
+    INLINE static simdscalar loadSOA(uint32_t comp, const BYTE* pSrc)
+    {
+        switch (comp)
+        {
+        case 0:
+            return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
+        case 1:
+            return TypeTraits<Y, NumBitsY>::loadSOA(pSrc);
+        case 2:
+            return TypeTraits<Z, NumBitsZ>::loadSOA(pSrc);
+        case 3:
+            return TypeTraits<W, NumBitsW>::loadSOA(pSrc);
+        }
+        SWR_ASSERT(0);
+        return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
+    }
+
+    INLINE static void storeSOA(uint32_t comp, BYTE *pDst, simdscalar src)
+    {
+        switch (comp)
+        {
+        case 0:
+            TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
+            return;
+        case 1:
+            TypeTraits<Y, NumBitsY>::storeSOA(pDst, src);
+            return;
+        case 2:
+            TypeTraits<Z, NumBitsZ>::storeSOA(pDst, src);
+            return;
+        case 3:
+            TypeTraits<W, NumBitsW>::storeSOA(pDst, src);
+            return;
+        }
+        SWR_ASSERT(0);
+        TypeTraits<X, NumBitsX>::storeSOA(pDst, src);
+    }
+
+    INLINE static simdscalar unpack(uint32_t comp, simdscalar &in)
+    {
+        switch (comp)
+        {
+        case 0:
+            return TypeTraits<X, NumBitsX>::unpack(in);
+        case 1:
+            return TypeTraits<Y, NumBitsY>::unpack(in);
+        case 2:
+            return TypeTraits<Z, NumBitsZ>::unpack(in);
+        case 3:
+            return TypeTraits<W, NumBitsW>::unpack(in);
+        }
+        SWR_ASSERT(0);
+        return TypeTraits<X, NumBitsX>::unpack(in);
+    }
+
+    INLINE static simdscalar pack(uint32_t comp, simdscalar &in)
+    {
+        switch (comp)
+        {
+        case 0:
+            return TypeTraits<X, NumBitsX>::pack(in);
+        case 1:
+            return TypeTraits<Y, NumBitsY>::pack(in);
+        case 2:
+            return TypeTraits<Z, NumBitsZ>::pack(in);
+        case 3:
+            return TypeTraits<W, NumBitsW>::pack(in);
+        }
+        SWR_ASSERT(0);
+        return TypeTraits<X, NumBitsX>::pack(in);
+    }
+
+    INLINE static simdscalar convertSrgb(uint32_t comp, simdscalar &in)
+    {
+        switch (comp)
+        {
+        case 0:
+            return TypeTraits<X, NumBitsX>::convertSrgb(in);;
+        case 1:
+            return TypeTraits<Y, NumBitsY>::convertSrgb(in);;
+        case 2:
+            return TypeTraits<Z, NumBitsZ>::convertSrgb(in);;
+        case 3:
+            return TypeTraits<W, NumBitsW>::convertSrgb(in);;
+        }
+        SWR_ASSERT(0);
+        return TypeTraits<X, NumBitsX>::convertSrgb(in);
+    }
+};
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
new file mode 100644
index 00000000000..f43a672bd82
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -0,0 +1,2345 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file frontend.cpp
+*
+* @brief Implementation for Frontend which handles vertex processing,
+*        primitive assembly, clipping, binning, etc.
+*
+******************************************************************************/
+
+#include "api.h"
+#include "frontend.h"
+#include "backend.h"
+#include "context.h"
+#include "rdtsc_core.h"
+#include "rasterizer.h"
+#include "utils.h"
+#include "threads.h"
+#include "pa.h"
+#include "clip.h"
+#include "tilemgr.h"
+#include "tessellator.h"
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Helper macro to generate a bitmask
+static INLINE uint32_t GenMask(uint32_t numBits)
+{
+    SWR_ASSERT(numBits <= (sizeof(uint32_t) * 8), "Too many bits (%d) for %s", numBits, __FUNCTION__);
+    return ((1U << numBits) - 1);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Offsets added to post-viewport vertex positions based on
+/// raster state.
+static const simdscalar g_pixelOffsets[SWR_PIXEL_LOCATION_UL + 1] =
+{
+    _simd_set1_ps(0.0f), // SWR_PIXEL_LOCATION_CENTER
+    _simd_set1_ps(0.5f), // SWR_PIXEL_LOCATION_UL
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief FE handler for SwrSync.
+/// @param pContext - pointer to SWR context.
+/// @param pDC - pointer to draw context.
+/// @param workerId - thread's worker id. Even thread has a unique id.
+/// @param pUserData - Pointer to user data passed back to sync callback.
+/// @todo This should go away when we switch this to use compute threading.
+void ProcessSync(
+    SWR_CONTEXT *pContext,
+    DRAW_CONTEXT *pDC,
+    uint32_t workerId,
+    void *pUserData)
+{
+    SYNC_DESC *pSync = (SYNC_DESC*)pUserData;
+    BE_WORK work;
+    work.type = SYNC;
+    work.pfnWork = ProcessSyncBE;
+    work.desc.sync = *pSync;
+
+    MacroTileMgr *pTileMgr = pDC->pTileMgr;
+    pTileMgr->enqueue(0, 0, &work);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief FE handler for SwrGetStats.
+/// @param pContext - pointer to SWR context.
+/// @param pDC - pointer to draw context.
+/// @param workerId - thread's worker id. Even thread has a unique id.
+/// @param pUserData - Pointer to user data passed back to stats callback.
+/// @todo This should go away when we switch this to use compute threading.
+void ProcessQueryStats(
+    SWR_CONTEXT *pContext,
+    DRAW_CONTEXT *pDC,
+    uint32_t workerId,
+    void *pUserData)
+{
+    QUERY_DESC *pQueryStats = (QUERY_DESC*)pUserData;
+    BE_WORK work;
+    work.type = QUERYSTATS;
+    work.pfnWork = ProcessQueryStatsBE;
+    work.desc.queryStats = *pQueryStats;
+
+    MacroTileMgr *pTileMgr = pDC->pTileMgr;
+    pTileMgr->enqueue(0, 0, &work);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief FE handler for SwrClearRenderTarget.
+/// @param pContext - pointer to SWR context.
+/// @param pDC - pointer to draw context.
+/// @param workerId - thread's worker id. Even thread has a unique id.
+/// @param pUserData - Pointer to user data passed back to clear callback.
+/// @todo This should go away when we switch this to use compute threading.
+void ProcessClear(
+    SWR_CONTEXT *pContext,
+    DRAW_CONTEXT *pDC,
+    uint32_t workerId,
+    void *pUserData)
+{
+    CLEAR_DESC *pClear = (CLEAR_DESC*)pUserData;
+    MacroTileMgr *pTileMgr = pDC->pTileMgr;
+
+    const API_STATE& state = GetApiState(pDC);
+
+    // queue a clear to each macro tile
+    // compute macro tile bounds for the current scissor/viewport
+    uint32_t macroTileLeft = state.scissorInFixedPoint.left / KNOB_MACROTILE_X_DIM_FIXED;
+    uint32_t macroTileRight = state.scissorInFixedPoint.right / KNOB_MACROTILE_X_DIM_FIXED;
+    uint32_t macroTileTop = state.scissorInFixedPoint.top / KNOB_MACROTILE_Y_DIM_FIXED;
+    uint32_t macroTileBottom = state.scissorInFixedPoint.bottom / KNOB_MACROTILE_Y_DIM_FIXED;
+
+    BE_WORK work;
+    work.type = CLEAR;
+    work.pfnWork = ProcessClearBE;
+    work.desc.clear = *pClear;
+
+    for (uint32_t y = macroTileTop; y <= macroTileBottom; ++y)
+    {
+        for (uint32_t x = macroTileLeft; x <= macroTileRight; ++x)
+        {
+            pTileMgr->enqueue(x, y, &work);
+        }
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief FE handler for SwrStoreTiles.
+/// @param pContext - pointer to SWR context.
+/// @param pDC - pointer to draw context.
+/// @param workerId - thread's worker id. Even thread has a unique id.
+/// @param pUserData - Pointer to user data passed back to callback.
+/// @todo This should go away when we switch this to use compute threading.
+void ProcessStoreTiles(
+    SWR_CONTEXT *pContext,
+    DRAW_CONTEXT *pDC,
+    uint32_t workerId,
+    void *pUserData)
+{
+    RDTSC_START(FEProcessStoreTiles);
+    STORE_TILES_DESC *pStore = (STORE_TILES_DESC*)pUserData;
+    MacroTileMgr *pTileMgr = pDC->pTileMgr;
+
+    const API_STATE& state = GetApiState(pDC);
+
+    // queue a store to each macro tile
+    // compute macro tile bounds for the current render target
+    const uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
+    const uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
+
+    uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth;
+    uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight;
+
+    // store tiles
+    BE_WORK work;
+    work.type = STORETILES;
+    work.pfnWork = ProcessStoreTileBE;
+    work.desc.storeTiles = *pStore;
+
+    for (uint32_t x = 0; x < numMacroTilesX; ++x)
+    {
+        for (uint32_t y = 0; y < numMacroTilesY; ++y)
+        {
+            pTileMgr->enqueue(x, y, &work);
+        }
+    }
+
+    RDTSC_STOP(FEProcessStoreTiles, 0, pDC->drawId);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief FE handler for SwrInvalidateTiles.
+/// @param pContext - pointer to SWR context.
+/// @param pDC - pointer to draw context.
+/// @param workerId - thread's worker id. Even thread has a unique id.
+/// @param pUserData - Pointer to user data passed back to callback.
+/// @todo This should go away when we switch this to use compute threading.
+void ProcessInvalidateTiles(
+    SWR_CONTEXT *pContext,
+    DRAW_CONTEXT *pDC,
+    uint32_t workerId,
+    void *pUserData)
+{
+    RDTSC_START(FEProcessInvalidateTiles);
+    INVALIDATE_TILES_DESC *pInv = (INVALIDATE_TILES_DESC*)pUserData;
+    MacroTileMgr *pTileMgr = pDC->pTileMgr;
+
+    const API_STATE& state = GetApiState(pDC);
+
+    // queue a store to each macro tile
+    // compute macro tile bounds for the current render target
+    uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
+    uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
+
+    uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth;
+    uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight;
+
+    // load tiles
+    BE_WORK work;
+    work.type = INVALIDATETILES;
+    work.pfnWork = ProcessInvalidateTilesBE;
+    work.desc.invalidateTiles = *pInv;
+
+    for (uint32_t x = 0; x < numMacroTilesX; ++x)
+    {
+        for (uint32_t y = 0; y < numMacroTilesY; ++y)
+        {
+            pTileMgr->enqueue(x, y, &work);
+        }
+    }
+
+    RDTSC_STOP(FEProcessInvalidateTiles, 0, pDC->drawId);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes the number of primitives given the number of verts.
+/// @param mode - primitive topology for draw operation.
+/// @param numPrims - number of vertices or indices for draw.
+/// @todo Frontend needs to be refactored. This will go in appropriate place then.
+uint32_t GetNumPrims(
+    PRIMITIVE_TOPOLOGY mode,
+    uint32_t numPrims)
+{
+    switch (mode)
+    {
+    case TOP_POINT_LIST: return numPrims;
+    case TOP_TRIANGLE_LIST: return numPrims / 3;
+    case TOP_TRIANGLE_STRIP: return numPrims < 3 ? 0 : numPrims - 2;
+    case TOP_TRIANGLE_FAN: return numPrims < 3 ? 0 : numPrims - 2;
+    case TOP_TRIANGLE_DISC: return numPrims < 2 ? 0 : numPrims - 1;
+    case TOP_QUAD_LIST: return numPrims / 4;
+    case TOP_QUAD_STRIP: return numPrims < 4 ? 0 : (numPrims - 2) / 2;
+    case TOP_LINE_STRIP: return numPrims < 2 ? 0 : numPrims - 1;
+    case TOP_LINE_LIST: return numPrims / 2;
+    case TOP_LINE_LOOP: return numPrims;
+    case TOP_RECT_LIST: return numPrims / 3;
+    case TOP_LINE_LIST_ADJ: return numPrims / 4;
+    case TOP_LISTSTRIP_ADJ: return numPrims < 3 ? 0 : numPrims - 3;
+    case TOP_TRI_LIST_ADJ: return numPrims / 6;
+    case TOP_TRI_STRIP_ADJ: return numPrims < 4 ? 0 : (numPrims / 2) - 2;
+
+    case TOP_PATCHLIST_1:
+    case TOP_PATCHLIST_2:
+    case TOP_PATCHLIST_3:
+    case TOP_PATCHLIST_4:
+    case TOP_PATCHLIST_5:
+    case TOP_PATCHLIST_6:
+    case TOP_PATCHLIST_7:
+    case TOP_PATCHLIST_8:
+    case TOP_PATCHLIST_9:
+    case TOP_PATCHLIST_10:
+    case TOP_PATCHLIST_11:
+    case TOP_PATCHLIST_12:
+    case TOP_PATCHLIST_13:
+    case TOP_PATCHLIST_14:
+    case TOP_PATCHLIST_15:
+    case TOP_PATCHLIST_16:
+    case TOP_PATCHLIST_17:
+    case TOP_PATCHLIST_18:
+    case TOP_PATCHLIST_19:
+    case TOP_PATCHLIST_20:
+    case TOP_PATCHLIST_21:
+    case TOP_PATCHLIST_22:
+    case TOP_PATCHLIST_23:
+    case TOP_PATCHLIST_24:
+    case TOP_PATCHLIST_25:
+    case TOP_PATCHLIST_26:
+    case TOP_PATCHLIST_27:
+    case TOP_PATCHLIST_28:
+    case TOP_PATCHLIST_29:
+    case TOP_PATCHLIST_30:
+    case TOP_PATCHLIST_31:
+    case TOP_PATCHLIST_32:
+        return numPrims / (mode - TOP_PATCHLIST_BASE);
+
+    case TOP_POLYGON:
+    case TOP_POINT_LIST_BF:
+    case TOP_LINE_STRIP_CONT:
+    case TOP_LINE_STRIP_BF:
+    case TOP_LINE_STRIP_CONT_BF:
+    case TOP_TRIANGLE_FAN_NOSTIPPLE:
+    case TOP_TRI_STRIP_REVERSE:
+    case TOP_PATCHLIST_BASE:
+    case TOP_UNKNOWN:
+        SWR_ASSERT(false, "Unsupported topology: %d", mode);
+        return 0;
+    }
+
+    return 0;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes the number of verts given the number of primitives.
+/// @param mode - primitive topology for draw operation.
+/// @param numPrims - number of primitives for draw.
+uint32_t GetNumVerts(
+    PRIMITIVE_TOPOLOGY mode,
+    uint32_t numPrims)
+{
+    switch (mode)
+    {
+    case TOP_POINT_LIST: return numPrims;
+    case TOP_TRIANGLE_LIST: return numPrims * 3;
+    case TOP_TRIANGLE_STRIP: return numPrims ? numPrims + 2 : 0;
+    case TOP_TRIANGLE_FAN: return numPrims ? numPrims + 2 : 0;
+    case TOP_TRIANGLE_DISC: return numPrims ? numPrims + 1 : 0;
+    case TOP_QUAD_LIST: return numPrims * 4;
+    case TOP_QUAD_STRIP: return numPrims ? numPrims * 2 + 2 : 0;
+    case TOP_LINE_STRIP: return numPrims ? numPrims + 1 : 0;
+    case TOP_LINE_LIST: return numPrims * 2;
+    case TOP_LINE_LOOP: return numPrims;
+    case TOP_RECT_LIST: return numPrims * 3;
+    case TOP_LINE_LIST_ADJ: return numPrims * 4;
+    case TOP_LISTSTRIP_ADJ: return numPrims ? numPrims + 3 : 0;
+    case TOP_TRI_LIST_ADJ: return numPrims * 6;
+    case TOP_TRI_STRIP_ADJ: return numPrims ? (numPrims + 2) * 2 : 0;
+
+    case TOP_PATCHLIST_1:
+    case TOP_PATCHLIST_2:
+    case TOP_PATCHLIST_3:
+    case TOP_PATCHLIST_4:
+    case TOP_PATCHLIST_5:
+    case TOP_PATCHLIST_6:
+    case TOP_PATCHLIST_7:
+    case TOP_PATCHLIST_8:
+    case TOP_PATCHLIST_9:
+    case TOP_PATCHLIST_10:
+    case TOP_PATCHLIST_11:
+    case TOP_PATCHLIST_12:
+    case TOP_PATCHLIST_13:
+    case TOP_PATCHLIST_14:
+    case TOP_PATCHLIST_15:
+    case TOP_PATCHLIST_16:
+    case TOP_PATCHLIST_17:
+    case TOP_PATCHLIST_18:
+    case TOP_PATCHLIST_19:
+    case TOP_PATCHLIST_20:
+    case TOP_PATCHLIST_21:
+    case TOP_PATCHLIST_22:
+    case TOP_PATCHLIST_23:
+    case TOP_PATCHLIST_24:
+    case TOP_PATCHLIST_25:
+    case TOP_PATCHLIST_26:
+    case TOP_PATCHLIST_27:
+    case TOP_PATCHLIST_28:
+    case TOP_PATCHLIST_29:
+    case TOP_PATCHLIST_30:
+    case TOP_PATCHLIST_31:
+    case TOP_PATCHLIST_32:
+        return numPrims * (mode - TOP_PATCHLIST_BASE);
+
+    case TOP_POLYGON:
+    case TOP_POINT_LIST_BF:
+    case TOP_LINE_STRIP_CONT:
+    case TOP_LINE_STRIP_BF:
+    case TOP_LINE_STRIP_CONT_BF:
+    case TOP_TRIANGLE_FAN_NOSTIPPLE:
+    case TOP_TRI_STRIP_REVERSE:
+    case TOP_PATCHLIST_BASE:
+    case TOP_UNKNOWN:
+        SWR_ASSERT(false, "Unsupported topology: %d", mode);
+        return 0;
+    }
+
+    return 0;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Return number of verts per primitive.
+/// @param topology - topology
+/// @param includeAdjVerts - include adjacent verts in primitive vertices
+INLINE uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts)
+{
+    uint32_t numVerts = 0;
+    switch (topology)
+    {
+    case TOP_POINT_LIST:
+    case TOP_POINT_LIST_BF:
+        numVerts = 1;
+        break;
+    case TOP_LINE_LIST:
+    case TOP_LINE_STRIP:
+    case TOP_LINE_LIST_ADJ:
+    case TOP_LINE_LOOP:
+    case TOP_LINE_STRIP_CONT:
+    case TOP_LINE_STRIP_BF:
+    case TOP_LISTSTRIP_ADJ:
+        numVerts = 2;
+        break;
+    case TOP_TRIANGLE_LIST:
+    case TOP_TRIANGLE_STRIP:
+    case TOP_TRIANGLE_FAN:
+    case TOP_TRI_LIST_ADJ:
+    case TOP_TRI_STRIP_ADJ:
+    case TOP_TRI_STRIP_REVERSE:
+    case TOP_RECT_LIST:
+        numVerts = 3;
+        break;
+    case TOP_QUAD_LIST:
+    case TOP_QUAD_STRIP:
+        numVerts = 4;
+        break;
+    case TOP_PATCHLIST_1:
+    case TOP_PATCHLIST_2:
+    case TOP_PATCHLIST_3:
+    case TOP_PATCHLIST_4:
+    case TOP_PATCHLIST_5:
+    case TOP_PATCHLIST_6:
+    case TOP_PATCHLIST_7:
+    case TOP_PATCHLIST_8:
+    case TOP_PATCHLIST_9:
+    case TOP_PATCHLIST_10:
+    case TOP_PATCHLIST_11:
+    case TOP_PATCHLIST_12:
+    case TOP_PATCHLIST_13:
+    case TOP_PATCHLIST_14:
+    case TOP_PATCHLIST_15:
+    case TOP_PATCHLIST_16:
+    case TOP_PATCHLIST_17:
+    case TOP_PATCHLIST_18:
+    case TOP_PATCHLIST_19:
+    case TOP_PATCHLIST_20:
+    case TOP_PATCHLIST_21:
+    case TOP_PATCHLIST_22:
+    case TOP_PATCHLIST_23:
+    case TOP_PATCHLIST_24:
+    case TOP_PATCHLIST_25:
+    case TOP_PATCHLIST_26:
+    case TOP_PATCHLIST_27:
+    case TOP_PATCHLIST_28:
+    case TOP_PATCHLIST_29:
+    case TOP_PATCHLIST_30:
+    case TOP_PATCHLIST_31:
+    case TOP_PATCHLIST_32:
+        numVerts = topology - TOP_PATCHLIST_BASE;
+        break;
+    default:
+        SWR_ASSERT(false, "Unsupported topology: %d", topology);
+        break;
+    }
+
+    if (includeAdjVerts)
+    {
+        switch (topology)
+        {
+        case TOP_LISTSTRIP_ADJ:
+        case TOP_LINE_LIST_ADJ: numVerts = 4; break;
+        case TOP_TRI_STRIP_ADJ:
+        case TOP_TRI_LIST_ADJ: numVerts = 6; break;
+        default: break;
+        }
+    }
+
+    return numVerts;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Generate mask from remaining work.
+/// @param numWorkItems - Number of items being worked on by a SIMD.
+static INLINE simdscalari GenerateMask(uint32_t numItemsRemaining)
+{
+    uint32_t numActive = (numItemsRemaining >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : numItemsRemaining;
+    uint32_t mask = (numActive > 0) ? ((1 << numActive) - 1) : 0;
+    return _simd_castps_si(vMask(mask));
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief StreamOut - Streams vertex data out to SO buffers.
+///        Generally, we are only streaming out a SIMDs worth of triangles.
+/// @param pDC - pointer to draw context.
+/// @param workerId - thread's worker id. Even thread has a unique id.
+/// @param numPrims - Number of prims to streamout (e.g. points, lines, tris)
+static void StreamOut(
+    DRAW_CONTEXT* pDC,
+    PA_STATE& pa,
+    uint32_t workerId,
+    uint32_t* pPrimData,
+    uint32_t streamIndex)
+{
+    RDTSC_START(FEStreamout);
+
+    SWR_CONTEXT* pContext = pDC->pContext;
+
+    const API_STATE& state = GetApiState(pDC);
+    const SWR_STREAMOUT_STATE &soState = state.soState;
+
+    uint32_t soVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
+
+    // The pPrimData buffer is sparse in that we allocate memory for all 32 attributes for each vertex.
+    uint32_t primDataDwordVertexStride = (KNOB_NUM_ATTRIBUTES * sizeof(float) * 4) / sizeof(uint32_t);
+
+    SWR_STREAMOUT_CONTEXT soContext = { 0 };
+
+    // Setup buffer state pointers.
+    for (uint32_t i = 0; i < 4; ++i)
+    {
+        soContext.pBuffer[i] = &state.soBuffer[i];
+    }
+
+    uint32_t numPrims = pa.NumPrims();
+    for (uint32_t primIndex = 0; primIndex < numPrims; ++primIndex)
+    {
+        DWORD slot = 0;
+        uint32_t soMask = soState.streamMasks[streamIndex];
+
+        // Write all entries into primitive data buffer for SOS.
+        while (_BitScanForward(&slot, soMask))
+        {
+            __m128 attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
+            uint32_t paSlot = slot + VERTEX_ATTRIB_START_SLOT;
+            pa.AssembleSingle(paSlot, primIndex, attrib);
+
+            // Attribute offset is relative offset from start of vertex.
+            // Note that attributes start at slot 1 in the PA buffer. We need to write this
+            // to prim data starting at slot 0. Which is why we do (slot - 1).
+            // Also note: GL works slightly differently, and needs slot 0
+            uint32_t primDataAttribOffset = slot * sizeof(float) * 4 / sizeof(uint32_t);
+
+            // Store each vertex's attrib at appropriate locations in pPrimData buffer.
+            for (uint32_t v = 0; v < soVertsPerPrim; ++v)
+            {
+                uint32_t* pPrimDataAttrib = pPrimData + primDataAttribOffset + (v * primDataDwordVertexStride);
+
+                _mm_store_ps((float*)pPrimDataAttrib, attrib[v]);
+            }
+            soMask &= ~(1 << slot);
+        }
+
+        // Update pPrimData pointer 
+        soContext.pPrimData = pPrimData;
+
+        // Call SOS
+        SWR_ASSERT(state.pfnSoFunc[streamIndex] != nullptr, "Trying to execute uninitialized streamout jit function.");
+        state.pfnSoFunc[streamIndex](soContext);
+    }
+
+    // Update SO write offset. The driver provides memory for the update.
+    for (uint32_t i = 0; i < 4; ++i)
+    {
+        if (state.soBuffer[i].pWriteOffset)
+        {
+            *state.soBuffer[i].pWriteOffset = soContext.pBuffer[i]->streamOffset * sizeof(uint32_t);
+
+            // The SOS increments the existing write offset. So we don't want to increment
+            // the SoWriteOffset stat using an absolute offset instead of relative.
+            SET_STAT(SoWriteOffset[i], soContext.pBuffer[i]->streamOffset);
+        }
+    }
+
+    UPDATE_STAT(SoPrimStorageNeeded[streamIndex], soContext.numPrimStorageNeeded);
+    UPDATE_STAT(SoNumPrimsWritten[streamIndex], soContext.numPrimsWritten);
+
+    RDTSC_STOP(FEStreamout, 1, 0);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes number of invocations. The current index represents
+///        the start of the SIMD. The max index represents how much work
+///        items are remaining. If there is less then a SIMD's left of work
+///        then return the remaining amount of work.
+/// @param curIndex - The start index for the SIMD.
+/// @param maxIndex - The last index for all work items.
+static INLINE uint32_t GetNumInvocations(
+    uint32_t curIndex,
+    uint32_t maxIndex)
+{
+    uint32_t remainder = (maxIndex - curIndex);
+    return (remainder >= KNOB_SIMD_WIDTH) ? KNOB_SIMD_WIDTH : remainder;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Converts a streamId buffer to a cut buffer for the given stream id.
+///        The geometry shader will loop over each active streamout buffer, assembling
+///        primitives for the downstream stages. When multistream output is enabled,
+///        the generated stream ID buffer from the GS needs to be converted to a cut
+///        buffer for the primitive assembler.
+/// @param stream - stream id to generate the cut buffer for
+/// @param pStreamIdBase - pointer to the stream ID buffer
+/// @param numEmittedVerts - Number of total verts emitted by the GS
+/// @param pCutBuffer - output buffer to write cuts to
+void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t numEmittedVerts, uint8_t *pCutBuffer)
+{
+    SWR_ASSERT(stream < MAX_SO_STREAMS);
+
+    uint32_t numInputBytes = (numEmittedVerts * 2  + 7) / 8;
+    uint32_t numOutputBytes = std::max(numInputBytes / 2, 1U);
+
+    for (uint32_t b = 0; b < numOutputBytes; ++b)
+    {
+        uint8_t curInputByte = pStreamIdBase[2*b];
+        uint8_t outByte = 0;
+        for (uint32_t i = 0; i < 4; ++i)
+        {
+            if ((curInputByte & 0x3) != stream)
+            {
+                outByte |= (1 << i);
+            }
+            curInputByte >>= 2;
+        }
+
+        curInputByte = pStreamIdBase[2 * b + 1];
+        for (uint32_t i = 0; i < 4; ++i)
+        {
+            if ((curInputByte & 0x3) != stream)
+            {
+                outByte |= (1 << (i + 4));
+            }
+            curInputByte >>= 2;
+        }
+        
+        *pCutBuffer++ = outByte;
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Implements GS stage.
+/// @param pDC - pointer to draw context.
+/// @param workerId - thread's worker id. Even thread has a unique id.
+/// @param pa - The primitive assembly object.
+/// @param pGsOut - output stream for GS
+template <
+    bool HasStreamOutT,
+    bool HasRastT>
+static void GeometryShaderStage(
+    DRAW_CONTEXT *pDC,
+    uint32_t workerId,
+    PA_STATE& pa,
+    void* pGsOut,
+    void* pCutBuffer,
+    void* pStreamCutBuffer,
+    uint32_t* pSoPrimData,
+    simdscalari primID)
+{
+    RDTSC_START(FEGeometryShader);
+
+    SWR_GS_CONTEXT gsContext;
+    SWR_CONTEXT* pContext = pDC->pContext;
+
+    const API_STATE& state = GetApiState(pDC);
+    const SWR_GS_STATE* pState = &state.gsState;
+
+    SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
+    SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
+
+    gsContext.pStream = (uint8_t*)pGsOut;
+    gsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
+    gsContext.PrimitiveID = primID;
+
+    uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
+    simdvector attrib[MAX_ATTRIBUTES];
+
+    // assemble all attributes for the input primitive
+    for (uint32_t slot = 0; slot < pState->numInputAttribs; ++slot)
+    {
+        uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
+        pa.Assemble(attribSlot, attrib);
+
+        for (uint32_t i = 0; i < numVertsPerPrim; ++i)
+        {
+            gsContext.vert[i].attrib[attribSlot] = attrib[i];
+        }
+    }
+    
+    // assemble position
+    pa.Assemble(VERTEX_POSITION_SLOT, attrib);
+    for (uint32_t i = 0; i < numVertsPerPrim; ++i)
+    {
+        gsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
+    }
+
+    const uint32_t vertexStride = sizeof(simdvertex);
+    const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
+    const uint32_t inputPrimStride = numSimdBatches * vertexStride;
+    const uint32_t instanceStride = inputPrimStride * KNOB_SIMD_WIDTH;
+    uint32_t cutPrimStride;
+    uint32_t cutInstanceStride;
+
+    if (pState->isSingleStream)
+    {
+        cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
+        cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
+    }
+    else
+    {
+        cutPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
+        cutInstanceStride = cutPrimStride * KNOB_SIMD_WIDTH;
+    }
+
+    // record valid prims from the frontend to avoid over binning the newly generated
+    // prims from the GS
+    uint32_t numInputPrims = pa.NumPrims();
+
+    for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
+    {
+        gsContext.InstanceID = instance;
+        gsContext.mask = GenerateMask(numInputPrims);
+
+        // execute the geometry shader
+        state.pfnGsFunc(GetPrivateState(pDC), &gsContext);
+
+        gsContext.pStream += instanceStride;
+        gsContext.pCutOrStreamIdBuffer += cutInstanceStride;
+    }
+
+    // set up new binner and state for the GS output topology
+    PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
+    if (HasRastT)
+    {
+        switch (pState->outputTopology)
+        {
+        case TOP_TRIANGLE_STRIP:    pfnClipFunc = ClipTriangles; break;
+        case TOP_LINE_STRIP:        pfnClipFunc = ClipLines; break;
+        case TOP_POINT_LIST:        pfnClipFunc = ClipPoints; break;
+        default: SWR_ASSERT(false, "Unexpected GS output topology: %d", pState->outputTopology);
+        }
+    }
+
+    // foreach input prim:
+    // - setup a new PA based on the emitted verts for that prim
+    // - loop over the new verts, calling PA to assemble each prim
+    uint32_t* pVertexCount = (uint32_t*)&gsContext.vertexCount;
+    uint32_t* pPrimitiveId = (uint32_t*)&primID;
+
+    uint32_t totalPrimsGenerated = 0;
+    for (uint32_t inputPrim = 0; inputPrim < numInputPrims; ++inputPrim)
+    {
+        uint8_t* pInstanceBase = (uint8_t*)pGsOut + inputPrim * inputPrimStride;
+        uint8_t* pCutBufferBase = (uint8_t*)pCutBuffer + inputPrim * cutPrimStride;
+        for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
+        {
+            uint32_t numEmittedVerts = pVertexCount[inputPrim];
+            if (numEmittedVerts == 0)
+            {
+                continue;
+            }
+
+            uint8_t* pBase = pInstanceBase + instance * instanceStride;
+            uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
+            
+            DWORD numAttribs;
+            _BitScanReverse(&numAttribs, state.feAttribMask);
+            numAttribs++;
+
+            for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
+            {
+                bool processCutVerts = false;
+
+                uint8_t* pCutBuffer = pCutBase;
+
+                // assign default stream ID, only relevant when GS is outputting a single stream
+                uint32_t streamID = 0;
+                if (pState->isSingleStream)
+                {
+                    processCutVerts = true;
+                    streamID = pState->singleStreamID;
+                    if (streamID != stream) continue;
+                }
+                else
+                {
+                    // early exit if this stream is not enabled for streamout
+                    if (HasStreamOutT && !state.soState.streamEnable[stream])
+                    {
+                        continue;
+                    }
+
+                    // multi-stream output, need to translate StreamID buffer to a cut buffer
+                    ProcessStreamIdBuffer(stream, pCutBase, numEmittedVerts, (uint8_t*)pStreamCutBuffer);
+                    pCutBuffer = (uint8_t*)pStreamCutBuffer;
+                    processCutVerts = false;
+                }
+
+                PA_STATE_CUT gsPa(pDC, pBase, numEmittedVerts, pCutBuffer, numEmittedVerts, numAttribs, pState->outputTopology, processCutVerts);
+
+                while (gsPa.GetNextStreamOutput())
+                {
+                    do
+                    {
+                        bool assemble = gsPa.Assemble(VERTEX_POSITION_SLOT, attrib);
+
+                        if (assemble)
+                        {
+                            totalPrimsGenerated += gsPa.NumPrims();
+
+                            if (HasStreamOutT)
+                            {
+                                StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
+                            }
+
+                            if (HasRastT && state.soState.streamToRasterizer == stream)
+                            {
+                                simdscalari vPrimId;
+                                // pull primitiveID from the GS output if available
+                                if (state.gsState.emitsPrimitiveID)
+                                {
+                                    simdvector primIdAttrib[3];
+                                    gsPa.Assemble(VERTEX_PRIMID_SLOT, primIdAttrib);
+                                    vPrimId = _simd_castps_si(primIdAttrib[0].x);
+                                }
+                                else
+                                {
+                                    vPrimId = _simd_set1_epi32(pPrimitiveId[inputPrim]);
+                                }
+
+                                pfnClipFunc(pDC, gsPa, workerId, attrib, GenMask(gsPa.NumPrims()), vPrimId);
+                            }
+                        }
+                    } while (gsPa.NextPrim());
+                }
+            }
+        }
+    }
+
+    // update GS pipeline stats
+    UPDATE_STAT(GsInvocations, numInputPrims * pState->instanceCount);
+    UPDATE_STAT(GsPrimitives, totalPrimsGenerated);
+
+    RDTSC_STOP(FEGeometryShader, 1, 0);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Allocate GS buffers
+/// @param pDC - pointer to draw context.
+/// @param state - API state
+/// @param ppGsOut - pointer to GS output buffer allocation
+/// @param ppCutBuffer - pointer to GS output cut buffer allocation
+static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
+    void **ppStreamCutBuffer)
+{
+    Arena* pArena = pDC->pArena;
+    SWR_ASSERT(pArena != nullptr);
+    SWR_ASSERT(state.gsState.gsEnable);
+    // allocate arena space to hold GS output verts
+    // @todo pack attribs
+    // @todo support multiple streams
+    const uint32_t vertexStride = sizeof(simdvertex);
+    const uint32_t numSimdBatches = (state.gsState.maxNumVerts + KNOB_SIMD_WIDTH - 1) / KNOB_SIMD_WIDTH;
+    uint32_t size = state.gsState.instanceCount * numSimdBatches * vertexStride * KNOB_SIMD_WIDTH;
+    *ppGsOut = pArena->AllocAligned(size, KNOB_SIMD_WIDTH * sizeof(float));
+
+    const uint32_t cutPrimStride = (state.gsState.maxNumVerts + 7) / 8;
+    const uint32_t streamIdPrimStride = AlignUp(state.gsState.maxNumVerts * 2 / 8, 4);
+    const uint32_t cutBufferSize = cutPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
+    const uint32_t streamIdSize = streamIdPrimStride * state.gsState.instanceCount * KNOB_SIMD_WIDTH;
+
+    // allocate arena space to hold cut or streamid buffer, which is essentially a bitfield sized to the
+    // maximum vertex output as defined by the GS state, per SIMD lane, per GS instance
+
+    // allocate space for temporary per-stream cut buffer if multi-stream is enabled
+    if (state.gsState.isSingleStream)
+    {
+        *ppCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
+        *ppStreamCutBuffer = nullptr;
+    }
+    else
+    {
+        *ppCutBuffer = pArena->AllocAligned(streamIdSize, KNOB_SIMD_WIDTH * sizeof(float));
+        *ppStreamCutBuffer = pArena->AllocAligned(cutBufferSize, KNOB_SIMD_WIDTH * sizeof(float));
+    }
+
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Contains all data generated by the HS and passed to the
+/// tessellator and DS.
+struct TessellationThreadLocalData
+{
+    SWR_HS_CONTEXT hsContext;
+    ScalarPatch patchData[KNOB_SIMD_WIDTH];
+    void* pTxCtx;
+    size_t tsCtxSize;
+
+    simdscalar* pDSOutput;
+    size_t numDSOutputVectors;
+};
+
+THREAD TessellationThreadLocalData* gt_pTessellationThreadData = nullptr;
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Allocate tessellation data for this worker thread.
+INLINE
+static void AllocateTessellationData(SWR_CONTEXT* pContext)
+{
+    /// @TODO - Don't use thread local storage.  Use Worker local storage instead.
+    if (gt_pTessellationThreadData == nullptr)
+    {
+        gt_pTessellationThreadData = (TessellationThreadLocalData*)
+            _aligned_malloc(sizeof(TessellationThreadLocalData), 64);
+        memset(gt_pTessellationThreadData, 0, sizeof(*gt_pTessellationThreadData));
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Implements Tessellation Stages.
+/// @param pDC - pointer to draw context.
+/// @param workerId - thread's worker id. Even thread has a unique id.
+/// @param pa - The primitive assembly object.
+/// @param pGsOut - output stream for GS
+template <
+    bool HasGeometryShaderT,
+    bool HasStreamOutT,
+    bool HasRastT>
+static void TessellationStages(
+    DRAW_CONTEXT *pDC,
+    uint32_t workerId,
+    PA_STATE& pa,
+    void* pGsOut,
+    void* pCutBuffer,
+    void* pCutStreamBuffer,
+    uint32_t* pSoPrimData,
+    simdscalari primID)
+{
+    const API_STATE& state = GetApiState(pDC);
+    const SWR_TS_STATE& tsState = state.tsState;
+    SWR_CONTEXT *pContext = pDC->pContext; // Needed for UPDATE_STATS macro
+
+    SWR_ASSERT(gt_pTessellationThreadData);
+
+    HANDLE tsCtx = TSInitCtx(
+        tsState.domain,
+        tsState.partitioning,
+        tsState.tsOutputTopology,
+        gt_pTessellationThreadData->pTxCtx,
+        gt_pTessellationThreadData->tsCtxSize);
+    if (tsCtx == nullptr)
+    {
+        gt_pTessellationThreadData->pTxCtx = _aligned_malloc(gt_pTessellationThreadData->tsCtxSize, 64);
+        tsCtx = TSInitCtx(
+            tsState.domain,
+            tsState.partitioning,
+            tsState.tsOutputTopology,
+            gt_pTessellationThreadData->pTxCtx,
+            gt_pTessellationThreadData->tsCtxSize);
+    }
+    SWR_ASSERT(tsCtx);
+
+    PFN_PROCESS_PRIMS pfnClipFunc = nullptr;
+    if (HasRastT)
+    {
+        switch (tsState.postDSTopology)
+        {
+        case TOP_TRIANGLE_LIST: pfnClipFunc = ClipTriangles; break;
+        case TOP_LINE_LIST:     pfnClipFunc = ClipLines; break;
+        case TOP_POINT_LIST:    pfnClipFunc = ClipPoints; break;
+        default: SWR_ASSERT(false, "Unexpected DS output topology: %d", tsState.postDSTopology);
+        }
+    }
+
+    SWR_HS_CONTEXT& hsContext = gt_pTessellationThreadData->hsContext;
+    hsContext.pCPout = gt_pTessellationThreadData->patchData;
+    hsContext.PrimitiveID = primID;
+
+    uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, false);
+    // Max storage for one attribute for an entire simdprimitive
+    simdvector simdattrib[MAX_NUM_VERTS_PER_PRIM];
+
+    // assemble all attributes for the input primitives
+    for (uint32_t slot = 0; slot < tsState.numHsInputAttribs; ++slot)
+    {
+        uint32_t attribSlot = VERTEX_ATTRIB_START_SLOT + slot;
+        pa.Assemble(attribSlot, simdattrib);
+
+        for (uint32_t i = 0; i < numVertsPerPrim; ++i)
+        {
+            hsContext.vert[i].attrib[attribSlot] = simdattrib[i];
+        }
+    }
+
+#if defined(_DEBUG)
+    memset(hsContext.pCPout, 0x90, sizeof(ScalarPatch) * KNOB_SIMD_WIDTH);
+#endif
+
+    uint32_t numPrims = pa.NumPrims();
+    hsContext.mask = GenerateMask(numPrims);
+
+    // Run the HS
+    RDTSC_START(FEHullShader);
+    state.pfnHsFunc(GetPrivateState(pDC), &hsContext);
+    RDTSC_STOP(FEHullShader, 0, 0);
+
+    UPDATE_STAT(HsInvocations, numPrims);
+
+    const uint32_t* pPrimId = (const uint32_t*)&primID;
+
+    for (uint32_t p = 0; p < numPrims; ++p)
+    {
+        // Run Tessellator
+        SWR_TS_TESSELLATED_DATA tsData = { 0 };
+        RDTSC_START(FETessellation);
+        TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData);
+        RDTSC_STOP(FETessellation, 0, 0);
+
+        if (tsData.NumPrimitives == 0)
+        {
+            continue;
+        }
+        SWR_ASSERT(tsData.NumDomainPoints);
+
+        // Allocate DS Output memory
+        uint32_t requiredDSVectorInvocations = AlignUp(tsData.NumDomainPoints, KNOB_SIMD_WIDTH) / KNOB_SIMD_WIDTH;
+        size_t requiredDSOutputVectors = requiredDSVectorInvocations * tsState.numDsOutputAttribs;
+        size_t requiredAllocSize = sizeof(simdvector) * requiredDSOutputVectors;
+        if (requiredDSOutputVectors > gt_pTessellationThreadData->numDSOutputVectors)
+        {
+            _aligned_free(gt_pTessellationThreadData->pDSOutput);
+            gt_pTessellationThreadData->pDSOutput = (simdscalar*)_aligned_malloc(requiredAllocSize, 64);
+            gt_pTessellationThreadData->numDSOutputVectors = requiredDSOutputVectors;
+        }
+        SWR_ASSERT(gt_pTessellationThreadData->pDSOutput);
+        SWR_ASSERT(gt_pTessellationThreadData->numDSOutputVectors >= requiredDSOutputVectors);
+
+#if defined(_DEBUG)
+        memset(gt_pTessellationThreadData->pDSOutput, 0x90, requiredAllocSize);
+#endif
+
+        // Run Domain Shader
+        SWR_DS_CONTEXT dsContext;
+        dsContext.PrimitiveID = pPrimId[p];
+        dsContext.pCpIn = &hsContext.pCPout[p];
+        dsContext.pDomainU = (simdscalar*)tsData.pDomainPointsU;
+        dsContext.pDomainV = (simdscalar*)tsData.pDomainPointsV;
+        dsContext.pOutputData = gt_pTessellationThreadData->pDSOutput;
+        dsContext.vectorStride = requiredDSVectorInvocations;
+
+        uint32_t dsInvocations = 0;
+
+        for (dsContext.vectorOffset = 0; dsContext.vectorOffset < requiredDSVectorInvocations; ++dsContext.vectorOffset)
+        {
+            dsContext.mask = GenerateMask(tsData.NumDomainPoints - dsInvocations);
+
+            RDTSC_START(FEDomainShader);
+            state.pfnDsFunc(GetPrivateState(pDC), &dsContext);
+            RDTSC_STOP(FEDomainShader, 0, 0);
+
+            dsInvocations += KNOB_SIMD_WIDTH;
+        }
+        UPDATE_STAT(DsInvocations, tsData.NumDomainPoints);
+
+        PA_TESS tessPa(
+            pDC,
+            dsContext.pOutputData,
+            dsContext.vectorStride,
+            tsState.numDsOutputAttribs,
+            tsData.ppIndices,
+            tsData.NumPrimitives,
+            tsState.postDSTopology);
+
+        while (tessPa.HasWork())
+        {
+            if (HasGeometryShaderT)
+            {
+                GeometryShaderStage<HasStreamOutT, HasRastT>(
+                    pDC, workerId, tessPa, pGsOut, pCutBuffer, pCutStreamBuffer, pSoPrimData,
+                    _simd_set1_epi32(dsContext.PrimitiveID));
+            }
+            else
+            {
+                if (HasStreamOutT)
+                {
+                    StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
+                }
+
+                if (HasRastT)
+                {
+                    simdvector prim[3]; // Only deal with triangles, lines, or points
+                    RDTSC_START(FEPAAssemble);
+#if SWR_ENABLE_ASSERTS
+                    bool assemble =
+#endif
+                        tessPa.Assemble(VERTEX_POSITION_SLOT, prim);
+                    RDTSC_STOP(FEPAAssemble, 1, 0);
+                    SWR_ASSERT(assemble);
+
+                    SWR_ASSERT(pfnClipFunc);
+                    pfnClipFunc(pDC, tessPa, workerId, prim,
+                        GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID));
+                }
+            }
+
+            tessPa.NextPrim();
+
+        } // while (tessPa.HasWork())
+    } // for (uint32_t p = 0; p < numPrims; ++p)
+
+    TSDestroyCtx(tsCtx);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief FE handler for SwrDraw.
+/// @tparam IsIndexedT - Is indexed drawing enabled
+/// @tparam HasTessellationT - Is tessellation enabled
+/// @tparam HasGeometryShaderT - Is the geometry shader stage enabled
+/// @tparam HasStreamOutT - Is stream-out enabled
+/// @tparam HasRastT - Is rasterization enabled
+/// @param pContext - pointer to SWR context.
+/// @param pDC - pointer to draw context.
+/// @param workerId - thread's worker id.
+/// @param pUserData - Pointer to DRAW_WORK
+template <
+    bool IsIndexedT,
+    bool HasTessellationT,
+    bool HasGeometryShaderT,
+    bool HasStreamOutT,
+    bool HasRastT>
+void ProcessDraw(
+    SWR_CONTEXT *pContext,
+    DRAW_CONTEXT *pDC,
+    uint32_t workerId,
+    void *pUserData)
+{
+
+#if KNOB_ENABLE_TOSS_POINTS
+    if (KNOB_TOSS_QUEUE_FE)
+    {
+        return;
+    }
+#endif
+
+    RDTSC_START(FEProcessDraw);
+
+    DRAW_WORK&          work = *(DRAW_WORK*)pUserData;
+    const API_STATE&    state = GetApiState(pDC);
+    __m256i             vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+    SWR_VS_CONTEXT      vsContext;
+    simdvertex          vin;
+
+    int indexSize = 0;
+    uint32_t endVertex = work.numVerts; 
+
+    const int32_t* pLastRequestedIndex = nullptr;
+    if (IsIndexedT)
+    {
+        switch (work.type)
+        {
+        case R32_UINT:
+            indexSize = sizeof(uint32_t);
+            pLastRequestedIndex = &(work.pIB[endVertex]);
+            break;
+        case R16_UINT:
+            indexSize = sizeof(uint16_t);
+            // nasty address offset to last index
+            pLastRequestedIndex = (int32_t*)(&(((uint16_t*)work.pIB)[endVertex]));
+            break;
+        case R8_UINT:
+            indexSize = sizeof(uint8_t);
+            // nasty address offset to last index
+            pLastRequestedIndex = (int32_t*)(&(((uint8_t*)work.pIB)[endVertex]));
+            break;
+        default:
+            SWR_ASSERT(0);
+        }
+    }
+    else
+    {
+        // No cuts, prune partial primitives.
+        endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts));
+    }
+
+    SWR_FETCH_CONTEXT fetchInfo = { 0 };
+    fetchInfo.pStreams = &state.vertexBuffers[0];
+    fetchInfo.StartInstance = work.startInstance;
+    fetchInfo.StartVertex = 0;
+
+    vsContext.pVin = &vin;
+
+    if (IsIndexedT)
+    {
+        fetchInfo.BaseVertex = work.baseVertex;
+
+        // if the entire index buffer isn't being consumed, set the last index
+        // so that fetches < a SIMD wide will be masked off
+        fetchInfo.pLastIndex = (const int32_t*)(((BYTE*)state.indexBuffer.pIndices) + state.indexBuffer.size);
+        if (pLastRequestedIndex < fetchInfo.pLastIndex)
+        {
+            fetchInfo.pLastIndex = pLastRequestedIndex;
+        }
+    }
+    else
+    {
+        fetchInfo.StartVertex = work.startVertex;
+    }
+
+#ifdef KNOB_ENABLE_RDTSC
+    uint32_t numPrims = GetNumPrims(state.topology, work.numVerts);
+#endif
+
+    void* pGsOut = nullptr;
+    void* pCutBuffer = nullptr;
+    void* pStreamCutBuffer = nullptr;
+    if (HasGeometryShaderT)
+    {
+        AllocateGsBuffers(pDC, state, &pGsOut, &pCutBuffer, &pStreamCutBuffer);
+    }
+
+    if (HasTessellationT)
+    {
+        SWR_ASSERT(state.tsState.tsEnable == true);
+        SWR_ASSERT(state.pfnHsFunc != nullptr);
+        SWR_ASSERT(state.pfnDsFunc != nullptr);
+
+        AllocateTessellationData(pContext);
+    }
+    else
+    {
+        SWR_ASSERT(state.tsState.tsEnable == false);
+        SWR_ASSERT(state.pfnHsFunc == nullptr);
+        SWR_ASSERT(state.pfnDsFunc == nullptr);
+    }
+
+    // allocate space for streamout input prim data
+    uint32_t* pSoPrimData = nullptr;
+    if (HasStreamOutT)
+    {
+        pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16);
+
+        // update the
+        for (uint32_t i = 0; i < 4; ++i)
+        {
+            SET_STAT(SoWriteOffset[i], state.soBuffer[i].streamOffset);
+        }
+
+    }
+
+    // choose primitive assembler
+    PA_FACTORY<IsIndexedT> paFactory(pDC, state.topology, work.numVerts);
+    PA_STATE& pa = paFactory.GetPA();
+
+    /// @todo: temporarily move instance loop in the FE to ensure SO ordering
+    for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++)
+    {
+        simdscalari vIndex;
+        uint32_t  i = 0;
+
+        if (IsIndexedT)
+        {
+            fetchInfo.pIndices = work.pIB;
+        }
+        else
+        {
+            vIndex = _simd_add_epi32(_simd_set1_epi32(work.startVertexID), vScale);
+            fetchInfo.pIndices = (const int32_t*)&vIndex;
+        }
+
+        fetchInfo.CurInstance = instanceNum;
+        vsContext.InstanceID = instanceNum;
+
+        while (pa.HasWork())
+        {
+            // PaGetNextVsOutput currently has the side effect of updating some PA state machine state.
+            // So we need to keep this outside of (i < endVertex) check.
+            simdmask* pvCutIndices = nullptr;
+            if (IsIndexedT)
+            {
+                pvCutIndices = &pa.GetNextVsIndices();
+            }
+
+            simdvertex& vout = pa.GetNextVsOutput();
+            vsContext.pVout = &vout;
+
+            if (i < endVertex)
+            {
+
+                // 1. Execute FS/VS for a single SIMD.
+                RDTSC_START(FEFetchShader);
+                state.pfnFetchFunc(fetchInfo, vin);
+                RDTSC_STOP(FEFetchShader, 0, 0);
+
+                // forward fetch generated vertex IDs to the vertex shader
+                vsContext.VertexID = fetchInfo.VertexID;
+
+                // Setup active mask for vertex shader.
+                vsContext.mask = GenerateMask(endVertex - i);
+
+                // forward cut mask to the PA
+                if (IsIndexedT)
+                {
+                    *pvCutIndices = _simd_movemask_ps(_simd_castsi_ps(fetchInfo.CutMask));
+                }
+
+                UPDATE_STAT(IaVertices, GetNumInvocations(i, endVertex));
+
+#if KNOB_ENABLE_TOSS_POINTS
+                if (!KNOB_TOSS_FETCH)
+#endif
+                {
+                    RDTSC_START(FEVertexShader);
+                    state.pfnVertexFunc(GetPrivateState(pDC), &vsContext);
+                    RDTSC_STOP(FEVertexShader, 0, 0);
+
+                    UPDATE_STAT(VsInvocations, GetNumInvocations(i, endVertex));
+                }
+            }
+
+            // 2. Assemble primitives given the last two SIMD.
+            do
+            {
+                simdvector prim[MAX_NUM_VERTS_PER_PRIM];
+                // PaAssemble returns false if there is not enough verts to assemble.
+                RDTSC_START(FEPAAssemble);
+                bool assemble = pa.Assemble(VERTEX_POSITION_SLOT, prim);
+                RDTSC_STOP(FEPAAssemble, 1, 0);
+
+#if KNOB_ENABLE_TOSS_POINTS
+                if (!KNOB_TOSS_FETCH)
+#endif
+                {
+#if KNOB_ENABLE_TOSS_POINTS
+                    if (!KNOB_TOSS_VS)
+#endif
+                    {
+                        if (assemble)
+                        {
+                            UPDATE_STAT(IaPrimitives, pa.NumPrims());
+
+                            if (HasTessellationT)
+                            {
+                                TessellationStages<HasGeometryShaderT, HasStreamOutT, HasRastT>(
+                                    pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
+                            }
+                            else if (HasGeometryShaderT)
+                            {
+                                GeometryShaderStage<HasStreamOutT, HasRastT>(
+                                    pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID));
+                            }
+                            else
+                            {
+                                // If streamout is enabled then stream vertices out to memory.
+                                if (HasStreamOutT)
+                                {
+                                    StreamOut(pDC, pa, workerId, pSoPrimData, 0);
+                                }
+
+                                if (HasRastT)
+                                {
+                                    SWR_ASSERT(pDC->pState->pfnProcessPrims);
+                                    pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim,
+                                        GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID));
+                                }
+                            }
+                        }
+                    }
+                }
+            } while (pa.NextPrim());
+
+            i += KNOB_SIMD_WIDTH;
+            if (IsIndexedT)
+            {
+                fetchInfo.pIndices = (int*)((BYTE*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
+            }
+            else
+            {
+                vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH));
+            }
+        }
+        pa.Reset();
+    }
+
+    RDTSC_STOP(FEProcessDraw, numPrims * work.numInstances, pDC->drawId);
+}
+// Explicit Instantiation of all combinations
+template void ProcessDraw<false, false, false, false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, false, false, false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, false, false, true,  false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, false, false, true,  true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, false, true,  false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, false, true,  false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, false, true,  true,  false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, false, true,  true,  true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, true,  false, false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, true,  false, false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, true,  false, true,  false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, true,  false, true,  true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, true,  true,  false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, true,  true,  false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, true,  true,  true,  false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<false, true,  true,  true,  true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true,  false, false, false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true,  false, false, false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true,  false, false, true,  false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true,  false, false, true,  true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true,  false, true,  false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true,  false, true,  false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true,  false, true,  true,  false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true,  false, true,  true,  true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true,  true,  false, false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true,  true,  false, false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true,  true,  false, true,  false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true,  true,  false, true,  true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true,  true,  true,  false, false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true,  true,  true,  false, true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true,  true,  true,  true,  false>(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+template void ProcessDraw<true,  true,  true,  true,  true >(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Processes attributes for the backend based on linkage mask and
+///        linkage map.  Essentially just doing an SOA->AOS conversion and pack.
+/// @param pDC - Draw context
+/// @param pa - Primitive Assembly state
+/// @param linkageMask - Specifies which VS outputs are routed to PS.
+/// @param pLinkageMap - maps VS attribute slot to PS slot
+/// @param triIndex - Triangle to process attributes for
+/// @param pBuffer - Output result
+template<uint32_t NumVerts>
+INLINE void ProcessAttributes(
+    DRAW_CONTEXT *pDC,
+    PA_STATE&pa,
+    uint32_t linkageMask,
+    const uint8_t* pLinkageMap,
+    uint32_t triIndex,
+    float *pBuffer)
+{
+    DWORD slot = 0;
+    uint32_t mapIdx = 0;
+    LONG constantInterpMask = pDC->pState->state.backendState.constantInterpolationMask;
+    const uint32_t provokingVertex = pDC->pState->state.frontendState.topologyProvokingVertex;
+
+    while (_BitScanForward(&slot, linkageMask))
+    {
+        linkageMask &= ~(1 << slot); // done with this bit.
+
+        // compute absolute slot in vertex attrib array
+        uint32_t inputSlot = VERTEX_ATTRIB_START_SLOT + pLinkageMap[mapIdx];
+
+        __m128 attrib[3];    // triangle attribs (always 4 wide)
+        pa.AssembleSingle(inputSlot, triIndex, attrib);
+
+        if (_bittest(&constantInterpMask, mapIdx))
+        {
+            for (uint32_t i = 0; i < NumVerts; ++i)
+            {
+                _mm_store_ps(pBuffer, attrib[provokingVertex]);
+                pBuffer += 4;
+            }
+        }
+        else
+        {
+            for (uint32_t i = 0; i < NumVerts; ++i)
+            {
+                _mm_store_ps(pBuffer, attrib[i]);
+                pBuffer += 4;
+            }
+        }
+
+        // pad out the attrib buffer to 3 verts to ensure the triangle
+        // interpolation code in the pixel shader works correctly for the
+        // 3 topologies - point, line, tri.  This effectively zeros out the
+        // effect of the missing vertices in the triangle interpolation.
+        for (uint32_t i = NumVerts; i < 3; ++i)
+        {
+            _mm_store_ps(pBuffer, attrib[NumVerts - 1]);
+            pBuffer += 4;
+        }
+
+        mapIdx++;
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Processes enabled user clip distances. Loads the active clip
+///        distances from the PA, sets up barycentric equations, and
+///        stores the results to the output buffer
+/// @param pa - Primitive Assembly state
+/// @param primIndex - primitive index to process
+/// @param clipDistMask - mask of enabled clip distances
+/// @param pUserClipBuffer - buffer to store results
+template<uint32_t NumVerts>
+void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask, float* pUserClipBuffer)
+{
+    DWORD clipDist;
+    while (_BitScanForward(&clipDist, clipDistMask))
+    {
+        clipDistMask &= ~(1 << clipDist);
+        uint32_t clipSlot = clipDist >> 2;
+        uint32_t clipComp = clipDist & 0x3;
+        uint32_t clipAttribSlot = clipSlot == 0 ?
+            VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
+
+        __m128 primClipDist[3];
+        pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
+
+        float vertClipDist[NumVerts];
+        for (uint32_t e = 0; e < NumVerts; ++e)
+        {
+            OSALIGNSIMD(float) aVertClipDist[4];
+            _mm_store_ps(aVertClipDist, primClipDist[e]);
+            vertClipDist[e] = aVertClipDist[clipComp];
+        };
+
+        // setup plane equations for barycentric interpolation in the backend
+        float baryCoeff[NumVerts];
+        for (uint32_t e = 0; e < NumVerts - 1; ++e)
+        {
+            baryCoeff[e] = vertClipDist[e] - vertClipDist[NumVerts - 1];
+        }
+        baryCoeff[NumVerts - 1] = vertClipDist[NumVerts - 1];
+
+        for (uint32_t e = 0; e < NumVerts; ++e)
+        {
+            *(pUserClipBuffer++) = baryCoeff[e];
+        }
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Bin triangle primitives to macro tiles. Performs setup, clipping
+///        culling, viewport transform, etc.
+/// @param pDC - pointer to draw context.
+/// @param pa - The primitive assembly object.
+/// @param workerId - thread's worker id. Even thread has a unique id.
+/// @param tri - Contains triangle position data for SIMDs worth of triangles.
+/// @param primID - Primitive ID for each triangle.
+void BinTriangles(
+    DRAW_CONTEXT *pDC,
+    PA_STATE& pa,
+    uint32_t workerId,
+    simdvector tri[3],
+    uint32_t triMask,
+    simdscalari primID)
+{
+    RDTSC_START(FEBinTriangles);
+
+    const API_STATE& state = GetApiState(pDC);
+    const SWR_RASTSTATE& rastState = state.rastState;
+    const SWR_FRONTEND_STATE& feState = state.frontendState;
+    const SWR_GS_STATE& gsState = state.gsState;
+
+    // Simple wireframe mode for debugging purposes only
+
+    simdscalar vRecipW0 = _simd_set1_ps(1.0f);
+    simdscalar vRecipW1 = _simd_set1_ps(1.0f);
+    simdscalar vRecipW2 = _simd_set1_ps(1.0f);
+
+    if (!feState.vpTransformDisable)
+    {
+        // perspective divide
+        vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), tri[0].w);
+        vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), tri[1].w);
+        vRecipW2 = _simd_div_ps(_simd_set1_ps(1.0f), tri[2].w);
+
+        tri[0].v[0] = _simd_mul_ps(tri[0].v[0], vRecipW0);
+        tri[1].v[0] = _simd_mul_ps(tri[1].v[0], vRecipW1);
+        tri[2].v[0] = _simd_mul_ps(tri[2].v[0], vRecipW2);
+
+        tri[0].v[1] = _simd_mul_ps(tri[0].v[1], vRecipW0);
+        tri[1].v[1] = _simd_mul_ps(tri[1].v[1], vRecipW1);
+        tri[2].v[1] = _simd_mul_ps(tri[2].v[1], vRecipW2);
+
+        tri[0].v[2] = _simd_mul_ps(tri[0].v[2], vRecipW0);
+        tri[1].v[2] = _simd_mul_ps(tri[1].v[2], vRecipW1);
+        tri[2].v[2] = _simd_mul_ps(tri[2].v[2], vRecipW2);
+
+        // viewport transform to screen coords
+        viewportTransform<3>(tri, state.vpMatrix[0]);
+    }
+
+    // adjust for pixel center location
+    simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
+    tri[0].x = _simd_add_ps(tri[0].x, offset);
+    tri[0].y = _simd_add_ps(tri[0].y, offset);
+
+    tri[1].x = _simd_add_ps(tri[1].x, offset);
+    tri[1].y = _simd_add_ps(tri[1].y, offset);
+
+    tri[2].x = _simd_add_ps(tri[2].x, offset);
+    tri[2].y = _simd_add_ps(tri[2].y, offset);
+
+    // convert to fixed point
+    simdscalari vXi[3], vYi[3];
+    vXi[0] = fpToFixedPointVertical(tri[0].x);
+    vYi[0] = fpToFixedPointVertical(tri[0].y);
+    vXi[1] = fpToFixedPointVertical(tri[1].x);
+    vYi[1] = fpToFixedPointVertical(tri[1].y);
+    vXi[2] = fpToFixedPointVertical(tri[2].x);
+    vYi[2] = fpToFixedPointVertical(tri[2].y);
+
+    // triangle setup
+    simdscalari vAi[3], vBi[3];
+    triangleSetupABIntVertical(vXi, vYi, vAi, vBi);
+        
+    // determinant
+    simdscalari vDet[2];
+    calcDeterminantIntVertical(vAi, vBi, vDet);
+
+    // cull zero area
+    int maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[0], _simd_setzero_si())));
+    int maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpeq_epi64(vDet[1], _simd_setzero_si())));
+
+    int cullZeroAreaMask = maskLo | ((maskHi << KNOB_SIMD_WIDTH / 2));
+
+    uint32_t origTriMask = triMask;
+    triMask &= ~cullZeroAreaMask;
+
+    // determine front winding tris
+    // CW  +det
+    // CCW -det
+    maskLo = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[0], _simd_setzero_si())));
+    maskHi = _simd_movemask_pd(_simd_castsi_pd(_simd_cmpgt_epi64(vDet[1], _simd_setzero_si())));
+    int cwTriMask = maskLo | (maskHi << (KNOB_SIMD_WIDTH /2) );
+
+    uint32_t frontWindingTris;
+    if (rastState.frontWinding == SWR_FRONTWINDING_CW)
+    {
+        frontWindingTris = cwTriMask;
+    }
+    else
+    {
+        frontWindingTris = ~cwTriMask;
+    }
+
+    // cull
+    uint32_t cullTris;
+    switch ((SWR_CULLMODE)rastState.cullMode)
+    {
+    case SWR_CULLMODE_BOTH:  cullTris = 0xffffffff; break;
+    case SWR_CULLMODE_NONE:  cullTris = 0x0; break;
+    case SWR_CULLMODE_FRONT: cullTris = frontWindingTris; break;
+    case SWR_CULLMODE_BACK:  cullTris = ~frontWindingTris; break;
+    default: SWR_ASSERT(false, "Invalid cull mode: %d", rastState.cullMode); cullTris = 0x0; break;
+    }
+
+    triMask &= ~cullTris;
+
+    if (origTriMask ^ triMask)
+    {
+        RDTSC_EVENT(FECullZeroAreaAndBackface, _mm_popcnt_u32(origTriMask ^ triMask), 0);
+    }
+
+    // compute per tri backface
+    uint32_t frontFaceMask = frontWindingTris;
+
+    uint32_t *pPrimID = (uint32_t *)&primID;
+    DWORD triIndex = 0;
+
+    if (!triMask)
+    {
+        goto endBinTriangles;
+    }
+
+    // Calc bounding box of triangles
+    simdBBox bbox;
+    calcBoundingBoxIntVertical(vXi, vYi, bbox);
+
+    // determine if triangle falls between pixel centers and discard
+    // only discard for non-MSAA case
+    // (left + 127) & ~255
+    // (right + 128) & ~255
+
+    if(rastState.sampleCount == SWR_MULTISAMPLE_1X)
+    {
+        origTriMask = triMask;
+
+        int cullCenterMask;
+        {
+            simdscalari left = _simd_add_epi32(bbox.left, _simd_set1_epi32(127));
+            left = _simd_and_si(left, _simd_set1_epi32(~255));
+            simdscalari right = _simd_add_epi32(bbox.right, _simd_set1_epi32(128));
+            right = _simd_and_si(right, _simd_set1_epi32(~255));
+
+            simdscalari vMaskH = _simd_cmpeq_epi32(left, right);
+
+            simdscalari top = _simd_add_epi32(bbox.top, _simd_set1_epi32(127));
+            top = _simd_and_si(top, _simd_set1_epi32(~255));
+            simdscalari bottom = _simd_add_epi32(bbox.bottom, _simd_set1_epi32(128));
+            bottom = _simd_and_si(bottom, _simd_set1_epi32(~255));
+
+            simdscalari vMaskV = _simd_cmpeq_epi32(top, bottom);
+            vMaskV = _simd_or_si(vMaskH, vMaskV);
+            cullCenterMask = _simd_movemask_ps(_simd_castsi_ps(vMaskV));
+        }
+
+        triMask &= ~cullCenterMask;
+
+        if(origTriMask ^ triMask)
+        {
+            RDTSC_EVENT(FECullBetweenCenters, _mm_popcnt_u32(origTriMask ^ triMask), 0);
+        }
+    }
+
+    // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
+    bbox.left   = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
+    bbox.top    = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
+    bbox.right  = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
+    bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
+
+    // Cull tris completely outside scissor
+    {
+        simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
+        simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
+        simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
+        uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
+        triMask = triMask & ~maskOutsideScissor;
+    }
+
+    if (!triMask)
+    {
+        goto endBinTriangles;
+    }
+
+    // Convert triangle bbox to macrotile units.
+    bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
+    bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
+    bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
+    bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
+
+    OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
+    _simd_store_si((simdscalari*)aMTLeft, bbox.left);
+    _simd_store_si((simdscalari*)aMTRight, bbox.right);
+    _simd_store_si((simdscalari*)aMTTop, bbox.top);
+    _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
+
+    // transpose verts needed for backend
+    /// @todo modify BE to take non-transformed verts
+    __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
+    vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
+    vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
+    vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
+    vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vRecipW2);
+
+    // store render target array index
+    OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
+    if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
+    {
+        simdvector vRtai[3];
+        pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
+        simdscalari vRtaii;
+        vRtaii = _simd_castps_si(vRtai[0].x);
+        _simd_store_si((simdscalari*)aRTAI, vRtaii);
+    }
+    else
+    {
+        _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
+    }
+
+    // scan remaining valid triangles and bin each separately
+    while (_BitScanForward(&triIndex, triMask))
+    {
+        uint32_t linkageCount = state.linkageCount;
+        uint32_t linkageMask  = state.linkageMask;
+        uint32_t numScalarAttribs = linkageCount * 4;
+        
+        BE_WORK work;
+        work.type = DRAW;
+
+        TRIANGLE_WORK_DESC &desc = work.desc.tri;
+
+        desc.triFlags.frontFacing = state.forceFront ? 1 : ((frontFaceMask >> triIndex) & 1);
+        desc.triFlags.primID = pPrimID[triIndex];
+        desc.triFlags.renderTargetArrayIndex = aRTAI[triIndex];
+
+        if(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN)
+        {
+            work.pfnWork = gRasterizerTable[rastState.scissorEnable][rastState.sampleCount];
+        }
+        else
+        {
+            // for center sample pattern, all samples are at pixel center; calculate coverage
+            // once at center and broadcast the results in the backend
+            work.pfnWork = gRasterizerTable[rastState.scissorEnable][SWR_MULTISAMPLE_1X];
+        }
+
+        Arena* pArena = pDC->pArena;
+        SWR_ASSERT(pArena != nullptr);
+
+        // store active attribs
+        float *pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
+        desc.pAttribs = pAttribs;
+        desc.numAttribs = linkageCount;
+        ProcessAttributes<3>(pDC, pa, linkageMask, state.linkageMap, triIndex, desc.pAttribs);
+
+        // store triangle vertex data
+        desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
+
+        _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
+        _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
+        _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
+        _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
+
+        // store user clip distances
+        if (rastState.clipDistanceMask)
+        {
+            uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
+            desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 3 * sizeof(float));
+            ProcessUserClipDist<3>(pa, triIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
+        }
+
+        MacroTileMgr *pTileMgr = pDC->pTileMgr;
+        for (uint32_t y = aMTTop[triIndex]; y <= aMTBottom[triIndex]; ++y)
+        {
+            for (uint32_t x = aMTLeft[triIndex]; x <= aMTRight[triIndex]; ++x)
+            {
+#if KNOB_ENABLE_TOSS_POINTS
+                if (!KNOB_TOSS_SETUP_TRIS)
+#endif
+                {
+                    pTileMgr->enqueue(x, y, &work);
+                }
+            }
+        }
+
+        triMask &= ~(1 << triIndex);
+    }
+
+endBinTriangles:
+    RDTSC_STOP(FEBinTriangles, 1, 0);
+}
+
+
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Bin SIMD points to the backend.  Only supports point size of 1
+/// @param pDC - pointer to draw context.
+/// @param pa - The primitive assembly object.
+/// @param workerId - thread's worker id. Even thread has a unique id.
+/// @param tri - Contains point position data for SIMDs worth of points.
+/// @param primID - Primitive ID for each point.
+void BinPoints(
+    DRAW_CONTEXT *pDC,
+    PA_STATE& pa,
+    uint32_t workerId,
+    simdvector prim[3],
+    uint32_t primMask,
+    simdscalari primID)
+{
+    RDTSC_START(FEBinPoints);
+
+    simdvector& primVerts = prim[0];
+
+    const API_STATE& state = GetApiState(pDC);
+    const SWR_FRONTEND_STATE& feState = state.frontendState;
+    const SWR_GS_STATE& gsState = state.gsState;
+    const SWR_RASTSTATE& rastState = state.rastState;
+
+    if (!feState.vpTransformDisable)
+    {
+        // perspective divide
+        simdscalar vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), primVerts.w);
+        primVerts.x = _simd_mul_ps(primVerts.x, vRecipW0);
+        primVerts.y = _simd_mul_ps(primVerts.y, vRecipW0);
+        primVerts.z = _simd_mul_ps(primVerts.z, vRecipW0);
+
+        // viewport transform to screen coords
+        viewportTransform<1>(&primVerts, state.vpMatrix[0]);
+    }
+
+    // adjust for pixel center location
+    simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
+    primVerts.x = _simd_add_ps(primVerts.x, offset);
+    primVerts.y = _simd_add_ps(primVerts.y, offset);
+
+    // convert to fixed point
+    simdscalari vXi, vYi;
+    vXi = fpToFixedPointVertical(primVerts.x);
+    vYi = fpToFixedPointVertical(primVerts.y);
+
+    if (CanUseSimplePoints(pDC))
+    {
+        // adjust for top-left rule
+        vXi = _simd_sub_epi32(vXi, _simd_set1_epi32(1));
+        vYi = _simd_sub_epi32(vYi, _simd_set1_epi32(1));
+
+        // cull points off the top-left edge of the viewport
+        primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vXi));
+        primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vYi));
+
+        // compute macro tile coordinates 
+        simdscalari macroX = _simd_srai_epi32(vXi, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
+        simdscalari macroY = _simd_srai_epi32(vYi, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
+
+        OSALIGNSIMD(uint32_t) aMacroX[KNOB_SIMD_WIDTH], aMacroY[KNOB_SIMD_WIDTH];
+        _simd_store_si((simdscalari*)aMacroX, macroX);
+        _simd_store_si((simdscalari*)aMacroY, macroY);
+
+        // compute raster tile coordinates
+        simdscalari rasterX = _simd_srai_epi32(vXi, KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
+        simdscalari rasterY = _simd_srai_epi32(vYi, KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
+
+        // compute raster tile relative x,y for coverage mask
+        simdscalari tileAlignedX = _simd_slli_epi32(rasterX, KNOB_TILE_X_DIM_SHIFT);
+        simdscalari tileAlignedY = _simd_slli_epi32(rasterY, KNOB_TILE_Y_DIM_SHIFT);
+
+        simdscalari tileRelativeX = _simd_sub_epi32(_simd_srai_epi32(vXi, FIXED_POINT_SHIFT), tileAlignedX);
+        simdscalari tileRelativeY = _simd_sub_epi32(_simd_srai_epi32(vYi, FIXED_POINT_SHIFT), tileAlignedY);
+
+        OSALIGNSIMD(uint32_t) aTileRelativeX[KNOB_SIMD_WIDTH];
+        OSALIGNSIMD(uint32_t) aTileRelativeY[KNOB_SIMD_WIDTH];
+        _simd_store_si((simdscalari*)aTileRelativeX, tileRelativeX);
+        _simd_store_si((simdscalari*)aTileRelativeY, tileRelativeY);
+
+        OSALIGNSIMD(uint32_t) aTileAlignedX[KNOB_SIMD_WIDTH];
+        OSALIGNSIMD(uint32_t) aTileAlignedY[KNOB_SIMD_WIDTH];
+        _simd_store_si((simdscalari*)aTileAlignedX, tileAlignedX);
+        _simd_store_si((simdscalari*)aTileAlignedY, tileAlignedY);
+
+        OSALIGNSIMD(float) aZ[KNOB_SIMD_WIDTH];
+        _simd_store_ps((float*)aZ, primVerts.z);
+
+        // store render target array index
+        OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
+        if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
+        {
+            simdvector vRtai;
+            pa.Assemble(VERTEX_RTAI_SLOT, &vRtai);
+            simdscalari vRtaii = _simd_castps_si(vRtai.x);
+            _simd_store_si((simdscalari*)aRTAI, vRtaii);
+        }
+        else
+        {
+            _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
+        }
+
+        uint32_t *pPrimID = (uint32_t *)&primID;
+        DWORD primIndex = 0;
+        // scan remaining valid triangles and bin each separately
+        while (_BitScanForward(&primIndex, primMask))
+        {
+            uint32_t linkageCount = state.linkageCount;
+            uint32_t linkageMask = state.linkageMask;
+
+            uint32_t numScalarAttribs = linkageCount * 4;
+
+            BE_WORK work;
+            work.type = DRAW;
+
+            TRIANGLE_WORK_DESC &desc = work.desc.tri;
+
+            // points are always front facing
+            desc.triFlags.frontFacing = 1;
+            desc.triFlags.primID = pPrimID[primIndex];
+            desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
+
+            work.pfnWork = RasterizeSimplePoint;
+
+            Arena* pArena = pDC->pArena;
+            SWR_ASSERT(pArena != nullptr);
+
+            // store attributes
+            float *pAttribs = (float*)pArena->AllocAligned(3 * numScalarAttribs * sizeof(float), 16);
+            desc.pAttribs = pAttribs;
+            desc.numAttribs = linkageCount;
+
+            ProcessAttributes<1>(pDC, pa, linkageMask, state.linkageMap, primIndex, pAttribs);
+
+            // store raster tile aligned x, y, perspective correct z
+            float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
+            desc.pTriBuffer = pTriBuffer;
+            *(uint32_t*)pTriBuffer++ = aTileAlignedX[primIndex];
+            *(uint32_t*)pTriBuffer++ = aTileAlignedY[primIndex];
+            *pTriBuffer = aZ[primIndex];
+
+            uint32_t tX = aTileRelativeX[primIndex];
+            uint32_t tY = aTileRelativeY[primIndex];
+
+            // pack the relative x,y into the coverageMask, the rasterizer will
+            // generate the true coverage mask from it
+            work.desc.tri.triFlags.coverageMask = tX | (tY << 4);
+
+            // bin it
+            MacroTileMgr *pTileMgr = pDC->pTileMgr;
+#if KNOB_ENABLE_TOSS_POINTS
+            if (!KNOB_TOSS_SETUP_TRIS)
+#endif
+            {
+                pTileMgr->enqueue(aMacroX[primIndex], aMacroY[primIndex], &work);
+            }
+            primMask &= ~(1 << primIndex);
+        }
+    }
+    else
+    {
+        // non simple points need to be potentially binned to multiple macro tiles
+        simdscalar vPointSize;
+        if (rastState.pointParam)
+        {
+            simdvector size[3];
+            pa.Assemble(VERTEX_POINT_SIZE_SLOT, size);
+            vPointSize = size[0].x;
+        }
+        else
+        {
+            vPointSize = _simd_set1_ps(rastState.pointSize);
+        }
+
+        // bloat point to bbox
+        simdBBox bbox;
+        bbox.left = bbox.right = vXi;
+        bbox.top = bbox.bottom = vYi;
+
+        simdscalar vHalfWidth = _simd_mul_ps(vPointSize, _simd_set1_ps(0.5f));
+        simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
+        bbox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
+        bbox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
+        bbox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
+        bbox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
+
+        // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
+        bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
+        bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
+        bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
+        bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
+
+        // Cull bloated points completely outside scissor
+        simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
+        simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
+        simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
+        uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
+        primMask = primMask & ~maskOutsideScissor;
+
+        // Convert bbox to macrotile units.
+        bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
+        bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
+        bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
+        bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
+
+        OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
+        _simd_store_si((simdscalari*)aMTLeft, bbox.left);
+        _simd_store_si((simdscalari*)aMTRight, bbox.right);
+        _simd_store_si((simdscalari*)aMTTop, bbox.top);
+        _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
+
+        // store render target array index
+        OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
+        if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
+        {
+            simdvector vRtai[2];
+            pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
+            simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
+            _simd_store_si((simdscalari*)aRTAI, vRtaii);
+        }
+        else
+        {
+            _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
+        }
+
+        OSALIGNSIMD(float) aPointSize[KNOB_SIMD_WIDTH];
+        _simd_store_ps((float*)aPointSize, vPointSize);
+
+        uint32_t *pPrimID = (uint32_t *)&primID;
+
+        OSALIGNSIMD(float) aPrimVertsX[KNOB_SIMD_WIDTH];
+        OSALIGNSIMD(float) aPrimVertsY[KNOB_SIMD_WIDTH];
+        OSALIGNSIMD(float) aPrimVertsZ[KNOB_SIMD_WIDTH];
+
+        _simd_store_ps((float*)aPrimVertsX, primVerts.x);
+        _simd_store_ps((float*)aPrimVertsY, primVerts.y);
+        _simd_store_ps((float*)aPrimVertsZ, primVerts.z);
+
+        // scan remaining valid prims and bin each separately
+        DWORD primIndex;
+        while (_BitScanForward(&primIndex, primMask))
+        {
+            uint32_t linkageCount = state.linkageCount;
+            uint32_t linkageMask = state.linkageMask;
+            uint32_t numScalarAttribs = linkageCount * 4;
+
+            BE_WORK work;
+            work.type = DRAW;
+
+            TRIANGLE_WORK_DESC &desc = work.desc.tri;
+
+            desc.triFlags.frontFacing = 1;
+            desc.triFlags.primID = pPrimID[primIndex];
+            desc.triFlags.pointSize = aPointSize[primIndex];
+            desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
+
+            work.pfnWork = RasterizeTriPoint;
+
+            Arena* pArena = pDC->pArena;
+            SWR_ASSERT(pArena != nullptr);
+
+            // store active attribs
+            desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
+            desc.numAttribs = linkageCount;
+            ProcessAttributes<1>(pDC, pa, linkageMask, state.linkageMap, primIndex, desc.pAttribs);
+
+            // store point vertex data
+            float *pTriBuffer = (float*)pArena->AllocAligned(4 * sizeof(float), 16);
+            desc.pTriBuffer = pTriBuffer;
+            *pTriBuffer++ = aPrimVertsX[primIndex];
+            *pTriBuffer++ = aPrimVertsY[primIndex];
+            *pTriBuffer = aPrimVertsZ[primIndex];
+
+            // store user clip distances
+            if (rastState.clipDistanceMask)
+            {
+                uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
+                desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
+                ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
+            }
+
+            MacroTileMgr *pTileMgr = pDC->pTileMgr;
+            for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
+            {
+                for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
+                {
+#if KNOB_ENABLE_TOSS_POINTS
+                    if (!KNOB_TOSS_SETUP_TRIS)
+#endif
+                    {
+                        pTileMgr->enqueue(x, y, &work);
+                    }
+                }
+            }
+
+            primMask &= ~(1 << primIndex);
+        }
+    }
+
+
+
+    
+    RDTSC_STOP(FEBinPoints, 1, 0);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Bin SIMD lines to the backend.
+/// @param pDC - pointer to draw context.
+/// @param pa - The primitive assembly object.
+/// @param workerId - thread's worker id. Even thread has a unique id.
+/// @param tri - Contains line position data for SIMDs worth of points.
+/// @param primID - Primitive ID for each line.
+void BinLines(
+    DRAW_CONTEXT *pDC,
+    PA_STATE& pa,
+    uint32_t workerId,
+    simdvector prim[],
+    uint32_t primMask,
+    simdscalari primID)
+{
+    RDTSC_START(FEBinLines);
+
+    const API_STATE& state = GetApiState(pDC);
+    const SWR_RASTSTATE& rastState = state.rastState;
+    const SWR_FRONTEND_STATE& feState = state.frontendState;
+    const SWR_GS_STATE& gsState = state.gsState;
+
+    simdscalar vRecipW0 = _simd_set1_ps(1.0f);
+    simdscalar vRecipW1 = _simd_set1_ps(1.0f);
+
+    if (!feState.vpTransformDisable)
+    {
+        // perspective divide
+        vRecipW0 = _simd_div_ps(_simd_set1_ps(1.0f), prim[0].w);
+        vRecipW1 = _simd_div_ps(_simd_set1_ps(1.0f), prim[1].w);
+
+        prim[0].v[0] = _simd_mul_ps(prim[0].v[0], vRecipW0);
+        prim[1].v[0] = _simd_mul_ps(prim[1].v[0], vRecipW1);
+
+        prim[0].v[1] = _simd_mul_ps(prim[0].v[1], vRecipW0);
+        prim[1].v[1] = _simd_mul_ps(prim[1].v[1], vRecipW1);
+
+        prim[0].v[2] = _simd_mul_ps(prim[0].v[2], vRecipW0);
+        prim[1].v[2] = _simd_mul_ps(prim[1].v[2], vRecipW1);
+
+        // viewport transform to screen coords
+        viewportTransform<2>(prim, state.vpMatrix[0]);
+    }
+
+    // adjust for pixel center location
+    simdscalar offset = g_pixelOffsets[rastState.pixelLocation];
+    prim[0].x = _simd_add_ps(prim[0].x, offset);
+    prim[0].y = _simd_add_ps(prim[0].y, offset);
+
+    prim[1].x = _simd_add_ps(prim[1].x, offset);
+    prim[1].y = _simd_add_ps(prim[1].y, offset);
+
+    // convert to fixed point
+    simdscalari vXi[2], vYi[2];
+    vXi[0] = fpToFixedPointVertical(prim[0].x);
+    vYi[0] = fpToFixedPointVertical(prim[0].y);
+    vXi[1] = fpToFixedPointVertical(prim[1].x);
+    vYi[1] = fpToFixedPointVertical(prim[1].y);
+
+    // compute x-major vs y-major mask
+    simdscalari xLength = _simd_abs_epi32(_simd_sub_epi32(vXi[0], vXi[1]));
+    simdscalari yLength = _simd_abs_epi32(_simd_sub_epi32(vYi[0], vYi[1]));
+    simdscalar vYmajorMask = _simd_castsi_ps(_simd_cmpgt_epi32(yLength, xLength));
+    uint32_t yMajorMask = _simd_movemask_ps(vYmajorMask);
+
+    // cull zero-length lines
+    simdscalari vZeroLengthMask = _simd_cmpeq_epi32(xLength, _simd_setzero_si());
+    vZeroLengthMask = _simd_and_si(vZeroLengthMask, _simd_cmpeq_epi32(yLength, _simd_setzero_si()));
+
+    primMask &= ~_simd_movemask_ps(_simd_castsi_ps(vZeroLengthMask));
+
+    uint32_t *pPrimID = (uint32_t *)&primID;
+
+    simdscalar vUnused = _simd_setzero_ps();
+
+    // Calc bounding box of lines
+    simdBBox bbox;
+    bbox.left = _simd_min_epi32(vXi[0], vXi[1]);
+    bbox.right = _simd_max_epi32(vXi[0], vXi[1]);
+    bbox.top = _simd_min_epi32(vYi[0], vYi[1]);
+    bbox.bottom = _simd_max_epi32(vYi[0], vYi[1]);
+
+    // bloat bbox by line width along minor axis
+    simdscalar vHalfWidth = _simd_set1_ps(rastState.lineWidth / 2.0f);
+    simdscalari vHalfWidthi = fpToFixedPointVertical(vHalfWidth);
+    simdBBox bloatBox;
+    bloatBox.left = _simd_sub_epi32(bbox.left, vHalfWidthi);
+    bloatBox.right = _simd_add_epi32(bbox.right, vHalfWidthi);
+    bloatBox.top = _simd_sub_epi32(bbox.top, vHalfWidthi);
+    bloatBox.bottom = _simd_add_epi32(bbox.bottom, vHalfWidthi);
+
+    bbox.left = _simd_blendv_epi32(bbox.left, bloatBox.left, vYmajorMask);
+    bbox.right = _simd_blendv_epi32(bbox.right, bloatBox.right, vYmajorMask);
+    bbox.top = _simd_blendv_epi32(bloatBox.top, bbox.top, vYmajorMask);
+    bbox.bottom = _simd_blendv_epi32(bloatBox.bottom, bbox.bottom, vYmajorMask);
+
+    // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since right/bottom edge is exclusive.
+    bbox.left = _simd_max_epi32(bbox.left, _simd_set1_epi32(state.scissorInFixedPoint.left));
+    bbox.top = _simd_max_epi32(bbox.top, _simd_set1_epi32(state.scissorInFixedPoint.top));
+    bbox.right = _simd_min_epi32(_simd_sub_epi32(bbox.right, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.right));
+    bbox.bottom = _simd_min_epi32(_simd_sub_epi32(bbox.bottom, _simd_set1_epi32(1)), _simd_set1_epi32(state.scissorInFixedPoint.bottom));
+
+    // Cull prims completely outside scissor
+    {
+        simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.left, bbox.right);
+        simdscalari maskOutsideScissorY = _simd_cmpgt_epi32(bbox.top, bbox.bottom);
+        simdscalari maskOutsideScissorXY = _simd_or_si(maskOutsideScissorX, maskOutsideScissorY);
+        uint32_t maskOutsideScissor = _simd_movemask_ps(_simd_castsi_ps(maskOutsideScissorXY));
+        primMask = primMask & ~maskOutsideScissor;
+    }
+
+    if (!primMask)
+    {
+        goto endBinLines;
+    }
+
+    // Convert triangle bbox to macrotile units.
+    bbox.left = _simd_srai_epi32(bbox.left, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
+    bbox.top = _simd_srai_epi32(bbox.top, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
+    bbox.right = _simd_srai_epi32(bbox.right, KNOB_MACROTILE_X_DIM_FIXED_SHIFT);
+    bbox.bottom = _simd_srai_epi32(bbox.bottom, KNOB_MACROTILE_Y_DIM_FIXED_SHIFT);
+
+    OSALIGNSIMD(uint32_t) aMTLeft[KNOB_SIMD_WIDTH], aMTRight[KNOB_SIMD_WIDTH], aMTTop[KNOB_SIMD_WIDTH], aMTBottom[KNOB_SIMD_WIDTH];
+    _simd_store_si((simdscalari*)aMTLeft, bbox.left);
+    _simd_store_si((simdscalari*)aMTRight, bbox.right);
+    _simd_store_si((simdscalari*)aMTTop, bbox.top);
+    _simd_store_si((simdscalari*)aMTBottom, bbox.bottom);
+
+    // transpose verts needed for backend
+    /// @todo modify BE to take non-transformed verts
+    __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
+    vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
+    vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
+    vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
+    vTranspose3x8(vHorizW, vRecipW0, vRecipW1, vUnused);
+
+    // store render target array index
+    OSALIGNSIMD(uint32_t) aRTAI[KNOB_SIMD_WIDTH];
+    if (gsState.gsEnable && gsState.emitsRenderTargetArrayIndex)
+    {
+        simdvector vRtai[2];
+        pa.Assemble(VERTEX_RTAI_SLOT, vRtai);
+        simdscalari vRtaii = _simd_castps_si(vRtai[0].x);
+        _simd_store_si((simdscalari*)aRTAI, vRtaii);
+    }
+    else
+    {
+        _simd_store_si((simdscalari*)aRTAI, _simd_setzero_si());
+    }
+
+    // scan remaining valid prims and bin each separately
+    DWORD primIndex;
+    while (_BitScanForward(&primIndex, primMask))
+    {
+        uint32_t linkageCount = state.linkageCount;
+        uint32_t linkageMask = state.linkageMask;
+        uint32_t numScalarAttribs = linkageCount * 4;
+
+        BE_WORK work;
+        work.type = DRAW;
+
+        TRIANGLE_WORK_DESC &desc = work.desc.tri;
+
+        desc.triFlags.frontFacing = 1;
+        desc.triFlags.primID = pPrimID[primIndex];
+        desc.triFlags.yMajor = (yMajorMask >> primIndex) & 1;
+        desc.triFlags.renderTargetArrayIndex = aRTAI[primIndex];
+
+        work.pfnWork = RasterizeLine;
+
+        Arena* pArena = pDC->pArena;
+        SWR_ASSERT(pArena != nullptr);
+
+        // store active attribs
+        desc.pAttribs = (float*)pArena->AllocAligned(numScalarAttribs * 3 * sizeof(float), 16);
+        desc.numAttribs = linkageCount;
+        ProcessAttributes<2>(pDC, pa, linkageMask, state.linkageMap, primIndex, desc.pAttribs);
+
+        // store line vertex data
+        desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
+        _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
+        _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
+        _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
+        _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
+
+        // store user clip distances
+        if (rastState.clipDistanceMask)
+        {
+            uint32_t numClipDist = _mm_popcnt_u32(rastState.clipDistanceMask);
+            desc.pUserClipBuffer = (float*)pArena->Alloc(numClipDist * 2 * sizeof(float));
+            ProcessUserClipDist<2>(pa, primIndex, rastState.clipDistanceMask, desc.pUserClipBuffer);
+        }
+
+        MacroTileMgr *pTileMgr = pDC->pTileMgr;
+        for (uint32_t y = aMTTop[primIndex]; y <= aMTBottom[primIndex]; ++y)
+        {
+            for (uint32_t x = aMTLeft[primIndex]; x <= aMTRight[primIndex]; ++x)
+            {
+#if KNOB_ENABLE_TOSS_POINTS
+                if (!KNOB_TOSS_SETUP_TRIS)
+#endif
+                {
+                    pTileMgr->enqueue(x, y, &work);
+                }
+            }
+        }
+
+        primMask &= ~(1 << primIndex);
+    }
+
+endBinLines:
+
+    RDTSC_STOP(FEBinLines, 1, 0);
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h
new file mode 100644
index 00000000000..acb935fc251
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h
@@ -0,0 +1,327 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file frontend.h
+*
+* @brief Definitions for Frontend which handles vertex processing,
+*        primitive assembly, clipping, binning, etc.
+*
+******************************************************************************/
+#pragma once
+#include "context.h"
+
+INLINE
+__m128i fpToFixedPoint(const __m128 vIn)
+{
+    __m128 vFixed = _mm_mul_ps(vIn, _mm_set1_ps(FIXED_POINT_SCALE));
+    return _mm_cvtps_epi32(vFixed);
+}
+
+INLINE
+simdscalari fpToFixedPointVertical(const simdscalar vIn)
+{
+    simdscalar vFixed = _simd_mul_ps(vIn, _simd_set1_ps(FIXED_POINT_SCALE));
+    return _simd_cvtps_epi32(vFixed);
+}
+
+
+// Calculates the A and B coefficients for the 3 edges of the triangle
+// 
+// maths for edge equations:
+//   standard form of a line in 2d
+//   Ax + By + C = 0
+//   A = y0 - y1
+//   B = x1 - x0
+//   C = x0y1 - x1y0
+INLINE
+void triangleSetupAB(const __m128 vX, const __m128 vY, __m128 & vA, __m128 & vB)
+{
+    // vYsub = y1 y2 y0 dc
+    __m128 vYsub = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(3, 0, 2, 1));
+    // vY =    y0 y1 y2 dc
+    vA = _mm_sub_ps(vY, vYsub);
+
+    // Result: 
+    // A[0] = y0 - y1
+    // A[1] = y1 - y2
+    // A[2] = y2 - y0
+
+    // vXsub = x1 x2 x0 dc
+    __m128 vXsub = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(3, 0, 2, 1));
+    // vX =    x0 x1 x2 dc
+    vB = _mm_sub_ps(vXsub, vX);
+
+    // Result: 
+    // B[0] = x1 - x0
+    // B[1] = x2 - x1
+    // B[2] = x0 - x2
+}
+
+INLINE
+void triangleSetupABVertical(const simdscalar vX[3], const simdscalar vY[3], simdscalar (&vA)[3], simdscalar (&vB)[3])
+{
+    // generate edge equations
+    // A = y0 - y1
+    // B = x1 - x0
+    vA[0] = _simd_sub_ps(vY[0], vY[1]);
+    vA[1] = _simd_sub_ps(vY[1], vY[2]);
+    vA[2] = _simd_sub_ps(vY[2], vY[0]);
+
+    vB[0] = _simd_sub_ps(vX[1], vX[0]);
+    vB[1] = _simd_sub_ps(vX[2], vX[1]);
+    vB[2] = _simd_sub_ps(vX[0], vX[2]);
+}
+
+INLINE
+void triangleSetupABInt(const __m128i vX, const __m128i vY, __m128i & vA, __m128i & vB)
+{
+    // generate edge equations
+    // A = y0 - y1
+    // B = x1 - x0
+    // C = x0y1 - x1y0
+    __m128i vYsub = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 2, 1));
+    vA = _mm_sub_epi32(vY, vYsub);
+
+    __m128i vXsub = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 2, 1));
+    vB = _mm_sub_epi32(vXsub, vX);
+}
+
+INLINE
+void triangleSetupABIntVertical(const simdscalari vX[3], const simdscalari vY[3], simdscalari (&vA)[3], simdscalari (&vB)[3])
+{
+    // A = y0 - y1
+    // B = x1 - x0
+    vA[0] = _simd_sub_epi32(vY[0], vY[1]);
+    vA[1] = _simd_sub_epi32(vY[1], vY[2]);
+    vA[2] = _simd_sub_epi32(vY[2], vY[0]);
+
+    vB[0] = _simd_sub_epi32(vX[1], vX[0]);
+    vB[1] = _simd_sub_epi32(vX[2], vX[1]);
+    vB[2] = _simd_sub_epi32(vX[0], vX[2]);
+}
+// Calculate the determinant of the triangle
+// 2 vectors between the 3 points: P, Q
+// Px = x0-x2, Py = y0-y2
+// Qx = x1-x2, Qy = y1-y2
+//       |Px Qx|
+// det = |     | = PxQy - PyQx 
+//       |Py Qy|
+// simplifies to : (x0-x2)*(y1-y2) - (y0-y2)*(x1-x2)
+//               try to reuse our A & B coef's already calculated. factor out a -1 from Py and Qx
+//               : B[2]*A[1] - (-(y2-y0))*(-(x2-x1))
+//               : B[2]*A[1] - (-1)(-1)(y2-y0)*(x2-x1)
+//               : B[2]*A[1] - A[2]*B[1]
+INLINE
+float calcDeterminantInt(const __m128i vA, const __m128i vB)
+{
+    // vAShuf = [A1, A0, A2, A0]
+    __m128i vAShuf = _mm_shuffle_epi32(vA, _MM_SHUFFLE(0, 2, 0, 1));
+    // vBShuf = [B2, B0, B1, B0]
+    __m128i vBShuf = _mm_shuffle_epi32(vB, _MM_SHUFFLE(0, 1, 0, 2));
+    // vMul = [A1*B2, B1*A2]
+    __m128i vMul   = _mm_mul_epi32(vAShuf, vBShuf);
+
+    // shuffle upper to lower
+    // vMul2 = [B1*A2, B1*A2]
+    __m128i vMul2 = _mm_shuffle_epi32(vMul, _MM_SHUFFLE(3, 2, 3, 2));
+    //vMul = [A1*B2 - B1*A2]
+    vMul = _mm_sub_epi64(vMul, vMul2);
+
+	// According to emmintrin.h __mm_store1_pd(), address must be 16-byte aligned
+    OSALIGN(int64_t, 16) result;
+    _mm_store1_pd((double*)&result, _mm_castsi128_pd(vMul));
+
+    double fResult = (double)result;
+    fResult = fResult * (1.0 / FIXED_POINT16_SCALE);
+
+    return (float)fResult;
+}
+
+INLINE
+void calcDeterminantIntVertical(const simdscalari vA[3], const simdscalari vB[3], simdscalari *pvDet)
+{
+    // refer to calcDeterminantInt comment for calculation explanation
+    // A1*B2
+    simdscalari vA1Lo = _simd_unpacklo_epi32(vA[1], vA[1]);     // 0 0 1 1 4 4 5 5
+    simdscalari vA1Hi = _simd_unpackhi_epi32(vA[1], vA[1]);     // 2 2 3 3 6 6 7 7
+
+    simdscalari vB2Lo = _simd_unpacklo_epi32(vB[2], vB[2]);
+    simdscalari vB2Hi = _simd_unpackhi_epi32(vB[2], vB[2]);
+
+    simdscalari vA1B2Lo = _simd_mul_epi32(vA1Lo, vB2Lo);        // 0 1 4 5
+    simdscalari vA1B2Hi = _simd_mul_epi32(vA1Hi, vB2Hi);        // 2 3 6 7
+
+    // B1*A2
+    simdscalari vA2Lo = _simd_unpacklo_epi32(vA[2], vA[2]);
+    simdscalari vA2Hi = _simd_unpackhi_epi32(vA[2], vA[2]);
+
+    simdscalari vB1Lo = _simd_unpacklo_epi32(vB[1], vB[1]);
+    simdscalari vB1Hi = _simd_unpackhi_epi32(vB[1], vB[1]);
+
+    simdscalari vA2B1Lo = _simd_mul_epi32(vA2Lo, vB1Lo);
+    simdscalari vA2B1Hi = _simd_mul_epi32(vA2Hi, vB1Hi);
+
+    // A1*B2 - A2*B1
+    simdscalari detLo = _simd_sub_epi64(vA1B2Lo, vA2B1Lo);
+    simdscalari detHi = _simd_sub_epi64(vA1B2Hi, vA2B1Hi);
+
+    // shuffle 0 1 4 5 -> 0 1 2 3
+    simdscalari vResultLo = _mm256_permute2f128_si256(detLo, detHi, 0x20);
+    simdscalari vResultHi = _mm256_permute2f128_si256(detLo, detHi, 0x31);
+
+    pvDet[0] = vResultLo;
+    pvDet[1] = vResultHi;
+}
+
+INLINE
+void triangleSetupC(const __m128 vX, const __m128 vY, const __m128 vA, const __m128 &vB, __m128 &vC)
+{
+    // C = -Ax - By
+    vC  = _mm_mul_ps(vA, vX);
+    __m128 vCy = _mm_mul_ps(vB, vY);    
+    vC  = _mm_mul_ps(vC, _mm_set1_ps(-1.0f));
+    vC  = _mm_sub_ps(vC, vCy);
+}
+
+INLINE
+void viewportTransform(__m128 &vX, __m128 &vY, __m128 &vZ, const SWR_VIEWPORT_MATRIX &vpMatrix)
+{
+    vX = _mm_mul_ps(vX, _mm_set1_ps(vpMatrix.m00));
+    vX = _mm_add_ps(vX, _mm_set1_ps(vpMatrix.m30));
+
+    vY = _mm_mul_ps(vY, _mm_set1_ps(vpMatrix.m11));
+    vY = _mm_add_ps(vY, _mm_set1_ps(vpMatrix.m31));
+
+    vZ = _mm_mul_ps(vZ, _mm_set1_ps(vpMatrix.m22));
+    vZ = _mm_add_ps(vZ, _mm_set1_ps(vpMatrix.m32));
+}
+
+template<uint32_t NumVerts>
+INLINE
+void viewportTransform(simdvector *v, const SWR_VIEWPORT_MATRIX & vpMatrix)
+{
+    simdscalar m00 = _simd_load1_ps(&vpMatrix.m00);
+    simdscalar m30 = _simd_load1_ps(&vpMatrix.m30);
+    simdscalar m11 = _simd_load1_ps(&vpMatrix.m11);
+    simdscalar m31 = _simd_load1_ps(&vpMatrix.m31);
+    simdscalar m22 = _simd_load1_ps(&vpMatrix.m22);
+    simdscalar m32 = _simd_load1_ps(&vpMatrix.m32);
+
+    for (uint32_t i = 0; i < NumVerts; ++i)
+    {
+        v[i].x = _simd_fmadd_ps(v[i].x, m00, m30);
+        v[i].y = _simd_fmadd_ps(v[i].y, m11, m31);
+        v[i].z = _simd_fmadd_ps(v[i].z, m22, m32);
+    }
+}
+
+INLINE
+void calcBoundingBoxInt(const __m128i &vX, const __m128i &vY, BBOX &bbox)
+{
+    // Need horizontal fp min here
+    __m128i vX1 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 2, 0, 1));
+    __m128i vX2 = _mm_shuffle_epi32(vX, _MM_SHUFFLE(3, 0, 1, 2));
+
+    __m128i vY1 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 2, 0, 1));
+    __m128i vY2 = _mm_shuffle_epi32(vY, _MM_SHUFFLE(3, 0, 1, 2));
+
+
+    __m128i vMinX = _mm_min_epi32(vX, vX1);
+            vMinX = _mm_min_epi32(vMinX, vX2);
+
+    __m128i vMaxX = _mm_max_epi32(vX, vX1);
+            vMaxX = _mm_max_epi32(vMaxX, vX2);
+
+    __m128i vMinY = _mm_min_epi32(vY, vY1);
+            vMinY = _mm_min_epi32(vMinY, vY2);
+
+    __m128i vMaxY = _mm_max_epi32(vY, vY1);
+            vMaxY = _mm_max_epi32(vMaxY, vY2);
+
+    bbox.left = _mm_extract_epi32(vMinX, 0);
+    bbox.right = _mm_extract_epi32(vMaxX, 0);
+    bbox.top = _mm_extract_epi32(vMinY, 0);
+    bbox.bottom = _mm_extract_epi32(vMaxY, 0);
+
+#if 0
+    Jacob:  A = _mm_shuffle_ps(X, Y, 0 0 0 0)
+B = _mm_shuffle_ps(Z, W, 0 0 0 0)
+A = _mm_shuffle_epi32(A, 3 0 3 0)
+A = _mm_shuffle_ps(A, B, 1 0 1 0)
+#endif
+
+}
+
+INLINE
+void calcBoundingBoxIntVertical(const simdscalari (&vX)[3], const simdscalari (&vY)[3], simdBBox &bbox)
+{
+    simdscalari vMinX = vX[0];
+    vMinX = _simd_min_epi32(vMinX, vX[1]);
+    vMinX = _simd_min_epi32(vMinX, vX[2]);
+
+    simdscalari vMaxX = vX[0];
+    vMaxX = _simd_max_epi32(vMaxX, vX[1]);
+    vMaxX = _simd_max_epi32(vMaxX, vX[2]);
+
+    simdscalari vMinY = vY[0];
+    vMinY = _simd_min_epi32(vMinY, vY[1]);
+    vMinY = _simd_min_epi32(vMinY, vY[2]);
+
+    simdscalari vMaxY = vY[0];
+    vMaxY = _simd_max_epi32(vMaxY, vY[1]);
+    vMaxY = _simd_max_epi32(vMaxY, vY[2]);
+
+    bbox.left = vMinX;
+    bbox.right = vMaxX;
+    bbox.top = vMinY;
+    bbox.bottom = vMaxY;
+}
+
+INLINE
+bool CanUseSimplePoints(DRAW_CONTEXT *pDC)
+{
+    const API_STATE& state = GetApiState(pDC);
+
+    return (state.rastState.sampleCount == SWR_MULTISAMPLE_1X &&
+            state.rastState.pointSize == 1.0f &&
+            !state.rastState.pointParam &&
+            !state.rastState.pointSpriteEnable);
+}
+
+uint32_t GetNumPrims(PRIMITIVE_TOPOLOGY mode, uint32_t numElements);
+uint32_t NumVertsPerPrim(PRIMITIVE_TOPOLOGY topology, bool includeAdjVerts);
+
+// Templated Draw front-end function.  All combinations of template parameter values are available
+template <bool IsIndexedT, bool HasTessellationT, bool HasGeometryShaderT, bool HasStreamOutT, bool HasRastT>
+void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+
+void ProcessClear(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+void ProcessStoreTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+void ProcessInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+void ProcessSync(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+void ProcessQueryStats(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+
+struct PA_STATE_BASE;  // forward decl
+void BinTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector tri[3], uint32_t primMask, simdscalari primID);
+void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID);
+void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID);
+
diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h b/src/gallium/drivers/swr/rasterizer/core/knobs.h
new file mode 100644
index 00000000000..d7feb86273d
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h
@@ -0,0 +1,142 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file knobs.h
+*
+* @brief Static (Compile-Time) Knobs for Core.
+*
+******************************************************************************/
+#pragma once
+
+#include <stdint.h>
+#include <gen_knobs.h>
+
+#define KNOB_ARCH_AVX    0
+#define KNOB_ARCH_AVX2   1
+#define KNOB_ARCH_AVX512 2
+
+///////////////////////////////////////////////////////////////////////////////
+// Architecture validation
+///////////////////////////////////////////////////////////////////////////////
+#if !defined(KNOB_ARCH)
+#define KNOB_ARCH KNOB_ARCH_AVX
+#endif
+
+#if (KNOB_ARCH == KNOB_ARCH_AVX)
+#define KNOB_ARCH_ISA AVX
+#define KNOB_ARCH_STR "AVX"
+#define KNOB_SIMD_WIDTH 8
+#elif (KNOB_ARCH == KNOB_ARCH_AVX2)
+#define KNOB_ARCH_ISA AVX2
+#define KNOB_ARCH_STR "AVX2"
+#define KNOB_SIMD_WIDTH 8
+#elif (KNOB_ARCH == KNOB_ARCH_AVX512)
+#define KNOB_ARCH_ISA AVX512F
+#define KNOB_ARCH_STR "AVX512"
+#define KNOB_SIMD_WIDTH 16
+#error "AVX512 not yet supported"
+#else
+#error "Unknown architecture"
+#endif
+
+#define MAX_KNOB_ARCH_STR_LEN sizeof("AVX512_PLUS_PADDING")
+
+///////////////////////////////////////////////////////////////////////////////
+// Configuration knobs
+///////////////////////////////////////////////////////////////////////////////
+#define KNOB_MAX_NUM_THREADS                256 // Supports up to dual-HSW-Xeon.
+
+// Maximum supported number of active vertex buffer streams
+#define KNOB_NUM_STREAMS                    32
+
+// Maximum supported number of attributes per vertex
+#define KNOB_NUM_ATTRIBUTES                 38
+
+// Maximum supported active viewports and scissors
+#define KNOB_NUM_VIEWPORTS_SCISSORS         16
+
+// Guardband range used by the clipper
+#define KNOB_GUARDBAND_WIDTH                32768.0f
+#define KNOB_GUARDBAND_HEIGHT               32768.0f
+
+///////////////////////////////
+// Macro tile configuration
+///////////////////////////////
+
+// raster tile dimensions
+#define KNOB_TILE_X_DIM                      8
+#define KNOB_TILE_X_DIM_SHIFT                3
+#define KNOB_TILE_Y_DIM                      8
+#define KNOB_TILE_Y_DIM_SHIFT                3
+
+// fixed macrotile pixel dimension for now, eventually will be 
+// dynamically set based on tile format and pixel size
+#define KNOB_MACROTILE_X_DIM                64
+#define KNOB_MACROTILE_Y_DIM                64
+#define KNOB_MACROTILE_X_DIM_FIXED          (KNOB_MACROTILE_X_DIM << 8)
+#define KNOB_MACROTILE_Y_DIM_FIXED          (KNOB_MACROTILE_Y_DIM << 8)
+#define KNOB_MACROTILE_X_DIM_FIXED_SHIFT    14
+#define KNOB_MACROTILE_Y_DIM_FIXED_SHIFT    14
+#define KNOB_MACROTILE_X_DIM_IN_TILES       (KNOB_MACROTILE_X_DIM >> KNOB_TILE_X_DIM_SHIFT)
+#define KNOB_MACROTILE_Y_DIM_IN_TILES       (KNOB_MACROTILE_Y_DIM >> KNOB_TILE_Y_DIM_SHIFT)
+
+// total # of hot tiles available. This should be enough to
+// fully render a 16kx16k 128bpp render target
+#define KNOB_NUM_HOT_TILES_X                 256
+#define KNOB_NUM_HOT_TILES_Y                 256
+#define KNOB_COLOR_HOT_TILE_FORMAT           R32G32B32A32_FLOAT
+#define KNOB_DEPTH_HOT_TILE_FORMAT           R32_FLOAT
+#define KNOB_STENCIL_HOT_TILE_FORMAT         R8_UINT
+
+// Max scissor rectangle
+#define KNOB_MAX_SCISSOR_X                  KNOB_NUM_HOT_TILES_X * KNOB_MACROTILE_X_DIM
+#define KNOB_MAX_SCISSOR_Y                  KNOB_NUM_HOT_TILES_Y * KNOB_MACROTILE_Y_DIM
+
+#if KNOB_SIMD_WIDTH==8 && KNOB_TILE_X_DIM < 4
+#error "incompatible width/tile dimensions"
+#endif
+
+#if KNOB_SIMD_WIDTH == 8
+#define SIMD_TILE_X_DIM 4
+#define SIMD_TILE_Y_DIM 2
+#else
+#error "Invalid simd width"
+#endif
+
+///////////////////////////////////////////////////////////////////////////////
+// Optimization knobs
+///////////////////////////////////////////////////////////////////////////////
+#define KNOB_USE_FAST_SRGB                     TRUE
+
+// enables cut-aware primitive assembler
+#define KNOB_ENABLE_CUT_AWARE_PA               TRUE
+
+///////////////////////////////////////////////////////////////////////////////
+// Debug knobs
+///////////////////////////////////////////////////////////////////////////////
+//#define KNOB_ENABLE_RDTSC
+
+// Set to 1 to use the dynamic KNOB_TOSS_XXXX knobs.
+#if !defined(KNOB_ENABLE_TOSS_POINTS)
+#define KNOB_ENABLE_TOSS_POINTS                 0
+#endif
+
diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
new file mode 100644
index 00000000000..3f19555557f
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
@@ -0,0 +1,98 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file knobs_init.h
+*
+* @brief Dynamic Knobs Initialization for Core.
+*
+******************************************************************************/
+#pragma once
+
+#include <core/knobs.h>
+#include <stdlib.h>
+#include <string.h>
+#include <ctype.h>
+#include <stdio.h>
+
+// Assume the type is compatible with a 32-bit integer
+template <typename T>
+static inline void ConvertEnvToKnob(const char* pOverride, T& knobValue)
+{
+    uint32_t value = 0;
+    if (sscanf(pOverride, "%u", &value))
+    {
+        knobValue = static_cast<T>(value);
+    }
+}
+
+static inline void ConvertEnvToKnob(const char* pOverride, bool& knobValue)
+{
+    size_t len = strlen(pOverride);
+    if (len == 1)
+    {
+        auto c = tolower(pOverride[0]);
+        if (c == 'y' || c == 't' || c == '1')
+        {
+            knobValue = true;
+            return;
+        }
+        if (c == 'n' || c == 'f' || c == '0')
+        {
+            knobValue = false;
+            return;
+        }
+    }
+
+    // Try converting to a number and casting to bool
+    uint32_t value = 0;
+    if (sscanf(pOverride, "%u", &value))
+    {
+        knobValue = value != 0;
+        return;
+    }
+}
+
+static inline void ConvertEnvToKnob(const char* pOverride, float& knobValue)
+{
+    float value = knobValue;
+    if (sscanf(pOverride, "%f", &value))
+    {
+        knobValue = value;
+    }
+}
+
+template <typename T>
+static inline void InitKnob(T& knob)
+{
+
+    // TODO, read registry first
+
+    // Second, read environment variables
+    const char* pOverride = getenv(knob.Name());
+
+    if (pOverride)
+    {
+        auto knobValue = knob.Value();
+        ConvertEnvToKnob(pOverride, knobValue);
+        knob.Value(knobValue);
+    }
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/multisample.cpp b/src/gallium/drivers/swr/rasterizer/core/multisample.cpp
new file mode 100644
index 00000000000..d51a546b063
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/multisample.cpp
@@ -0,0 +1,51 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file multisample.cpp
+*
+******************************************************************************/
+
+#include "multisample.h"
+
+const uint32_t MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosXi[2] {0xC0, 0x40};
+const uint32_t MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosYi[2] {0xC0, 0x40};
+const uint32_t MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosXi[4] {0x60, 0xE0, 0x20, 0xA0};
+const uint32_t MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosYi[4] {0x20, 0x60, 0xA0, 0xE0};
+const uint32_t MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosXi[8] {0x90, 0x70, 0xD0, 0x50, 0x30, 0x10, 0xB0, 0xF0};
+const uint32_t MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosYi[8] {0x50, 0xB0, 0x90, 0x30, 0xD0, 0x70, 0xF0, 0x10};
+const uint32_t MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosXi[16] 
+{0x90, 0x70, 0x50, 0xC0, 0x30, 0xA0, 0xD0, 0xB0, 0x60, 0x80, 0x40, 0x20, 0x00, 0xF0, 0xE0, 0x10};
+const uint32_t MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosYi[16]
+{0x90, 0x50, 0xA0, 0x70, 0x60, 0xD0, 0xB0, 0x30, 0xE0, 0x10, 0x20, 0xC0, 0x80, 0x40, 0xF0, 0x00};
+
+const float MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosX{0.5f};
+const float MultisampleTraits<SWR_MULTISAMPLE_1X>::samplePosY{0.5f};
+const float MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosX[2]{0.75f, 0.25f};
+const float MultisampleTraits<SWR_MULTISAMPLE_2X>::samplePosY[2]{0.75f, 0.25f};
+const float MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosX[4]{0.375f, 0.875, 0.125, 0.625};
+const float MultisampleTraits<SWR_MULTISAMPLE_4X>::samplePosY[4]{0.125, 0.375, 0.625, 0.875};
+const float MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosX[8]{0.5625, 0.4375, 0.8125, 0.3125, 0.1875, 0.0625, 0.6875, 0.9375};
+const float MultisampleTraits<SWR_MULTISAMPLE_8X>::samplePosY[8]{0.3125, 0.6875, 0.5625, 0.1875, 0.8125, 0.4375, 0.9375, 0.0625};
+const float MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosX[16]
+{0.5625, 0.4375, 0.3125, 0.7500, 0.1875, 0.6250, 0.8125, 0.6875, 0.3750, 0.5000, 0.2500, 0.1250, 0.0000, 0.9375, 0.8750, 0.0625};
+const float MultisampleTraits<SWR_MULTISAMPLE_16X>::samplePosY[16]
+{0.5625, 0.3125, 0.6250, 0.4375, 0.3750, 0.8125, 0.6875, 0.1875, 0.8750, 0.0625, 0.1250, 0.7500, 0.5000, 0.2500, 0.9375, 0.0000};
diff --git a/src/gallium/drivers/swr/rasterizer/core/multisample.h b/src/gallium/drivers/swr/rasterizer/core/multisample.h
new file mode 100644
index 00000000000..4ae777e2fc5
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/multisample.h
@@ -0,0 +1,620 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file multisample.h
+*
+******************************************************************************/
+
+#pragma once
+
+#include "context.h"
+#include "format_traits.h"
+
+INLINE
+uint32_t GetNumSamples(SWR_MULTISAMPLE_COUNT sampleCount)
+{
+    static const uint32_t sampleCountLUT[SWR_MULTISAMPLE_TYPE_MAX] {1, 2, 4, 8, 16};
+    assert(sampleCount < SWR_MULTISAMPLE_TYPE_MAX);
+    return sampleCountLUT[sampleCount];
+}
+
+INLINE
+SWR_MULTISAMPLE_COUNT GetSampleCount(uint32_t numSamples)
+{
+    switch(numSamples)
+    {
+    case 1: return SWR_MULTISAMPLE_1X;
+    case 2: return SWR_MULTISAMPLE_2X;
+    case 4: return SWR_MULTISAMPLE_4X;
+    case 8: return SWR_MULTISAMPLE_8X;
+    case 16: return SWR_MULTISAMPLE_16X;
+    default: assert(0); return SWR_MULTISAMPLE_1X;
+    }
+}
+
+// hardcoded offsets based on Direct3d standard multisample positions
+// 8 x 8 pixel grid ranging from (0, 0) to (15, 15), with (0, 0) = UL pixel corner
+// coords are 0.8 fixed point offsets from (0, 0)
+template<SWR_MULTISAMPLE_COUNT sampleCount>
+struct MultisampleTraits
+{
+    INLINE static __m128i vXi(uint32_t sampleNum) = delete;
+    INLINE static __m128i vYi(uint32_t sampleNum) = delete;
+    INLINE static simdscalar vX(uint32_t sampleNum) = delete;
+    INLINE static simdscalar vY(uint32_t sampleNum) = delete;
+    INLINE static float X(uint32_t sampleNum) = delete;
+    INLINE static float Y(uint32_t sampleNum) = delete;
+    INLINE static __m128i TileSampleOffsetsX() = delete;
+    INLINE static __m128i TileSampleOffsetsY() = delete;
+    INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum) = delete;
+    INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum) = delete;
+    INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) = delete;
+    INLINE static simdscalari FullSampleMask() = delete;
+
+    static const uint32_t numSamples = 0;
+};
+
+template<>
+struct MultisampleTraits<SWR_MULTISAMPLE_1X>
+{
+    INLINE static __m128i vXi(uint32_t sampleNum)
+    {
+        static const __m128i X = _mm_set1_epi32(samplePosXi);
+        return X;
+    }
+
+    INLINE static __m128i vYi(uint32_t sampleNum)
+    {
+        static const __m128i Y = _mm_set1_epi32(samplePosYi);
+        return Y;
+    }
+
+    INLINE static simdscalar vX(uint32_t sampleNum)
+    {
+        static const simdscalar X = _simd_set1_ps(0.5f);
+        return X;
+    }
+
+    INLINE static simdscalar vY(uint32_t sampleNum)
+    {
+        static const simdscalar Y = _simd_set1_ps(0.5f);
+        return Y;
+    }
+
+    INLINE static float X(uint32_t sampleNum) {return samplePosX;};
+    INLINE static float Y(uint32_t sampleNum) {return samplePosY;};
+
+    INLINE static __m128i TileSampleOffsetsX()
+    {
+        static const uint32_t bboxLeftEdge = 0x80;
+        static const uint32_t bboxRightEdge = 0x80;
+                                                            // BR,            BL,           UR,            UL
+        static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge);
+        return tileSampleOffsetX;
+    }
+
+    INLINE static __m128i TileSampleOffsetsY()
+    {
+        static const uint32_t bboxTopEdge = 0x80;
+        static const uint32_t bboxBottomEdge = 0x80;
+                                                            // BR,             BL,             UR,          UL
+        static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge);
+        return tileSampleOffsetY;
+    }
+
+    INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
+    {
+        return 0;
+    }
+
+    INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
+    {
+        return 0;
+    }
+
+    INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
+    {
+        return 0;
+    }
+
+    INLINE static simdscalari FullSampleMask(){return _simd_set1_epi32(0x1);};
+
+    static const uint32_t samplePosXi {0x80};
+    static const uint32_t samplePosYi {0x80};
+    static const float samplePosX;
+    static const float samplePosY;
+    static const uint32_t numSamples = 1;
+};
+
+template<>
+struct MultisampleTraits<SWR_MULTISAMPLE_2X>
+{
+    INLINE static __m128i vXi(uint32_t sampleNum)
+    {
+        SWR_ASSERT(sampleNum < numSamples);
+        static const __m128i X[numSamples] {_mm_set1_epi32(samplePosXi[0]), _mm_set1_epi32(samplePosXi[1])};
+        return X[sampleNum];
+    }
+
+    INLINE static __m128i vYi(uint32_t sampleNum)
+    {
+        SWR_ASSERT(sampleNum < numSamples);
+        static const __m128i Y[numSamples] {_mm_set1_epi32(samplePosYi[0]), _mm_set1_epi32(samplePosYi[1])};
+        return Y[sampleNum];
+    }
+
+    INLINE static simdscalar vX(uint32_t sampleNum)
+    {
+        static const simdscalar X[numSamples] {_simd_set1_ps(0.75f), _simd_set1_ps(0.25f)};
+        assert(sampleNum < numSamples);
+        return X[sampleNum];
+    }
+
+    INLINE static simdscalar vY(uint32_t sampleNum)
+    {
+        static const simdscalar Y[numSamples] {_simd_set1_ps(0.75f), _simd_set1_ps(0.25f)};
+        assert(sampleNum < numSamples);
+        return Y[sampleNum];
+    }
+
+    INLINE static float X(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosX[sampleNum]; };
+    INLINE static float Y(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosY[sampleNum]; };
+
+    INLINE static __m128i TileSampleOffsetsX()
+    {
+        static const uint32_t bboxLeftEdge = 0x40;
+        static const uint32_t bboxRightEdge = 0xC0;
+                                                            // BR,            BL,           UR,            UL
+        static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge);
+        return tileSampleOffsetX;
+    }
+
+    INLINE static __m128i TileSampleOffsetsY()
+    {
+        static const uint32_t bboxTopEdge = 0x40;
+        static const uint32_t bboxBottomEdge = 0xC0;
+                                                            // BR,             BL,             UR,          UL
+        static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge);
+        return tileSampleOffsetY;
+    }
+
+    INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
+    {
+        static const uint32_t RasterTileColorOffsets[numSamples]
+        { 0,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8)
+        };
+        assert(sampleNum < numSamples);
+        return RasterTileColorOffsets[sampleNum];
+    }
+
+    INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
+    {
+        static const uint32_t RasterTileDepthOffsets[numSamples]
+        { 0,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8)
+        };
+        assert(sampleNum < numSamples);
+        return RasterTileDepthOffsets[sampleNum];
+    }
+
+    INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
+    {
+        static const uint32_t RasterTileStencilOffsets[numSamples]
+        { 0,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8)
+        };
+        assert(sampleNum < numSamples);
+        return RasterTileStencilOffsets[sampleNum];
+    }
+
+    INLINE static simdscalari FullSampleMask()
+    {
+         static const simdscalari mask =_simd_set1_epi32(0x3);
+         return mask;
+    }
+
+    static const uint32_t samplePosXi[2];
+    static const uint32_t samplePosYi[2];
+    static const float samplePosX[2];
+    static const float samplePosY[2];
+    static const uint32_t numSamples = 2;
+};
+
+template<>
+struct MultisampleTraits<SWR_MULTISAMPLE_4X>
+{
+    INLINE static __m128i vXi(uint32_t sampleNum)
+    {
+        static const __m128i X[numSamples]
+        {_mm_set1_epi32(samplePosXi[0]), _mm_set1_epi32(samplePosXi[1]), _mm_set1_epi32(samplePosXi[2]), _mm_set1_epi32(samplePosXi[3])};
+        SWR_ASSERT(sampleNum < numSamples);
+        return X[sampleNum];
+    }
+
+    INLINE static __m128i vYi(uint32_t sampleNum)
+    {
+        static const __m128i Y[numSamples]
+        {_mm_set1_epi32(samplePosYi[0]), _mm_set1_epi32(samplePosYi[1]), _mm_set1_epi32(samplePosYi[2]), _mm_set1_epi32(samplePosYi[3])};
+        SWR_ASSERT(sampleNum < numSamples);
+        return Y[sampleNum];
+    }
+
+    INLINE static simdscalar vX(uint32_t sampleNum)
+    {
+        static const simdscalar X[numSamples] 
+        {_simd_set1_ps(0.375f), _simd_set1_ps(0.875), _simd_set1_ps(0.125), _simd_set1_ps(0.625)};
+        assert(sampleNum < numSamples);
+        return X[sampleNum];
+    }
+
+    INLINE static simdscalar vY(uint32_t sampleNum)
+    {
+        static const simdscalar Y[numSamples]
+        {_simd_set1_ps(0.125), _simd_set1_ps(0.375f), _simd_set1_ps(0.625), _simd_set1_ps(0.875)};
+        assert(sampleNum < numSamples);
+        return Y[sampleNum];
+    }
+    
+    INLINE static float X(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosX[sampleNum]; };
+    INLINE static float Y(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosY[sampleNum]; };
+
+    INLINE static __m128i TileSampleOffsetsX()
+    {
+        static const uint32_t bboxLeftEdge = 0x20;
+        static const uint32_t bboxRightEdge = 0xE0;
+                                                            // BR,            BL,           UR,            UL
+        static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge);
+        return tileSampleOffsetX;
+    }
+
+    INLINE static __m128i TileSampleOffsetsY()
+    {
+        static const uint32_t bboxTopEdge = 0x20;
+        static const uint32_t bboxBottomEdge = 0xE0;
+                                                            // BR,             BL,             UR,          UL
+        static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge);
+        return tileSampleOffsetY;
+    }
+
+    INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
+    {
+        static const uint32_t RasterTileColorOffsets[numSamples]
+        { 0,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
+        };
+        assert(sampleNum < numSamples);
+        return RasterTileColorOffsets[sampleNum];
+    }
+
+    INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
+    {
+        static const uint32_t RasterTileDepthOffsets[numSamples]
+        { 0,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
+        };
+        assert(sampleNum < numSamples);
+        return RasterTileDepthOffsets[sampleNum];
+    }
+
+    INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
+    {
+        static const uint32_t RasterTileStencilOffsets[numSamples]
+        { 0,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
+        };
+        assert(sampleNum < numSamples);
+        return RasterTileStencilOffsets[sampleNum];
+    }
+
+    INLINE static simdscalari FullSampleMask()
+    {
+        static const simdscalari mask = _simd_set1_epi32(0xF);
+        return mask;
+    }
+
+    static const uint32_t samplePosXi[4];
+    static const uint32_t samplePosYi[4];
+    static const float samplePosX[4];
+    static const float samplePosY[4];
+    static const uint32_t numSamples = 4;
+};
+
+template<>
+struct MultisampleTraits<SWR_MULTISAMPLE_8X>
+{
+    INLINE static __m128i vXi(uint32_t sampleNum)
+    {
+        static const __m128i X[numSamples]
+        {_mm_set1_epi32(samplePosXi[0]), _mm_set1_epi32(samplePosXi[1]), _mm_set1_epi32(samplePosXi[2]), _mm_set1_epi32(samplePosXi[3]), 
+         _mm_set1_epi32(samplePosXi[4]), _mm_set1_epi32(samplePosXi[5]), _mm_set1_epi32(samplePosXi[6]), _mm_set1_epi32(samplePosXi[7])};
+        SWR_ASSERT(sampleNum < numSamples);
+        return X[sampleNum];
+    }
+
+    INLINE static __m128i vYi(uint32_t sampleNum)
+    {
+        static const __m128i Y[numSamples]
+        {_mm_set1_epi32(samplePosYi[0]), _mm_set1_epi32(samplePosYi[1]), _mm_set1_epi32(samplePosYi[2]), _mm_set1_epi32(samplePosYi[3]), 
+         _mm_set1_epi32(samplePosYi[4]), _mm_set1_epi32(samplePosYi[5]), _mm_set1_epi32(samplePosYi[6]), _mm_set1_epi32(samplePosYi[7])};
+        SWR_ASSERT(sampleNum < numSamples);
+        return Y[sampleNum];
+    }
+
+    INLINE static simdscalar vX(uint32_t sampleNum)
+    {
+        static const simdscalar X[numSamples]
+        {_simd_set1_ps(0.5625), _simd_set1_ps(0.4375), _simd_set1_ps(0.8125), _simd_set1_ps(0.3125),
+         _simd_set1_ps(0.1875), _simd_set1_ps(0.0625), _simd_set1_ps(0.6875), _simd_set1_ps(0.9375)};
+        assert(sampleNum < numSamples);
+        return X[sampleNum];
+    }
+
+    INLINE static simdscalar vY(uint32_t sampleNum)
+    {
+        static const simdscalar Y[numSamples]
+        {_simd_set1_ps(0.3125), _simd_set1_ps(0.6875), _simd_set1_ps(0.5625), _simd_set1_ps(0.1875),
+         _simd_set1_ps(0.8125), _simd_set1_ps(0.4375), _simd_set1_ps(0.9375), _simd_set1_ps(0.0625)};
+        assert(sampleNum < numSamples);
+        return Y[sampleNum];
+    }
+
+    INLINE static float X(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosX[sampleNum]; };
+    INLINE static float Y(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosY[sampleNum]; };
+
+    INLINE static __m128i TileSampleOffsetsX()
+    {
+        static const uint32_t bboxLeftEdge = 0x10;
+        static const uint32_t bboxRightEdge = 0xF0;
+                                                            // BR,            BL,           UR,            UL
+        static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge);
+        return tileSampleOffsetX;
+    }
+
+    INLINE static __m128i TileSampleOffsetsY()
+    {
+        static const uint32_t bboxTopEdge = 0x10;
+        static const uint32_t bboxBottomEdge = 0xF0;
+                                                            // BR,             BL,             UR,          UL
+        static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge);
+        return tileSampleOffsetY;
+    }
+
+    INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
+    {
+        static const uint32_t RasterTileColorOffsets[numSamples]
+        { 0,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
+        };
+        assert(sampleNum < numSamples);
+        return RasterTileColorOffsets[sampleNum];
+    }
+
+    INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
+    {
+        static const uint32_t RasterTileDepthOffsets[numSamples]
+        { 0,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
+        };
+        assert(sampleNum < numSamples);
+        return RasterTileDepthOffsets[sampleNum];
+    }
+
+    INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
+    {
+        static const uint32_t RasterTileStencilOffsets[numSamples]
+        { 0,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
+        };
+        assert(sampleNum < numSamples);
+        return RasterTileStencilOffsets[sampleNum];
+    }
+
+    INLINE static simdscalari FullSampleMask()
+    {
+        static const simdscalari mask = _simd_set1_epi32(0xFF);
+        return mask;
+    }
+
+    static const uint32_t samplePosXi[8];
+    static const uint32_t samplePosYi[8];
+    static const float samplePosX[8];
+    static const float samplePosY[8];
+    static const uint32_t numSamples = 8;
+};
+
+template<>
+struct MultisampleTraits<SWR_MULTISAMPLE_16X>
+{
+    INLINE static __m128i vXi(uint32_t sampleNum)
+    {
+        static const __m128i X[numSamples]
+        {_mm_set1_epi32(samplePosXi[0]), _mm_set1_epi32(samplePosXi[1]), _mm_set1_epi32(samplePosXi[2]), _mm_set1_epi32(samplePosXi[3]), 
+         _mm_set1_epi32(samplePosXi[4]), _mm_set1_epi32(samplePosXi[5]), _mm_set1_epi32(samplePosXi[6]), _mm_set1_epi32(samplePosXi[7]), 
+         _mm_set1_epi32(samplePosXi[8]), _mm_set1_epi32(samplePosXi[9]), _mm_set1_epi32(samplePosXi[10]), _mm_set1_epi32(samplePosXi[11]), 
+         _mm_set1_epi32(samplePosXi[12]), _mm_set1_epi32(samplePosXi[13]), _mm_set1_epi32(samplePosXi[14]), _mm_set1_epi32(samplePosXi[15])};
+        SWR_ASSERT(sampleNum < numSamples);
+        return X[sampleNum];
+    }
+
+    INLINE static __m128i vYi(uint32_t sampleNum)
+    {
+        static const __m128i Y[numSamples]
+        {_mm_set1_epi32(samplePosYi[0]), _mm_set1_epi32(samplePosYi[1]), _mm_set1_epi32(samplePosYi[2]), _mm_set1_epi32(samplePosYi[3]), 
+         _mm_set1_epi32(samplePosYi[4]), _mm_set1_epi32(samplePosYi[5]), _mm_set1_epi32(samplePosYi[6]), _mm_set1_epi32(samplePosYi[7]), 
+         _mm_set1_epi32(samplePosYi[8]), _mm_set1_epi32(samplePosYi[9]), _mm_set1_epi32(samplePosYi[10]), _mm_set1_epi32(samplePosYi[11]), 
+         _mm_set1_epi32(samplePosYi[12]), _mm_set1_epi32(samplePosYi[13]), _mm_set1_epi32(samplePosYi[14]), _mm_set1_epi32(samplePosYi[15])};
+        SWR_ASSERT(sampleNum < numSamples);
+        return Y[sampleNum];
+    }
+
+    INLINE static simdscalar vX(uint32_t sampleNum)
+    {
+        static const simdscalar X[numSamples]
+        {_simd_set1_ps(0.5625), _simd_set1_ps(0.4375), _simd_set1_ps(0.3125), _simd_set1_ps(0.7500),
+         _simd_set1_ps(0.1875), _simd_set1_ps(0.6250), _simd_set1_ps(0.8125), _simd_set1_ps(0.6875),
+         _simd_set1_ps(0.3750), _simd_set1_ps(0.5000), _simd_set1_ps(0.2500), _simd_set1_ps(0.1250),
+         _simd_set1_ps(0.0000), _simd_set1_ps(0.9375), _simd_set1_ps(0.8750), _simd_set1_ps(0.0625)};
+        assert(sampleNum < numSamples);
+        return X[sampleNum];
+    }
+
+    INLINE static simdscalar vY(uint32_t sampleNum)
+    {
+        static const simdscalar Y[numSamples]
+        {_simd_set1_ps(0.5625), _simd_set1_ps(0.3125), _simd_set1_ps(0.6250), _simd_set1_ps(0.4375),
+         _simd_set1_ps(0.3750), _simd_set1_ps(0.8125), _simd_set1_ps(0.6875), _simd_set1_ps(0.1875),
+         _simd_set1_ps(0.8750), _simd_set1_ps(0.0625), _simd_set1_ps(0.1250), _simd_set1_ps(0.7500),
+         _simd_set1_ps(0.5000), _simd_set1_ps(0.2500), _simd_set1_ps(0.9375), _simd_set1_ps(0.0000)};
+        assert(sampleNum < numSamples);
+        return Y[sampleNum];
+    }
+
+    INLINE static float X(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosX[sampleNum]; };
+    INLINE static float Y(uint32_t sampleNum) { SWR_ASSERT(sampleNum < numSamples); return samplePosY[sampleNum]; };
+
+    INLINE static __m128i TileSampleOffsetsX()
+    {
+        static const uint32_t bboxLeftEdge = 0x00;
+        static const uint32_t bboxRightEdge = 0xF0;
+                                                            // BR,            BL,           UR,            UL
+        static const __m128i tileSampleOffsetX = _mm_set_epi32(bboxRightEdge, bboxLeftEdge, bboxRightEdge, bboxLeftEdge);
+        return tileSampleOffsetX;
+    }
+
+    INLINE static __m128i TileSampleOffsetsY()
+    {
+        static const uint32_t bboxTopEdge = 0x00;
+        static const uint32_t bboxBottomEdge = 0xF0;
+                                                            // BR,             BL,             UR,          UL
+        static const __m128i tileSampleOffsetY = _mm_set_epi32(bboxBottomEdge, bboxBottomEdge, bboxTopEdge, bboxTopEdge);
+        return tileSampleOffsetY;
+    }
+
+    INLINE static uint32_t RasterTileColorOffset(uint32_t sampleNum)
+    {
+        static const uint32_t RasterTileColorOffsets[numSamples]
+        { 0,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8),
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 2,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 3,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 4,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 5,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 6,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 7,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 8,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 9,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 10,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 11,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 12,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 13,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 14,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8) * 15,
+        };
+        assert(sampleNum < numSamples);
+        return RasterTileColorOffsets[sampleNum];
+    }
+
+    INLINE static uint32_t RasterTileDepthOffset(uint32_t sampleNum)
+    {
+        static const uint32_t RasterTileDepthOffsets[numSamples]
+        { 0,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8),
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 2,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 3,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 4,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 5,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 6,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 7,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 8,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 9,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 10,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 11,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 12,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 13,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 14,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8) * 15,
+        };
+        assert(sampleNum < numSamples);
+        return RasterTileDepthOffsets[sampleNum];
+    }
+
+    INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum)
+    {
+        static const uint32_t RasterTileStencilOffsets[numSamples]
+        { 0,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8),
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 2,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 3,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 4,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 5,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 6,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 7,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 8,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 9,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 10,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 11,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 12,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 13,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 14,
+          (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8) * 15,
+        };
+        assert(sampleNum < numSamples);
+        return RasterTileStencilOffsets[sampleNum];
+    }
+
+    INLINE static simdscalari FullSampleMask()
+    {
+        static const simdscalari mask = _simd_set1_epi32(0xFFFF);
+        return mask;
+    }
+
+    static const uint32_t samplePosXi[16];
+    static const uint32_t samplePosYi[16];
+    static const float samplePosX[16];
+    static const float samplePosY[16];
+    static const uint32_t numSamples = 16;
+};
diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h
new file mode 100644
index 00000000000..2028d9fbcfe
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/pa.h
@@ -0,0 +1,1208 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file pa.h
+*
+* @brief Definitions for primitive assembly.
+*        N primitives are assembled at a time, where N is the SIMD width.
+*        A state machine, that is specific for a given topology, drives the
+*        assembly of vertices into triangles.
+*
+******************************************************************************/
+#pragma once
+
+#include "frontend.h"
+
+struct PA_STATE
+{
+    DRAW_CONTEXT *pDC;              // draw context
+    uint8_t* pStreamBase;           // vertex stream
+    uint32_t streamSizeInVerts;     // total size of the input stream in verts
+
+    // The topology the binner will use. In some cases the FE changes the topology from the api state.
+    PRIMITIVE_TOPOLOGY binTopology;
+
+    PA_STATE() {}
+    PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) :
+        pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts) {}
+
+    virtual bool HasWork() = 0;
+    virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0;
+    virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0;
+    virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0;
+    virtual bool NextPrim() = 0;
+    virtual simdvertex& GetNextVsOutput() = 0;
+    virtual bool GetNextStreamOutput() = 0;
+    virtual simdmask& GetNextVsIndices() = 0;
+    virtual uint32_t NumPrims() = 0;
+    virtual void Reset() = 0;
+    virtual simdscalari GetPrimID(uint32_t startID) = 0;
+};
+
+// The Optimized PA is a state machine that assembles triangles from vertex shader simd
+// output. Here is the sequence
+//    1. Execute FS/VS to generate a simd vertex (4 vertices for SSE simd and 8 for AVX simd).
+//    2. Execute PA function to assemble and bin triangles.
+//        a.    The PA function is a set of functions that collectively make up the
+//            state machine for a given topology.
+//                1.    We use a state index to track which PA function to call.
+//        b. Often the PA function needs to 2 simd vertices in order to assemble the next triangle.
+//                1.    We call this the current and previous simd vertex.
+//                2.    The SSE simd is 4-wide which is not a multiple of 3 needed for triangles. In
+//                    order to assemble the second triangle, for a triangle list, we'll need the
+//                    last vertex from the previous simd and the first 2 vertices from the current simd.
+//                3. At times the PA can assemble multiple triangles from the 2 simd vertices.
+//
+// This optimized PA is not cut aware, so only should be used by non-indexed draws or draws without
+// cuts
+struct PA_STATE_OPT : public PA_STATE
+{
+    simdvertex leadingVertex;           // For tri-fan
+    uint32_t numPrims;              // Total number of primitives for draw.
+    uint32_t numPrimsComplete;      // Total number of complete primitives.
+
+    uint32_t numSimdPrims;          // Number of prims in current simd.
+
+    uint32_t cur;                   // index to current VS output.
+    uint32_t prev;                  // index to prev VS output. Not really needed in the state.
+    uint32_t first;                 // index to first VS output. Used for trifan.
+
+    uint32_t counter;               // state counter
+    bool reset;                     // reset state
+
+    uint32_t primIDIncr;            // how much to increment for each vector (typically vector / {1, 2})
+    simdscalari primID;
+
+    typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]);
+    typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+
+    PFN_PA_FUNC        pfnPaFunc;        // PA state machine function for assembling 4 triangles.
+    PFN_PA_SINGLE_FUNC pfnPaSingleFunc;  // PA state machine function for assembling single triangle.
+    PFN_PA_FUNC        pfnPaFuncReset;   // initial state to set on reset
+
+    // state used to advance the PA when Next is called
+    PFN_PA_FUNC        pfnPaNextFunc;
+    uint32_t           nextNumSimdPrims;
+    uint32_t           nextNumPrimsIncrement;
+    bool               nextReset;
+    bool               isStreaming;
+
+    simdmask tmpIndices;             // temporary index store for unused virtual function
+    
+    PA_STATE_OPT() {}
+    PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
+        bool in_isStreaming, PRIMITIVE_TOPOLOGY topo = TOP_UNKNOWN);
+
+    bool HasWork()
+    {
+        return (this->numPrimsComplete < this->numPrims) ? true : false;
+    }
+
+    simdvector& GetSimdVector(uint32_t index, uint32_t slot)
+    {
+        simdvertex* pVertex = (simdvertex*)pStreamBase;
+        return pVertex[index].attrib[slot];
+    }
+
+    // Assembles 4 triangles. Each simdvector is a single vertex from 4
+    // triangles (xxxx yyyy zzzz wwww) and there are 3 verts per triangle.
+    bool Assemble(uint32_t slot, simdvector verts[])
+    {
+        return this->pfnPaFunc(*this, slot, verts);
+    }
+
+    // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
+    void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
+    {
+        return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
+    }
+
+    bool NextPrim()
+    {
+        this->pfnPaFunc = this->pfnPaNextFunc;
+        this->numSimdPrims = this->nextNumSimdPrims;
+        this->numPrimsComplete += this->nextNumPrimsIncrement;
+        this->reset = this->nextReset;
+
+        if (this->isStreaming)
+        {
+            this->reset = false;
+        }
+
+        bool morePrims = false;
+
+        if (this->numSimdPrims > 0)
+        {
+            morePrims = true;
+            this->numSimdPrims--;
+        }
+        else
+        {
+            this->counter = (this->reset) ? 0 : (this->counter + 1);
+            this->reset = false;
+        }
+
+        this->pfnPaFunc = this->pfnPaNextFunc;
+
+        if (!HasWork())
+        {
+            morePrims = false;    // no more to do
+        }
+
+        return morePrims;
+    }
+
+    simdvertex& GetNextVsOutput()
+    {
+        // increment cur and prev indices
+        const uint32_t numSimdVerts = this->streamSizeInVerts / KNOB_SIMD_WIDTH;
+        this->prev = this->cur;  // prev is undefined for first state.
+        this->cur = this->counter % numSimdVerts;
+
+        simdvertex* pVertex = (simdvertex*)pStreamBase;
+        return pVertex[this->cur];
+    }
+    
+    simdmask& GetNextVsIndices()
+    {
+        // unused in optimized PA, pass tmp buffer back
+        return tmpIndices;
+    }
+
+    bool GetNextStreamOutput()
+    {
+        this->prev = this->cur;
+        this->cur = this->counter;
+
+        return HasWork();
+    }
+
+    uint32_t NumPrims()
+    {
+        return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ?
+            (KNOB_SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : KNOB_SIMD_WIDTH;
+    }
+
+    void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
+        PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
+        uint32_t numSimdPrims = 0,
+        uint32_t numPrimsIncrement = 0,
+        bool reset = false)
+    {
+        this->pfnPaNextFunc = pfnPaNextFunc;
+        this->nextNumSimdPrims = numSimdPrims;
+        this->nextNumPrimsIncrement = numPrimsIncrement;
+        this->nextReset = reset;
+
+        this->pfnPaSingleFunc = pfnPaNextSingleFunc;
+    }
+
+    void Reset()
+    {
+        this->pfnPaFunc = this->pfnPaFuncReset;
+        this->numPrimsComplete = 0;
+        this->numSimdPrims = 0;
+        this->cur = 0;
+        this->prev = 0;
+        this->first = 0;
+        this->counter = 0;
+        this->reset = false;
+    }
+
+    simdscalari GetPrimID(uint32_t startID)
+    {
+        return _simd_add_epi32(this->primID,
+            _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / KNOB_SIMD_WIDTH)));
+    }
+};
+
+// helper C wrappers to avoid having to rewrite all the PA topology state functions
+INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc,
+    PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc,
+    uint32_t numSimdPrims = 0,
+    uint32_t numPrimsIncrement = 0,
+    bool reset = false)
+{
+    return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset);
+}
+INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot)
+{
+    return pa.GetSimdVector(index, slot);
+}
+
+INLINE __m128 swizzleLane0(const simdvector &a)
+{
+    simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
+    simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
+    return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
+}
+
+INLINE __m128 swizzleLane1(const simdvector &a)
+{
+    simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
+    simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
+    return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
+}
+
+INLINE __m128 swizzleLane2(const simdvector &a)
+{
+    simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
+    simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
+    return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
+}
+
+INLINE __m128 swizzleLane3(const simdvector &a)
+{
+    simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
+    simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
+    return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
+}
+
+INLINE __m128 swizzleLane4(const simdvector &a)
+{
+    simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
+    simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
+    return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
+
+}
+
+INLINE __m128 swizzleLane5(const simdvector &a)
+{
+    simdscalar tmp0 = _mm256_unpacklo_ps(a.x, a.z);
+    simdscalar tmp1 = _mm256_unpacklo_ps(a.y, a.w);
+    return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
+}
+
+INLINE __m128 swizzleLane6(const simdvector &a)
+{
+    simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
+    simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
+    return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
+}
+
+INLINE __m128 swizzleLane7(const simdvector &a)
+{
+    simdscalar tmp0 = _mm256_unpackhi_ps(a.x, a.z);
+    simdscalar tmp1 = _mm256_unpackhi_ps(a.y, a.w);
+    return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
+}
+
+INLINE __m128 swizzleLaneN(const simdvector &a, int lane)
+{
+    switch (lane) {
+    case 0:
+        return swizzleLane0(a);
+    case 1:
+        return swizzleLane1(a);
+    case 2:
+        return swizzleLane2(a);
+    case 3:
+        return swizzleLane3(a);
+    case 4:
+        return swizzleLane4(a);
+    case 5:
+        return swizzleLane5(a);
+    case 6:
+        return swizzleLane6(a);
+    case 7:
+        return swizzleLane7(a);
+    default:
+        return _mm_setzero_ps();
+    }
+}
+
+// Cut-aware primitive assembler.
+struct PA_STATE_CUT : public PA_STATE
+{
+    simdmask* pCutIndices;          // cut indices buffer, 1 bit per vertex
+    uint32_t numVerts;              // number of vertices available in buffer store
+    uint32_t numAttribs;            // number of attributes
+    int32_t numRemainingVerts;      // number of verts remaining to be assembled
+    uint32_t numVertsToAssemble;    // total number of verts to assemble for the draw
+    OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][KNOB_SIMD_WIDTH];    // current index buffer for gather
+    simdscalari vOffsets[MAX_NUM_VERTS_PER_PRIM];           // byte offsets for currently assembling simd
+    uint32_t numPrimsAssembled;     // number of primitives that are fully assembled
+    uint32_t headVertex;            // current unused vertex slot in vertex buffer store
+    uint32_t tailVertex;            // beginning vertex currently assembling
+    uint32_t curVertex;             // current unprocessed vertex
+    uint32_t startPrimId;           // starting prim id
+    simdscalari vPrimId;            // vector of prim ID
+    bool needOffsets;               // need to compute gather offsets for current SIMD
+    uint32_t vertsPerPrim;
+    simdvertex tmpVertex;               // temporary simdvertex for unimplemented API
+    bool processCutVerts;           // vertex indices with cuts should be processed as normal, otherwise they
+                                    // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
+                                    // while the GS sends valid verts for every index 
+    // Topology state tracking
+    uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
+    uint32_t curIndex;
+    bool reverseWinding;            // indicates reverse winding for strips
+    int32_t adjExtraVert;           // extra vert uses for tristrip w/ adj
+
+    typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
+    PFN_PA_FUNC pfnPa;              // per-topology function that processes a single vert
+
+    PA_STATE_CUT() {}
+    PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, simdmask* in_pIndices, uint32_t in_numVerts, 
+        uint32_t in_numAttribs, PRIMITIVE_TOPOLOGY topo, bool in_processCutVerts)
+        : PA_STATE(pDC, in_pStream, in_streamSizeInVerts)
+    {
+        numVerts = in_streamSizeInVerts;
+        numAttribs = in_numAttribs;
+        binTopology = topo;
+        needOffsets = false;
+        processCutVerts = in_processCutVerts;
+
+        numVertsToAssemble = numRemainingVerts = in_numVerts;
+        numPrimsAssembled = 0;
+        headVertex = tailVertex = curVertex = 0;
+
+        curIndex = 0;
+        pCutIndices = in_pIndices;
+        memset(indices, 0, sizeof(indices));
+        vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+        reverseWinding = false;
+        adjExtraVert = -1;
+
+        bool gsEnabled = pDC->pState->state.gsState.gsEnable;
+        vertsPerPrim = NumVertsPerPrim(topo, gsEnabled);
+
+        switch (topo)
+        {
+        case TOP_TRIANGLE_LIST:     pfnPa = &PA_STATE_CUT::ProcessVertTriList; break;
+        case TOP_TRI_LIST_ADJ:      pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertTriListAdj : &PA_STATE_CUT::ProcessVertTriListAdjNoGs; break;
+        case TOP_TRIANGLE_STRIP:    pfnPa = &PA_STATE_CUT::ProcessVertTriStrip; break;
+        case TOP_TRI_STRIP_ADJ:     if (gsEnabled)
+                                    {
+                                        pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < true > ;
+                                    }
+                                    else
+                                    {
+                                        pfnPa = &PA_STATE_CUT::ProcessVertTriStripAdj < false > ;
+                                    }
+                                    break;
+
+        case TOP_POINT_LIST:        pfnPa = &PA_STATE_CUT::ProcessVertPointList; break;
+        case TOP_LINE_LIST:         pfnPa = &PA_STATE_CUT::ProcessVertLineList; break;
+        case TOP_LINE_LIST_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineListAdj : &PA_STATE_CUT::ProcessVertLineListAdjNoGs; break;
+        case TOP_LINE_STRIP:        pfnPa = &PA_STATE_CUT::ProcessVertLineStrip; break;
+        case TOP_LISTSTRIP_ADJ:     pfnPa = gsEnabled ? &PA_STATE_CUT::ProcessVertLineStripAdj : &PA_STATE_CUT::ProcessVertLineStripAdjNoGs; break;
+        default: assert(0 && "Unimplemented topology");
+        }
+    }
+
+    simdvertex& GetNextVsOutput()
+    {
+        uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH;
+        this->headVertex = (this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts;
+        this->needOffsets = true;
+        return ((simdvertex*)pStreamBase)[vertexIndex];
+    }
+
+    simdmask& GetNextVsIndices()
+    {
+        uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH;
+        simdmask* pCurCutIndex = this->pCutIndices + vertexIndex;
+        return *pCurCutIndex;
+    }
+
+    simdvector& GetSimdVector(uint32_t index, uint32_t slot)
+    {
+        // unused
+        SWR_ASSERT(0 && "Not implemented");
+        return this->tmpVertex.attrib[0];
+    }
+
+    bool GetNextStreamOutput()
+    {
+        this->headVertex += KNOB_SIMD_WIDTH;
+        this->needOffsets = true;
+        return HasWork();
+    }
+
+    simdscalari GetPrimID(uint32_t startID)
+    {
+        return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId);
+    }
+
+    void Reset()
+    {
+        this->numRemainingVerts = this->numVertsToAssemble;
+        this->numPrimsAssembled = 0;
+        this->curIndex = 0;
+        this->curVertex = 0;
+        this->tailVertex = 0;
+        this->headVertex = 0;
+        this->reverseWinding = false;
+        this->adjExtraVert = -1;
+        this->vPrimId = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+    }
+
+    bool HasWork()
+    {
+        return this->numRemainingVerts > 0 || this->adjExtraVert != -1;
+    }
+
+    bool IsVertexStoreFull()
+    {
+        return ((this->headVertex + KNOB_SIMD_WIDTH) % this->numVerts) == this->tailVertex;
+    }
+
+    void RestartTopology()
+    {
+        this->curIndex = 0;
+        this->reverseWinding = false;
+        this->adjExtraVert = -1;
+    }
+
+    bool IsCutIndex(uint32_t vertex)
+    {
+        uint32_t vertexIndex = vertex / KNOB_SIMD_WIDTH;
+        uint32_t vertexOffset = vertex & (KNOB_SIMD_WIDTH - 1);
+        return _bittest((const LONG*)&this->pCutIndices[vertexIndex], vertexOffset) == 1;
+    }
+
+    // iterates across the unprocessed verts until we hit the end or we 
+    // have assembled SIMD prims
+    void ProcessVerts()
+    {
+        while (this->numPrimsAssembled != KNOB_SIMD_WIDTH &&
+            this->numRemainingVerts > 0 &&
+            this->curVertex != this->headVertex)
+        {
+            // if cut index, restart topology 
+            if (IsCutIndex(this->curVertex))
+            {
+                if (this->processCutVerts)
+                {
+                    (this->*pfnPa)(this->curVertex, false);
+                }
+                // finish off tri strip w/ adj before restarting topo
+                if (this->adjExtraVert != -1)
+                {
+                    (this->*pfnPa)(this->curVertex, true);
+                }
+                RestartTopology();
+            }
+            else
+            {
+                (this->*pfnPa)(this->curVertex, false);
+            }
+
+            this->curVertex = (this->curVertex + 1) % this->numVerts;
+            this->numRemainingVerts--;
+        }
+
+        // special case last primitive for tri strip w/ adj
+        if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts == 0 && this->adjExtraVert != -1)
+        {
+            (this->*pfnPa)(this->curVertex, true);
+        }
+    }
+
+    void Advance()
+    {
+        // done with current batch
+        // advance tail to the current unsubmitted vertex
+        this->tailVertex = this->curVertex;
+        this->numPrimsAssembled = 0;
+        this->vPrimId = _simd_add_epi32(vPrimId, _simd_set1_epi32(KNOB_SIMD_WIDTH));
+    }
+
+    bool NextPrim()
+    {
+        // if we've assembled enough prims, we can advance to the next set of verts
+        if (this->numPrimsAssembled == KNOB_SIMD_WIDTH || this->numRemainingVerts <= 0)
+        {
+            Advance();
+        }
+        return false;
+    }
+
+    void ComputeOffsets()
+    {
+        for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
+        {
+            simdscalari vIndices = *(simdscalari*)&this->indices[v][0];
+
+            // step to simdvertex batch
+            const uint32_t simdShift = 3; // @todo make knob
+            simdscalari vVertexBatch = _simd_srai_epi32(vIndices, simdShift);
+            this->vOffsets[v] = _simd_mullo_epi32(vVertexBatch, _simd_set1_epi32(sizeof(simdvertex)));
+
+            // step to index
+            const uint32_t simdMask = 0x7; // @todo make knob
+            simdscalari vVertexIndex = _simd_and_si(vIndices, _simd_set1_epi32(simdMask));
+            this->vOffsets[v] = _simd_add_epi32(this->vOffsets[v], _simd_mullo_epi32(vVertexIndex, _simd_set1_epi32(sizeof(float))));
+        }
+    }
+
+    bool Assemble(uint32_t slot, simdvector result[])
+    {
+        // process any outstanding verts
+        ProcessVerts();
+
+        // return false if we don't have enough prims assembled
+        if (this->numPrimsAssembled != KNOB_SIMD_WIDTH && this->numRemainingVerts > 0)
+        {
+            return false;
+        }
+
+        // cache off gather offsets given the current SIMD set of indices the first time we get an assemble
+        if (this->needOffsets)
+        {
+            ComputeOffsets();
+            this->needOffsets = false;
+        }
+
+        for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
+        {
+            simdscalari offsets = this->vOffsets[v];
+
+            // step to attribute
+            offsets = _simd_add_epi32(offsets, _simd_set1_epi32(slot * sizeof(simdvector)));
+
+            float* pBase = (float*)this->pStreamBase;
+            for (uint32_t c = 0; c < 4; ++c)
+            {
+                result[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1);
+
+                // move base to next component
+                pBase += KNOB_SIMD_WIDTH;
+            }
+        }
+
+        return true;
+    }
+
+    void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3])
+    {
+        // move to slot
+        for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
+        {
+            uint32_t* pOffset = (uint32_t*)&this->vOffsets[v];
+            uint32_t offset = pOffset[triIndex];
+            offset += sizeof(simdvector) * slot;
+            float* pVert = (float*)&tri[v];
+            for (uint32_t c = 0; c < 4; ++c)
+            {
+                float* pComponent = (float*)(this->pStreamBase + offset);
+                pVert[c] = *pComponent;
+                offset += KNOB_SIMD_WIDTH * sizeof(float);
+            }
+        }
+    }
+
+    uint32_t NumPrims()
+    {
+        return this->numPrimsAssembled;
+    }
+
+    // Per-topology functions
+    void ProcessVertTriStrip(uint32_t index, bool finish)
+    {
+        this->vert[this->curIndex] = index;
+        this->curIndex++;
+        if (this->curIndex == 3)
+        {
+            // assembled enough verts for prim, add to gather indices
+            this->indices[0][this->numPrimsAssembled] = this->vert[0];
+            if (reverseWinding)
+            {
+                this->indices[1][this->numPrimsAssembled] = this->vert[2];
+                this->indices[2][this->numPrimsAssembled] = this->vert[1];
+            }
+            else
+            {
+                this->indices[1][this->numPrimsAssembled] = this->vert[1];
+                this->indices[2][this->numPrimsAssembled] = this->vert[2];
+            }
+
+            // increment numPrimsAssembled
+            this->numPrimsAssembled++;
+
+            // set up next prim state
+            this->vert[0] = this->vert[1];
+            this->vert[1] = this->vert[2];
+            this->curIndex = 2;
+            this->reverseWinding ^= 1;
+        }
+    }
+
+    template<bool gsEnabled>
+    void AssembleTriStripAdj()
+    {
+        if (!gsEnabled)
+        {
+            this->vert[1] = this->vert[2];
+            this->vert[2] = this->vert[4];
+
+            this->indices[0][this->numPrimsAssembled] = this->vert[0];
+            this->indices[1][this->numPrimsAssembled] = this->vert[1];
+            this->indices[2][this->numPrimsAssembled] = this->vert[2];
+
+            this->vert[4] = this->vert[2];
+            this->vert[2] = this->vert[1];
+        }
+        else
+        {
+            this->indices[0][this->numPrimsAssembled] = this->vert[0];
+            this->indices[1][this->numPrimsAssembled] = this->vert[1];
+            this->indices[2][this->numPrimsAssembled] = this->vert[2];
+            this->indices[3][this->numPrimsAssembled] = this->vert[3];
+            this->indices[4][this->numPrimsAssembled] = this->vert[4];
+            this->indices[5][this->numPrimsAssembled] = this->vert[5];
+        }
+        this->numPrimsAssembled++;
+    }
+
+
+    template<bool gsEnabled>
+    void ProcessVertTriStripAdj(uint32_t index, bool finish)
+    {
+        // handle last primitive of tristrip
+        if (finish && this->adjExtraVert != -1)
+        {
+            this->vert[3] = this->adjExtraVert;
+            AssembleTriStripAdj<gsEnabled>();
+            this->adjExtraVert = -1;
+            return;
+        }
+
+        switch (this->curIndex)
+        {
+        case 0:
+        case 1:
+        case 2:
+        case 4:
+            this->vert[this->curIndex] = index;
+            this->curIndex++;
+            break;
+        case 3:
+            this->vert[5] = index;
+            this->curIndex++;
+            break;
+        case 5:
+            if (this->adjExtraVert == -1)
+            {
+                this->adjExtraVert = index;
+            }
+            else
+            {
+                this->vert[3] = index;
+                if (!gsEnabled)
+                {
+                    AssembleTriStripAdj<gsEnabled>();
+
+                    uint32_t nextTri[6];
+                    if (this->reverseWinding)
+                    {
+                        nextTri[0] = this->vert[4];
+                        nextTri[1] = this->vert[0];
+                        nextTri[2] = this->vert[2];
+                        nextTri[4] = this->vert[3];
+                        nextTri[5] = this->adjExtraVert;
+                    }
+                    else
+                    {
+                        nextTri[0] = this->vert[2];
+                        nextTri[1] = this->adjExtraVert;
+                        nextTri[2] = this->vert[3];
+                        nextTri[4] = this->vert[4];
+                        nextTri[5] = this->vert[0];
+                    }
+                    for (uint32_t i = 0; i < 6; ++i)
+                    {
+                        this->vert[i] = nextTri[i];
+                    }
+
+                    this->adjExtraVert = -1;
+                    this->reverseWinding ^= 1;
+                }
+                else
+                {
+                    this->curIndex++;
+                }
+            }
+            break;
+        case 6:
+            SWR_ASSERT(this->adjExtraVert != -1, "Algorith failure!");
+            AssembleTriStripAdj<gsEnabled>();
+            
+            uint32_t nextTri[6];
+            if (this->reverseWinding)
+            {
+                nextTri[0] = this->vert[4];
+                nextTri[1] = this->vert[0];
+                nextTri[2] = this->vert[2];
+                nextTri[4] = this->vert[3];
+                nextTri[5] = this->adjExtraVert;
+            }
+            else
+            {
+                nextTri[0] = this->vert[2];
+                nextTri[1] = this->adjExtraVert;
+                nextTri[2] = this->vert[3];
+                nextTri[4] = this->vert[4];
+                nextTri[5] = this->vert[0]; 
+            }
+            for (uint32_t i = 0; i < 6; ++i)
+            {
+                this->vert[i] = nextTri[i];
+            }
+            this->reverseWinding ^= 1;
+            this->adjExtraVert = index;
+            this->curIndex--;
+            break;
+        }
+    }
+
+    void ProcessVertTriList(uint32_t index, bool finish)
+    {
+        this->vert[this->curIndex] = index;
+        this->curIndex++;
+        if (this->curIndex == 3)
+        {
+            // assembled enough verts for prim, add to gather indices
+            this->indices[0][this->numPrimsAssembled] = this->vert[0];
+            this->indices[1][this->numPrimsAssembled] = this->vert[1];
+            this->indices[2][this->numPrimsAssembled] = this->vert[2];
+
+            // increment numPrimsAssembled
+            this->numPrimsAssembled++;
+
+            // set up next prim state
+            this->curIndex = 0;
+        }
+    }
+
+    void ProcessVertTriListAdj(uint32_t index, bool finish)
+    {
+        this->vert[this->curIndex] = index;
+        this->curIndex++;
+        if (this->curIndex == 6)
+        {
+            // assembled enough verts for prim, add to gather indices
+            this->indices[0][this->numPrimsAssembled] = this->vert[0];
+            this->indices[1][this->numPrimsAssembled] = this->vert[1];
+            this->indices[2][this->numPrimsAssembled] = this->vert[2];
+            this->indices[3][this->numPrimsAssembled] = this->vert[3];
+            this->indices[4][this->numPrimsAssembled] = this->vert[4];
+            this->indices[5][this->numPrimsAssembled] = this->vert[5];
+
+            // increment numPrimsAssembled
+            this->numPrimsAssembled++;
+
+            // set up next prim state
+            this->curIndex = 0;
+        }
+    }
+
+    void ProcessVertTriListAdjNoGs(uint32_t index, bool finish)
+    {
+        this->vert[this->curIndex] = index;
+        this->curIndex++;
+        if (this->curIndex == 6)
+        {
+            // assembled enough verts for prim, add to gather indices
+            this->indices[0][this->numPrimsAssembled] = this->vert[0];
+            this->indices[1][this->numPrimsAssembled] = this->vert[2];
+            this->indices[2][this->numPrimsAssembled] = this->vert[4];
+
+            // increment numPrimsAssembled
+            this->numPrimsAssembled++;
+
+            // set up next prim state
+            this->curIndex = 0;
+        }
+    }
+
+
+    void ProcessVertLineList(uint32_t index, bool finish)
+    {
+        this->vert[this->curIndex] = index;
+        this->curIndex++;
+        if (this->curIndex == 2)
+        {
+            this->indices[0][this->numPrimsAssembled] = this->vert[0];
+            this->indices[1][this->numPrimsAssembled] = this->vert[1];
+
+            this->numPrimsAssembled++;
+            this->curIndex = 0;
+        }
+    }
+
+    void ProcessVertLineStrip(uint32_t index, bool finish)
+    {
+        this->vert[this->curIndex] = index;
+        this->curIndex++;
+        if (this->curIndex == 2)
+        {
+            // assembled enough verts for prim, add to gather indices
+            this->indices[0][this->numPrimsAssembled] = this->vert[0];
+            this->indices[1][this->numPrimsAssembled] = this->vert[1];
+
+            // increment numPrimsAssembled
+            this->numPrimsAssembled++;
+
+            // set up next prim state
+            this->vert[0] = this->vert[1];
+            this->curIndex = 1;
+        }
+    }
+
+    void ProcessVertLineStripAdj(uint32_t index, bool finish)
+    {
+        this->vert[this->curIndex] = index;
+        this->curIndex++;
+        if (this->curIndex == 4)
+        {
+            // assembled enough verts for prim, add to gather indices
+            this->indices[0][this->numPrimsAssembled] = this->vert[0];
+            this->indices[1][this->numPrimsAssembled] = this->vert[1];
+            this->indices[2][this->numPrimsAssembled] = this->vert[2];
+            this->indices[3][this->numPrimsAssembled] = this->vert[3];
+
+            // increment numPrimsAssembled
+            this->numPrimsAssembled++;
+
+            // set up next prim state
+            this->vert[0] = this->vert[1];
+            this->vert[1] = this->vert[2];
+            this->vert[2] = this->vert[3];
+            this->curIndex = 3;
+        }
+    }
+
+    void ProcessVertLineStripAdjNoGs(uint32_t index, bool finish)
+    {
+        this->vert[this->curIndex] = index;
+        this->curIndex++;
+        if (this->curIndex == 4)
+        {
+            // assembled enough verts for prim, add to gather indices
+            this->indices[0][this->numPrimsAssembled] = this->vert[1];
+            this->indices[1][this->numPrimsAssembled] = this->vert[2];
+
+            // increment numPrimsAssembled
+            this->numPrimsAssembled++;
+
+            // set up next prim state
+            this->vert[0] = this->vert[1];
+            this->vert[1] = this->vert[2];
+            this->vert[2] = this->vert[3];
+            this->curIndex = 3;
+        }
+    }
+
+    void ProcessVertLineListAdj(uint32_t index, bool finish)
+    {
+        this->vert[this->curIndex] = index;
+        this->curIndex++;
+        if (this->curIndex == 4)
+        {
+            this->indices[0][this->numPrimsAssembled] = this->vert[0];
+            this->indices[1][this->numPrimsAssembled] = this->vert[1];
+            this->indices[2][this->numPrimsAssembled] = this->vert[2];
+            this->indices[3][this->numPrimsAssembled] = this->vert[3];
+
+            this->numPrimsAssembled++;
+            this->curIndex = 0;
+        }
+    }
+
+    void ProcessVertLineListAdjNoGs(uint32_t index, bool finish)
+    {
+        this->vert[this->curIndex] = index;
+        this->curIndex++;
+        if (this->curIndex == 4)
+        {
+            this->indices[0][this->numPrimsAssembled] = this->vert[1];
+            this->indices[1][this->numPrimsAssembled] = this->vert[2];
+
+            this->numPrimsAssembled++;
+            this->curIndex = 0;
+        }
+    }
+
+    void ProcessVertPointList(uint32_t index, bool finish)
+    {
+        this->vert[this->curIndex] = index;
+        this->curIndex++;
+        if (this->curIndex == 1)
+        {
+            this->indices[0][this->numPrimsAssembled] = this->vert[0];
+            this->numPrimsAssembled++;
+            this->curIndex = 0;
+        }
+    }
+};
+
+// Primitive Assembly for data output from the DomainShader.
+struct PA_TESS : PA_STATE
+{
+    PA_TESS(
+        DRAW_CONTEXT *in_pDC,
+        const simdscalar* in_pVertData,
+        uint32_t in_attributeStrideInVectors,
+        uint32_t in_numAttributes,
+        uint32_t* (&in_ppIndices)[3],
+        uint32_t in_numPrims,
+        PRIMITIVE_TOPOLOGY in_binTopology) :
+
+        PA_STATE(in_pDC, nullptr, 0),
+        m_pVertexData(in_pVertData),
+        m_attributeStrideInVectors(in_attributeStrideInVectors),
+        m_numAttributes(in_numAttributes),
+        m_numPrims(in_numPrims)
+    {
+        m_vPrimId = _simd_setzero_si();
+        binTopology = in_binTopology;
+        m_ppIndices[0] = in_ppIndices[0];
+        m_ppIndices[1] = in_ppIndices[1];
+        m_ppIndices[2] = in_ppIndices[2];
+
+        switch (binTopology)
+        {
+        case TOP_POINT_LIST:
+            m_numVertsPerPrim = 1;
+            break;
+
+        case TOP_LINE_LIST:
+            m_numVertsPerPrim = 2;
+            break;
+
+        case TOP_TRIANGLE_LIST:
+            m_numVertsPerPrim = 3;
+            break;
+
+        default:
+            SWR_ASSERT(0, "Invalid binTopology (%d) for %s", binTopology, __FUNCTION__);
+            break;
+        }
+    }
+
+    bool HasWork()
+    {
+        return m_numPrims != 0;
+    }
+
+    simdvector& GetSimdVector(uint32_t index, uint32_t slot)
+    {
+        SWR_ASSERT(0, "%s NOT IMPLEMENTED", __FUNCTION__);
+        static simdvector junk = { 0 };
+        return junk;
+    }
+
+    static simdscalari GenPrimMask(uint32_t numPrims)
+    {
+        SWR_ASSERT(numPrims <= KNOB_SIMD_WIDTH);
+#if KNOB_SIMD_WIDTH == 8
+        static const OSALIGN(int32_t, 64) maskGen[KNOB_SIMD_WIDTH * 2] =
+        {
+            -1, -1, -1, -1, -1, -1, -1, -1,
+             0,  0,  0,  0,  0,  0,  0,  0
+        };
+#elif KNOB_SIMD_WIDTH == 16
+        static const OSALIGN(int32_t, 128) maskGen[KNOB_SIMD_WIDTH * 2] =
+        {
+            -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
+             0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
+        };
+#else
+#error "Help, help, I can't get up!"
+#endif
+
+        return _simd_loadu_si((const simdscalari*)&maskGen[KNOB_SIMD_WIDTH - numPrims]);
+    }
+
+    bool Assemble(uint32_t slot, simdvector verts[])
+    {
+        static_assert(KNOB_SIMD_WIDTH == 8, "Need to revisit this when AVX512 is implemented");
+        SWR_ASSERT(slot < m_numAttributes);
+
+        uint32_t numPrimsToAssemble = PA_TESS::NumPrims();
+        if (0 == numPrimsToAssemble)
+        {
+            return false;
+        }
+
+        simdscalari mask = GenPrimMask(numPrimsToAssemble);
+
+        const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
+        for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
+        {
+            simdscalari indices = _simd_load_si((const simdscalari*)m_ppIndices[i]);
+
+            const float* pBase = pBaseAttrib;
+            for (uint32_t c = 0; c < 4; ++c)
+            {
+                verts[i].v[c] = _simd_mask_i32gather_ps(
+                    _simd_setzero_ps(),
+                    pBase,
+                    indices,
+                    _simd_castsi_ps(mask),
+                    4 /* gcc doesn't like sizeof(float) */);
+                pBase += m_attributeStrideInVectors * KNOB_SIMD_WIDTH;
+            }
+        }
+
+        return true;
+    }
+
+    void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
+    {
+        SWR_ASSERT(slot < m_numAttributes);
+        SWR_ASSERT(primIndex < PA_TESS::NumPrims());
+
+        const float* pVertDataBase = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4];
+        for (uint32_t i = 0; i < m_numVertsPerPrim; ++i)
+        {
+            uint32_t index = m_ppIndices[i][primIndex];
+            const float* pVertData = pVertDataBase;
+            float* pVert = (float*)&verts[i];
+
+            for (uint32_t c = 0; c < 4; ++c)
+            {
+                pVert[c] = pVertData[index];
+                pVertData += m_attributeStrideInVectors * KNOB_SIMD_WIDTH;
+            }
+        }
+    }
+
+    bool NextPrim()
+    {
+        uint32_t numPrims = PA_TESS::NumPrims();
+        m_numPrims -= numPrims;
+        m_ppIndices[0] += numPrims;
+        m_ppIndices[1] += numPrims;
+        m_ppIndices[2] += numPrims;
+
+        return HasWork();
+    }
+
+    simdvertex& GetNextVsOutput()
+    {
+        SWR_ASSERT(0, "%s", __FUNCTION__);
+        static simdvertex junk;
+        return junk;
+    }
+
+    bool GetNextStreamOutput()
+    {
+        SWR_ASSERT(0, "%s", __FUNCTION__);
+        return false;
+    }
+
+    simdmask& GetNextVsIndices()
+    {
+        SWR_ASSERT(0, "%s", __FUNCTION__);
+        static simdmask junk;
+        return junk;
+    }
+
+    uint32_t NumPrims()
+    {
+        return std::min<uint32_t>(m_numPrims, KNOB_SIMD_WIDTH);
+    }
+
+    void Reset() { SWR_ASSERT(0); };
+
+    simdscalari GetPrimID(uint32_t startID)
+    {
+        return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId);
+    }
+
+private:
+    const simdscalar*   m_pVertexData = nullptr;
+    uint32_t            m_attributeStrideInVectors = 0;
+    uint32_t            m_numAttributes = 0;
+    uint32_t            m_numPrims = 0;
+    uint32_t*           m_ppIndices[3];
+
+    uint32_t            m_numVertsPerPrim = 0;
+
+    simdscalari         m_vPrimId;
+};
+
+// Primitive Assembler factory class, responsible for creating and initializing the correct assembler
+// based on state.
+template <bool IsIndexedT>
+struct PA_FACTORY
+{
+    PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts) : topo(in_topo)
+    {
+#if KNOB_ENABLE_CUT_AWARE_PA == TRUE
+        const API_STATE& state = GetApiState(pDC);
+        if ((IsIndexedT && (
+            topo == TOP_TRIANGLE_STRIP || topo == TOP_POINT_LIST ||
+            topo == TOP_LINE_LIST || topo == TOP_LINE_STRIP ||
+            topo == TOP_TRIANGLE_LIST || topo == TOP_LINE_LIST_ADJ ||
+            topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ ||
+            topo == TOP_TRI_STRIP_ADJ)) ||
+
+            // non-indexed draws with adjacency topologies must use cut-aware PA until we add support
+            // for them in the optimized PA
+            (!IsIndexedT && (
+            topo == TOP_LINE_LIST_ADJ || topo == TOP_LISTSTRIP_ADJ || topo == TOP_TRI_LIST_ADJ || topo == TOP_TRI_STRIP_ADJ)))
+        {
+            memset(&indexStore, 0, sizeof(indexStore));
+            DWORD numAttribs;
+            _BitScanReverse(&numAttribs, state.feAttribMask);
+            numAttribs++;
+            new (&this->paCut) PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, 
+                &this->indexStore[0], numVerts, numAttribs, state.topology, false);
+            cutPA = true;
+        }
+        else
+#endif
+        {
+            uint32_t numPrims = GetNumPrims(in_topo, numVerts);
+            new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, false);
+            cutPA = false;
+        }
+
+    }
+
+    PA_STATE& GetPA()
+    {
+#if KNOB_ENABLE_CUT_AWARE_PA == TRUE
+        if (cutPA)
+        {
+            return this->paCut;
+        }
+        else
+#endif
+        {
+            return this->paOpt;
+        }
+    }
+
+    PA_STATE_OPT paOpt;
+    PA_STATE_CUT paCut;
+    bool cutPA;
+
+    PRIMITIVE_TOPOLOGY topo;
+
+    simdvertex vertexStore[MAX_NUM_VERTS_PER_PRIM];
+    simdmask indexStore[MAX_NUM_VERTS_PER_PRIM];
+};
diff --git a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp
new file mode 100644
index 00000000000..9850b436e39
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp
@@ -0,0 +1,1177 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file pa_avx.cpp
+*
+* @brief AVX implementation for primitive assembly.
+*        N primitives are assembled at a time, where N is the SIMD width.
+*        A state machine, that is specific for a given topology, drives the
+*        assembly of vertices into triangles.
+*
+******************************************************************************/
+#include "context.h"
+#include "pa.h"
+#include "frontend.h"
+
+#if (KNOB_SIMD_WIDTH == 8)
+
+bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+
+bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+
+bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+
+bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+
+bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+
+bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t index, __m128 verts[]);
+
+bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 lineverts[]);
+
+bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+
+bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+bool PaRectList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
+void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+
+template <uint32_t TotalControlPoints>
+void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+{
+    // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
+    // KNOB_SIMD_WIDTH * 1 patch.  This function is called once per attribute.
+    // Each attribute has 4 components.
+
+    /// @todo Optimize this
+
+    float* pOutVec = (float*)verts;
+
+    for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
+    {
+        uint32_t input_cp = primIndex * TotalControlPoints + cp;
+        uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH;
+        uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
+
+        // Loop over all components of the attribute
+        for (uint32_t i = 0; i < 4; ++i)
+        {
+            const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
+            pOutVec[cp * 4 + i] = pInputVec[input_lane];
+        }
+    }
+}
+
+template<uint32_t TotalControlPoints, uint32_t CurrentControlPoints = 1>
+static bool PaPatchList(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+    SetNextPaState(
+        pa,
+        PaPatchList<TotalControlPoints, CurrentControlPoints + 1>,
+        PaPatchListSingle<TotalControlPoints>);
+
+    return false;
+}
+
+template<uint32_t TotalControlPoints>
+static bool PaPatchListTerm(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+    // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
+    // KNOB_SIMD_WIDTH * 1 patch.  This function is called once per attribute.
+    // Each attribute has 4 components.
+
+    /// @todo Optimize this
+
+    // Loop over all components of the attribute
+    for (uint32_t i = 0; i < 4; ++i)
+    {
+        for (uint32_t cp = 0; cp < TotalControlPoints; ++cp)
+        {
+            float vec[KNOB_SIMD_WIDTH];
+            for (uint32_t lane = 0; lane < KNOB_SIMD_WIDTH; ++lane)
+            {
+                uint32_t input_cp = lane * TotalControlPoints + cp;
+                uint32_t input_vec = input_cp / KNOB_SIMD_WIDTH;
+                uint32_t input_lane = input_cp % KNOB_SIMD_WIDTH;
+
+                const float* pInputVec = (const float*)(&PaGetSimdVector(pa, input_vec, slot)[i]);
+                vec[lane] = pInputVec[input_lane];
+            }
+            verts[cp][i] = _simd_loadu_ps(vec);
+        }
+    }
+
+    SetNextPaState(
+        pa,
+        PaPatchList<TotalControlPoints>,
+        PaPatchListSingle<TotalControlPoints>,
+        0,
+        KNOB_SIMD_WIDTH,
+        true);
+
+    return true;
+}
+
+#define PA_PATCH_LIST_TERMINATOR(N) \
+    template<> bool PaPatchList<N, N>(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])\
+                           { return PaPatchListTerm<N>(pa, slot, verts); }
+PA_PATCH_LIST_TERMINATOR(1)
+PA_PATCH_LIST_TERMINATOR(2)
+PA_PATCH_LIST_TERMINATOR(3)
+PA_PATCH_LIST_TERMINATOR(4)
+PA_PATCH_LIST_TERMINATOR(5)
+PA_PATCH_LIST_TERMINATOR(6)
+PA_PATCH_LIST_TERMINATOR(7)
+PA_PATCH_LIST_TERMINATOR(8)
+PA_PATCH_LIST_TERMINATOR(9)
+PA_PATCH_LIST_TERMINATOR(10)
+PA_PATCH_LIST_TERMINATOR(11)
+PA_PATCH_LIST_TERMINATOR(12)
+PA_PATCH_LIST_TERMINATOR(13)
+PA_PATCH_LIST_TERMINATOR(14)
+PA_PATCH_LIST_TERMINATOR(15)
+PA_PATCH_LIST_TERMINATOR(16)
+PA_PATCH_LIST_TERMINATOR(17)
+PA_PATCH_LIST_TERMINATOR(18)
+PA_PATCH_LIST_TERMINATOR(19)
+PA_PATCH_LIST_TERMINATOR(20)
+PA_PATCH_LIST_TERMINATOR(21)
+PA_PATCH_LIST_TERMINATOR(22)
+PA_PATCH_LIST_TERMINATOR(23)
+PA_PATCH_LIST_TERMINATOR(24)
+PA_PATCH_LIST_TERMINATOR(25)
+PA_PATCH_LIST_TERMINATOR(26)
+PA_PATCH_LIST_TERMINATOR(27)
+PA_PATCH_LIST_TERMINATOR(28)
+PA_PATCH_LIST_TERMINATOR(29)
+PA_PATCH_LIST_TERMINATOR(30)
+PA_PATCH_LIST_TERMINATOR(31)
+PA_PATCH_LIST_TERMINATOR(32)
+#undef PA_PATCH_LIST_TERMINATOR
+
+bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+    SetNextPaState(pa, PaTriList1, PaTriListSingle0);
+    return false;    // Not enough vertices to assemble 4 or 8 triangles.
+}
+
+bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+    SetNextPaState(pa, PaTriList2, PaTriListSingle0);
+    return false;    // Not enough vertices to assemble 8 triangles.
+}
+
+bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+    simdvector& a = PaGetSimdVector(pa, 0, slot);
+    simdvector& b = PaGetSimdVector(pa, 1, slot);
+    simdvector& c = PaGetSimdVector(pa, 2, slot);
+    simdscalar    s;
+
+    // Tri Pattern - provoking vertex is always v0
+    //  v0 -> 0 3 6 9  12 15 18 21
+    //  v1 -> 1 4 7 10 13 16 19 22
+    //  v2 -> 2 5 8 11 14 17 20 23
+
+    for(int i = 0; i < 4; ++i)
+    {
+        simdvector& v0 = verts[0];
+        v0[i] = _simd_blend_ps(a[i], b[i], 0x92);
+        v0[i] = _simd_blend_ps(v0[i], c[i], 0x24);
+        v0[i] = _mm256_permute_ps(v0[i], 0x6C);
+        s = _mm256_permute2f128_ps(v0[i], v0[i], 0x21);
+        v0[i] = _simd_blend_ps(v0[i], s, 0x44);
+
+        simdvector& v1 = verts[1];
+        v1[i] = _simd_blend_ps(a[i], b[i], 0x24);
+        v1[i] = _simd_blend_ps(v1[i], c[i], 0x49);
+        v1[i] = _mm256_permute_ps(v1[i], 0xB1);
+        s = _mm256_permute2f128_ps(v1[i], v1[i], 0x21);
+        v1[i] = _simd_blend_ps(v1[i], s, 0x66);
+
+        simdvector& v2 = verts[2];
+        v2[i] = _simd_blend_ps(a[i], b[i], 0x49);
+        v2[i] = _simd_blend_ps(v2[i], c[i], 0x92);
+        v2[i] = _mm256_permute_ps(v2[i], 0xC6);
+        s = _mm256_permute2f128_ps(v2[i], v2[i], 0x21);
+        v2[i] = _simd_blend_ps(v2[i], s, 0x22);
+    }
+
+    SetNextPaState(pa, PaTriList0, PaTriListSingle0, 0, KNOB_SIMD_WIDTH, true);
+    return true;
+}
+
+void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+{
+    // We have 12 simdscalars contained within 3 simdvectors which
+    // hold at least 8 triangles worth of data. We want to assemble a single
+    // triangle with data in horizontal form.
+    simdvector& a = PaGetSimdVector(pa, 0, slot);
+    simdvector& b = PaGetSimdVector(pa, 1, slot);
+    simdvector& c = PaGetSimdVector(pa, 2, slot);
+
+    // Convert from vertical to horizontal.
+    // Tri Pattern - provoking vertex is always v0
+    //  v0 -> 0 3 6 9  12 15 18 21
+    //  v1 -> 1 4 7 10 13 16 19 22
+    //  v2 -> 2 5 8 11 14 17 20 23
+    switch(primIndex)
+    {
+    case 0:
+        verts[0] = swizzleLane0(a);
+        verts[1] = swizzleLane1(a);
+        verts[2] = swizzleLane2(a);
+        break;
+    case 1:
+        verts[0] = swizzleLane3(a);
+        verts[1] = swizzleLane4(a);
+        verts[2] = swizzleLane5(a);
+        break;
+    case 2:
+        verts[0] = swizzleLane6(a);
+        verts[1] = swizzleLane7(a);
+        verts[2] = swizzleLane0(b);
+        break;
+    case 3:
+        verts[0] = swizzleLane1(b);
+        verts[1] = swizzleLane2(b);
+        verts[2] = swizzleLane3(b);
+        break;
+    case 4:
+        verts[0] = swizzleLane4(b);
+        verts[1] = swizzleLane5(b);
+        verts[2] = swizzleLane6(b);
+        break;
+    case 5:
+        verts[0] = swizzleLane7(b);
+        verts[1] = swizzleLane0(c);
+        verts[2] = swizzleLane1(c);
+        break;
+    case 6:
+        verts[0] = swizzleLane2(c);
+        verts[1] = swizzleLane3(c);
+        verts[2] = swizzleLane4(c);
+        break;
+    case 7:
+        verts[0] = swizzleLane5(c);
+        verts[1] = swizzleLane6(c);
+        verts[2] = swizzleLane7(c);
+        break;
+    };
+}
+
+bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+    SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0);
+    return false;    // Not enough vertices to assemble 8 triangles.
+}
+
+bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+    simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
+    simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
+    simdscalar  s;
+
+    for(int i = 0; i < 4; ++i)
+    {
+        simdscalar a0 = a[i];
+        simdscalar b0 = b[i];
+
+        // Tri Pattern - provoking vertex is always v0
+        //  v0 -> 01234567
+        //  v1 -> 13355779
+        //  v2 -> 22446688
+        simdvector& v0 = verts[0];
+        v0[i] = a0;
+
+        //  s -> 4567891011 
+        s = _mm256_permute2f128_ps(a0, b0, 0x21);
+        //  s -> 23456789
+        s = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
+
+        simdvector& v1 = verts[1];
+        //  v1 -> 13355779
+        v1[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(3, 1, 3, 1));
+
+        simdvector& v2 = verts[2];
+        //  v2 -> 22446688
+        v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(2, 2, 2, 2));
+    }
+
+    SetNextPaState(pa, PaTriStrip1, PaTriStripSingle0, 0, KNOB_SIMD_WIDTH);
+    return true;
+}
+
+void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+{
+    simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
+    simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
+
+    // Convert from vertical to horizontal.
+    // Tri Pattern - provoking vertex is always v0
+    //  v0 -> 01234567
+    //  v1 -> 13355779
+    //  v2 -> 22446688
+    switch(primIndex)
+    {
+    case 0:
+        verts[0] = swizzleLane0(a);
+        verts[1] = swizzleLane1(a);
+        verts[2] = swizzleLane2(a);
+        break;
+    case 1:
+        verts[0] = swizzleLane1(a);
+        verts[1] = swizzleLane3(a);
+        verts[2] = swizzleLane2(a);
+        break;
+    case 2:
+        verts[0] = swizzleLane2(a);
+        verts[1] = swizzleLane3(a);
+        verts[2] = swizzleLane4(a);
+        break;
+    case 3:
+        verts[0] = swizzleLane3(a);
+        verts[1] = swizzleLane5(a);
+        verts[2] = swizzleLane4(a);
+        break;
+    case 4:
+        verts[0] = swizzleLane4(a);
+        verts[1] = swizzleLane5(a);
+        verts[2] = swizzleLane6(a);
+        break;
+    case 5:
+        verts[0] = swizzleLane5(a);
+        verts[1] = swizzleLane7(a);
+        verts[2] = swizzleLane6(a);
+        break;
+    case 6:
+        verts[0] = swizzleLane6(a);
+        verts[1] = swizzleLane7(a);
+        verts[2] = swizzleLane0(b);
+        break;
+    case 7:
+        verts[0] = swizzleLane7(a);
+        verts[1] = swizzleLane1(b);
+        verts[2] = swizzleLane0(b);
+        break;
+    };
+}
+
+bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+    simdvector& a = PaGetSimdVector(pa, pa.cur, slot);
+
+    // Extract vertex 0 to every lane of first vector
+    for(int i = 0; i < 4; ++i)
+    {
+        __m256 a0 = a[i];
+        simdvector& v0 = verts[0];
+        v0[i] = _simd_shuffle_ps(a0, a0, _MM_SHUFFLE(0, 0, 0, 0));
+        v0[i] = _mm256_permute2f128_ps(v0[i], a0, 0x00);
+    }
+
+    // store off leading vertex for attributes
+    simdvertex* pVertex = (simdvertex*)pa.pStreamBase;
+    pa.leadingVertex = pVertex[pa.cur];
+
+    SetNextPaState(pa, PaTriFan1, PaTriFanSingle0);
+    return false;    // Not enough vertices to assemble 8 triangles.
+}
+
+bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+    simdvector& leadVert = pa.leadingVertex.attrib[slot];
+    simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
+    simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
+    simdscalar    s;
+
+    // need to fill vectors 1/2 with new verts, and v0 with anchor vert.
+    for(int i = 0; i < 4; ++i)
+    {
+        simdscalar a0 = a[i];
+        simdscalar b0 = b[i];
+
+        __m256 comp = leadVert[i];
+        simdvector& v0 = verts[0];
+        v0[i] = _simd_shuffle_ps(comp, comp, _MM_SHUFFLE(0, 0, 0, 0));
+        v0[i] = _mm256_permute2f128_ps(v0[i], comp, 0x00);
+
+        simdvector& v2 = verts[2];
+        s = _mm256_permute2f128_ps(a0, b0, 0x21);
+        v2[i] = _simd_shuffle_ps(a0, s, _MM_SHUFFLE(1, 0, 3, 2));
+
+        simdvector& v1 = verts[1];
+        v1[i] = _simd_shuffle_ps(a0, v2[i], _MM_SHUFFLE(2, 1, 2, 1));
+    }
+
+    SetNextPaState(pa, PaTriFan1, PaTriFanSingle0, 0, KNOB_SIMD_WIDTH);
+    return true;
+}
+
+void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+{
+    // vert 0 from leading vertex
+    simdvector& lead = pa.leadingVertex.attrib[slot];
+    verts[0] = swizzleLane0(lead);
+
+    simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
+    simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
+
+    // vert 1
+    if (primIndex < 7)
+    {
+        verts[1] = swizzleLaneN(a, primIndex + 1);
+    }
+    else
+    {
+        verts[1] = swizzleLane0(b);
+    }
+
+    // vert 2
+    if (primIndex < 6)
+    {
+        verts[2] = swizzleLaneN(a, primIndex + 2);
+    }
+    else
+    {
+        verts[2] = swizzleLaneN(b, primIndex - 6);
+    }
+}
+
+bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+    SetNextPaState(pa, PaQuadList1, PaQuadListSingle0);
+    return false;    // Not enough vertices to assemble 8 triangles.
+}
+
+bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+    simdvector& a = PaGetSimdVector(pa, 0, slot);
+    simdvector& b = PaGetSimdVector(pa, 1, slot);
+    simdscalar    s1, s2;
+
+    for(int i = 0; i < 4; ++i)
+    {
+        simdscalar a0 = a[i];
+        simdscalar b0 = b[i];
+
+        s1 = _mm256_permute2f128_ps(a0, b0, 0x20);
+        s2 = _mm256_permute2f128_ps(a0, b0, 0x31);
+
+        simdvector& v0 = verts[0];
+        v0[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(0, 0, 0, 0));
+
+        simdvector& v1 = verts[1];
+        v1[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(2, 1, 2, 1));
+
+        simdvector& v2 = verts[2];
+        v2[i] = _simd_shuffle_ps(s1, s2, _MM_SHUFFLE(3, 2, 3, 2));
+    }
+
+    SetNextPaState(pa, PaQuadList0, PaQuadListSingle0, 0, KNOB_SIMD_WIDTH, true);
+    return true;
+}
+
+void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+{
+    simdvector& a = PaGetSimdVector(pa, 0, slot);
+    simdvector& b = PaGetSimdVector(pa, 1, slot);
+
+    switch (primIndex)
+    {
+    case 0:
+        // triangle 0 - 0 1 2
+        verts[0] = swizzleLane0(a);
+        verts[1] = swizzleLane1(a);
+        verts[2] = swizzleLane2(a);
+        break;
+
+    case 1:
+        // triangle 1 - 0 2 3
+        verts[0] = swizzleLane0(a);
+        verts[1] = swizzleLane2(a);
+        verts[2] = swizzleLane3(a);
+        break;
+
+    case 2:
+        // triangle 2 - 4 5 6
+        verts[0] = swizzleLane4(a);
+        verts[1] = swizzleLane5(a);
+        verts[2] = swizzleLane6(a);
+        break;
+
+    case 3:
+        // triangle 3 - 4 6 7
+        verts[0] = swizzleLane4(a);
+        verts[1] = swizzleLane6(a);
+        verts[2] = swizzleLane7(a);
+        break;
+
+    case 4:
+        // triangle 4 - 8 9 10 (0 1 2)
+        verts[0] = swizzleLane0(b);
+        verts[1] = swizzleLane1(b);
+        verts[2] = swizzleLane2(b);
+        break;
+
+    case 5:
+        // triangle 1 - 0 2 3
+        verts[0] = swizzleLane0(b);
+        verts[1] = swizzleLane2(b);
+        verts[2] = swizzleLane3(b);
+        break;
+
+    case 6:
+        // triangle 2 - 4 5 6
+        verts[0] = swizzleLane4(b);
+        verts[1] = swizzleLane5(b);
+        verts[2] = swizzleLane6(b);
+        break;
+
+    case 7:
+        // triangle 3 - 4 6 7
+        verts[0] = swizzleLane4(b);
+        verts[1] = swizzleLane6(b);
+        verts[2] = swizzleLane7(b);
+        break;
+    }
+}
+
+void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t lineIndex, __m128 verts[])
+{
+    PaLineStripSingle0(pa, slot, lineIndex, verts);
+
+    if (pa.numPrimsComplete + lineIndex == pa.numPrims - 1) {
+        simdvector &start = PaGetSimdVector(pa, pa.first, slot);
+        verts[1] = swizzleLane0(start);
+    }
+}
+
+bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+    SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0);
+    return false;
+}
+
+bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+    PaLineStrip1(pa, slot, verts);
+
+    if (pa.numPrimsComplete + KNOB_SIMD_WIDTH > pa.numPrims - 1) {
+        // loop reconnect now
+        int lane = pa.numPrims - pa.numPrimsComplete - 1;
+        simdvector &start = PaGetSimdVector(pa, pa.first, slot);
+        for (int i = 0; i < 4; i++) {
+            float *startVtx = (float *)&(start[i]);
+            float *targetVtx = (float *)&(verts[1][i]);
+            targetVtx[lane] = startVtx[0];
+        }
+    }
+
+    SetNextPaState(pa, PaLineLoop1, PaLineLoopSingle0, 0, KNOB_SIMD_WIDTH);
+    return true;
+}
+
+
+bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+    SetNextPaState(pa, PaLineList1, PaLineListSingle0);
+    return false;    // Not enough vertices to assemble 8 lines
+}
+
+bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+    simdvector& a = PaGetSimdVector(pa, 0, slot);
+    simdvector& b = PaGetSimdVector(pa, 1, slot);
+    /// @todo: verify provoking vertex is correct
+    // Line list 0  1  2  3  4  5  6  7
+    //           8  9 10 11 12 13 14 15
+
+    // shuffle:
+    //           0 2 4 6 8 10 12 14
+    //           1 3 5 7 9 11 13 15
+
+    for (uint32_t i = 0; i < 4; ++i)
+    {
+        // 0 1 2 3 8 9 10 11
+        __m256 vALowBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x20);
+        // 4 5 6 7 12 13 14 15
+        __m256 vAHighBHigh = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x31);
+
+        // 0 2 4 6 8 10 12 14
+        verts[0].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(2, 0, 2, 0));
+        // 1 3 5 7 9 11 13 15
+        verts[1].v[i] = _mm256_shuffle_ps(vALowBLow, vAHighBHigh, _MM_SHUFFLE(3, 1, 3, 1));
+    }
+
+    SetNextPaState(pa, PaLineList0, PaLineListSingle0, 0, KNOB_SIMD_WIDTH, true);
+    return true;
+}
+
+void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+{
+    simdvector &a = PaGetSimdVector(pa, pa.prev, slot);
+    simdvector &b = PaGetSimdVector(pa, pa.cur, slot);
+
+    switch (primIndex)
+    {
+    case 0:
+        verts[0] = swizzleLane0(a);
+        verts[1] = swizzleLane1(a);
+        break;
+    case 1:
+        verts[0] = swizzleLane2(a);
+        verts[1] = swizzleLane3(a);
+        break;
+    case 2:
+        verts[0] = swizzleLane4(a);
+        verts[1] = swizzleLane5(a);
+        break;
+    case 3:
+        verts[0] = swizzleLane6(a);
+        verts[1] = swizzleLane7(a);
+        break;
+    case 4:
+        verts[0] = swizzleLane0(b);
+        verts[1] = swizzleLane1(b);
+        break;
+    case 5:
+        verts[0] = swizzleLane2(b);
+        verts[1] = swizzleLane3(b);
+        break;
+    case 6:
+        verts[0] = swizzleLane4(b);
+        verts[1] = swizzleLane5(b);
+        break;
+    case 7:
+        verts[0] = swizzleLane6(b);
+        verts[1] = swizzleLane7(b);
+        break;
+    }
+}
+
+bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+    SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0);
+    return false;    // Not enough vertices to assemble 8 lines
+}
+
+bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+    simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
+    simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
+
+    /// @todo: verify provoking vertex is correct
+    // Line list 0  1  2  3  4  5  6  7
+    //           8  9 10 11 12 13 14 15
+
+    // shuffle:
+    //           0  1  2  3  4  5  6  7
+    //           1  2  3  4  5  6  7  8
+
+    verts[0] = a;
+
+    for(uint32_t i = 0; i < 4; ++i)
+    {
+        // 1 2 3 x 5 6 7 x
+        __m256 vPermA = _mm256_permute_ps(a.v[i], 0x39); // indices hi->low 00 11 10 01 (0 3 2 1)
+        // 4 5 6 7 8 9 10 11
+        __m256 vAHighBLow = _mm256_permute2f128_ps(a.v[i], b.v[i], 0x21);
+
+        // x x x 4 x x x 8
+        __m256 vPermB = _mm256_permute_ps(vAHighBLow, 0); // indices hi->low  (0 0 0 0)
+
+        verts[1].v[i] = _mm256_blend_ps(vPermA, vPermB, 0x88);
+    }
+
+    SetNextPaState(pa, PaLineStrip1, PaLineStripSingle0, 0, KNOB_SIMD_WIDTH);
+    return true;
+}
+
+void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t lineIndex, __m128 verts[])
+{
+    simdvector& a = PaGetSimdVector(pa, pa.prev, slot);
+    simdvector& b = PaGetSimdVector(pa, pa.cur, slot);
+
+    switch (lineIndex)
+    {
+    case 0:
+        verts[0] = swizzleLane0(a);
+        verts[1] = swizzleLane1(a);
+        break;
+    case 1:
+        verts[0] = swizzleLane1(a);
+        verts[1] = swizzleLane2(a);
+        break;
+    case 2:
+        verts[0] = swizzleLane2(a);
+        verts[1] = swizzleLane3(a);
+        break;
+    case 3:
+        verts[0] = swizzleLane3(a);
+        verts[1] = swizzleLane4(a);
+        break;
+    case 4:
+        verts[0] = swizzleLane4(a);
+        verts[1] = swizzleLane5(a);
+        break;
+    case 5:
+        verts[0] = swizzleLane5(a);
+        verts[1] = swizzleLane6(a);
+        break;
+    case 6:
+        verts[0] = swizzleLane6(a);
+        verts[1] = swizzleLane7(a);
+        break;
+    case 7:
+        verts[0] = swizzleLane7(a);
+        verts[1] = swizzleLane0(b);
+        break;
+    }
+}
+
+bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+    simdvector& a = PaGetSimdVector(pa, pa.cur, slot);
+
+    verts[0] = a;  // points only have 1 vertex.
+
+    SetNextPaState(pa, PaPoints0, PaPointsSingle0, 0, KNOB_SIMD_WIDTH, true);
+    return true;
+}
+
+void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+{
+    simdvector &a = PaGetSimdVector(pa, pa.cur, slot);
+    switch(primIndex)
+    {
+    case 0: 
+        verts[0] = swizzleLane0(a);
+        break;
+    case 1: 
+        verts[0] = swizzleLane1(a);
+        break;
+    case 2: 
+        verts[0] = swizzleLane2(a);
+        break;
+    case 3: 
+        verts[0] = swizzleLane3(a);
+        break;
+    case 4: 
+        verts[0] = swizzleLane4(a);
+        break;
+    case 5: 
+        verts[0] = swizzleLane5(a);
+        break;
+    case 6: 
+        verts[0] = swizzleLane6(a);
+        break;
+    case 7: 
+        verts[0] = swizzleLane7(a);
+        break;
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief State 1 for RECT_LIST topology.
+///        There is not enough to assemble 8 triangles.
+bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[])
+{
+    SetNextPaState(pa, PaRectList1, PaRectListSingle0);
+    return false;
+}
+ 
+//////////////////////////////////////////////////////////////////////////
+/// @brief State 1 for RECT_LIST topology.
+///   Rect lists has the following format.
+///             w          x          y           z
+///      v2 o---o   v5 o---o   v8 o---o   v11 o---o
+///         | \ |      | \ |      | \ |       | \ |
+///      v1 o---o   v4 o---o   v7 o---o   v10 o---o
+///            v0         v3         v6          v9
+/// 
+///   Only 3 vertices of the rectangle are supplied. The 4th vertex is implied.
+/// 
+///   tri0 = { v0, v1, v2 }  tri1 = { v0, v2, w } <-- w = v0 - v1 + v2
+///   tri2 = { v3, v4, v5 }  tri3 = { v3, v5, x } <-- x = v3 - v4 + v5
+///   etc.
+/// 
+///   PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2
+///   where v0 contains all the first vertices for 8 triangles.
+/// 
+///     Result:
+///      verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 }
+///      verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 }
+///      verts[2] = { v2,  w, v5,  x, v8,  y, v11, z }
+///
+/// @param pa - State for PA state machine.
+/// @param slot - Index into VS output which is either a position (slot 0) or attribute.
+/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
+bool PaRectList1(
+    PA_STATE_OPT& pa,
+    uint32_t slot,
+    simdvector verts[])
+{
+    // SIMD vectors a and b are the last two vertical outputs from the vertex shader.
+    simdvector& a = PaGetSimdVector(pa, 0, slot);   // a[] = { v0, v1,  v2,  v3,  v4,  v5,  v6,  v7 }
+    simdvector& b = PaGetSimdVector(pa, 1, slot);   // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 }
+
+    __m256 tmp0, tmp1, tmp2;
+
+    // Loop over each component in the simdvector.
+    for(int i = 0; i < 4; ++i)
+    {
+        simdvector& v0 = verts[0];                          // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 }
+        tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01);  // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 }
+        v0[i] = _mm256_blend_ps(a[i], tmp0, 0x20);        //   v0 = {  v0,   *,   *,  v3,  *, v9,  v6,  * } where * is don't care.
+        tmp1  = _mm256_permute_ps(v0[i], 0xF0);           // tmp1 = {  v0,  v0,  v3,  v3,  *,  *,  *,  * }
+        v0[i] = _mm256_permute_ps(v0[i], 0x5A);           //   v0 = {   *,   *,   *,   *,  v6, v6, v9, v9 }
+        v0[i] = _mm256_blend_ps(tmp1, v0[i], 0xF0);       //   v0 = {  v0,  v0,  v3,  v3,  v6, v6, v9, v9 }
+
+        /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'.
+        ///      AVX2 should make this much cheaper.
+        simdvector& v1 = verts[1];                          // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 }
+        v1[i] = _mm256_permute_ps(a[i], 0x09);            //   v1 = { v1, v2,  *,  *,  *, *,  *, * }
+        tmp1  = _mm256_permute_ps(a[i], 0x43);            // tmp1 = {  *,  *,  *,  *, v7, *, v4, v5 }
+        tmp2  = _mm256_blend_ps(v1[i], tmp1, 0xF0);       // tmp2 = { v1, v2,  *,  *, v7, *, v4, v5 }
+        tmp1  = _mm256_permute2f128_ps(tmp2, tmp2, 0x1);  // tmp1 = { v7,  *, v4,  v5, *  *,  *,  * }
+        v1[i] = _mm256_permute_ps(tmp0, 0xE0);            //   v1 = {  *,  *,  *,  *,  *, v8, v10, v11 }
+        v1[i] = _mm256_blend_ps(tmp2, v1[i], 0xE0);       //   v1 = { v1, v2,  *,  *, v7, v8, v10, v11 }
+        v1[i] = _mm256_blend_ps(v1[i], tmp1, 0x0C);       //   v1 = { v1, v2, v4, v5, v7, v8, v10, v11 }
+
+        // verts[2] = { v2,  w, v5,  x, v8,  y, v11, z }
+        simdvector& v2 = verts[2];                          // verts[2] needs to be { v2,  w, v5,  x, v8,  y, v11, z }
+        v2[i] = _mm256_permute_ps(tmp0, 0x30);            //   v2 = { *, *, *, *, v8, *, v11, * }
+        tmp1  = _mm256_permute_ps(tmp2, 0x31);            // tmp1 = { v2, *, v5, *, *, *, *, * }
+        v2[i] = _mm256_blend_ps(tmp1, v2[i], 0xF0);
+
+        // Need to compute 4th implied vertex for the rectangle.
+        tmp2  = _mm256_sub_ps(v0[i], v1[i]);
+        tmp2  = _mm256_add_ps(tmp2, v2[i]);               // tmp2 = {  w,  *,  x, *, y,  *,  z,  * }
+        tmp2  = _mm256_permute_ps(tmp2, 0xA0);            // tmp2 = {  *,  w,  *, x, *,   y,  *,  z }
+        v2[i] = _mm256_blend_ps(v2[i], tmp2, 0xAA);       //   v2 = { v2,  w, v5, x, v8,  y, v11, z }
+    }
+
+    SetNextPaState(pa, PaRectList1, PaRectListSingle0, 0, KNOB_SIMD_WIDTH, true);
+    return true;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief State 2 for RECT_LIST topology.
+///        Not implemented unless there is a use case for more then 8 rects.
+/// @param pa - State for PA state machine.
+/// @param slot - Index into VS output which is either a position (slot 0) or attribute.
+/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
+bool PaRectList2(
+    PA_STATE_OPT& pa,
+    uint32_t slot,
+    simdvector verts[])
+{
+    SWR_ASSERT(0); // Is rect list used for anything other then clears?
+    SetNextPaState(pa, PaRectList0, PaRectListSingle0, 0, KNOB_SIMD_WIDTH, true);
+    return true;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief This procedure is called by the Binner to assemble the attributes.
+///        Unlike position, which is stored vertically, the attributes are
+///        stored horizontally. The outputs from the VS, labeled as 'a' and
+///        'b' are vertical. This function needs to transpose the lanes
+///        containing the vertical attribute data into horizontal form.
+/// @param pa - State for PA state machine.
+/// @param slot - Index into VS output for a given attribute.
+/// @param primIndex - Binner processes each triangle individually.
+/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc.
+void PaRectListSingle0(
+    PA_STATE_OPT& pa,
+    uint32_t slot,
+    uint32_t primIndex,
+    __m128 verts[])
+{
+    // We have 12 simdscalars contained within 3 simdvectors which
+    // hold at least 8 triangles worth of data. We want to assemble a single
+    // triangle with data in horizontal form.
+    simdvector& a = PaGetSimdVector(pa, 0, slot);
+
+    // Convert from vertical to horizontal.
+    switch(primIndex)
+    {
+    case 0:
+        verts[0] = swizzleLane0(a);
+        verts[1] = swizzleLane1(a);
+        verts[2] = swizzleLane2(a);
+        break;
+    case 1:
+        verts[0] = swizzleLane0(a);
+        verts[1] = swizzleLane2(a);
+        verts[2] = _mm_blend_ps(verts[0], verts[1], 0x2);
+        break;
+    case 2:
+    case 3:
+    case 4:
+    case 5:
+    case 6:
+    case 7:
+        SWR_ASSERT(0);
+        break;
+    };
+}
+
+PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t* pStream, uint32_t in_streamSizeInVerts, 
+    bool in_isStreaming, PRIMITIVE_TOPOLOGY topo) : PA_STATE(in_pDC, pStream, in_streamSizeInVerts), numPrims(in_numPrims), numPrimsComplete(0), numSimdPrims(0), 
+    cur(0), prev(0), first(0), counter(0), reset(false), pfnPaFunc(nullptr), isStreaming(in_isStreaming)
+{
+    const API_STATE& state = GetApiState(pDC);
+
+    this->binTopology = topo == TOP_UNKNOWN ? state.topology : topo;
+
+    switch (this->binTopology)
+    {
+        case TOP_TRIANGLE_LIST:
+            this->pfnPaFunc = PaTriList0;
+            break;
+        case TOP_TRIANGLE_STRIP:
+            this->pfnPaFunc = PaTriStrip0;
+            break;
+        case TOP_TRIANGLE_FAN:
+            this->pfnPaFunc = PaTriFan0;
+            break;
+        case TOP_QUAD_LIST:
+            this->pfnPaFunc = PaQuadList0;
+            this->numPrims = in_numPrims * 2;    // Convert quad primitives into triangles
+            break;
+        case TOP_QUAD_STRIP:
+            // quad strip pattern when decomposed into triangles is the same as verts strips
+            this->pfnPaFunc = PaTriStrip0;
+            this->numPrims = in_numPrims * 2;    // Convert quad primitives into triangles
+            break;
+        case TOP_LINE_LIST:
+            this->pfnPaFunc = PaLineList0;
+            this->numPrims = in_numPrims;
+            break;
+        case TOP_LINE_STRIP:
+            this->pfnPaFunc = PaLineStrip0;
+            this->numPrims = in_numPrims;
+            break;
+        case TOP_LINE_LOOP:
+            this->pfnPaFunc = PaLineLoop0;
+            this->numPrims = in_numPrims;
+            break;
+        case TOP_POINT_LIST:
+            // use point binner and rasterizer if supported
+            this->pfnPaFunc = PaPoints0;
+            this->numPrims = in_numPrims;
+            break;
+        case TOP_RECT_LIST:
+            this->pfnPaFunc = PaRectList0;
+            this->numPrims = in_numPrims * 2;
+            break;
+
+        case TOP_PATCHLIST_1:
+            this->pfnPaFunc = PaPatchList<1>;
+            break;
+        case TOP_PATCHLIST_2:
+            this->pfnPaFunc = PaPatchList<2>;
+            break;
+        case TOP_PATCHLIST_3:
+            this->pfnPaFunc = PaPatchList<3>;
+            break;
+        case TOP_PATCHLIST_4:
+            this->pfnPaFunc = PaPatchList<4>;
+            break;
+        case TOP_PATCHLIST_5:
+            this->pfnPaFunc = PaPatchList<5>;
+            break;
+        case TOP_PATCHLIST_6:
+            this->pfnPaFunc = PaPatchList<6>;
+            break;
+        case TOP_PATCHLIST_7:
+            this->pfnPaFunc = PaPatchList<7>;
+            break;
+        case TOP_PATCHLIST_8:
+            this->pfnPaFunc = PaPatchList<8>;
+            break;
+        case TOP_PATCHLIST_9:
+            this->pfnPaFunc = PaPatchList<9>;
+            break;
+        case TOP_PATCHLIST_10:
+            this->pfnPaFunc = PaPatchList<10>;
+            break;
+        case TOP_PATCHLIST_11:
+            this->pfnPaFunc = PaPatchList<11>;
+            break;
+        case TOP_PATCHLIST_12:
+            this->pfnPaFunc = PaPatchList<12>;
+            break;
+        case TOP_PATCHLIST_13:
+            this->pfnPaFunc = PaPatchList<13>;
+            break;
+        case TOP_PATCHLIST_14:
+            this->pfnPaFunc = PaPatchList<14>;
+            break;
+        case TOP_PATCHLIST_15:
+            this->pfnPaFunc = PaPatchList<15>;
+            break;
+        case TOP_PATCHLIST_16:
+            this->pfnPaFunc = PaPatchList<16>;
+            break;
+        case TOP_PATCHLIST_17:
+            this->pfnPaFunc = PaPatchList<17>;
+            break;
+        case TOP_PATCHLIST_18:
+            this->pfnPaFunc = PaPatchList<18>;
+            break;
+        case TOP_PATCHLIST_19:
+            this->pfnPaFunc = PaPatchList<19>;
+            break;
+        case TOP_PATCHLIST_20:
+            this->pfnPaFunc = PaPatchList<20>;
+            break;
+        case TOP_PATCHLIST_21:
+            this->pfnPaFunc = PaPatchList<21>;
+            break;
+        case TOP_PATCHLIST_22:
+            this->pfnPaFunc = PaPatchList<22>;
+            break;
+        case TOP_PATCHLIST_23:
+            this->pfnPaFunc = PaPatchList<23>;
+            break;
+        case TOP_PATCHLIST_24:
+            this->pfnPaFunc = PaPatchList<24>;
+            break;
+        case TOP_PATCHLIST_25:
+            this->pfnPaFunc = PaPatchList<25>;
+            break;
+        case TOP_PATCHLIST_26:
+            this->pfnPaFunc = PaPatchList<26>;
+            break;
+        case TOP_PATCHLIST_27:
+            this->pfnPaFunc = PaPatchList<27>;
+            break;
+        case TOP_PATCHLIST_28:
+            this->pfnPaFunc = PaPatchList<28>;
+            break;
+        case TOP_PATCHLIST_29:
+            this->pfnPaFunc = PaPatchList<29>;
+            break;
+        case TOP_PATCHLIST_30:
+            this->pfnPaFunc = PaPatchList<30>;
+            break;
+        case TOP_PATCHLIST_31:
+            this->pfnPaFunc = PaPatchList<31>;
+            break;
+        case TOP_PATCHLIST_32:
+            this->pfnPaFunc = PaPatchList<32>;
+            break;
+
+        default:
+            SWR_ASSERT(0);
+            break;
+    };
+
+    this->pfnPaFuncReset = this->pfnPaFunc;
+
+    //    simdscalari id8 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7);
+    //    simdscalari id4 = _mm256_set_epi32(0, 0, 1, 1, 2, 2, 3, 3);
+    simdscalari id8 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0);
+    simdscalari id4 = _mm256_set_epi32(3, 3, 2, 2, 1, 1, 0, 0);
+
+    switch(this->binTopology)
+    {
+        case TOP_TRIANGLE_LIST:
+        case TOP_TRIANGLE_STRIP:
+        case TOP_TRIANGLE_FAN:
+        case TOP_LINE_STRIP:
+        case TOP_LINE_LIST:
+        case TOP_LINE_LOOP:
+            this->primIDIncr = 8;
+            this->primID = id8;
+            break;
+        case TOP_QUAD_LIST:
+        case TOP_QUAD_STRIP:
+        case TOP_RECT_LIST:
+            this->primIDIncr = 4;
+            this->primID = id4;
+            break;
+        case TOP_POINT_LIST:
+            this->primIDIncr = 8;
+            this->primID = id8;
+            break;
+        case TOP_PATCHLIST_1:
+        case TOP_PATCHLIST_2:
+        case TOP_PATCHLIST_3:
+        case TOP_PATCHLIST_4:
+        case TOP_PATCHLIST_5:
+        case TOP_PATCHLIST_6:
+        case TOP_PATCHLIST_7:
+        case TOP_PATCHLIST_8:
+        case TOP_PATCHLIST_9:
+        case TOP_PATCHLIST_10:
+        case TOP_PATCHLIST_11:
+        case TOP_PATCHLIST_12:
+        case TOP_PATCHLIST_13:
+        case TOP_PATCHLIST_14:
+        case TOP_PATCHLIST_15:
+        case TOP_PATCHLIST_16:
+        case TOP_PATCHLIST_17:
+        case TOP_PATCHLIST_18:
+        case TOP_PATCHLIST_19:
+        case TOP_PATCHLIST_20:
+        case TOP_PATCHLIST_21:
+        case TOP_PATCHLIST_22:
+        case TOP_PATCHLIST_23:
+        case TOP_PATCHLIST_24:
+        case TOP_PATCHLIST_25:
+        case TOP_PATCHLIST_26:
+        case TOP_PATCHLIST_27:
+        case TOP_PATCHLIST_28:
+        case TOP_PATCHLIST_29:
+        case TOP_PATCHLIST_30:
+        case TOP_PATCHLIST_31:
+        case TOP_PATCHLIST_32:
+            // Always run KNOB_SIMD_WIDTH number of patches at a time.
+            this->primIDIncr = 8;
+            this->primID = id8;
+            break;
+
+        default:
+            SWR_ASSERT(0);
+            break;
+    };
+
+}
+#endif
diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
new file mode 100644
index 00000000000..587e336d87d
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
@@ -0,0 +1,1393 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file rasterizer.cpp
+*
+* @brief Implementation for the rasterizer.
+*
+******************************************************************************/
+
+#include <vector>
+#include <algorithm>
+
+#include "rasterizer.h"
+#include "multisample.h"
+#include "rdtsc_core.h"
+#include "backend.h"
+#include "utils.h"
+#include "frontend.h"
+#include "tilemgr.h"
+#include "memory/tilingtraits.h"
+
+void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t x, uint32_t y, RenderOutputBuffers &renderBuffers, 
+    uint32_t numSamples, uint32_t renderTargetArrayIndex);
+void StepRasterTileX(uint32_t MaxRT, RenderOutputBuffers &buffers, uint32_t colorTileStep, uint32_t depthTileStep, uint32_t stencilTileStep);
+void StepRasterTileY(uint32_t MaxRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow, 
+                     uint32_t colorRowStep, uint32_t depthRowStep, uint32_t stencilRowStep);
+
+#define MASKTOVEC(i3,i2,i1,i0) {-i0,-i1,-i2,-i3}
+const __m128 gMaskToVec[] = {
+    MASKTOVEC(0,0,0,0),
+    MASKTOVEC(0,0,0,1),
+    MASKTOVEC(0,0,1,0),
+    MASKTOVEC(0,0,1,1),
+    MASKTOVEC(0,1,0,0),
+    MASKTOVEC(0,1,0,1),
+    MASKTOVEC(0,1,1,0),
+    MASKTOVEC(0,1,1,1),
+    MASKTOVEC(1,0,0,0),
+    MASKTOVEC(1,0,0,1),
+    MASKTOVEC(1,0,1,0),
+    MASKTOVEC(1,0,1,1),
+    MASKTOVEC(1,1,0,0),
+    MASKTOVEC(1,1,0,1),
+    MASKTOVEC(1,1,1,0),
+    MASKTOVEC(1,1,1,1),
+};
+
+const __m256d gMaskToVecpd[] =
+{
+    MASKTOVEC(0, 0, 0, 0),
+    MASKTOVEC(0, 0, 0, 1),
+    MASKTOVEC(0, 0, 1, 0),
+    MASKTOVEC(0, 0, 1, 1),
+    MASKTOVEC(0, 1, 0, 0),
+    MASKTOVEC(0, 1, 0, 1),
+    MASKTOVEC(0, 1, 1, 0),
+    MASKTOVEC(0, 1, 1, 1),
+    MASKTOVEC(1, 0, 0, 0),
+    MASKTOVEC(1, 0, 0, 1),
+    MASKTOVEC(1, 0, 1, 0),
+    MASKTOVEC(1, 0, 1, 1),
+    MASKTOVEC(1, 1, 0, 0),
+    MASKTOVEC(1, 1, 0, 1),
+    MASKTOVEC(1, 1, 1, 0),
+    MASKTOVEC(1, 1, 1, 1),
+};
+
+struct POS
+{
+    int32_t x, y;
+};
+
+struct EDGE
+{
+    double a, b;                // a, b edge coefficients in fix8
+    double stepQuadX;           // step to adjacent horizontal quad in fix16
+    double stepQuadY;           // step to adjacent vertical quad in fix16
+    double stepRasterTileX;     // step to adjacent horizontal raster tile in fix16
+    double stepRasterTileY;     // step to adjacent vertical raster tile in fix16
+
+    __m256d vQuadOffsets;       // offsets for 4 samples of a quad
+    __m256d vRasterTileOffsets; // offsets for the 4 corners of a raster tile
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief rasterize a raster tile partially covered by the triangle
+/// @param vEdge0-2 - edge equations evaluated at sample pos at each of the 4 corners of a raster tile
+/// @param vA, vB - A & B coefs for each edge of the triangle (Ax + Bx + C)
+/// @param vStepQuad0-2 - edge equations evaluated at the UL corners of the 2x2 pixel quad.
+///        Used to step between quads when sweeping over the raster tile.
+template<uint32_t NumEdges>
+INLINE uint64_t rasterizePartialTile(DRAW_CONTEXT *pDC, double startEdges[NumEdges], EDGE *pRastEdges)
+{
+    uint64_t coverageMask = 0;
+
+    __m256d vEdges[NumEdges];
+    __m256d vStepX[NumEdges];
+    __m256d vStepY[NumEdges];
+
+    for (uint32_t e = 0; e < NumEdges; ++e)
+    {
+        // Step to the pixel sample locations of the 1st quad
+        vEdges[e] = _mm256_add_pd(_mm256_set1_pd(startEdges[e]), pRastEdges[e].vQuadOffsets);
+
+        // compute step to next quad (mul by 2 in x and y direction)
+        vStepX[e] = _mm256_set1_pd(pRastEdges[e].stepQuadX);
+        vStepY[e] = _mm256_set1_pd(pRastEdges[e].stepQuadY);
+    }
+
+    // fast unrolled version for 8x8 tile
+#if KNOB_TILE_X_DIM == 8 && KNOB_TILE_Y_DIM == 8
+    int edgeMask[NumEdges];
+    uint64_t mask;
+
+    auto eval_lambda = [&](int e){edgeMask[e] = _mm256_movemask_pd(vEdges[e]);};
+    auto update_lambda = [&](int e){mask &= edgeMask[e];};
+    auto incx_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);};
+    auto incy_lambda = [&](int e){vEdges[e] = _mm256_add_pd(vEdges[e], vStepY[e]);};
+    auto decx_lambda = [&](int e){vEdges[e] = _mm256_sub_pd(vEdges[e], vStepX[e]);};
+
+// evaluate which pixels in the quad are covered
+#define EVAL \
+            UnrollerL<0, NumEdges, 1>::step(eval_lambda);
+
+    // update coverage mask
+#define UPDATE_MASK(bit) \
+            mask = edgeMask[0]; \
+            UnrollerL<1, NumEdges, 1>::step(update_lambda); \
+            coverageMask |= (mask << bit);
+
+    // step in the +x direction to the next quad 
+#define INCX \
+            UnrollerL<0, NumEdges, 1>::step(incx_lambda);
+
+    // step in the +y direction to the next quad 
+#define INCY \
+            UnrollerL<0, NumEdges, 1>::step(incy_lambda);
+
+    // step in the -x direction to the next quad 
+#define DECX \
+            UnrollerL<0, NumEdges, 1>::step(decx_lambda);
+
+    // sweep 2x2 quad back and forth through the raster tile, 
+    // computing coverage masks for the entire tile
+
+    // raster tile
+    // 0  1  2  3  4  5  6  7 
+    // x  x
+    // x  x ------------------>  
+    //                   x  x  |
+    // <-----------------x  x  V
+    // ..
+
+    // row 0
+    EVAL;
+    UPDATE_MASK(0);
+    INCX;
+    EVAL;
+    UPDATE_MASK(4);
+    INCX;
+    EVAL;
+    UPDATE_MASK(8);
+    INCX;
+    EVAL;
+    UPDATE_MASK(12);
+    INCY;
+
+    //row 1
+    EVAL;
+    UPDATE_MASK(28);
+    DECX;
+    EVAL;
+    UPDATE_MASK(24);
+    DECX;
+    EVAL;
+    UPDATE_MASK(20);
+    DECX;
+    EVAL;
+    UPDATE_MASK(16);
+    INCY;
+
+    // row 2
+    EVAL;
+    UPDATE_MASK(32);
+    INCX;
+    EVAL;
+    UPDATE_MASK(36);
+    INCX;
+    EVAL;
+    UPDATE_MASK(40);
+    INCX;
+    EVAL;
+    UPDATE_MASK(44);
+    INCY;
+
+    // row 3
+    EVAL;
+    UPDATE_MASK(60);
+    DECX;
+    EVAL;
+    UPDATE_MASK(56);
+    DECX;
+    EVAL;
+    UPDATE_MASK(52);
+    DECX;
+    EVAL;
+    UPDATE_MASK(48);
+#else
+    uint32_t bit = 0;
+    for (uint32_t y = 0; y < KNOB_TILE_Y_DIM/2; ++y)
+    {
+        __m256d vStartOfRowEdge[NumEdges];
+        for (uint32_t e = 0; e < NumEdges; ++e)
+        {
+            vStartOfRowEdge[e] = vEdges[e];
+        }
+
+        for (uint32_t x = 0; x < KNOB_TILE_X_DIM/2; ++x)
+        {
+            int edgeMask[NumEdges];
+            for (uint32_t e = 0; e < NumEdges; ++e)
+            {
+                edgeMask[e] = _mm256_movemask_pd(vEdges[e]);
+            }
+
+            uint64_t mask = edgeMask[0];
+            for (uint32_t e = 1; e < NumEdges; ++e)
+            {
+                mask &= edgeMask[e];
+            }
+            coverageMask |= (mask << bit);
+
+            // step to the next pixel in the x
+            for (uint32_t e = 0; e < NumEdges; ++e)
+            {
+                vEdges[e] = _mm256_add_pd(vEdges[e], vStepX[e]);
+            }
+            bit+=4;
+        }
+
+        // step to the next row
+        for (uint32_t e = 0; e < NumEdges; ++e)
+        {
+            vEdges[e] = _mm256_add_pd(vStartOfRowEdge[e], vStepY[e]);
+        }
+    }
+#endif
+    return coverageMask;
+
+}
+// Top left rule:
+// Top: if an edge is horizontal, and it is above other edges in tri pixel space, it is a 'top' edge
+// Left: if an edge is not horizontal, and it is on the left side of the triangle in pixel space, it is a 'left' edge
+// Top left: a sample is in if it is a top or left edge.
+// Out: !(horizontal && above) = !horizontal && below
+// Out: !horizontal && left = !(!horizontal && left) = horizontal and right 
+INLINE __m256d adjustTopLeftRuleIntFix16(const __m128i vA, const __m128i vB, const __m256d vEdge)
+{
+    // if vA < 0, vC--
+    // if vA == 0 && vB < 0, vC--
+
+    __m256d vEdgeOut = vEdge;
+    __m256d vEdgeAdjust = _mm256_sub_pd(vEdge, _mm256_set1_pd(1.0));
+
+    // if vA < 0 (line is not horizontal and below)
+    int msk = _mm_movemask_ps(_mm_castsi128_ps(vA));
+
+    // if vA == 0 && vB < 0 (line is horizontal and we're on the left edge of a tri)
+    __m128i vCmp = _mm_cmpeq_epi32(vA, _mm_setzero_si128());
+    int msk2 = _mm_movemask_ps(_mm_castsi128_ps(vCmp));
+    msk2 &= _mm_movemask_ps(_mm_castsi128_ps(vB));
+
+    // if either of these are true and we're on the line (edge == 0), bump it outside the line
+    vEdgeOut = _mm256_blendv_pd(vEdgeOut, vEdgeAdjust, gMaskToVecpd[msk | msk2]);
+    return vEdgeOut;
+}
+
+// max(abs(dz/dx), abs(dz,dy)
+INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc)
+{
+    /*
+    // evaluate i,j at (0,0)
+    float i00 = pDesc->I[0] * 0.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
+    float j00 = pDesc->J[0] * 0.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
+
+    // evaluate i,j at (1,0)
+    float i10 = pDesc->I[0] * 1.0f + pDesc->I[1] * 0.0f + pDesc->I[2];
+    float j10 = pDesc->J[0] * 1.0f + pDesc->J[1] * 0.0f + pDesc->J[2];
+
+    // compute dz/dx
+    float d00 = pDesc->Z[0] * i00 + pDesc->Z[1] * j00 + pDesc->Z[2];
+    float d10 = pDesc->Z[0] * i10 + pDesc->Z[1] * j10 + pDesc->Z[2];
+    float dzdx = abs(d10 - d00);
+
+    // evaluate i,j at (0,1)
+    float i01 = pDesc->I[0] * 0.0f + pDesc->I[1] * 1.0f + pDesc->I[2];
+    float j01 = pDesc->J[0] * 0.0f + pDesc->J[1] * 1.0f + pDesc->J[2];
+
+    float d01 = pDesc->Z[0] * i01 + pDesc->Z[1] * j01 + pDesc->Z[2];
+    float dzdy = abs(d01 - d00);
+    */
+
+    // optimized version of above
+    float dzdx = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[0] + pDesc->Z[1] * pDesc->J[0]));
+    float dzdy = fabsf(pDesc->recipDet * (pDesc->Z[0] * pDesc->I[1] + pDesc->Z[1] * pDesc->J[1]));
+
+    return std::max(dzdx, dzdy);
+}
+
+INLINE float ComputeBiasFactor(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pDesc, const float* z)
+{
+    if (pState->depthFormat == R24_UNORM_X8_TYPELESS)
+    {
+        return (1.0f / (1 << 24));
+    }
+    else if (pState->depthFormat == R16_UNORM)
+    {
+        return (1.0f / (1 << 16));
+    }
+    else
+    {
+        SWR_ASSERT(pState->depthFormat == R32_FLOAT);
+
+        // for f32 depth, factor = 2^(exponent(max(abs(z) - 23)
+        float zMax = std::max(fabsf(z[0]), std::max(fabsf(z[1]), fabsf(z[2])));
+        uint32_t zMaxInt = *(uint32_t*)&zMax;
+        zMaxInt &= 0x7f800000;
+        zMax = *(float*)&zMaxInt;
+
+        return zMax * (1.0f / (1 << 23));
+    }
+}
+
+INLINE float ComputeDepthBias(const SWR_RASTSTATE* pState, const SWR_TRIANGLE_DESC* pTri, const float* z)
+{
+    if (pState->depthBias == 0 && pState->slopeScaledDepthBias == 0)
+    {
+        return 0.0f;
+    }
+
+    float scale = pState->slopeScaledDepthBias;
+    if (scale != 0.0f)
+    {
+        scale *= ComputeMaxDepthSlope(pTri);
+    }
+
+    float bias = pState->depthBias * ComputeBiasFactor(pState, pTri, z) + scale;
+    if (pState->depthBiasClamp > 0.0f)
+    {
+        bias = std::min(bias, pState->depthBiasClamp);
+    }
+    else if (pState->depthBiasClamp < 0.0f)
+    {
+        bias = std::max(bias, pState->depthBiasClamp);
+    }
+
+    return bias;
+}
+
+// Prevent DCE by writing coverage mask from rasterizer to volatile
+#if KNOB_ENABLE_TOSS_POINTS
+__declspec(thread) volatile uint64_t gToss;
+#endif
+
+static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
+// try to avoid _chkstk insertions; make this thread local
+static THREAD OSALIGN(float, 16) perspAttribsTLS[vertsPerTri * KNOB_NUM_ATTRIBUTES * componentsPerAttrib];
+
+INLINE
+void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge)
+{
+    edge.a = a;
+    edge.b = b;
+
+    // compute constant steps to adjacent quads
+    edge.stepQuadX = (double)((int64_t)a * (int64_t)(2 * FIXED_POINT_SCALE));
+    edge.stepQuadY = (double)((int64_t)b * (int64_t)(2 * FIXED_POINT_SCALE));
+
+    // compute constant steps to adjacent raster tiles
+    edge.stepRasterTileX = (double)((int64_t)a * (int64_t)(KNOB_TILE_X_DIM * FIXED_POINT_SCALE));
+    edge.stepRasterTileY = (double)((int64_t)b * (int64_t)(KNOB_TILE_Y_DIM * FIXED_POINT_SCALE));
+
+    // compute quad offsets
+    const __m256d vQuadOffsetsXIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, 0, FIXED_POINT_SCALE, 0);
+    const __m256d vQuadOffsetsYIntFix8 = _mm256_set_pd(FIXED_POINT_SCALE, FIXED_POINT_SCALE, 0, 0);
+
+    __m256d vQuadStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vQuadOffsetsXIntFix8);
+    __m256d vQuadStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vQuadOffsetsYIntFix8);
+    edge.vQuadOffsets = _mm256_add_pd(vQuadStepXFix16, vQuadStepYFix16);
+
+    // compute raster tile offsets
+    const __m256d vTileOffsetsXIntFix8 = _mm256_set_pd((KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0, (KNOB_TILE_X_DIM - 1)*FIXED_POINT_SCALE, 0);
+    const __m256d vTileOffsetsYIntFix8 = _mm256_set_pd((KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, (KNOB_TILE_Y_DIM - 1)*FIXED_POINT_SCALE, 0, 0);
+
+    __m256d vTileStepXFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.a), vTileOffsetsXIntFix8);
+    __m256d vTileStepYFix16 = _mm256_mul_pd(_mm256_set1_pd(edge.b), vTileOffsetsYIntFix8);
+    edge.vRasterTileOffsets = _mm256_add_pd(vTileStepXFix16, vTileStepYFix16);
+}
+
+INLINE
+void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge)
+{
+    ComputeEdgeData(p0.y - p1.y, p1.x - p0.x, edge);
+}
+
+template<bool RasterizeScissorEdges, SWR_MULTISAMPLE_COUNT sampleCount>
+void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc)
+{
+    const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pDesc);
+#if KNOB_ENABLE_TOSS_POINTS
+    if (KNOB_TOSS_BIN_TRIS)
+    {
+        return;
+    }
+#endif
+    RDTSC_START(BERasterizeTriangle);
+
+    RDTSC_START(BETriangleSetup);
+    const API_STATE &state = GetApiState(pDC);
+    const SWR_RASTSTATE &rastState = state.rastState;
+    const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
+
+    OSALIGN(SWR_TRIANGLE_DESC, 16) triDesc;
+    triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
+
+    __m128 vX, vY, vZ, vRecipW;
+    
+    // pTriBuffer data layout: grouped components of the 3 triangle points and 1 don't care
+    // eg: vX = [x0 x1 x2 dc]
+    vX = _mm_load_ps(workDesc.pTriBuffer);
+    vY = _mm_load_ps(workDesc.pTriBuffer + 4);
+    vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
+    vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
+
+    // convert to fixed point
+    __m128i vXi = fpToFixedPoint(vX);
+    __m128i vYi = fpToFixedPoint(vY);
+
+    // quantize floating point position to fixed point precision
+    // to prevent attribute creep around the triangle vertices
+    vX = _mm_mul_ps(_mm_cvtepi32_ps(vXi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
+    vY = _mm_mul_ps(_mm_cvtepi32_ps(vYi), _mm_set1_ps(1.0f / FIXED_POINT_SCALE));
+
+    // triangle setup - A and B edge equation coefs
+    __m128 vA, vB;
+    triangleSetupAB(vX, vY, vA, vB);
+
+    __m128i vAi, vBi;
+    triangleSetupABInt(vXi, vYi, vAi, vBi);
+    
+    // determinant
+    float det = calcDeterminantInt(vAi, vBi);
+
+    /// @todo: This test is flipped...we have a stray '-' sign somewhere
+    // Convert CW triangles to CCW
+    if (det > 0.0)
+    {
+        vA  = _mm_mul_ps(vA, _mm_set1_ps(-1));
+        vB  = _mm_mul_ps(vB, _mm_set1_ps(-1));
+        vAi = _mm_mullo_epi32(vAi, _mm_set1_epi32(-1));
+        vBi = _mm_mullo_epi32(vBi, _mm_set1_epi32(-1));
+        det = -det;
+    }
+
+    __m128 vC;
+    // Finish triangle setup - C edge coef
+    triangleSetupC(vX, vY, vA, vB, vC);
+
+    // compute barycentric i and j
+    // i = (A1x + B1y + C1)/det
+    // j = (A2x + B2y + C2)/det
+    __m128 vDet = _mm_set1_ps(det);
+    __m128 vRecipDet = _mm_div_ps(_mm_set1_ps(1.0f), vDet);//_mm_rcp_ps(vDet);
+    _mm_store_ss(&triDesc.recipDet, vRecipDet);
+
+    // only extract coefs for 2 of the barycentrics; the 3rd can be 
+    // determined from the barycentric equation:
+    // i + j + k = 1 <=> k = 1 - j - i
+    _MM_EXTRACT_FLOAT(triDesc.I[0], vA, 1);
+    _MM_EXTRACT_FLOAT(triDesc.I[1], vB, 1);
+    _MM_EXTRACT_FLOAT(triDesc.I[2], vC, 1);
+    _MM_EXTRACT_FLOAT(triDesc.J[0], vA, 2);
+    _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2);
+    _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);
+
+    OSALIGN(float, 16) oneOverW[4];
+    _mm_store_ps(oneOverW, vRecipW);
+    triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2];
+    triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
+    triDesc.OneOverW[2] = oneOverW[2];
+
+    // calculate perspective correct coefs per vertex attrib 
+    float* pPerspAttribs = perspAttribsTLS;
+    float* pAttribs = workDesc.pAttribs;
+    triDesc.pPerspAttribs = pPerspAttribs;
+    triDesc.pAttribs = pAttribs;
+    float *pRecipW = workDesc.pTriBuffer + 12;
+    triDesc.pRecipW = pRecipW;
+    __m128 vOneOverWV0 = _mm_broadcast_ss(pRecipW);
+    __m128 vOneOverWV1 = _mm_broadcast_ss(pRecipW+=1);
+    __m128 vOneOverWV2 = _mm_broadcast_ss(pRecipW+=1);
+    for(uint32_t i = 0; i < workDesc.numAttribs; i++)
+    {
+        __m128 attribA = _mm_load_ps(pAttribs);
+        __m128 attribB = _mm_load_ps(pAttribs+=4);
+        __m128 attribC = _mm_load_ps(pAttribs+=4);
+        pAttribs+=4;
+
+        attribA = _mm_mul_ps(attribA, vOneOverWV0);
+        attribB = _mm_mul_ps(attribB, vOneOverWV1);
+        attribC = _mm_mul_ps(attribC, vOneOverWV2);
+
+        _mm_store_ps(pPerspAttribs, attribA);
+        _mm_store_ps(pPerspAttribs+=4, attribB);
+        _mm_store_ps(pPerspAttribs+=4, attribC);
+        pPerspAttribs+=4;
+    }
+
+    // compute bary Z
+    // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
+    OSALIGN(float, 16) a[4];
+    _mm_store_ps(a, vZ);
+    triDesc.Z[0] = a[0] - a[2];
+    triDesc.Z[1] = a[1] - a[2];
+    triDesc.Z[2] = a[2];
+        
+    // add depth bias
+    triDesc.Z[2] += ComputeDepthBias(&rastState, &triDesc, workDesc.pTriBuffer + 8);
+
+    // Compute edge data
+    OSALIGNSIMD(int32_t) aAi[4], aBi[4];
+    _mm_store_si128((__m128i*)aAi, vAi);
+    _mm_store_si128((__m128i*)aBi, vBi);
+
+    const uint32_t numEdges = 3 + (RasterizeScissorEdges ? 4 : 0);
+    EDGE rastEdges[7];
+
+    // compute triangle edges
+    ComputeEdgeData(aAi[0], aBi[0], rastEdges[0]);
+    ComputeEdgeData(aAi[1], aBi[1], rastEdges[1]);
+    ComputeEdgeData(aAi[2], aBi[2], rastEdges[2]);
+
+    // compute scissor edges if enabled
+    if (RasterizeScissorEdges)
+    {
+        POS topLeft{state.scissorInFixedPoint.left, state.scissorInFixedPoint.top};
+        POS bottomLeft{state.scissorInFixedPoint.left, state.scissorInFixedPoint.bottom};
+        POS topRight{state.scissorInFixedPoint.right, state.scissorInFixedPoint.top};
+        POS bottomRight{state.scissorInFixedPoint.right, state.scissorInFixedPoint.bottom};
+
+        // construct 4 scissor edges in ccw direction
+        ComputeEdgeData(topLeft, bottomLeft, rastEdges[3]);
+        ComputeEdgeData(bottomLeft, bottomRight, rastEdges[4]);
+        ComputeEdgeData(bottomRight, topRight, rastEdges[5]);
+        ComputeEdgeData(topRight, topLeft, rastEdges[6]);
+    }
+
+    // Calc bounding box of triangle
+    OSALIGN(BBOX, 16) bbox;
+    calcBoundingBoxInt(vXi, vYi, bbox);
+
+    // Intersect with scissor/viewport
+    bbox.left = std::max(bbox.left, state.scissorInFixedPoint.left);
+    bbox.right = std::min(bbox.right - 1, state.scissorInFixedPoint.right);
+    bbox.top = std::max(bbox.top, state.scissorInFixedPoint.top);
+    bbox.bottom = std::min(bbox.bottom - 1, state.scissorInFixedPoint.bottom);
+
+    triDesc.triFlags = workDesc.triFlags;
+
+    // further constrain backend to intersecting bounding box of macro tile and scissored triangle bbox
+    uint32_t macroX, macroY;
+    MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
+    int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
+    int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
+    int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
+    int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
+
+    OSALIGN(BBOX, 16) intersect;
+    intersect.left   = std::max(bbox.left, macroBoxLeft);
+    intersect.top    = std::max(bbox.top, macroBoxTop);
+    intersect.right  = std::min(bbox.right, macroBoxRight);
+    intersect.bottom = std::min(bbox.bottom, macroBoxBottom);
+
+    SWR_ASSERT(intersect.left <= intersect.right && intersect.top <= intersect.bottom && intersect.left >= 0 && intersect.right >= 0 && intersect.top >= 0 && intersect.bottom >= 0);
+
+    RDTSC_STOP(BETriangleSetup, 0, pDC->drawId);
+
+    // update triangle desc
+    uint32_t tileX = intersect.left >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
+    uint32_t tileY = intersect.top >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
+    uint32_t maxTileX = intersect.right >> (KNOB_TILE_X_DIM_SHIFT + FIXED_POINT_SHIFT);
+    uint32_t maxTileY = intersect.bottom >> (KNOB_TILE_Y_DIM_SHIFT + FIXED_POINT_SHIFT);
+    uint32_t numTilesX = maxTileX - tileX + 1;
+    uint32_t numTilesY = maxTileY - tileY + 1;
+
+    if (numTilesX == 0 || numTilesY == 0) 
+    {
+        RDTSC_EVENT(BEEmptyTriangle, 1, 0);
+        RDTSC_STOP(BERasterizeTriangle, 1, 0);
+        return;
+    }
+
+    RDTSC_START(BEStepSetup);
+
+    // Step to pixel center of top-left pixel of the triangle bbox
+    // Align intersect bbox (top/left) to raster tile's (top/left).
+    int32_t x = AlignDown(intersect.left, (FIXED_POINT_SCALE * KNOB_TILE_X_DIM));
+    int32_t y = AlignDown(intersect.top, (FIXED_POINT_SCALE * KNOB_TILE_Y_DIM));
+
+    if(sampleCount == SWR_MULTISAMPLE_1X)
+    {
+        // Add 0.5, in fixed point, to offset to pixel center
+        x += (FIXED_POINT_SCALE / 2);
+        y += (FIXED_POINT_SCALE / 2);
+    }
+
+    __m128i vTopLeftX = _mm_set1_epi32(x);
+    __m128i vTopLeftY = _mm_set1_epi32(y);
+
+    // evaluate edge equations at top-left pixel using 64bit math
+    // all other evaluations will be 32bit steps from it
+    // small triangles could skip this and do all 32bit math
+    // edge 0
+    // 
+    // line = Ax + By + C
+    // solving for C:
+    // C = -Ax - By
+    // we know x0 and y0 are on the line; plug them in:
+    // C = -Ax0 - By0
+    // plug C back into line equation:
+    // line = Ax - Bx - Ax0 - Bx1
+    // line = A(x - x0) + B(y - y0)
+    // line = A(x0+dX) + B(y0+dY) + C = Ax0 + AdX + By0 + BdY + c = AdX + BdY
+
+    // edge 0 and 1
+    // edge0 = A0(x - x0) + B0(y - y0)
+    // edge1 = A1(x - x1) + B1(y - y1)
+    __m128i vDeltaX = _mm_sub_epi32(vTopLeftX, vXi);
+    __m128i vDeltaY = _mm_sub_epi32(vTopLeftY, vYi);
+
+    __m256d vEdgeFix16[7];
+
+    // evaluate A(dx) and B(dY) for all points
+    __m256d vAipd = _mm256_cvtepi32_pd(vAi);
+    __m256d vBipd = _mm256_cvtepi32_pd(vBi);
+    __m256d vDeltaXpd = _mm256_cvtepi32_pd(vDeltaX);
+    __m256d vDeltaYpd = _mm256_cvtepi32_pd(vDeltaY);
+
+    __m256d vAiDeltaXFix16 = _mm256_mul_pd(vAipd, vDeltaXpd);
+    __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd);
+    __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16);
+
+    // adjust for top-left rule
+    vEdge = adjustTopLeftRuleIntFix16(vAi, vBi, vEdge);
+
+    // broadcast respective edge results to all lanes
+    double* pEdge = (double*)&vEdge;
+    vEdgeFix16[0] = _mm256_set1_pd(pEdge[0]);
+    vEdgeFix16[1] = _mm256_set1_pd(pEdge[1]);
+    vEdgeFix16[2] = _mm256_set1_pd(pEdge[2]);
+
+    // evaluate edge equations for scissor edges
+    if (RasterizeScissorEdges)
+    {
+        const BBOX &scissor = state.scissorInFixedPoint;
+        vEdgeFix16[3] = _mm256_set1_pd((rastEdges[3].a * (x - scissor.left)) + (rastEdges[3].b * (y - scissor.top)));
+        vEdgeFix16[4] = _mm256_set1_pd((rastEdges[4].a * (x - scissor.left)) + (rastEdges[4].b * (y - scissor.bottom)));
+        vEdgeFix16[5] = _mm256_set1_pd((rastEdges[5].a * (x - scissor.right)) + (rastEdges[5].b * (y - scissor.bottom)));
+        vEdgeFix16[6] = _mm256_set1_pd((rastEdges[6].a * (x - scissor.right)) + (rastEdges[6].b * (y - scissor.top)));
+    }
+
+    // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
+    // used to for testing if entire raster tile is inside a triangle
+    vEdgeFix16[0] = _mm256_add_pd(vEdgeFix16[0], rastEdges[0].vRasterTileOffsets);
+    vEdgeFix16[1] = _mm256_add_pd(vEdgeFix16[1], rastEdges[1].vRasterTileOffsets);
+    vEdgeFix16[2] = _mm256_add_pd(vEdgeFix16[2], rastEdges[2].vRasterTileOffsets);
+
+    // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
+    // step sample positions to the raster tile bbox of multisample points
+    // min(xSamples),min(ySamples)  ------  max(xSamples),min(ySamples)
+    //                             |      |
+    //                             |      |
+    // min(xSamples),max(ySamples)  ------  max(xSamples),max(ySamples)
+    __m256d vEdge0TileBbox, vEdge1TileBbox, vEdge2TileBbox;
+    if (sampleCount > SWR_MULTISAMPLE_1X)
+    {
+        __m128i vTileSampleBBoxXh = MultisampleTraits<sampleCount>::TileSampleOffsetsX();
+        __m128i vTileSampleBBoxYh = MultisampleTraits<sampleCount>::TileSampleOffsetsY();
+
+        __m256d vTileSampleBBoxXFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxXh);
+        __m256d vTileSampleBBoxYFix8 = _mm256_cvtepi32_pd(vTileSampleBBoxYh);
+
+        // step edge equation tests from Tile
+        // used to for testing if entire raster tile is inside a triangle
+        __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].a), vTileSampleBBoxXFix8);
+        __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].b), vTileSampleBBoxYFix8);
+        vEdge0TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+
+        vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].a), vTileSampleBBoxXFix8);
+        vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].b), vTileSampleBBoxYFix8);
+        vEdge1TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+
+        vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].a), vTileSampleBBoxXFix8);
+        vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].b), vTileSampleBBoxYFix8);
+        vEdge2TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+    }
+
+    RDTSC_STOP(BEStepSetup, 0, pDC->drawId);
+
+    uint32_t tY = tileY;
+    uint32_t tX = tileX;
+    uint32_t maxY = maxTileY;
+    uint32_t maxX = maxTileX;
+
+    // compute steps between raster tiles for render output buffers
+    static const uint32_t colorRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8)) * MultisampleTraits<sampleCount>::numSamples};
+    static const uint32_t colorRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * colorRasterTileStep};
+    static const uint32_t depthRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8)) * MultisampleTraits<sampleCount>::numSamples};
+    static const uint32_t depthRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM)* depthRasterTileStep};
+    static const uint32_t stencilRasterTileStep{(KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8)) * MultisampleTraits<sampleCount>::numSamples};
+    static const uint32_t stencilRasterTileRowStep{(KNOB_MACROTILE_X_DIM / KNOB_TILE_X_DIM) * stencilRasterTileStep};
+    RenderOutputBuffers renderBuffers, currentRenderBufferRow;
+
+    GetRenderHotTiles(pDC, macroTile, tileX, tileY, renderBuffers, MultisampleTraits<sampleCount>::numSamples,
+        triDesc.triFlags.renderTargetArrayIndex);
+    currentRenderBufferRow = renderBuffers;
+
+    // rasterize and generate coverage masks per sample
+    uint32_t maxSamples = MultisampleTraits<sampleCount>::numSamples;
+    for (uint32_t tileY = tY; tileY <= maxY; ++tileY)
+    {
+        __m256d vStartOfRowEdge[numEdges];
+        for (uint32_t e = 0; e < numEdges; ++e)
+        {
+            vStartOfRowEdge[e] = vEdgeFix16[e];
+        }
+
+        for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
+        {
+            uint64_t anyCoveredSamples = 0;
+
+            // is the corner of the edge outside of the raster tile? (vEdge < 0)
+            int mask0, mask1, mask2;
+            if (sampleCount == SWR_MULTISAMPLE_1X)
+            {
+                mask0 = _mm256_movemask_pd(vEdgeFix16[0]);
+                mask1 = _mm256_movemask_pd(vEdgeFix16[1]);
+                mask2 = _mm256_movemask_pd(vEdgeFix16[2]);
+            }
+            else
+            {
+                __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
+                // evaluate edge equations at the tile multisample bounding box
+                vSampleBboxTest0 = _mm256_add_pd(vEdge0TileBbox, vEdgeFix16[0]);
+                vSampleBboxTest1 = _mm256_add_pd(vEdge1TileBbox, vEdgeFix16[1]);
+                vSampleBboxTest2 = _mm256_add_pd(vEdge2TileBbox, vEdgeFix16[2]);
+                mask0 = _mm256_movemask_pd(vSampleBboxTest0);
+                mask1 = _mm256_movemask_pd(vSampleBboxTest1);
+                mask2 = _mm256_movemask_pd(vSampleBboxTest2);
+            }
+
+            for (uint32_t sampleNum = 0; sampleNum < maxSamples; sampleNum++)
+            {
+                // trivial reject, at least one edge has all 4 corners of raster tile outside
+                bool trivialReject = (!(mask0 && mask1 && mask2)) ? true : false;
+
+                if (!trivialReject)
+                {
+                    // trivial accept mask
+                    triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
+                    if ((mask0 & mask1 & mask2) == 0xf)
+                    {
+                        anyCoveredSamples = triDesc.coverageMask[sampleNum];
+                        // trivial accept, all 4 corners of all 3 edges are negative 
+                        // i.e. raster tile completely inside triangle
+                        RDTSC_EVENT(BETrivialAccept, 1, 0);
+                    }
+                    else
+                    {
+                        __m256d vEdge0AtSample, vEdge1AtSample, vEdge2AtSample; 
+                        if(sampleCount == SWR_MULTISAMPLE_1X)
+                        {
+                            // should get optimized out for single sample case (global value numbering or copy propagation)
+                            vEdge0AtSample = vEdgeFix16[0];
+                            vEdge1AtSample = vEdgeFix16[1];
+                            vEdge2AtSample = vEdgeFix16[2];
+                        }
+                        else
+                        {
+                            __m128i vSampleOffsetXh = MultisampleTraits<sampleCount>::vXi(sampleNum);
+                            __m128i vSampleOffsetYh = MultisampleTraits<sampleCount>::vYi(sampleNum);
+                            __m256d vSampleOffsetX = _mm256_cvtepi32_pd(vSampleOffsetXh);
+                            __m256d vSampleOffsetY = _mm256_cvtepi32_pd(vSampleOffsetYh);
+
+                            // *note*: none of this needs to be vectorized as rasterizePartialTile just takes vEdge[0]
+                            // for each edge and broadcasts it before offsetting to individual pixel quads
+
+                            // step edge equation tests from UL tile corner to pixel sample position
+                            __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].a), vSampleOffsetX);
+                            __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].b), vSampleOffsetY);
+                            vEdge0AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+                            vEdge0AtSample = _mm256_add_pd(vEdgeFix16[0], vEdge0AtSample);
+
+                            vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].a), vSampleOffsetX);
+                            vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].b), vSampleOffsetY);
+                            vEdge1AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+                            vEdge1AtSample = _mm256_add_pd(vEdgeFix16[1], vEdge1AtSample);
+
+                            vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].a), vSampleOffsetX);
+                            vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].b), vSampleOffsetY);
+                            vEdge2AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+                            vEdge2AtSample = _mm256_add_pd(vEdgeFix16[2], vEdge2AtSample);
+                        }
+
+                        double startQuadEdges[numEdges];
+                        const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+                        _mm256_maskstore_pd(&startQuadEdges[0], vLane0Mask, vEdge0AtSample);
+                        _mm256_maskstore_pd(&startQuadEdges[1], vLane0Mask, vEdge1AtSample);
+                        _mm256_maskstore_pd(&startQuadEdges[2], vLane0Mask, vEdge2AtSample);
+
+                        for (uint32_t e = 3; e < numEdges; ++e)
+                        {
+                            _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeFix16[e]);
+                        }
+
+                        // not trivial accept or reject, must rasterize full tile
+                        RDTSC_START(BERasterizePartial);
+                        if (RasterizeScissorEdges)
+                        {
+                            triDesc.coverageMask[sampleNum] = rasterizePartialTile<7>(pDC, startQuadEdges, rastEdges);
+                        }
+                        else
+                        {
+                            triDesc.coverageMask[sampleNum] = rasterizePartialTile<3>(pDC, startQuadEdges, rastEdges);
+                        }
+                        RDTSC_STOP(BERasterizePartial, 0, 0);
+
+                        anyCoveredSamples |= triDesc.coverageMask[sampleNum]; 
+                    }
+                }
+                else
+                {
+                    // if we're calculating coverage per sample, need to store it off. otherwise no covered samples, don't need to do anything
+                    if(sampleCount > SWR_MULTISAMPLE_1X)
+                    {
+                        triDesc.coverageMask[sampleNum] = 0;
+                    }
+                    RDTSC_EVENT(BETrivialReject, 1, 0);
+                }
+            }
+
+#if KNOB_ENABLE_TOSS_POINTS
+            if(KNOB_TOSS_RS)
+            {
+                gToss = triDesc.coverageMask[0];
+            }
+            else
+#endif
+            if(anyCoveredSamples)
+            {
+                RDTSC_START(BEPixelBackend);
+                backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
+                RDTSC_STOP(BEPixelBackend, 0, 0);
+            }
+
+            // step to the next tile in X
+            for (uint32_t e = 0; e < numEdges; ++e)
+            {
+                vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], _mm256_set1_pd(rastEdges[e].stepRasterTileX));
+            }
+            StepRasterTileX(state.psState.numRenderTargets, renderBuffers, colorRasterTileStep, depthRasterTileStep, stencilRasterTileStep);
+        }
+
+        // step to the next tile in Y
+        for (uint32_t e = 0; e < numEdges; ++e)
+        {
+            vEdgeFix16[e] = _mm256_add_pd(vStartOfRowEdge[e], _mm256_set1_pd(rastEdges[e].stepRasterTileY));
+        }
+        StepRasterTileY(state.psState.numRenderTargets, renderBuffers, currentRenderBufferRow, colorRasterTileRowStep, depthRasterTileRowStep, stencilRasterTileRowStep);
+    }
+
+    RDTSC_STOP(BERasterizeTriangle, 1, 0);
+}
+
+void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
+{
+    const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
+    const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
+    const SWR_BACKEND_STATE& backendState = pDC->pState->state.backendState;
+
+    bool isPointSpriteTexCoordEnabled = backendState.pointSpriteTexCoordMask != 0;
+
+    // load point vertex
+    float x = *workDesc.pTriBuffer;
+    float y = *(workDesc.pTriBuffer + 1);
+    float z = *(workDesc.pTriBuffer + 2);
+
+    // create a copy of the triangle buffer to write our adjusted vertices to
+    OSALIGNSIMD(float) newTriBuffer[4 * 4];
+    TRIANGLE_WORK_DESC newWorkDesc = workDesc;
+    newWorkDesc.pTriBuffer = &newTriBuffer[0];
+
+    // create a copy of the attrib buffer to write our adjusted attribs to
+    OSALIGNSIMD(float) newAttribBuffer[4 * 3 * KNOB_NUM_ATTRIBUTES];
+    newWorkDesc.pAttribs = &newAttribBuffer[0];
+
+    newWorkDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
+    newWorkDesc.numAttribs = workDesc.numAttribs;
+    newWorkDesc.triFlags = workDesc.triFlags;
+
+    // construct two tris by bloating point by point size
+    float halfPointSize = workDesc.triFlags.pointSize * 0.5f;
+    float lowerX = x - halfPointSize;
+    float upperX = x + halfPointSize;
+    float lowerY = y - halfPointSize;
+    float upperY = y + halfPointSize;
+
+    // tri 0
+    float *pBuf = &newTriBuffer[0];
+    *pBuf++ = lowerX;
+    *pBuf++ = lowerX;
+    *pBuf++ = upperX;
+    pBuf++;
+    *pBuf++ = lowerY;
+    *pBuf++ = upperY;
+    *pBuf++ = upperY;
+    pBuf++;
+    _mm_store_ps(pBuf, _mm_set1_ps(z));
+    _mm_store_ps(pBuf+=4, _mm_set1_ps(1.0f));
+
+    // setup triangle rasterizer function
+    PFN_WORK_FUNC pfnTriRast;
+    if (rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN)
+    {
+        pfnTriRast = gRasterizerTable[rastState.scissorEnable][rastState.sampleCount];
+    }
+    else
+    {
+        // for center sample pattern, all samples are at pixel center; calculate coverage
+        // once at center and broadcast the results in the backend
+        pfnTriRast = gRasterizerTable[rastState.scissorEnable][SWR_MULTISAMPLE_1X];
+    }
+
+    // overwrite texcoords for point sprites
+    if (isPointSpriteTexCoordEnabled)
+    {
+        // copy original attribs
+        memcpy(&newAttribBuffer[0], workDesc.pAttribs, 4 * 3 * workDesc.numAttribs * sizeof(float));
+        newWorkDesc.pAttribs = &newAttribBuffer[0];
+
+        // overwrite texcoord for point sprites
+        uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
+        DWORD texCoordAttrib = 0;
+
+        while (_BitScanForward(&texCoordAttrib, texCoordMask))
+        {
+            texCoordMask &= ~(1 << texCoordAttrib);
+            __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
+            if (rastState.pointSpriteTopOrigin)
+            {
+                pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
+                pTexAttrib[1] = _mm_set_ps(1, 0, 1, 0);
+                pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
+            }
+            else
+            {
+                pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
+                pTexAttrib[1] = _mm_set_ps(1, 0, 0, 0);
+                pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
+            }
+        }
+    }
+    else
+    {
+        // no texcoord overwrite, can reuse the attrib buffer from frontend
+        newWorkDesc.pAttribs = workDesc.pAttribs;
+    }
+
+    pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
+
+    // tri 1
+    pBuf = &newTriBuffer[0];
+    *pBuf++ = lowerX;
+    *pBuf++ = upperX;
+    *pBuf++ = upperX;
+    pBuf++;
+    *pBuf++ = lowerY;
+    *pBuf++ = upperY;
+    *pBuf++ = lowerY;
+    // z, w unchanged
+
+    if (isPointSpriteTexCoordEnabled)
+    {
+        uint32_t texCoordMask = backendState.pointSpriteTexCoordMask;
+        DWORD texCoordAttrib = 0;
+
+        while (_BitScanForward(&texCoordAttrib, texCoordMask))
+        {
+            texCoordMask &= ~(1 << texCoordAttrib);
+            __m128* pTexAttrib = (__m128*)&newAttribBuffer[0] + 3 * texCoordAttrib;
+            if (rastState.pointSpriteTopOrigin)
+            {
+                pTexAttrib[0] = _mm_set_ps(1, 0, 0, 0);
+                pTexAttrib[1] = _mm_set_ps(1, 0, 1, 1);
+                pTexAttrib[2] = _mm_set_ps(1, 0, 0, 1);
+
+            }
+            else
+            {
+                pTexAttrib[0] = _mm_set_ps(1, 0, 1, 0);
+                pTexAttrib[1] = _mm_set_ps(1, 0, 0, 1);
+                pTexAttrib[2] = _mm_set_ps(1, 0, 1, 1);
+            }
+        }
+    }
+
+    pfnTriRast(pDC, workerId, macroTile, (void*)&newWorkDesc);
+}
+
+void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void* pData)
+{
+#if KNOB_ENABLE_TOSS_POINTS
+    if (KNOB_TOSS_BIN_TRIS)
+    {
+        return;
+    }
+#endif
+
+    const TRIANGLE_WORK_DESC& workDesc = *(const TRIANGLE_WORK_DESC*)pData;
+    const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
+
+    // map x,y relative offsets from start of raster tile to bit position in 
+    // coverage mask for the point
+    static const uint32_t coverageMap[8][8] = {
+        { 0, 1, 4, 5, 8, 9, 12, 13 },
+        { 2, 3, 6, 7, 10, 11, 14, 15 },
+        { 16, 17, 20, 21, 24, 25, 28, 29 },
+        { 18, 19, 22, 23, 26, 27, 30, 31 },
+        { 32, 33, 36, 37, 40, 41, 44, 45 },
+        { 34, 35, 38, 39, 42, 43, 46, 47 },
+        { 48, 49, 52, 53, 56, 57, 60, 61 },
+        { 50, 51, 54, 55, 58, 59, 62, 63 }
+    };
+
+    OSALIGN(SWR_TRIANGLE_DESC, 16) triDesc;
+
+    // pull point information from triangle buffer
+    // @todo use structs for readability
+    uint32_t tileAlignedX = *(uint32_t*)workDesc.pTriBuffer;
+    uint32_t tileAlignedY = *(uint32_t*)(workDesc.pTriBuffer + 1);
+    float z = *(workDesc.pTriBuffer + 2);
+
+    // construct triangle descriptor for point
+    // no interpolation, set up i,j for constant interpolation of z and attribs
+    // @todo implement an optimized backend that doesn't require triangle information
+
+    // compute coverage mask from x,y packed into the coverageMask flag
+    // mask indices by the maximum valid index for x/y of coveragemap.
+    uint32_t tX = workDesc.triFlags.coverageMask & 0x7;
+    uint32_t tY = (workDesc.triFlags.coverageMask >> 4) & 0x7;
+    // todo: multisample points?
+    triDesc.coverageMask[0] = 1ULL << coverageMap[tY][tX];
+
+    // no persp divide needed for points
+    triDesc.pAttribs = triDesc.pPerspAttribs = workDesc.pAttribs;
+    triDesc.triFlags = workDesc.triFlags;
+    triDesc.recipDet = 1.0f;
+    triDesc.OneOverW[0] = triDesc.OneOverW[1] = triDesc.OneOverW[2] = 1.0f;
+    triDesc.I[0] = triDesc.I[1] = triDesc.I[2] = 0.0f;
+    triDesc.J[0] = triDesc.J[1] = triDesc.J[2] = 0.0f;
+    triDesc.Z[0] = triDesc.Z[1] = triDesc.Z[2] = z;
+
+    RenderOutputBuffers renderBuffers;
+    GetRenderHotTiles(pDC, macroTile, tileAlignedX >> KNOB_TILE_X_DIM_SHIFT , tileAlignedY >> KNOB_TILE_Y_DIM_SHIFT, 
+        renderBuffers, 1, triDesc.triFlags.renderTargetArrayIndex);
+
+    RDTSC_START(BEPixelBackend);
+    backendFuncs.pfnBackend(pDC, workerId, tileAlignedX, tileAlignedY, triDesc, renderBuffers);
+    RDTSC_STOP(BEPixelBackend, 0, 0);
+}
+
+// Get pointers to hot tile memory for color RT, depth, stencil
+void GetRenderHotTiles(DRAW_CONTEXT *pDC, uint32_t macroID, uint32_t tileX, uint32_t tileY, RenderOutputBuffers &renderBuffers, 
+    uint32_t numSamples, uint32_t renderTargetArrayIndex)
+{
+    const API_STATE& state = GetApiState(pDC);
+    SWR_CONTEXT *pContext = pDC->pContext;
+
+    uint32_t mx, my;
+    MacroTileMgr::getTileIndices(macroID, mx, my);
+    tileX -= KNOB_MACROTILE_X_DIM_IN_TILES * mx;
+    tileY -= KNOB_MACROTILE_Y_DIM_IN_TILES * my;
+
+    // compute tile offset for active hottile buffers
+    const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
+    uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
+    offset*=numSamples;
+
+    unsigned long rtSlot = 0;
+    uint32_t colorHottileEnableMask = state.colorHottileEnable;
+    while(_BitScanForward(&rtSlot, colorHottileEnableMask))
+    {
+        HOTTILE *pColor = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, 
+            numSamples, renderTargetArrayIndex);
+        pColor->state = HOTTILE_DIRTY;
+        renderBuffers.pColor[rtSlot] = pColor->pBuffer + offset;
+        
+        colorHottileEnableMask &= ~(1 << rtSlot);
+    }
+    if(state.depthHottileEnable)
+    {
+        const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
+        uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
+        offset*=numSamples;
+        HOTTILE *pDepth = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, 
+            numSamples, renderTargetArrayIndex);
+        pDepth->state = HOTTILE_DIRTY;
+        SWR_ASSERT(pDepth->pBuffer != nullptr);
+        renderBuffers.pDepth = pDepth->pBuffer + offset;
+    }
+    if(state.stencilHottileEnable)
+    {
+        const uint32_t pitch = KNOB_MACROTILE_X_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
+        uint32_t offset = ComputeTileOffset2D<TilingTraits<SWR_TILE_SWRZ, FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp> >(pitch, tileX, tileY);
+        offset*=numSamples;
+        HOTTILE* pStencil = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, 
+            numSamples, renderTargetArrayIndex);
+        pStencil->state = HOTTILE_DIRTY;
+        SWR_ASSERT(pStencil->pBuffer != nullptr);
+        renderBuffers.pStencil = pStencil->pBuffer + offset;
+    }
+}
+
+INLINE
+void StepRasterTileX(uint32_t NumRT, RenderOutputBuffers &buffers, uint32_t colorTileStep, uint32_t depthTileStep, uint32_t stencilTileStep)
+{
+    for(uint32_t rt = 0; rt < NumRT; ++rt)
+    {
+        buffers.pColor[rt] += colorTileStep;
+    }
+    
+    buffers.pDepth += depthTileStep;
+    buffers.pStencil += stencilTileStep;
+}
+
+INLINE
+void StepRasterTileY(uint32_t NumRT, RenderOutputBuffers &buffers, RenderOutputBuffers &startBufferRow, uint32_t colorRowStep, uint32_t depthRowStep, uint32_t stencilRowStep)
+{
+    for(uint32_t rt = 0; rt < NumRT; ++rt)
+    {
+        startBufferRow.pColor[rt] += colorRowStep;
+        buffers.pColor[rt] = startBufferRow.pColor[rt];
+    }
+    startBufferRow.pDepth += depthRowStep;
+    buffers.pDepth = startBufferRow.pDepth;
+
+    startBufferRow.pStencil += stencilRowStep;
+    buffers.pStencil = startBufferRow.pStencil;
+}
+
+// initialize rasterizer function table
+PFN_WORK_FUNC gRasterizerTable[2][SWR_MULTISAMPLE_TYPE_MAX] =
+{
+    RasterizeTriangle<false, SWR_MULTISAMPLE_1X>,
+    RasterizeTriangle<false, SWR_MULTISAMPLE_2X>,
+    RasterizeTriangle<false, SWR_MULTISAMPLE_4X>,
+    RasterizeTriangle<false, SWR_MULTISAMPLE_8X>,
+    RasterizeTriangle<false, SWR_MULTISAMPLE_16X>,
+    RasterizeTriangle<true, SWR_MULTISAMPLE_1X>,
+    RasterizeTriangle<true, SWR_MULTISAMPLE_2X>,
+    RasterizeTriangle<true, SWR_MULTISAMPLE_4X>,
+    RasterizeTriangle<true, SWR_MULTISAMPLE_8X>,
+    RasterizeTriangle<true, SWR_MULTISAMPLE_16X>
+};
+
+void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
+{
+    const TRIANGLE_WORK_DESC &workDesc = *((TRIANGLE_WORK_DESC*)pData);
+#if KNOB_ENABLE_TOSS_POINTS
+    if (KNOB_TOSS_BIN_TRIS)
+    {
+        return;
+    }
+#endif
+
+    // bloat line to two tris and call the triangle rasterizer twice
+    RDTSC_START(BERasterizeLine);
+
+    const API_STATE &state = GetApiState(pDC);
+    const SWR_RASTSTATE &rastState = state.rastState;
+
+    // macrotile dimensioning
+    uint32_t macroX, macroY;
+    MacroTileMgr::getTileIndices(macroTile, macroX, macroY);
+    int32_t macroBoxLeft = macroX * KNOB_MACROTILE_X_DIM_FIXED;
+    int32_t macroBoxRight = macroBoxLeft + KNOB_MACROTILE_X_DIM_FIXED - 1;
+    int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
+    int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
+
+    // create a copy of the triangle buffer to write our adjusted vertices to
+    OSALIGNSIMD(float) newTriBuffer[4 * 4];
+    TRIANGLE_WORK_DESC newWorkDesc = workDesc;
+    newWorkDesc.pTriBuffer = &newTriBuffer[0];
+
+    // create a copy of the attrib buffer to write our adjusted attribs to
+    OSALIGNSIMD(float) newAttribBuffer[4 * 3 * KNOB_NUM_ATTRIBUTES];
+    newWorkDesc.pAttribs = &newAttribBuffer[0];
+
+    const __m128 vBloat0 = _mm_set_ps(0.5f, -0.5f, -0.5f, 0.5f);
+    const __m128 vBloat1 = _mm_set_ps(0.5f, 0.5f, 0.5f, -0.5f);
+
+    __m128 vX, vY, vZ, vRecipW;
+
+    vX = _mm_load_ps(workDesc.pTriBuffer);
+    vY = _mm_load_ps(workDesc.pTriBuffer + 4);
+    vZ = _mm_load_ps(workDesc.pTriBuffer + 8);
+    vRecipW = _mm_load_ps(workDesc.pTriBuffer + 12);
+
+    // triangle 0
+    // v0,v1 -> v0,v0,v1
+    __m128 vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 1, 0, 0));
+    __m128 vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 1, 0, 0));
+    __m128 vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 1, 0, 0));
+    __m128 vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 1, 0, 0));
+
+    __m128 vLineWidth = _mm_set1_ps(pDC->pState->state.rastState.lineWidth);
+    __m128 vAdjust = _mm_mul_ps(vLineWidth, vBloat0);
+    if (workDesc.triFlags.yMajor)
+    {
+        vXa = _mm_add_ps(vAdjust, vXa);
+    }
+    else
+    {
+        vYa = _mm_add_ps(vAdjust, vYa);
+    }
+
+    // Store triangle description for rasterizer
+    _mm_store_ps((float*)&newTriBuffer[0], vXa);
+    _mm_store_ps((float*)&newTriBuffer[4], vYa);
+    _mm_store_ps((float*)&newTriBuffer[8], vZa);
+    _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
+
+    // binner bins 3 edges for lines as v0, v1, v1
+    // tri0 needs v0, v0, v1
+    for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
+    {
+        __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a*12 + 0]);
+        __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a*12 + 4]);
+
+        _mm_store_ps((float*)&newAttribBuffer[a*12 + 0], vAttrib0);
+        _mm_store_ps((float*)&newAttribBuffer[a*12 + 4], vAttrib0);
+        _mm_store_ps((float*)&newAttribBuffer[a*12 + 8], vAttrib1);
+    }
+
+    // Store user clip distances for triangle 0
+    float newClipBuffer[3 * 8];
+    uint32_t numClipDist = _mm_popcnt_u32(state.rastState.clipDistanceMask);
+    if (numClipDist)
+    {
+        newWorkDesc.pUserClipBuffer = newClipBuffer;
+
+        float* pOldBuffer = workDesc.pUserClipBuffer;
+        float* pNewBuffer = newClipBuffer;
+        for (uint32_t i = 0; i < numClipDist; ++i)
+        {
+            // read barycentric coeffs from binner
+            float a = *(pOldBuffer++);
+            float b = *(pOldBuffer++);
+
+            // reconstruct original clip distance at vertices
+            float c0 = a + b;
+            float c1 = b;
+
+            // construct triangle barycentrics
+            *(pNewBuffer++) = c0 - c1;
+            *(pNewBuffer++) = c0 - c1;
+            *(pNewBuffer++) = c1;
+        }
+    }
+
+    // make sure this macrotile intersects the triangle
+    __m128i vXai = fpToFixedPoint(vXa);
+    __m128i vYai = fpToFixedPoint(vYa);
+    OSALIGN(BBOX, 16) bboxA;
+    calcBoundingBoxInt(vXai, vYai, bboxA);
+
+    if (!(bboxA.left > macroBoxRight ||
+          bboxA.left > state.scissorInFixedPoint.right ||
+          bboxA.right - 1 < macroBoxLeft ||
+          bboxA.right - 1 < state.scissorInFixedPoint.left ||
+          bboxA.top > macroBoxBottom ||
+          bboxA.top > state.scissorInFixedPoint.bottom ||
+          bboxA.bottom - 1 < macroBoxTop ||
+          bboxA.bottom - 1 < state.scissorInFixedPoint.top)) {
+        // rasterize triangle
+        gRasterizerTable[rastState.scissorEnable][rastState.sampleCount](pDC, workerId, macroTile, (void*)&newWorkDesc);
+    }
+
+    // triangle 1
+    // v0,v1 -> v1,v1,v0
+    vXa = _mm_shuffle_ps(vX, vX, _MM_SHUFFLE(1, 0, 1, 1));
+    vYa = _mm_shuffle_ps(vY, vY, _MM_SHUFFLE(1, 0, 1, 1));
+    vZa = _mm_shuffle_ps(vZ, vZ, _MM_SHUFFLE(1, 0, 1, 1));
+    vRecipWa = _mm_shuffle_ps(vRecipW, vRecipW, _MM_SHUFFLE(1, 0, 1, 1));
+
+    vAdjust = _mm_mul_ps(vLineWidth, vBloat1);
+    if (workDesc.triFlags.yMajor)
+    {
+        vXa = _mm_add_ps(vAdjust, vXa);
+    }
+    else
+    {
+        vYa = _mm_add_ps(vAdjust, vYa);
+    }
+
+    // Store triangle description for rasterizer
+    _mm_store_ps((float*)&newTriBuffer[0], vXa);
+    _mm_store_ps((float*)&newTriBuffer[4], vYa);
+    _mm_store_ps((float*)&newTriBuffer[8], vZa);
+    _mm_store_ps((float*)&newTriBuffer[12], vRecipWa);
+
+    // binner bins 3 edges for lines as v0, v1, v1
+    // tri1 needs v1, v1, v0
+    for (uint32_t a = 0; a < workDesc.numAttribs; ++a)
+    {
+        __m128 vAttrib0 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 0]);
+        __m128 vAttrib1 = _mm_load_ps(&workDesc.pAttribs[a * 12 + 4]);
+
+        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 0], vAttrib1);
+        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 4], vAttrib1);
+        _mm_store_ps((float*)&newAttribBuffer[a * 12 + 8], vAttrib0);
+    }
+
+    // store user clip distance for triangle 1
+    if (numClipDist)
+    {
+        float* pOldBuffer = workDesc.pUserClipBuffer;
+        float* pNewBuffer = newClipBuffer;
+        for (uint32_t i = 0; i < numClipDist; ++i)
+        {
+            // read barycentric coeffs from binner
+            float a = *(pOldBuffer++);
+            float b = *(pOldBuffer++);
+
+            // reconstruct original clip distance at vertices
+            float c0 = a + b;
+            float c1 = b;
+
+            // construct triangle barycentrics
+            *(pNewBuffer++) = c1 - c0;
+            *(pNewBuffer++) = c1 - c0;
+            *(pNewBuffer++) = c0;
+        }
+    }
+
+    vXai = fpToFixedPoint(vXa);
+    vYai = fpToFixedPoint(vYa);
+    calcBoundingBoxInt(vXai, vYai, bboxA);
+
+    if (!(bboxA.left > macroBoxRight ||
+          bboxA.left > state.scissorInFixedPoint.right ||
+          bboxA.right - 1 < macroBoxLeft ||
+          bboxA.right - 1 < state.scissorInFixedPoint.left ||
+          bboxA.top > macroBoxBottom ||
+          bboxA.top > state.scissorInFixedPoint.bottom ||
+          bboxA.bottom - 1 < macroBoxTop ||
+          bboxA.bottom - 1 < state.scissorInFixedPoint.top)) {
+        // rasterize triangle
+        gRasterizerTable[rastState.scissorEnable][rastState.sampleCount](pDC, workerId, macroTile, (void*)&newWorkDesc);
+    }
+
+    RDTSC_STOP(BERasterizeLine, 1, 0);
+}
+
diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.h b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h
new file mode 100644
index 00000000000..bcfeef48410
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.h
@@ -0,0 +1,35 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file rasterizer.h
+*
+* @brief Definitions for the rasterizer.
+*
+******************************************************************************/
+#pragma once
+
+#include "context.h"
+
+extern PFN_WORK_FUNC gRasterizerTable[2][SWR_MULTISAMPLE_TYPE_MAX];
+void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
+void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
+void RasterizeTriPoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
diff --git a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp
new file mode 100644
index 00000000000..4b6b536075b
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.cpp
@@ -0,0 +1,91 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+
+#include "rdtsc_core.h"
+#include "common/rdtsc_buckets.h"
+
+// must match CORE_BUCKETS enum order
+BUCKET_DESC gCoreBuckets[] = {
+    { "APIClearRenderTarget", "", true, 0xff0b8bea },
+    { "APIDraw", "", true, 0xff000066 },
+    { "APIDrawWakeAllThreads", "", false, 0xffffffff },
+    { "APIDrawIndexed", "", true, 0xff000066 },
+    { "APIDispatch", "", true, 0xff660000 },
+    { "APIStoreTiles", "", true, 0xff00ffff },
+    { "APIGetDrawContext", "", false, 0xffffffff },
+    { "APISync", "", true, 0xff6666ff },
+    { "APIWaitForIdle", "", true, 0xff0000ff },
+    { "FEProcessDraw", "", true, 0xff009900 },
+    { "FEProcessDrawIndexed", "", true, 0xff009900 },
+    { "FEFetchShader", "", false, 0xffffffff },
+    { "FEVertexShader", "", false, 0xffffffff },
+    { "FEHullShader", "", false, 0xffffffff },
+    { "FETessellation", "", false, 0xffffffff },
+    { "FEDomainShader", "", false, 0xffffffff },
+    { "FEGeometryShader", "", false, 0xffffffff },
+    { "FEStreamout", "", false, 0xffffffff },
+    { "FEPAAssemble", "", false, 0xffffffff },
+    { "FEBinPoints", "", false, 0xff29b854 },
+    { "FEBinLines", "", false, 0xff29b854 },
+    { "FEBinTriangles", "", false, 0xff29b854 },
+    { "FETriangleSetup", "", false, 0xffffffff },
+    { "FEViewportCull", "", false, 0xffffffff },
+    { "FEGuardbandClip", "", false, 0xffffffff },
+    { "FEClipPoints", "", false, 0xffffffff },
+    { "FEClipLines", "", false, 0xffffffff },
+    { "FEClipTriangles", "", false, 0xffffffff },
+    { "FECullZeroAreaAndBackface", "", false, 0xffffffff },
+    { "FECullBetweenCenters", "", false, 0xffffffff },
+    { "FEProcessStoreTiles", "", true, 0xff39c864 },
+    { "FEProcessInvalidateTiles", "", true, 0xffffffff },
+    { "WorkerWorkOnFifoBE", "", false, 0xff40261c },
+    { "WorkerFoundWork", "", false, 0xff573326 },
+    { "BELoadTiles", "", true, 0xffb0e2ff },
+    { "BEDispatch", "", true, 0xff00a2ff },
+    { "BEClear", "", true, 0xff00ccbb },
+    { "BERasterizeLine", "", true, 0xffb26a4e },
+    { "BERasterizeTriangle", "", true, 0xffb26a4e },
+    { "BETriangleSetup", "", false, 0xffffffff },
+    { "BEStepSetup", "", false, 0xffffffff },
+    { "BECullZeroArea", "", false, 0xffffffff },
+    { "BEEmptyTriangle", "", false, 0xffffffff },
+    { "BETrivialAccept", "", false, 0xffffffff },
+    { "BETrivialReject", "", false, 0xffffffff },
+    { "BERasterizePartial", "", false, 0xffffffff },
+    { "BEPixelBackend", "", false, 0xffffffff },
+    { "BESetup", "", false, 0xffffffff },
+    { "BEBarycentric", "", false, 0xffffffff },
+    { "BEEarlyDepthTest", "", false, 0xffffffff },
+    { "BEPixelShader", "", false, 0xffffffff },
+    { "BELateDepthTest", "", false, 0xffffffff },
+    { "BEOutputMerger", "", false, 0xffffffff },
+    { "BEStoreTiles", "", true, 0xff00cccc },
+    { "BEEndTile", "", false, 0xffffffff },
+    { "WorkerWaitForThreadEvent", "", false, 0xffffffff },
+};
+
+/// @todo bucketmanager and mapping should probably be a part of the SWR context
+std::vector<uint32_t> gBucketMap;
+BucketManager gBucketMgr(KNOB_BUCKETS_ENABLE_THREADVIZ);
+
+uint32_t gCurrentFrame = 0;
diff --git a/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h
new file mode 100644
index 00000000000..5fcc40bf8ee
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/rdtsc_core.h
@@ -0,0 +1,177 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+
+#pragma once
+#include "knobs.h"
+
+#include "common/os.h"
+#include "common/rdtsc_buckets.h"
+
+#include <vector>
+
+enum CORE_BUCKETS
+{
+    APIClearRenderTarget,
+    APIDraw,
+    APIDrawWakeAllThreads,
+    APIDrawIndexed,
+    APIDispatch,
+    APIStoreTiles,
+    APIGetDrawContext,
+    APISync,
+    APIWaitForIdle,
+    FEProcessDraw,
+    FEProcessDrawIndexed,
+    FEFetchShader,
+    FEVertexShader,
+    FEHullShader,
+    FETessellation,
+    FEDomainShader,
+    FEGeometryShader,
+    FEStreamout,
+    FEPAAssemble,
+    FEBinPoints,
+    FEBinLines,
+    FEBinTriangles,
+    FETriangleSetup,
+    FEViewportCull,
+    FEGuardbandClip,
+    FEClipPoints,
+    FEClipLines,
+    FEClipTriangles,
+    FECullZeroAreaAndBackface,
+    FECullBetweenCenters,
+    FEProcessStoreTiles,
+    FEProcessInvalidateTiles,
+    WorkerWorkOnFifoBE,
+    WorkerFoundWork,
+    BELoadTiles,
+    BEDispatch,
+    BEClear,
+    BERasterizeLine,
+    BERasterizeTriangle,
+    BETriangleSetup,
+    BEStepSetup,
+    BECullZeroArea,
+    BEEmptyTriangle,
+    BETrivialAccept,
+    BETrivialReject,
+    BERasterizePartial,
+    BEPixelBackend,
+    BESetup,
+    BEBarycentric,
+    BEEarlyDepthTest,
+    BEPixelShader,
+    BELateDepthTest,
+    BEOutputMerger,
+    BEStoreTiles,
+    BEEndTile,
+    WorkerWaitForThreadEvent,
+
+    NumBuckets
+};
+
+void rdtscReset();
+void rdtscInit(int threadId);
+void rdtscStart(uint32_t bucketId);
+void rdtscStop(uint32_t bucketId, uint32_t count, uint64_t drawId);
+void rdtscEvent(uint32_t bucketId, uint32_t count1, uint32_t count2);
+void rdtscEndFrame();
+
+#ifdef KNOB_ENABLE_RDTSC
+#define RDTSC_RESET() rdtscReset()
+#define RDTSC_INIT(threadId) rdtscInit(threadId)
+#define RDTSC_START(bucket) rdtscStart(bucket)
+#define RDTSC_STOP(bucket, count, draw) rdtscStop(bucket, count, draw)
+#define RDTSC_EVENT(bucket, count1, count2) rdtscEvent(bucket, count1, count2)
+#define RDTSC_ENDFRAME() rdtscEndFrame()
+#else
+#define RDTSC_RESET()
+#define RDTSC_INIT(threadId)
+#define RDTSC_START(bucket)
+#define RDTSC_STOP(bucket, count, draw)
+#define RDTSC_EVENT(bucket, count1, count2)
+#define RDTSC_ENDFRAME()
+#endif
+
+extern std::vector<uint32_t> gBucketMap;
+extern BucketManager gBucketMgr;
+extern BUCKET_DESC gCoreBuckets[];
+extern uint32_t gCurrentFrame;
+
+INLINE void rdtscReset()
+{
+    gCurrentFrame = 0;
+    gBucketMgr.ClearThreads();
+    gBucketMgr.ClearBuckets();
+}
+
+INLINE void rdtscInit(int threadId)
+{
+    // register all the buckets once
+    if (threadId == 0)
+    {
+        gBucketMap.resize(NumBuckets);
+        for (uint32_t i = 0; i < NumBuckets; ++i)
+        {
+            gBucketMap[i] = gBucketMgr.RegisterBucket(gCoreBuckets[i]);
+        }
+    }
+
+    std::string name = threadId == 0 ? "API" : "WORKER";
+    gBucketMgr.RegisterThread(name);
+}
+
+INLINE void rdtscStart(uint32_t bucketId)
+{
+    uint32_t id = gBucketMap[bucketId];
+    gBucketMgr.StartBucket(id);
+}
+
+INLINE void rdtscStop(uint32_t bucketId, uint32_t count, uint64_t drawId)
+{
+    uint32_t id = gBucketMap[bucketId];
+    gBucketMgr.StopBucket(id);
+}
+
+INLINE void rdtscEvent(uint32_t bucketId, uint32_t count1, uint32_t count2)
+{
+    uint32_t id = gBucketMap[bucketId];
+    gBucketMgr.AddEvent(id, count1);
+}
+
+INLINE void rdtscEndFrame()
+{
+    gCurrentFrame++;
+
+    if (gCurrentFrame == KNOB_BUCKETS_START_FRAME)
+    {
+        gBucketMgr.StartCapture();
+    }
+
+    if (gCurrentFrame == KNOB_BUCKETS_END_FRAME)
+    {
+        gBucketMgr.StopCapture();
+        gBucketMgr.PrintReport("rdtsc.txt");
+    }
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h
new file mode 100644
index 00000000000..2758555fd4b
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -0,0 +1,1027 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file state.h
+*
+* @brief Definitions for API state.
+*
+******************************************************************************/
+#pragma once
+
+#include "common/formats.h"
+#include "common/simdintrin.h"
+
+// clear flags
+#define SWR_CLEAR_NONE        0
+#define SWR_CLEAR_COLOR      (1 << 0)
+#define SWR_CLEAR_DEPTH      (1 << 1)
+#define SWR_CLEAR_STENCIL    (1 << 2)
+
+enum DRIVER_TYPE
+{
+    DX,
+    GL
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// PRIMITIVE_TOPOLOGY.
+//////////////////////////////////////////////////////////////////////////
+enum PRIMITIVE_TOPOLOGY
+{
+    TOP_UNKNOWN = 0x0,
+    TOP_POINT_LIST = 0x1,
+    TOP_LINE_LIST = 0x2,
+    TOP_LINE_STRIP = 0x3,
+    TOP_TRIANGLE_LIST = 0x4,
+    TOP_TRIANGLE_STRIP = 0x5,
+    TOP_TRIANGLE_FAN = 0x6,
+    TOP_QUAD_LIST = 0x7,
+    TOP_QUAD_STRIP = 0x8,
+    TOP_LINE_LIST_ADJ = 0x9,
+    TOP_LISTSTRIP_ADJ = 0xA,
+    TOP_TRI_LIST_ADJ = 0xB,
+    TOP_TRI_STRIP_ADJ = 0xC,
+    TOP_TRI_STRIP_REVERSE = 0xD,
+    TOP_POLYGON = 0xE,
+    TOP_RECT_LIST = 0xF,
+    TOP_LINE_LOOP = 0x10,
+    TOP_POINT_LIST_BF = 0x11,
+    TOP_LINE_STRIP_CONT = 0x12,
+    TOP_LINE_STRIP_BF = 0x13,
+    TOP_LINE_STRIP_CONT_BF = 0x14,
+    TOP_TRIANGLE_FAN_NOSTIPPLE = 0x16,
+    TOP_TRIANGLE_DISC = 0x17,   /// @todo What is this??
+
+    TOP_PATCHLIST_BASE = 0x1F,  // Invalid topology, used to calculate num verts for a patchlist.
+    TOP_PATCHLIST_1 = 0x20,     // List of 1-vertex patches
+    TOP_PATCHLIST_2 = 0x21,
+    TOP_PATCHLIST_3 = 0x22,
+    TOP_PATCHLIST_4 = 0x23,
+    TOP_PATCHLIST_5 = 0x24,
+    TOP_PATCHLIST_6 = 0x25,
+    TOP_PATCHLIST_7 = 0x26,
+    TOP_PATCHLIST_8 = 0x27,
+    TOP_PATCHLIST_9 = 0x28,
+    TOP_PATCHLIST_10 = 0x29,
+    TOP_PATCHLIST_11 = 0x2A,
+    TOP_PATCHLIST_12 = 0x2B,
+    TOP_PATCHLIST_13 = 0x2C,
+    TOP_PATCHLIST_14 = 0x2D,
+    TOP_PATCHLIST_15 = 0x2E,
+    TOP_PATCHLIST_16 = 0x2F,
+    TOP_PATCHLIST_17 = 0x30,
+    TOP_PATCHLIST_18 = 0x31,
+    TOP_PATCHLIST_19 = 0x32,
+    TOP_PATCHLIST_20 = 0x33,
+    TOP_PATCHLIST_21 = 0x34,
+    TOP_PATCHLIST_22 = 0x35,
+    TOP_PATCHLIST_23 = 0x36,
+    TOP_PATCHLIST_24 = 0x37,
+    TOP_PATCHLIST_25 = 0x38,
+    TOP_PATCHLIST_26 = 0x39,
+    TOP_PATCHLIST_27 = 0x3A,
+    TOP_PATCHLIST_28 = 0x3B,
+    TOP_PATCHLIST_29 = 0x3C,
+    TOP_PATCHLIST_30 = 0x3D,
+    TOP_PATCHLIST_31 = 0x3E,
+    TOP_PATCHLIST_32 = 0x3F,   // List of 32-vertex patches
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_SHADER_TYPE
+//////////////////////////////////////////////////////////////////////////
+enum SWR_SHADER_TYPE
+{
+    SHADER_VERTEX,
+    SHADER_GEOMETRY,
+    SHADER_DOMAIN,
+    SHADER_HULL,
+    SHADER_PIXEL,
+    SHADER_COMPUTE,
+
+    NUM_SHADER_TYPES,
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_RENDERTARGET_ATTACHMENT
+/// @todo Its not clear what an "attachment" means. Its not common term.
+//////////////////////////////////////////////////////////////////////////
+enum SWR_RENDERTARGET_ATTACHMENT
+{
+    SWR_ATTACHMENT_COLOR0,
+    SWR_ATTACHMENT_COLOR1,
+    SWR_ATTACHMENT_COLOR2,
+    SWR_ATTACHMENT_COLOR3,
+    SWR_ATTACHMENT_COLOR4,
+    SWR_ATTACHMENT_COLOR5,
+    SWR_ATTACHMENT_COLOR6,
+    SWR_ATTACHMENT_COLOR7,
+    SWR_ATTACHMENT_DEPTH,
+    SWR_ATTACHMENT_STENCIL,
+
+    SWR_NUM_ATTACHMENTS
+};
+
+#define SWR_NUM_RENDERTARGETS 8
+
+#define SWR_ATTACHMENT_COLOR0_BIT 0x001
+#define SWR_ATTACHMENT_COLOR1_BIT 0x002
+#define SWR_ATTACHMENT_COLOR2_BIT 0x004
+#define SWR_ATTACHMENT_COLOR3_BIT 0x008
+#define SWR_ATTACHMENT_COLOR4_BIT 0x010
+#define SWR_ATTACHMENT_COLOR5_BIT 0x020
+#define SWR_ATTACHMENT_COLOR6_BIT 0x040
+#define SWR_ATTACHMENT_COLOR7_BIT 0x080
+#define SWR_ATTACHMENT_DEPTH_BIT 0x100
+#define SWR_ATTACHMENT_STENCIL_BIT 0x200
+#define SWR_ATTACHMENT_MASK_ALL 0x3ff
+#define SWR_ATTACHMENT_MASK_COLOR 0x0ff
+
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SWR Inner Tessellation factor ID
+/// See above GetTessFactorOutputPosition code for documentation
+enum SWR_INNER_TESSFACTOR_ID
+{
+    SWR_QUAD_U_TRI_INSIDE,
+    SWR_QUAD_V_INSIDE,
+
+    SWR_NUM_INNER_TESS_FACTORS,
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SWR Outer Tessellation factor ID
+/// See above GetTessFactorOutputPosition code for documentation
+enum SWR_OUTER_TESSFACTOR_ID
+{
+    SWR_QUAD_U_EQ0_TRI_U_LINE_DETAIL,
+    SWR_QUAD_V_EQ0_TRI_V_LINE_DENSITY,
+    SWR_QUAD_U_EQ1_TRI_W,
+    SWR_QUAD_V_EQ1,
+
+    SWR_NUM_OUTER_TESS_FACTORS,
+};
+
+
+/////////////////////////////////////////////////////////////////////////
+/// simdvertex
+/// @brief Defines a vertex element that holds all the data for SIMD vertices.
+///        Contains position in clip space, hardcoded to attribute 0,
+///        space for up to 32 attributes, as well as any SGV values generated
+///        by the pipeline
+/////////////////////////////////////////////////////////////////////////
+#define VERTEX_POSITION_SLOT 0
+#define VERTEX_ATTRIB_START_SLOT 1
+#define VERTEX_ATTRIB_END_SLOT 32
+#define VERTEX_RTAI_SLOT 33         // GS writes RenderTargetArrayIndex here
+#define VERTEX_PRIMID_SLOT 34       // GS writes PrimId here
+#define VERTEX_CLIPCULL_DIST_LO_SLOT 35 // VS writes lower 4 clip/cull dist
+#define VERTEX_CLIPCULL_DIST_HI_SLOT 36 // VS writes upper 4 clip/cull dist
+#define VERTEX_POINT_SIZE_SLOT 37       // VS writes point size here
+static_assert(VERTEX_POINT_SIZE_SLOT < KNOB_NUM_ATTRIBUTES, "Mismatched attribute slot size");
+
+// SoAoSoA
+struct simdvertex
+{
+    simdvector    attrib[KNOB_NUM_ATTRIBUTES];
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_VS_CONTEXT
+/// @brief Input to vertex shader
+/////////////////////////////////////////////////////////////////////////
+struct SWR_VS_CONTEXT
+{
+    simdvertex* pVin;           // IN: SIMD input vertex data store
+    simdvertex* pVout;          // OUT: SIMD output vertex data store
+
+    uint32_t InstanceID;    // IN: Instance ID, constant across all verts of the SIMD
+    simdscalari VertexID;   // IN: Vertex ID
+    simdscalari mask;       // IN: Active mask for shader
+};
+
+/////////////////////////////////////////////////////////////////////////
+/// ScalarCPoint
+/// @brief defines a control point element as passed from the output
+/// of the hull shader to the input of the domain shader
+/////////////////////////////////////////////////////////////////////////
+struct ScalarAttrib
+{
+    float x;
+    float y;
+    float z;
+    float w;
+};
+
+struct ScalarCPoint
+{
+    ScalarAttrib attrib[KNOB_NUM_ATTRIBUTES];
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_TESSELLATION_FACTORS
+/// @brief Tessellation factors structure (non-vector)
+/////////////////////////////////////////////////////////////////////////
+struct SWR_TESSELLATION_FACTORS
+{
+    float  OuterTessFactors[SWR_NUM_OUTER_TESS_FACTORS];
+    float  InnerTessFactors[SWR_NUM_INNER_TESS_FACTORS];
+};
+
+#define MAX_NUM_VERTS_PER_PRIM 32 // support up to 32 control point patches
+struct ScalarPatch
+{
+    SWR_TESSELLATION_FACTORS tessFactors;
+    ScalarCPoint cp[MAX_NUM_VERTS_PER_PRIM];
+    ScalarCPoint patchData;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_HS_CONTEXT
+/// @brief Input to hull shader
+/////////////////////////////////////////////////////////////////////////
+struct SWR_HS_CONTEXT
+{
+    simdvertex vert[MAX_NUM_VERTS_PER_PRIM]; // IN: (SIMD) input primitive data
+    simdscalari PrimitiveID;    // IN: (SIMD) primitive ID generated from the draw call
+    simdscalari mask;           // IN: Active mask for shader
+    ScalarPatch* pCPout;        // OUT: Output control point patch
+                                // SIMD-sized-array of SCALAR patches
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_DS_CONTEXT
+/// @brief Input to domain shader
+/////////////////////////////////////////////////////////////////////////
+struct SWR_DS_CONTEXT
+{
+    uint32_t        PrimitiveID;    // IN: (SCALAR) PrimitiveID for the patch associated with the DS invocation
+    uint32_t        vectorOffset;   // IN: (SCALAR) vector index offset into SIMD data.
+    uint32_t        vectorStride;   // IN: (SCALAR) stride (in vectors) of output data per attribute-component
+    ScalarPatch*    pCpIn;          // IN: (SCALAR) Control patch
+    simdscalar*     pDomainU;       // IN: (SIMD) Domain Point U coords
+    simdscalar*     pDomainV;       // IN: (SIMD) Domain Point V coords
+    simdscalari     mask;           // IN: Active mask for shader
+    simdscalar*     pOutputData;    // OUT: (SIMD) Vertex Attributes (2D array of vectors, one row per attribute-component)
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_GS_CONTEXT
+/// @brief Input to geometry shader.
+/////////////////////////////////////////////////////////////////////////
+struct SWR_GS_CONTEXT
+{
+    simdvertex vert[MAX_NUM_VERTS_PER_PRIM]; // IN: input primitive data for SIMD prims
+    simdscalari PrimitiveID;        // IN: input primitive ID generated from the draw call
+    uint32_t InstanceID;            // IN: input instance ID
+    simdscalari mask;               // IN: Active mask for shader
+    uint8_t* pStream;               // OUT: output stream (contains vertices for all output streams)
+    uint8_t* pCutOrStreamIdBuffer;  // OUT: cut or stream id buffer
+    simdscalari vertexCount;        // OUT: num vertices emitted per SIMD lane
+};
+
+struct PixelPositions
+{
+    simdscalar UL;
+    simdscalar center;
+    simdscalar sample;
+    simdscalar centroid;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_PS_CONTEXT
+/// @brief Input to pixel shader.
+/////////////////////////////////////////////////////////////////////////
+struct SWR_PS_CONTEXT
+{
+    PixelPositions vX;          // IN: x location(s) of pixels
+    PixelPositions vY;          // IN: x location(s) of pixels
+    simdscalar vZ;              // INOUT: z location of pixels
+    simdscalari activeMask;     // OUT: mask for kill
+    simdscalar  inputMask;      // IN: input coverage mask for all samples
+    simdscalari oMask;          // OUT: mask for output coverage
+
+    PixelPositions vI;          // barycentric coords evaluated at pixel center, sample position, centroid
+    PixelPositions vJ;
+    PixelPositions vOneOverW;   // IN: 1/w
+
+    const float* pAttribs;      // IN: pointer to attribute barycentric coefficients
+    const float* pPerspAttribs; // IN: pointer to attribute/w barycentric coefficients
+    const float* pRecipW;       // IN: pointer to 1/w coord for each vertex
+    const float *I;             // IN: Barycentric A, B, and C coefs used to compute I
+    const float *J;             // IN: Barycentric A, B, and C coefs used to compute J
+    float recipDet;             // IN: 1/Det, used when barycentric interpolating attributes
+    const float* pSamplePosX;   // IN: array of sample positions
+    const float* pSamplePosY;   // IN: array of sample positions
+    simdvector shaded[SWR_NUM_RENDERTARGETS];
+                                // OUT: result color per rendertarget
+
+    uint32_t frontFace;         // IN: front- 1, back- 0
+    uint32_t primID;            // IN: primitive ID
+    uint32_t sampleIndex;       // IN: sampleIndex
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_CS_CONTEXT
+/// @brief Input to compute shader.
+/////////////////////////////////////////////////////////////////////////
+struct SWR_CS_CONTEXT
+{
+    // The ThreadGroupId is the current thread group index relative
+    // to all thread groups in the Dispatch call. The ThreadId, ThreadIdInGroup,
+    // and ThreadIdInGroupFlattened can be derived from ThreadGroupId in the shader.
+
+    // Compute shader accepts the following system values.
+    // o ThreadId - Current thread id relative to all other threads in dispatch.
+    // o ThreadGroupId - Current thread group id relative to all other groups in dispatch.
+    // o ThreadIdInGroup - Current thread relative to all threads in the current thread group.
+    // o ThreadIdInGroupFlattened - Flattened linear id derived from ThreadIdInGroup.
+    //
+    // All of these system values can be computed in the shader. They will be
+    // derived from the current tile counter. The tile counter is an atomic counter that
+    // resides in the draw context and is initialized to the product of the dispatch dims.
+    //
+    //  tileCounter = dispatchDims.x * dispatchDims.y * dispatchDims.z
+    //
+    // Each CPU worker thread will atomically decrement this counter and passes the current
+    // count into the shader. When the count reaches 0 then all thread groups in the
+    // dispatch call have been completed.
+
+    uint32_t tileCounter;  // The tile counter value for this thread group.
+
+    // Dispatch dimensions used by shader to compute system values from the tile counter.
+    uint32_t dispatchDims[3];
+
+    uint8_t* pTGSM;  // Thread Group Shared Memory pointer.
+
+    uint8_t* pSpillFillBuffer;  // Spill/fill buffer for barrier support
+};
+
+// enums
+enum SWR_TILE_MODE
+{
+    SWR_TILE_NONE = 0x0,    // Linear mode (no tiling)
+    SWR_TILE_MODE_WMAJOR,   // W major tiling
+    SWR_TILE_MODE_XMAJOR,   // X major tiling
+    SWR_TILE_MODE_YMAJOR,   // Y major tiling
+    SWR_TILE_SWRZ,          // SWR-Z tiling
+
+    SWR_TILE_MODE_COUNT
+};
+
+enum SWR_SURFACE_TYPE
+{
+    SURFACE_1D        = 0,
+    SURFACE_2D        = 1,
+    SURFACE_3D        = 2,
+    SURFACE_CUBE      = 3,
+    SURFACE_BUFFER    = 4,
+    SURFACE_STRUCTURED_BUFFER = 5,
+    SURFACE_NULL       = 7
+};
+
+enum SWR_ZFUNCTION
+{
+    ZFUNC_ALWAYS,
+    ZFUNC_NEVER,
+    ZFUNC_LT,
+    ZFUNC_EQ,
+    ZFUNC_LE,
+    ZFUNC_GT,
+    ZFUNC_NE,
+    ZFUNC_GE,
+    NUM_ZFUNC
+};
+
+enum SWR_STENCILOP
+{
+    STENCILOP_KEEP,
+    STENCILOP_ZERO,
+    STENCILOP_REPLACE,
+    STENCILOP_INCRSAT,
+    STENCILOP_DECRSAT,
+    STENCILOP_INCR,
+    STENCILOP_DECR,
+    STENCILOP_INVERT
+};
+
+enum SWR_BLEND_FACTOR
+{
+    BLENDFACTOR_ONE,
+    BLENDFACTOR_SRC_COLOR,
+    BLENDFACTOR_SRC_ALPHA,
+    BLENDFACTOR_DST_ALPHA,
+    BLENDFACTOR_DST_COLOR,
+    BLENDFACTOR_SRC_ALPHA_SATURATE,
+    BLENDFACTOR_CONST_COLOR,
+    BLENDFACTOR_CONST_ALPHA,
+    BLENDFACTOR_SRC1_COLOR,
+    BLENDFACTOR_SRC1_ALPHA,
+    BLENDFACTOR_ZERO,
+    BLENDFACTOR_INV_SRC_COLOR,
+    BLENDFACTOR_INV_SRC_ALPHA,
+    BLENDFACTOR_INV_DST_ALPHA,
+    BLENDFACTOR_INV_DST_COLOR,
+    BLENDFACTOR_INV_CONST_COLOR,
+    BLENDFACTOR_INV_CONST_ALPHA,
+    BLENDFACTOR_INV_SRC1_COLOR,
+    BLENDFACTOR_INV_SRC1_ALPHA
+};
+
+enum SWR_BLEND_OP
+{
+    BLENDOP_ADD,
+    BLENDOP_SUBTRACT,
+    BLENDOP_REVSUBTRACT,
+    BLENDOP_MIN,
+    BLENDOP_MAX,
+};
+
+enum SWR_LOGIC_OP
+{
+    LOGICOP_CLEAR,
+    LOGICOP_NOR,
+    LOGICOP_AND_INVERTED,
+    LOGICOP_COPY_INVERTED,
+    LOGICOP_AND_REVERSE,
+    LOGICOP_INVERT,
+    LOGICOP_XOR,
+    LOGICOP_NAND,
+    LOGICOP_AND,
+    LOGICOP_EQUIV,
+    LOGICOP_NOOP,
+    LOGICOP_OR_INVERTED,
+    LOGICOP_COPY,
+    LOGICOP_OR_REVERSE,
+    LOGICOP_OR,
+    LOGICOP_SET,
+};
+
+struct SWR_SURFACE_STATE
+{
+    uint8_t *pBaseAddress;
+    SWR_SURFACE_TYPE type;  // @llvm_enum
+    SWR_FORMAT format;      // @llvm_enum
+    uint32_t width;
+    uint32_t height;
+    uint32_t depth;
+    uint32_t numSamples;
+    uint32_t samplePattern;
+    uint32_t pitch;
+    uint32_t qpitch;
+    uint32_t minLod;            // for sampled surfaces, the most detailed LOD that can be accessed by sampler
+    uint32_t maxLod;            // for sampled surfaces, the max LOD that can be accessed
+    float resourceMinLod;       // for sampled surfaces, the most detailed fractional mip that can be accessed by sampler
+    uint32_t lod;               // for render targets, the lod being rendered to
+    uint32_t arrayIndex;        // for render targets, the array index being rendered to for arrayed surfaces
+    SWR_TILE_MODE tileMode;     // @llvm_enum
+    bool bInterleavedSamples;   // are MSAA samples stored interleaved or planar
+    uint32_t halign;
+    uint32_t valign;
+    uint32_t xOffset;
+    uint32_t yOffset;
+
+    uint32_t lodOffsets[2][15]; // lod offsets for sampled surfaces
+
+    uint8_t *pAuxBaseAddress;   // Used for compression, append/consume counter, etc.
+};
+
+// vertex fetch state
+// WARNING- any changes to this struct need to be reflected
+// in the fetch shader jit
+struct SWR_VERTEX_BUFFER_STATE
+{
+    uint32_t index;
+    uint32_t pitch;
+    const uint8_t *pData;
+    uint32_t size;
+    uint32_t numaNode;
+    uint32_t maxVertex;             // size / pitch.  precalculated value used by fetch shader for OOB checks
+    uint32_t partialInboundsSize;   // size % pitch.  precalculated value used by fetch shader for partially OOB vertices
+};
+
+struct SWR_INDEX_BUFFER_STATE
+{
+    // Format type for indices (e.g. UINT16, UINT32, etc.)
+    SWR_FORMAT format; // @llvm_enum
+    const void *pIndices;
+    uint32_t size;
+};
+
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_FETCH_CONTEXT
+/// @brief Input to fetch shader.
+/// @note WARNING - Changes to this struct need to be reflected in the
+///                 fetch shader jit.
+/////////////////////////////////////////////////////////////////////////
+struct SWR_FETCH_CONTEXT
+{
+    const SWR_VERTEX_BUFFER_STATE* pStreams;    // IN: array of bound vertex buffers
+    const int32_t* pIndices;                    // IN: pointer to index buffer for indexed draws
+    const int32_t* pLastIndex;                  // IN: pointer to end of index buffer, used for bounds checking
+    uint32_t CurInstance;                       // IN: current instance
+    uint32_t BaseVertex;                        // IN: base vertex
+    uint32_t StartVertex;                       // IN: start vertex
+    uint32_t StartInstance;                     // IN: start instance
+    simdscalari VertexID;                       // OUT: vector of vertex IDs
+    simdscalari CutMask;                        // OUT: vector mask of indices which have the cut index value
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_STATS
+///
+/// @brief All statistics generated by SWR go here. These are public
+///        to driver.
+/////////////////////////////////////////////////////////////////////////
+struct SWR_STATS
+{
+    // Occlusion Query
+    uint64_t DepthPassCount; // Number of passing depth tests. Not exact.
+
+    // Pipeline Stats
+    uint64_t IaVertices;    // Number of Fetch Shader vertices
+    uint64_t IaPrimitives;  // Number of PA primitives.
+    uint64_t VsInvocations; // Number of Vertex Shader invocations
+    uint64_t HsInvocations; // Number of Hull Shader invocations
+    uint64_t DsInvocations; // Number of Domain Shader invocations
+    uint64_t GsInvocations; // Number of Geometry Shader invocations
+    uint64_t PsInvocations; // Number of Pixel Shader invocations
+    uint64_t CsInvocations; // Number of Compute Shader invocations
+    uint64_t CInvocations;  // Number of clipper invocations
+    uint64_t CPrimitives;   // Number of clipper primitives.
+    uint64_t GsPrimitives;  // Number of prims GS outputs.
+
+    // Streamout Stats
+    uint32_t SoWriteOffset[4];
+    uint64_t SoPrimStorageNeeded[4];
+    uint64_t SoNumPrimsWritten[4];
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// STREAMOUT_BUFFERS
+/////////////////////////////////////////////////////////////////////////
+
+#define MAX_SO_STREAMS 4
+#define MAX_ATTRIBUTES 32
+
+struct SWR_STREAMOUT_BUFFER
+{
+    bool enable;
+
+    // Pointers to streamout buffers.
+    uint32_t* pBuffer;
+
+    // Size of buffer in dwords.
+    uint32_t bufferSize;
+
+    // Vertex pitch of buffer in dwords.
+    uint32_t pitch;
+
+    // Offset into buffer in dwords. SOS will increment this offset.
+    uint32_t streamOffset;
+
+    // Offset to the SO write offset. If not null then we update offset here.
+    uint32_t* pWriteOffset;
+
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// STREAMOUT_STATE
+/////////////////////////////////////////////////////////////////////////
+struct SWR_STREAMOUT_STATE
+{
+    // This disables stream output.
+    bool soEnable;
+
+    // which streams are enabled for streamout
+    bool streamEnable[MAX_SO_STREAMS];
+
+    // If set then do not send any streams to the rasterizer.
+    bool rasterizerDisable;
+
+    // Specifies which stream to send to the rasterizer.
+    uint32_t streamToRasterizer;
+
+    // The stream masks specify which attributes are sent to which streams.
+    // These masks help the FE to setup the pPrimData buffer that is passed
+    // the the Stream Output Shader (SOS) function.
+    uint32_t streamMasks[MAX_SO_STREAMS];
+
+    // Number of attributes, including position, per vertex that are streamed out.
+    // This should match number of bits in stream mask.
+    uint32_t streamNumEntries[MAX_SO_STREAMS];
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// STREAMOUT_CONTEXT - Passed to SOS
+/////////////////////////////////////////////////////////////////////////
+struct SWR_STREAMOUT_CONTEXT
+{
+    uint32_t* pPrimData;
+    SWR_STREAMOUT_BUFFER* pBuffer[MAX_SO_STREAMS];
+
+    // Num prims written for this stream
+    uint32_t numPrimsWritten;
+
+    // Num prims that should have been written if there were no overflow.
+    uint32_t numPrimStorageNeeded;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_GS_STATE - Geometry shader state
+/////////////////////////////////////////////////////////////////////////
+struct SWR_GS_STATE
+{
+    bool gsEnable;
+
+    // number of input attributes per vertex. used by the frontend to
+    // optimize assembling primitives for GS
+    uint32_t numInputAttribs;
+
+    // output topology - can be point, tristrip, or linestrip
+    PRIMITIVE_TOPOLOGY outputTopology;      // @llvm_enum
+
+    // maximum number of verts that can be emitted by a single instance of the GS
+    uint32_t maxNumVerts;
+    
+    // instance count
+    uint32_t instanceCount;
+
+    // geometry shader emits renderTargetArrayIndex
+    bool emitsRenderTargetArrayIndex;
+
+    // geometry shader emits PrimitiveID
+    bool emitsPrimitiveID;
+
+    // if true, geometry shader emits a single stream, with separate cut buffer.
+    // if false, geometry shader emits vertices for multiple streams to the stream buffer, with a separate StreamID buffer
+    // to map vertices to streams
+    bool isSingleStream;
+
+    // when single stream is enabled, singleStreamID dictates which stream is being output.
+    // field ignored if isSingleStream is false
+    uint32_t singleStreamID;
+};
+
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_TS_OUTPUT_TOPOLOGY - Defines data output by the tessellator / DS
+/////////////////////////////////////////////////////////////////////////
+enum SWR_TS_OUTPUT_TOPOLOGY
+{
+    SWR_TS_OUTPUT_POINT,
+    SWR_TS_OUTPUT_LINE,
+    SWR_TS_OUTPUT_TRI_CW,
+    SWR_TS_OUTPUT_TRI_CCW,
+
+    SWR_TS_OUTPUT_TOPOLOGY_COUNT
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_TS_PARTITIONING - Defines tessellation algorithm
+/////////////////////////////////////////////////////////////////////////
+enum SWR_TS_PARTITIONING
+{
+    SWR_TS_INTEGER,
+    SWR_TS_ODD_FRACTIONAL,
+    SWR_TS_EVEN_FRACTIONAL,
+
+    SWR_TS_PARTITIONING_COUNT
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_TS_DOMAIN - Defines Tessellation Domain
+/////////////////////////////////////////////////////////////////////////
+enum SWR_TS_DOMAIN
+{
+    SWR_TS_QUAD,
+    SWR_TS_TRI,
+    SWR_TS_ISOLINE,
+
+    SWR_TS_DOMAIN_COUNT
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_TS_STATE - Tessellation state
+/////////////////////////////////////////////////////////////////////////
+struct SWR_TS_STATE
+{
+    bool                    tsEnable;
+    SWR_TS_OUTPUT_TOPOLOGY  tsOutputTopology;   // @llvm_enum
+    SWR_TS_PARTITIONING     partitioning;       // @llvm_enum
+    SWR_TS_DOMAIN           domain;             // @llvm_enum
+
+    PRIMITIVE_TOPOLOGY      postDSTopology;     // @llvm_enum
+
+    uint32_t                numHsInputAttribs;
+    uint32_t                numHsOutputAttribs;
+    uint32_t                numDsOutputAttribs;
+};
+
+// output merger state
+struct SWR_RENDER_TARGET_BLEND_STATE
+{
+    uint8_t writeDisableRed : 1;
+    uint8_t writeDisableGreen : 1;
+    uint8_t writeDisableBlue : 1;
+    uint8_t writeDisableAlpha : 1;
+};
+static_assert(sizeof(SWR_RENDER_TARGET_BLEND_STATE) == 1, "Invalid SWR_RENDER_TARGET_BLEND_STATE size");
+
+#define SWR_MAX_NUM_MULTISAMPLES 16
+enum SWR_MULTISAMPLE_COUNT
+{
+    SWR_MULTISAMPLE_1X = 0,
+    SWR_MULTISAMPLE_2X,
+    SWR_MULTISAMPLE_4X,
+    SWR_MULTISAMPLE_8X,
+    SWR_MULTISAMPLE_16X,
+    SWR_MULTISAMPLE_TYPE_MAX
+};
+
+struct SWR_BLEND_STATE
+{
+    // constant blend factor color in RGBA float
+    float constantColor[4];
+
+    // alpha test reference value in unorm8 or float32
+    uint32_t alphaTestReference; 
+    uint32_t sampleMask;
+    // all RT's have the same sample count
+    ///@todo move this to Output Merger state when we refactor
+    SWR_MULTISAMPLE_COUNT sampleCount;  // @llvm_enum 
+
+    SWR_RENDER_TARGET_BLEND_STATE renderTarget[SWR_NUM_RENDERTARGETS];
+};
+static_assert(sizeof(SWR_BLEND_STATE) == 36, "Invalid SWR_BLEND_STATE size");
+
+//////////////////////////////////////////////////////////////////////////
+/// FUNCTION POINTERS FOR SHADERS
+
+typedef void(__cdecl *PFN_FETCH_FUNC)(SWR_FETCH_CONTEXT& fetchInfo, simdvertex& out);
+typedef void(__cdecl *PFN_VERTEX_FUNC)(HANDLE hPrivateData, SWR_VS_CONTEXT* pVsContext);
+typedef void(__cdecl *PFN_HS_FUNC)(HANDLE hPrivateData, SWR_HS_CONTEXT* pHsContext);
+typedef void(__cdecl *PFN_DS_FUNC)(HANDLE hPrivateData, SWR_DS_CONTEXT* pDsContext);
+typedef void(__cdecl *PFN_GS_FUNC)(HANDLE hPrivateData, SWR_GS_CONTEXT* pGsContext);
+typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, SWR_CS_CONTEXT* pCsContext);
+typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT& soContext);
+typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext);
+typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, BYTE*, simdvector&, simdscalari*, simdscalari*);
+
+//////////////////////////////////////////////////////////////////////////
+/// FRONTEND_STATE
+/////////////////////////////////////////////////////////////////////////
+struct SWR_FRONTEND_STATE
+{
+    // skip clip test, perspective divide, and viewport transform
+    // intended for verts in screen space
+    bool vpTransformDisable;
+    union
+    {
+        struct
+        {
+            uint32_t triFan : 2;
+            uint32_t lineStripList : 1;
+            uint32_t triStripList : 2;
+        };
+        uint32_t bits;
+    }provokingVertex;
+    uint32_t topologyProvokingVertex; // provoking vertex for the draw topology
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// VIEWPORT_MATRIX
+/////////////////////////////////////////////////////////////////////////
+struct SWR_VIEWPORT_MATRIX
+{
+    float m00;
+    float m11;
+    float m22;
+    float m30;
+    float m31;
+    float m32;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_VIEWPORT
+/////////////////////////////////////////////////////////////////////////
+struct SWR_VIEWPORT
+{
+    float x;
+    float y;
+    float width;
+    float height;
+    float minZ;
+    float maxZ;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_CULLMODE
+//////////////////////////////////////////////////////////////////////////
+enum SWR_CULLMODE
+{
+    SWR_CULLMODE_BOTH,
+    SWR_CULLMODE_NONE,
+    SWR_CULLMODE_FRONT,
+    SWR_CULLMODE_BACK
+};
+
+enum SWR_FILLMODE
+{
+    SWR_FILLMODE_POINT,
+    SWR_FILLMODE_WIREFRAME,
+    SWR_FILLMODE_SOLID
+};
+
+enum SWR_FRONTWINDING
+{
+    SWR_FRONTWINDING_CW,
+    SWR_FRONTWINDING_CCW
+};
+
+
+enum SWR_MSAA_SAMPLE_PATTERN
+{
+    SWR_MSAA_CENTER_PATTERN,
+    SWR_MSAA_STANDARD_PATTERN,
+    SWR_MSAA_SAMPLE_PATTERN_MAX
+};
+
+enum SWR_PIXEL_LOCATION
+{
+    SWR_PIXEL_LOCATION_CENTER,
+    SWR_PIXEL_LOCATION_UL,
+};
+
+// fixed point screen space sample locations within a pixel
+struct SWR_MULTISAMPLE_POS
+{
+    uint32_t x;
+    uint32_t y;
+};
+
+enum SWR_MSAA_RASTMODE
+{
+    SWR_MSAA_RASTMODE_OFF_PIXEL,
+    SWR_MSAA_RASTMODE_OFF_PATTERN,
+    SWR_MSAA_RASTMODE_ON_PIXEL,
+    SWR_MSAA_RASTMODE_ON_PATTERN
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// SWR_RASTSTATE
+//////////////////////////////////////////////////////////////////////////
+struct SWR_RASTSTATE
+{
+    uint32_t cullMode : 2;
+    uint32_t fillMode : 2;
+    uint32_t frontWinding : 1;
+    uint32_t scissorEnable : 1;
+    uint32_t depthClipEnable : 1;
+    float pointSize;
+    float lineWidth;
+
+    // point size output from the VS
+    bool pointParam;
+
+    // point sprite
+    bool pointSpriteEnable;
+    bool pointSpriteTopOrigin;
+
+    // depth bias
+    float depthBias;
+    float slopeScaledDepthBias;
+    float depthBiasClamp;
+    SWR_FORMAT depthFormat;     // @llvm_enum
+
+    ///@todo: MSAA lines
+    // multisample state for MSAA lines
+    bool msaaRastEnable;
+    SWR_MSAA_RASTMODE rastMode;    // @llvm_enum
+
+    // sample count the rasterizer is running at
+    SWR_MULTISAMPLE_COUNT sampleCount;  // @llvm_enum
+    bool bForcedSampleCount;
+    uint32_t pixelLocation;     // UL or Center
+    bool pixelOffset;           // offset pixel positions by .5 in both the horizontal and vertical direction
+    SWR_MULTISAMPLE_POS iSamplePos[SWR_MAX_NUM_MULTISAMPLES];   
+    SWR_MSAA_SAMPLE_PATTERN samplePattern;   // @llvm_enum
+
+    // user clip/cull distance enables
+    uint8_t cullDistanceMask;
+    uint8_t clipDistanceMask;
+};
+
+// backend state
+struct SWR_BACKEND_STATE
+{
+    uint32_t constantInterpolationMask;
+    uint32_t pointSpriteTexCoordMask;
+    uint8_t numAttributes;
+    uint8_t numComponents[KNOB_NUM_ATTRIBUTES];
+};
+
+union SWR_DEPTH_STENCIL_STATE
+{
+    struct
+    {
+        // dword 0
+        uint32_t depthWriteEnable : 1;
+        uint32_t depthTestEnable : 1;
+        uint32_t stencilWriteEnable : 1;
+        uint32_t stencilTestEnable : 1;
+        uint32_t doubleSidedStencilTestEnable : 1;
+
+        uint32_t depthTestFunc : 3;
+        uint32_t stencilTestFunc : 3;
+
+        uint32_t backfaceStencilPassDepthPassOp : 3;
+        uint32_t backfaceStencilPassDepthFailOp : 3;
+        uint32_t backfaceStencilFailOp : 3;
+        uint32_t backfaceStencilTestFunc : 3;
+        uint32_t stencilPassDepthPassOp : 3;
+        uint32_t stencilPassDepthFailOp : 3;
+        uint32_t stencilFailOp : 3;
+
+        // dword 1
+        uint8_t backfaceStencilWriteMask;
+        uint8_t backfaceStencilTestMask;
+        uint8_t stencilWriteMask;
+        uint8_t stencilTestMask;
+
+        // dword 2
+        uint8_t backfaceStencilRefValue;
+        uint8_t stencilRefValue;
+    };
+    uint32_t value[3];
+};
+
+enum SWR_SHADING_RATE
+{
+    SWR_SHADING_RATE_PIXEL,
+    SWR_SHADING_RATE_SAMPLE,
+    SWR_SHADING_RATE_COARSE,
+    SWR_SHADING_RATE_MAX,
+};
+
+enum SWR_INPUT_COVERAGE
+{
+    SWR_INPUT_COVERAGE_NONE,
+    SWR_INPUT_COVERAGE_NORMAL,
+    SWR_INPUT_COVERAGE_MAX,
+};
+
+enum SWR_PS_POSITION_OFFSET
+{
+    SWR_PS_POSITION_SAMPLE_NONE, 
+    SWR_PS_POSITION_SAMPLE_OFFSET, 
+    SWR_PS_POSITION_CENTROID_OFFSET,
+    SWR_PS_POSITION_OFFSET_MAX,
+};
+
+enum SWR_BARYCENTRICS_MASK
+{
+    SWR_BARYCENTRIC_PER_PIXEL_MASK = 0x1,
+    SWR_BARYCENTRIC_CENTROID_MASK = 0x2,
+    SWR_BARYCENTRIC_PER_SAMPLE_MASK = 0x4,
+    SWR_BARYCENTRICS_MASK_MAX = 0x8
+};
+
+// pixel shader state
+struct SWR_PS_STATE
+{
+    // dword 0-1
+    PFN_PIXEL_KERNEL pfnPixelShader;  // @llvm_pfn
+
+    // dword 2
+    uint32_t killsPixel         : 1;    // pixel shader can kill pixels
+    uint32_t inputCoverage      : 1;    // type of input coverage PS uses
+    uint32_t writesODepth       : 1;    // pixel shader writes to depth
+    uint32_t usesSourceDepth    : 1;    // pixel shader reads depth
+    uint32_t shadingRate        : 2;    // shading per pixel / sample / coarse pixel
+    uint32_t numRenderTargets   : 4;    // number of render target outputs in use (0-8)
+    uint32_t posOffset          : 2;    // type of offset (none, sample, centroid) to add to pixel position
+    uint32_t barycentricsMask   : 3;    // which type(s) of barycentric coords does the PS interpolate attributes with
+    uint32_t usesUAV            : 1;    // pixel shader accesses UAV 
+    uint32_t forceEarlyZ        : 1;    // force execution of early depth/stencil test
+};
diff --git a/src/gallium/drivers/swr/rasterizer/core/tessellator.h b/src/gallium/drivers/swr/rasterizer/core/tessellator.h
new file mode 100644
index 00000000000..915ac77897b
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/tessellator.h
@@ -0,0 +1,88 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file tessellator.h
+*
+* @brief Tessellator fixed function unit interface definition
+*
+******************************************************************************/
+#pragma once
+
+/// Allocate and initialize a new tessellation context
+HANDLE SWR_API TSInitCtx(
+    SWR_TS_DOMAIN tsDomain,                     ///< [IN] Tessellation domain (isoline, quad, triangle)
+    SWR_TS_PARTITIONING tsPartitioning,         ///< [IN] Tessellation partitioning algorithm
+    SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology,    ///< [IN] Tessellation output topology
+    void* pContextMem,                          ///< [IN] Memory to use for the context
+    size_t& memSize);                           ///< [INOUT] In: Amount of memory in pContextMem. Out: Mem required
+
+/// Destroy & de-allocate tessellation context
+void SWR_API TSDestroyCtx(
+    HANDLE tsCtx);  ///< [IN] Tessellation context to be destroyed
+
+struct SWR_TS_TESSELLATED_DATA
+{
+    uint32_t NumPrimitives;
+    uint32_t NumDomainPoints;
+
+    uint32_t* ppIndices[3];
+    float* pDomainPointsU;
+    float* pDomainPointsV;
+    // For Tri: pDomainPointsW[i] = 1.0f - pDomainPointsU[i] - pDomainPointsV[i]
+};
+
+/// Perform Tessellation
+void SWR_API TSTessellate(
+    HANDLE tsCtx,                                   ///< [IN] Tessellation Context
+    const SWR_TESSELLATION_FACTORS& tsTessFactors,  ///< [IN] Tessellation Factors
+    SWR_TS_TESSELLATED_DATA& tsTessellatedData);    ///< [OUT] Tessellated Data
+
+
+
+/// @TODO - Implement OSS tessellator
+
+INLINE HANDLE SWR_API TSInitCtx(
+    SWR_TS_DOMAIN tsDomain,
+    SWR_TS_PARTITIONING tsPartitioning,
+    SWR_TS_OUTPUT_TOPOLOGY tsOutputTopology,
+    void* pContextMem,
+    size_t& memSize)
+{
+    SWR_ASSERT(0, "%s: Not Implemented", __FUNCTION__);
+    return NULL;
+}
+
+
+INLINE void SWR_API TSDestroyCtx(HANDLE tsCtx)
+{
+    SWR_ASSERT(0, "%s: Not Implemented", __FUNCTION__);
+}
+
+
+INLINE void SWR_API TSTessellate(
+    HANDLE tsCtx,
+    const SWR_TESSELLATION_FACTORS& tsTessFactors,
+    SWR_TS_TESSELLATED_DATA& tsTessellatedData)
+{
+    SWR_ASSERT(0, "%s: Not Implemented", __FUNCTION__);
+}
+
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
new file mode 100644
index 00000000000..24c5588bfec
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -0,0 +1,962 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+
+#include <stdio.h>
+#include <thread>
+#include <algorithm>
+#include <unordered_set>
+#include <float.h>
+#include <vector>
+#include <utility>
+#include <fstream>
+#include <string>
+
+#if defined(__linux__) || defined(__gnu_linux__)
+#include <pthread.h>
+#include <sched.h>
+#include <unistd.h>
+#endif
+
+#include "common/os.h"
+#include "context.h"
+#include "frontend.h"
+#include "backend.h"
+#include "rasterizer.h"
+#include "rdtsc_core.h"
+#include "tilemgr.h"
+#include "core/multisample.h"
+
+
+
+
+// ThreadId
+struct Core
+{
+    uint32_t                procGroup = 0;
+    std::vector<uint32_t>   threadIds;
+};
+
+struct NumaNode
+{
+    std::vector<Core> cores;
+};
+
+typedef std::vector<NumaNode> CPUNumaNodes;
+
+void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThreadsPerProcGroup)
+{
+    out_nodes.clear();
+    out_numThreadsPerProcGroup = 0;
+
+#if defined(_WIN32)
+
+    SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer[KNOB_MAX_NUM_THREADS];
+    DWORD bufSize = sizeof(buffer);
+
+    BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, buffer, &bufSize);
+    SWR_ASSERT(ret != FALSE, "Failed to get Processor Topology Information");
+
+    uint32_t count = bufSize / buffer->Size;
+    PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX pBuffer = buffer;
+
+    for (uint32_t i = 0; i < count; ++i)
+    {
+        SWR_ASSERT(pBuffer->Relationship == RelationProcessorCore);
+        for (uint32_t g = 0; g < pBuffer->Processor.GroupCount; ++g)
+        {
+            auto& gmask = pBuffer->Processor.GroupMask[g];
+            uint32_t threadId = 0;
+            uint32_t procGroup = gmask.Group;
+
+            Core* pCore = nullptr;
+
+            uint32_t numThreads = (uint32_t)_mm_popcount_sizeT(gmask.Mask);
+
+            while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask))
+            {
+                // clear mask
+                gmask.Mask &= ~(KAFFINITY(1) << threadId);
+
+                // Find Numa Node
+                PROCESSOR_NUMBER procNum = {};
+                procNum.Group = WORD(procGroup);
+                procNum.Number = UCHAR(threadId);
+
+                uint32_t numaId = 0;
+                ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId);
+                SWR_ASSERT(ret);
+
+                // Store data
+                if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1);
+                auto& numaNode = out_nodes[numaId];
+
+                uint32_t coreId = 0;
+
+                if (nullptr == pCore)
+                {
+                    numaNode.cores.push_back(Core());
+                    pCore = &numaNode.cores.back();
+                    pCore->procGroup = procGroup;
+#if !defined(_WIN64)
+                    coreId = (uint32_t)numaNode.cores.size();
+                    if ((coreId * numThreads) >= 32)
+                    {
+                        // Windows doesn't return threadIds >= 32 for a processor group correctly
+                        // when running a 32-bit application.
+                        // Just save -1 as the threadId
+                        threadId = uint32_t(-1);
+                    }
+#endif
+                }
+                pCore->threadIds.push_back(threadId);
+                if (procGroup == 0)
+                {
+                    out_numThreadsPerProcGroup++;
+                }
+            }
+        }
+        pBuffer = PtrAdd(pBuffer, pBuffer->Size);
+    }
+
+
+#elif defined(__linux__) || defined (__gnu_linux__)
+
+    // Parse /proc/cpuinfo to get full topology
+    std::ifstream input("/proc/cpuinfo");
+    std::string line;
+    char* c;
+    uint32_t threadId = uint32_t(-1);
+    uint32_t coreId = uint32_t(-1);
+    uint32_t numaId = uint32_t(-1);
+
+    while (std::getline(input, line))
+    {
+        if (line.find("processor") != std::string::npos)
+        {
+            if (threadId != uint32_t(-1))
+            {
+                // Save information.
+                if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1);
+                auto& numaNode = out_nodes[numaId];
+                if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1);
+                auto& core = numaNode.cores[coreId];
+
+                core.procGroup = coreId;
+                core.threadIds.push_back(threadId);
+
+                out_numThreadsPerProcGroup++;
+            }
+
+            auto data_start = line.find(": ") + 2;
+            threadId = std::strtoul(&line.c_str()[data_start], &c, 10);
+            continue;
+        }
+        if (line.find("core id") != std::string::npos)
+        {
+            auto data_start = line.find(": ") + 2;
+            coreId = std::strtoul(&line.c_str()[data_start], &c, 10);
+            continue;
+        }
+        if (line.find("physical id") != std::string::npos)
+        {
+            auto data_start = line.find(": ") + 2;
+            numaId = std::strtoul(&line.c_str()[data_start], &c, 10);
+            continue;
+        }
+    }
+
+    if (threadId != uint32_t(-1))
+    {
+        // Save information.
+        if (out_nodes.size() <= numaId) out_nodes.resize(numaId + 1);
+        auto& numaNode = out_nodes[numaId];
+        if (numaNode.cores.size() <= coreId) numaNode.cores.resize(coreId + 1);
+        auto& core = numaNode.cores[coreId];
+
+        core.procGroup = coreId;
+        core.threadIds.push_back(threadId);
+        out_numThreadsPerProcGroup++;
+    }
+
+    for (uint32_t node = 0; node < out_nodes.size(); node++) {
+        auto& numaNode = out_nodes[node];
+        auto it = numaNode.cores.begin();
+        for ( ; it != numaNode.cores.end(); ) {
+            if (it->threadIds.size() == 0)
+                numaNode.cores.erase(it);
+            else
+                ++it;
+        }
+    }
+
+#else
+
+#error Unsupported platform
+
+#endif
+}
+
+
+void bindThread(uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=false)
+{
+    // Only bind threads when MAX_WORKER_THREADS isn't set.
+    if (KNOB_MAX_WORKER_THREADS && bindProcGroup == false)
+    {
+        return;
+    }
+
+#if defined(_WIN32)
+    {
+        GROUP_AFFINITY affinity = {};
+        affinity.Group = procGroupId;
+
+#if !defined(_WIN64)
+        if (threadId >= 32)
+        {
+            // In a 32-bit process on Windows it is impossible to bind
+            // to logical processors 32-63 within a processor group.
+            // In this case set the mask to 0 and let the system assign
+            // the processor.  Hopefully it will make smart choices.
+            affinity.Mask = 0;
+        }
+        else
+#endif
+        {
+            // If KNOB_MAX_WORKER_THREADS is set, only bind to the proc group,
+            // Not the individual HW thread.
+            if (!KNOB_MAX_WORKER_THREADS)
+            {
+                affinity.Mask = KAFFINITY(1) << threadId;
+            }
+        }
+
+        SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr);
+    }
+#else
+    cpu_set_t cpuset;
+    pthread_t thread = pthread_self();
+    CPU_ZERO(&cpuset);
+    CPU_SET(threadId, &cpuset);
+
+    pthread_setaffinity_np(thread, sizeof(cpu_set_t), &cpuset);
+#endif
+}
+
+INLINE
+uint64_t GetEnqueuedDraw(SWR_CONTEXT *pContext)
+{
+    //uint64_t result = _InterlockedCompareExchange64((volatile __int64*)&pContext->DrawEnqueued, 0, 0);
+    //return result;
+    return pContext->DrawEnqueued;
+}
+
+INLINE
+DRAW_CONTEXT *GetDC(SWR_CONTEXT *pContext, uint64_t drawId)
+{
+    return &pContext->dcRing[(drawId-1) % KNOB_MAX_DRAWS_IN_FLIGHT];
+}
+
+// returns true if dependency not met
+INLINE
+bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint64_t lastRetiredDraw)
+{
+    return (pDC->dependency > lastRetiredDraw);
+}
+
+void ClearColorHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
+{
+    // Load clear color into SIMD register...
+    float *pClearData = (float*)(pHotTile->clearData);
+    simdscalar valR = _simd_broadcast_ss(&pClearData[0]);
+    simdscalar valG = _simd_broadcast_ss(&pClearData[1]);
+    simdscalar valB = _simd_broadcast_ss(&pClearData[2]);
+    simdscalar valA = _simd_broadcast_ss(&pClearData[3]);
+
+    float *pfBuf = (float*)pHotTile->pBuffer;
+    uint32_t numSamples = pHotTile->numSamples;
+
+    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+    {
+        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+        {
+            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++)
+            {
+                _simd_store_ps(pfBuf, valR);
+                pfBuf += KNOB_SIMD_WIDTH;
+                _simd_store_ps(pfBuf, valG);
+                pfBuf += KNOB_SIMD_WIDTH;
+                _simd_store_ps(pfBuf, valB);
+                pfBuf += KNOB_SIMD_WIDTH;
+                _simd_store_ps(pfBuf, valA);
+                pfBuf += KNOB_SIMD_WIDTH;
+            }
+        }
+    }
+}
+
+void ClearDepthHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
+{
+    // Load clear color into SIMD register...
+    float *pClearData = (float*)(pHotTile->clearData);
+    simdscalar valZ = _simd_broadcast_ss(&pClearData[0]);
+
+    float *pfBuf = (float*)pHotTile->pBuffer;
+    uint32_t numSamples = pHotTile->numSamples;
+
+    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+    {
+        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+        {
+            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM)
+            {
+                _simd_store_ps(pfBuf, valZ);
+                pfBuf += KNOB_SIMD_WIDTH;
+            }
+        }
+    }
+}
+
+void ClearStencilHotTile(const HOTTILE* pHotTile)
+{
+    // convert from F32 to U8.
+    uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]);
+    //broadcast 32x into __m256i...
+    simdscalari valS = _simd_set1_epi8(clearVal);
+
+    simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer;
+    uint32_t numSamples = pHotTile->numSamples;
+
+    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+    {
+        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+        {
+            // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
+            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4)
+            {
+                _simd_store_si(pBuf, valS);
+                pBuf += 1;
+            }
+        }
+    }
+}
+
+// for draw calls, we initialize the active hot tiles and perform deferred
+// load on them if tile is in invalid state. we do this in the outer thread loop instead of inside
+// the draw routine itself mainly for performance, to avoid unnecessary setup
+// every triangle
+// @todo support deferred clear
+INLINE
+void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, const TRIANGLE_WORK_DESC* pWork)
+{
+    const API_STATE& state = GetApiState(pDC);
+    HotTileMgr *pHotTileMgr = pContext->pHotTileMgr;
+
+    uint32_t x, y;
+    MacroTileMgr::getTileIndices(macroID, x, y);
+    x *= KNOB_MACROTILE_X_DIM;
+    y *= KNOB_MACROTILE_Y_DIM;
+
+    uint32_t numSamples = GetNumSamples(state.rastState.sampleCount);
+
+    // check RT if enabled
+    unsigned long rtSlot = 0;
+    uint32_t colorHottileEnableMask = state.colorHottileEnable;
+    while(_BitScanForward(&rtSlot, colorHottileEnableMask))
+    {
+        HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples);
+
+        if (pHotTile->state == HOTTILE_INVALID)
+        {
+            RDTSC_START(BELoadTiles);
+            // invalid hottile before draw requires a load from surface before we can draw to it
+            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+            pHotTile->state = HOTTILE_DIRTY;
+            RDTSC_STOP(BELoadTiles, 0, 0);
+        }
+        else if (pHotTile->state == HOTTILE_CLEAR)
+        {
+            RDTSC_START(BELoadTiles);
+            // Clear the tile.
+            ClearColorHotTile(pHotTile);
+            pHotTile->state = HOTTILE_DIRTY;
+            RDTSC_STOP(BELoadTiles, 0, 0);
+        }
+        colorHottileEnableMask &= ~(1 << rtSlot);
+    }
+
+    // check depth if enabled
+    if (state.depthHottileEnable)
+    {
+        HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
+        if (pHotTile->state == HOTTILE_INVALID)
+        {
+            RDTSC_START(BELoadTiles);
+            // invalid hottile before draw requires a load from surface before we can draw to it
+            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+            pHotTile->state = HOTTILE_DIRTY;
+            RDTSC_STOP(BELoadTiles, 0, 0);
+        }
+        else if (pHotTile->state == HOTTILE_CLEAR)
+        {
+            RDTSC_START(BELoadTiles);
+            // Clear the tile.
+            ClearDepthHotTile(pHotTile);
+            pHotTile->state = HOTTILE_DIRTY;
+            RDTSC_STOP(BELoadTiles, 0, 0);
+        }
+    }
+
+    // check stencil if enabled
+    if (state.stencilHottileEnable)
+    {
+        HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
+        if (pHotTile->state == HOTTILE_INVALID)
+        {
+            RDTSC_START(BELoadTiles);
+            // invalid hottile before draw requires a load from surface before we can draw to it
+            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+            pHotTile->state = HOTTILE_DIRTY;
+            RDTSC_STOP(BELoadTiles, 0, 0);
+        }
+        else if (pHotTile->state == HOTTILE_CLEAR)
+        {
+            RDTSC_START(BELoadTiles);
+            // Clear the tile.
+            ClearStencilHotTile(pHotTile);
+            pHotTile->state = HOTTILE_DIRTY;
+            RDTSC_STOP(BELoadTiles, 0, 0);
+        }
+    }
+}
+
+INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
+{
+    // increment our current draw id to the first incomplete draw
+    uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
+    while (curDrawBE < drawEnqueued)
+    {
+        DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT];
+
+        // If its not compute and FE is not done then break out of loop.
+        if (!pDC->doneFE && !pDC->isCompute) break;
+
+        bool isWorkComplete = (pDC->isCompute) ?
+            pDC->pDispatch->isWorkComplete() : pDC->pTileMgr->isWorkComplete();
+
+        if (isWorkComplete)
+        {
+            curDrawBE++;
+            InterlockedIncrement(&pDC->threadsDoneBE);
+        }
+        else
+        {
+            break;
+        }
+    }
+
+    // If there are no more incomplete draws then return false.
+    return (curDrawBE >= drawEnqueued) ? false : true;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief If there is any BE work then go work on it.
+/// @param pContext - pointer to SWR context.
+/// @param workerId - The unique worker ID that is assigned to this thread.
+/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread
+///                    has its own curDrawBE counter and this ensures that each worker processes all the
+///                    draws in order.
+/// @param lockedTiles - This is the set of tiles locked by other threads. Each thread maintains its
+///                      own set and each time it fails to lock a macrotile, because its already locked,
+///                      then it will add that tile to the lockedTiles set. As a worker begins to work
+///                      on future draws the lockedTiles ensure that it doesn't work on tiles that may
+///                      still have work pending in a previous draw. Additionally, the lockedTiles is
+///                      hueristic that can steer a worker back to the same macrotile that it had been
+///                      working on in a previous draw.
+void WorkOnFifoBE(
+    SWR_CONTEXT *pContext,
+    uint32_t workerId,
+    uint64_t &curDrawBE,
+    std::unordered_set<uint32_t>& lockedTiles)
+{
+    // Find the first incomplete draw that has pending work. If no such draw is found then
+    // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
+    if (FindFirstIncompleteDraw(pContext, curDrawBE) == false)
+    {
+        return;
+    }
+
+    uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;
+
+    // Reset our history for locked tiles. We'll have to re-learn which tiles are locked.
+    lockedTiles.clear();
+
+    // Try to work on each draw in order of the available draws in flight.
+    //   1. If we're on curDrawBE, we can work on any macrotile that is available.
+    //   2. If we're trying to work on draws after curDrawBE, we are restricted to 
+    //      working on those macrotiles that are known to be complete in the prior draw to
+    //      maintain order. The locked tiles provides the history to ensures this.
+    for (uint64_t i = curDrawBE; i < GetEnqueuedDraw(pContext); ++i)
+    {
+        DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT];
+
+        if (pDC->isCompute) return; // We don't look at compute work.
+
+        // First wait for FE to be finished with this draw. This keeps threading model simple
+        // but if there are lots of bubbles between draws then serializing FE and BE may
+        // need to be revisited.
+        if (!pDC->doneFE) return;
+        
+        // If this draw is dependent on a previous draw then we need to bail.
+        if (CheckDependency(pContext, pDC, lastRetiredDraw))
+        {
+            return;
+        }
+
+        // Grab the list of all dirty macrotiles. A tile is dirty if it has work queued to it.
+        std::vector<uint32_t> &macroTiles = pDC->pTileMgr->getDirtyTiles();
+
+        for (uint32_t tileID : macroTiles)
+        {
+            MacroTileQueue &tile = pDC->pTileMgr->getMacroTileQueue(tileID);
+            
+            // can only work on this draw if it's not in use by other threads
+            if (lockedTiles.find(tileID) == lockedTiles.end())
+            {
+                if (tile.getNumQueued())
+                {
+                    if (tile.tryLock())
+                    {
+                        BE_WORK *pWork;
+
+                        RDTSC_START(WorkerFoundWork);
+
+                        uint32_t numWorkItems = tile.getNumQueued();
+
+                        if (numWorkItems != 0)
+                        {
+                            pWork = tile.peek();
+                            SWR_ASSERT(pWork);
+                            if (pWork->type == DRAW)
+                            {
+                                InitializeHotTiles(pContext, pDC, tileID, (const TRIANGLE_WORK_DESC*)&pWork->desc);
+                            }
+                        }
+
+                        while ((pWork = tile.peek()) != nullptr)
+                        {
+                            pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
+                            tile.dequeue();
+                        }
+                        RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId);
+
+                        _ReadWriteBarrier();
+
+                        pDC->pTileMgr->markTileComplete(tileID);
+
+                        // Optimization: If the draw is complete and we're the last one to have worked on it then
+                        // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
+                        if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete())
+                        {
+                            // We can increment the current BE and safely move to next draw since we know this draw is complete.
+                            curDrawBE++;
+                            InterlockedIncrement(&pDC->threadsDoneBE);
+
+                            lastRetiredDraw++;
+
+                            lockedTiles.clear();
+                            break;
+                        }
+                    }
+                    else
+                    {
+                        // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
+                        lockedTiles.insert(tileID);
+                    }
+                }
+            }
+        }
+    }
+}
+
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, UCHAR numaNode)
+{
+    // Try to grab the next DC from the ring
+    uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
+    while (curDrawFE < drawEnqueued)
+    {
+        uint32_t dcSlot = curDrawFE % KNOB_MAX_DRAWS_IN_FLIGHT;
+        DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
+        if (pDC->isCompute || pDC->doneFE || pDC->FeLock)
+        {
+            curDrawFE++;
+            InterlockedIncrement(&pDC->threadsDoneFE);
+        }
+        else
+        {
+            break;
+        }
+    }
+
+    uint64_t curDraw = curDrawFE;
+    while (curDraw < drawEnqueued)
+    {
+        uint32_t dcSlot = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT;
+        DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
+
+        if (!pDC->isCompute && !pDC->FeLock)
+        {
+            uint32_t initial = InterlockedCompareExchange((volatile uint32_t*)&pDC->FeLock, 1, 0);
+            if (initial == 0)
+            {
+                // successfully grabbed the DC, now run the FE
+                pDC->FeWork.pfnWork(pContext, pDC, workerId, &pDC->FeWork.desc);
+
+                _ReadWriteBarrier();
+                pDC->doneFE = true;
+            }
+        }
+        curDraw++;
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief If there is any compute work then go work on it.
+/// @param pContext - pointer to SWR context.
+/// @param workerId - The unique worker ID that is assigned to this thread.
+/// @param curDrawBE - This tracks the draw contexts that this thread has processed. Each worker thread
+///                    has its own curDrawBE counter and this ensures that each worker processes all the
+///                    draws in order.
+void WorkOnCompute(
+    SWR_CONTEXT *pContext,
+    uint32_t workerId,
+    uint64_t& curDrawBE)
+{
+    if (FindFirstIncompleteDraw(pContext, curDrawBE) == false)
+    {
+        return;
+    }
+
+    uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;
+
+    DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT];
+    if (pDC->isCompute == false) return;
+
+    // check dependencies
+    if (CheckDependency(pContext, pDC, lastRetiredDraw))
+    {
+        return;
+    }
+
+    SWR_ASSERT(pDC->pDispatch != nullptr);
+    DispatchQueue& queue = *pDC->pDispatch;
+
+    // Is there any work remaining?
+    if (queue.getNumQueued() > 0)
+    {
+        bool lastToComplete = false;
+
+        uint32_t threadGroupId = 0;
+        while (queue.getWork(threadGroupId))
+        {
+            ProcessComputeBE(pDC, workerId, threadGroupId);
+
+            lastToComplete = queue.finishedWork();
+        }
+
+        _ReadWriteBarrier();
+
+        if (lastToComplete)
+        {
+            SWR_ASSERT(queue.isWorkComplete() == true);
+            pDC->doneCompute = true;
+        }
+    }
+}
+
+DWORD workerThreadMain(LPVOID pData)
+{
+    THREAD_DATA *pThreadData = (THREAD_DATA*)pData;
+    SWR_CONTEXT *pContext = pThreadData->pContext;
+    uint32_t threadId = pThreadData->threadId;
+    uint32_t workerId = pThreadData->workerId;
+
+    bindThread(threadId, pThreadData->procGroupId, pThreadData->forceBindProcGroup); 
+
+    RDTSC_INIT(threadId);
+
+    int numaNode = (int)pThreadData->numaId;
+
+    // flush denormals to 0
+    _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
+
+    // Track tiles locked by other threads. If we try to lock a macrotile and find its already
+    // locked then we'll add it to this list so that we don't try and lock it again.
+    std::unordered_set<uint32_t> lockedTiles;
+
+    // each worker has the ability to work on any of the queued draws as long as certain
+    // conditions are met. the data associated
+    // with a draw is guaranteed to be active as long as a worker hasn't signaled that he 
+    // has moved on to the next draw when he determines there is no more work to do. The api
+    // thread will not increment the head of the dc ring until all workers have moved past the
+    // current head.
+    // the logic to determine what to work on is:
+    // 1- try to work on the FE any draw that is queued. For now there are no dependencies
+    //    on the FE work, so any worker can grab any FE and process in parallel.  Eventually
+    //    we'll need dependency tracking to force serialization on FEs.  The worker will try
+    //    to pick an FE by atomically incrementing a counter in the swr context.  he'll keep
+    //    trying until he reaches the tail.
+    // 2- BE work must be done in strict order. we accomplish this today by pulling work off
+    //    the oldest draw (ie the head) of the dcRing. the worker can determine if there is
+    //    any work left by comparing the total # of binned work items and the total # of completed
+    //    work items. If they are equal, then there is no more work to do for this draw, and
+    //    the worker can safely increment its oldestDraw counter and move on to the next draw.
+    std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);
+
+    auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->DrawEnqueued; };
+
+    uint64_t curDrawBE = 1;
+    uint64_t curDrawFE = 1;
+
+    while (pContext->threadPool.inThreadShutdown == false)
+    {
+        uint32_t loop = 0;
+        while (loop++ < KNOB_WORKER_SPIN_LOOP_COUNT && !threadHasWork(curDrawBE))
+        {
+            _mm_pause();
+        }
+
+        if (!threadHasWork(curDrawBE))
+        {
+            lock.lock();
+
+            // check for thread idle condition again under lock
+            if (threadHasWork(curDrawBE))
+            {
+                lock.unlock();
+                continue;
+            }
+
+            if (pContext->threadPool.inThreadShutdown)
+            {
+                lock.unlock();
+                break;
+            }
+
+            RDTSC_START(WorkerWaitForThreadEvent);
+
+            pContext->FifosNotEmpty.wait(lock);
+            lock.unlock();
+
+            RDTSC_STOP(WorkerWaitForThreadEvent, 0, 0);
+
+            if (pContext->threadPool.inThreadShutdown)
+            {
+                break;
+            }
+        }
+
+        RDTSC_START(WorkerWorkOnFifoBE);
+        WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles);
+        RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);
+
+        WorkOnCompute(pContext, workerId, curDrawBE);
+
+        WorkOnFifoFE(pContext, workerId, curDrawFE, numaNode);
+    }
+
+    return 0;
+}
+
+DWORD workerThreadInit(LPVOID pData)
+{
+#if defined(_WIN32)
+    __try
+#endif // _WIN32
+    {
+        return workerThreadMain(pData);
+    }
+
+#if defined(_WIN32)
+    __except(EXCEPTION_CONTINUE_SEARCH)
+    {
+    }
+
+#endif // _WIN32
+
+    return 1;
+}
+
+void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
+{
+    bindThread(0);
+
+    CPUNumaNodes nodes;
+    uint32_t numThreadsPerProcGroup = 0;
+    CalculateProcessorTopology(nodes, numThreadsPerProcGroup);
+
+    uint32_t numHWNodes         = (uint32_t)nodes.size();
+    uint32_t numHWCoresPerNode  = (uint32_t)nodes[0].cores.size();
+    uint32_t numHWHyperThreads  = (uint32_t)nodes[0].cores[0].threadIds.size();
+
+    uint32_t numNodes           = numHWNodes;
+    uint32_t numCoresPerNode    = numHWCoresPerNode;
+    uint32_t numHyperThreads    = numHWHyperThreads;
+
+    if (KNOB_MAX_NUMA_NODES)
+    {
+        numNodes = std::min(numNodes, KNOB_MAX_NUMA_NODES);
+    }
+
+    if (KNOB_MAX_CORES_PER_NUMA_NODE)
+    {
+        numCoresPerNode = std::min(numCoresPerNode, KNOB_MAX_CORES_PER_NUMA_NODE);
+    }
+
+    if (KNOB_MAX_THREADS_PER_CORE)
+    {
+        numHyperThreads = std::min(numHyperThreads, KNOB_MAX_THREADS_PER_CORE);
+    }
+
+    // Calculate numThreads
+    uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
+
+    if (KNOB_MAX_WORKER_THREADS)
+    {
+        uint32_t maxHWThreads = numHWNodes * numHWCoresPerNode * numHWHyperThreads;
+        numThreads = std::min(KNOB_MAX_WORKER_THREADS, maxHWThreads);
+    }
+
+    if (numThreads > KNOB_MAX_NUM_THREADS)
+    {
+        printf("WARNING: system thread count %u exceeds max %u, "
+            "performance will be degraded\n",
+            numThreads, KNOB_MAX_NUM_THREADS);
+    }
+
+    if (numThreads == 1)
+    {
+        // If only 1 worker thread, try to move it to an available
+        // HW thread.  If that fails, use the API thread.
+        if (numCoresPerNode < numHWCoresPerNode)
+        {
+            numCoresPerNode++;
+        }
+        else if (numHyperThreads < numHWHyperThreads)
+        {
+            numHyperThreads++;
+        }
+        else if (numNodes < numHWNodes)
+        {
+            numNodes++;
+        }
+        else
+        {
+            pPool->numThreads = 0;
+            SET_KNOB(SINGLE_THREADED, true);
+            return;
+        }
+    }
+    else
+    {
+        // Save a HW thread for the API thread.
+        numThreads--;
+    }
+
+    pPool->numThreads = numThreads;
+    pContext->NumWorkerThreads = pPool->numThreads;
+
+    pPool->inThreadShutdown = false;
+    pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA));
+
+    if (KNOB_MAX_WORKER_THREADS)
+    {
+        bool bForceBindProcGroup = (numThreads > numThreadsPerProcGroup);
+        uint32_t numProcGroups = (numThreads + numThreadsPerProcGroup - 1) / numThreadsPerProcGroup;
+        // When MAX_WORKER_THREADS is set we don't bother to bind to specific HW threads
+        // But Windows will still require binding to specific process groups
+        for (uint32_t workerId = 0; workerId < numThreads; ++workerId)
+        {
+            pPool->pThreadData[workerId].workerId = workerId;
+            pPool->pThreadData[workerId].procGroupId = workerId % numProcGroups;
+            pPool->pThreadData[workerId].threadId = 0;
+            pPool->pThreadData[workerId].numaId = 0;
+            pPool->pThreadData[workerId].pContext = pContext;
+            pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup;
+            pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]);
+        }
+    }
+    else
+    {
+        uint32_t workerId = 0;
+        for (uint32_t n = 0; n < numNodes; ++n)
+        {
+            auto& node = nodes[n];
+
+            uint32_t numCores = numCoresPerNode;
+            for (uint32_t c = 0; c < numCores; ++c)
+            {
+                auto& core = node.cores[c];
+                for (uint32_t t = 0; t < numHyperThreads; ++t)
+                {
+                    if (c == 0 && n == 0 && t == 0)
+                    {
+                        // Skip core 0, thread0  on node 0 to reserve for API thread
+                        continue;
+                    }
+
+                    pPool->pThreadData[workerId].workerId = workerId;
+                    pPool->pThreadData[workerId].procGroupId = core.procGroup;
+                    pPool->pThreadData[workerId].threadId = core.threadIds[t];
+                    pPool->pThreadData[workerId].numaId = n;
+                    pPool->pThreadData[workerId].pContext = pContext;
+                    pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]);
+
+                    ++workerId;
+                }
+            }
+        }
+    }
+}
+
+void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
+{
+    if (!KNOB_SINGLE_THREADED)
+    {
+        // Inform threads to finish up
+        std::unique_lock<std::mutex> lock(pContext->WaitLock);
+        pPool->inThreadShutdown = true;
+        _mm_mfence();
+        pContext->FifosNotEmpty.notify_all();
+        lock.unlock();
+
+        // Wait for threads to finish and destroy them
+        for (uint32_t t = 0; t < pPool->numThreads; ++t)
+        {
+            pPool->threads[t]->join();
+            delete(pPool->threads[t]);
+        }
+
+        // Clean up data used by threads
+        free(pPool->pThreadData);
+    }
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h
new file mode 100644
index 00000000000..0fa7196f5ac
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@@ -0,0 +1,63 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file threads.h
+*
+* @brief Definitions for SWR threading model.
+*
+******************************************************************************/
+#pragma once
+
+#include "knobs.h"
+
+#include <unordered_set>
+#include <thread>
+typedef std::thread* THREAD_PTR;
+
+struct SWR_CONTEXT;
+
+struct THREAD_DATA
+{
+    uint32_t procGroupId;   // Will always be 0 for non-Windows OS
+    uint32_t threadId;      // within the procGroup for Windows
+    uint32_t numaId;        // NUMA node id
+    uint32_t workerId;
+    SWR_CONTEXT *pContext;
+    bool forceBindProcGroup; // Only useful when KNOB_MAX_WORKER_THREADS is set.
+};
+
+
+struct THREAD_POOL
+{
+    THREAD_PTR threads[KNOB_MAX_NUM_THREADS];
+    uint32_t numThreads;
+    volatile bool inThreadShutdown;
+    THREAD_DATA *pThreadData;
+};
+
+void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
+void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
+
+// Expose FE and BE worker functions to the API thread if single threaded
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, UCHAR numaNode);
+void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, std::unordered_set<uint32_t> &usedTiles);
+void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE);
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
new file mode 100644
index 00000000000..860393661e2
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@@ -0,0 +1,105 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file tilemgr.cpp
+*
+* @brief Implementation for Macro Tile Manager which provides the facilities
+*        for threads to work on an macro tile.
+*
+******************************************************************************/
+#include <unordered_map>
+
+#include "fifo.hpp"
+#include "tilemgr.h"
+
+#define TILE_ID(x,y) ((x << 16 | y))
+
+// override new/delete for alignment
+void *MacroTileMgr::operator new(size_t size)
+{
+    return _aligned_malloc(size, 64);
+}
+
+void MacroTileMgr::operator delete(void *p)
+{
+    _aligned_free(p);
+}
+
+void* DispatchQueue::operator new(size_t size)
+{
+    return _aligned_malloc(size, 64);
+}
+
+void DispatchQueue::operator delete(void *p)
+{
+    _aligned_free(p);
+}
+
+MacroTileMgr::MacroTileMgr(Arena& arena) : mArena(arena)
+{
+}
+
+void MacroTileMgr::initialize()
+{
+    mWorkItemsProduced = 0;
+    mWorkItemsConsumed = 0;
+
+    mDirtyTiles.clear();
+}
+
+void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork)
+{
+    // Should not enqueue more then what we have backing for in the hot tile manager.
+    SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
+    SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
+
+    uint32_t id = TILE_ID(x, y);
+
+    MacroTileQueue &tile = mTiles[id];
+    tile.mWorkItemsFE++;
+
+    if (tile.mWorkItemsFE == 1)
+    {
+        tile.clear(mArena);
+        mDirtyTiles.push_back(id);
+    }
+
+    mWorkItemsProduced++;
+    tile.enqueue_try_nosync(mArena, pWork);
+}
+
+void MacroTileMgr::markTileComplete(uint32_t id)
+{
+    SWR_ASSERT(mTiles.find(id) != mTiles.end());
+    MacroTileQueue &tile = mTiles[id];
+    uint32_t numTiles = tile.mWorkItemsFE;
+    InterlockedExchangeAdd(&mWorkItemsConsumed, numTiles);
+
+    _ReadWriteBarrier();
+    tile.mWorkItemsBE += numTiles;
+    SWR_ASSERT(tile.mWorkItemsFE == tile.mWorkItemsBE);
+
+    // clear out tile, but defer fifo clear until the next DC first queues to it.
+    // this prevents worker threads from constantly locking a completed macro tile
+    tile.mWorkItemsFE = 0;
+    tile.mWorkItemsBE = 0;
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
new file mode 100644
index 00000000000..9137941bad4
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
@@ -0,0 +1,390 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file tilemgr.h
+*
+* @brief Definitions for Macro Tile Manager which provides the facilities
+*        for threads to work on an macro tile.
+*
+******************************************************************************/
+#pragma once
+
+#include <set>
+#include <unordered_map>
+#include "common/formats.h"
+#include "fifo.hpp"
+#include "context.h"
+#include "format_traits.h"
+
+//////////////////////////////////////////////////////////////////////////
+/// MacroTile - work queue for a tile.
+//////////////////////////////////////////////////////////////////////////
+struct MacroTileQueue
+{
+    MacroTileQueue() { }
+    ~MacroTileQueue() { }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Returns number of work items queued for this tile.
+    uint32_t getNumQueued()
+    {
+        return mFifo.getNumQueued();
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Attempt to lock the work fifo. If already locked then return false.
+    bool tryLock()
+    {
+        return mFifo.tryLock();
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Clear fifo and unlock it.
+    void clear(Arena& arena)
+    {
+        mFifo.clear(arena);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Peek at work sitting at the front of the fifo.
+    BE_WORK* peek()
+    {
+        return mFifo.peek();
+    }
+
+    bool enqueue_try_nosync(Arena& arena, const BE_WORK* entry)
+    {
+        return mFifo.enqueue_try_nosync(arena, entry);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Move to next work item
+    void dequeue()
+    {
+        mFifo.dequeue_noinc();
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Destroy fifo
+    void destroy()
+    {
+        mFifo.destroy();
+    }
+
+    ///@todo This will all be private.
+    uint32_t mWorkItemsFE = 0;
+    uint32_t mWorkItemsBE = 0;
+
+private:
+    QUEUE<BE_WORK> mFifo;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// MacroTileMgr - Manages macrotiles for a draw.
+//////////////////////////////////////////////////////////////////////////
+class MacroTileMgr
+{
+public:
+    MacroTileMgr(Arena& arena);
+    ~MacroTileMgr()
+    {
+        for (auto &tile : mTiles)
+        {
+            tile.second.destroy();
+        }
+    }
+
+    void initialize();
+    INLINE std::vector<uint32_t>& getDirtyTiles() { return mDirtyTiles; }
+    INLINE MacroTileQueue& getMacroTileQueue(uint32_t id) { return mTiles[id]; }
+    void markTileComplete(uint32_t id);
+
+    INLINE bool isWorkComplete()
+    {
+        return mWorkItemsProduced == mWorkItemsConsumed;
+    }
+
+    void enqueue(uint32_t x, uint32_t y, BE_WORK *pWork);
+
+    static INLINE void getTileIndices(uint32_t tileID, uint32_t &x, uint32_t &y)
+    {
+        y = tileID & 0xffff;
+        x = (tileID >> 16) & 0xffff;
+    }
+
+    void *operator new(size_t size);
+    void operator delete (void *p);
+
+private:
+    Arena& mArena;
+    SWR_FORMAT mFormat;
+    std::unordered_map<uint32_t, MacroTileQueue> mTiles;
+
+    // Any tile that has work queued to it is a dirty tile.
+    std::vector<uint32_t> mDirtyTiles;
+
+    OSALIGNLINE(LONG) mWorkItemsProduced;
+    OSALIGNLINE(volatile LONG) mWorkItemsConsumed;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// DispatchQueue - work queue for dispatch
+//////////////////////////////////////////////////////////////////////////
+class DispatchQueue
+{
+public:
+    DispatchQueue() {}
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Setup the producer consumer counts.
+    void initialize(uint32_t totalTasks, void* pTaskData)
+    {
+        // The available and outstanding counts start with total tasks.
+        // At the start there are N tasks available and outstanding.
+        // When both the available and outstanding counts have reached 0 then all work has completed.
+        // When a worker starts on a threadgroup then it decrements the available count.
+        // When a worker completes a threadgroup then it decrements the outstanding count.
+
+        mTasksAvailable = totalTasks;
+        mTasksOutstanding = totalTasks;
+
+        mpTaskData = pTaskData;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Returns number of tasks available for this dispatch.
+    uint32_t getNumQueued()
+    {
+        return (mTasksAvailable > 0) ? mTasksAvailable : 0;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Atomically decrement the work available count. If the result
+    //         is greater than 0 then we can on the associated thread group.
+    //         Otherwise, there is no more work to do.
+    bool getWork(uint32_t& groupId)
+    {
+        LONG result = InterlockedDecrement(&mTasksAvailable);
+
+        if (result >= 0)
+        {
+            groupId = result;
+            return true;
+        }
+
+        return false;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Atomically decrement the outstanding count. A worker is notifying
+    ///        us that he just finished some work. Also, return true if we're
+    ///        the last worker to complete this dispatch.
+    bool finishedWork()
+    {
+        LONG result = InterlockedDecrement(&mTasksOutstanding);
+        SWR_ASSERT(result >= 0, "Should never oversubscribe work");
+
+        return (result == 0) ? true : false;
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Work is complete once both the available/outstanding counts have reached 0.
+    bool isWorkComplete()
+    {
+        return ((mTasksAvailable <= 0) &&
+                (mTasksOutstanding <= 0));
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Return pointer to task data.
+    const void* GetTasksData()
+    {
+        return mpTaskData;
+    }
+
+    void *operator new(size_t size);
+    void operator delete (void *p);
+
+    void* mpTaskData;        // The API thread will set this up and the callback task function will interpet this.
+
+    OSALIGNLINE(volatile LONG) mTasksAvailable{ 0 };
+    OSALIGNLINE(volatile LONG) mTasksOutstanding{ 0 };
+};
+
+
+enum HOTTILE_STATE
+{
+    HOTTILE_INVALID,        // tile is in unitialized state and should be loaded with surface contents before rendering
+    HOTTILE_CLEAR,          // tile should be cleared
+    HOTTILE_DIRTY,          // tile has been rendered to
+    HOTTILE_RESOLVED,       // tile has been stored to memory
+};
+
+struct HOTTILE
+{
+    BYTE *pBuffer;
+    HOTTILE_STATE state;
+    DWORD clearData[4];                 // May need to change based on pfnClearTile implementation.  Reorder for alignment?
+    uint32_t numSamples;
+    uint32_t renderTargetArrayIndex;    // current render target array index loaded
+};
+
+union HotTileSet
+{
+    struct
+    {
+        HOTTILE Color[SWR_NUM_RENDERTARGETS];
+        HOTTILE Depth;
+        HOTTILE Stencil;
+    };
+    HOTTILE Attachment[SWR_NUM_ATTACHMENTS];
+};
+
+class HotTileMgr
+{
+public:
+    HotTileMgr()
+    {
+        memset(&mHotTiles[0][0], 0, sizeof(mHotTiles));
+
+        // cache hottile size
+        for (uint32_t i = SWR_ATTACHMENT_COLOR0; i <= SWR_ATTACHMENT_COLOR7; ++i)
+        {
+            mHotTileSize[i] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * FormatTraits<KNOB_COLOR_HOT_TILE_FORMAT>::bpp / 8;
+        }
+        mHotTileSize[SWR_ATTACHMENT_DEPTH] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp / 8;
+        mHotTileSize[SWR_ATTACHMENT_STENCIL] = KNOB_MACROTILE_X_DIM * KNOB_MACROTILE_Y_DIM * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp / 8;
+    }
+
+    ~HotTileMgr()
+    {
+        for (int x = 0; x < KNOB_NUM_HOT_TILES_X; ++x)
+        {
+            for (int y = 0; y < KNOB_NUM_HOT_TILES_Y; ++y)
+            {
+                for (int a = 0; a < SWR_NUM_ATTACHMENTS; ++a)
+                {
+                    if (mHotTiles[x][y].Attachment[a].pBuffer != NULL)
+                    {
+                        _aligned_free(mHotTiles[x][y].Attachment[a].pBuffer);
+                        mHotTiles[x][y].Attachment[a].pBuffer = NULL;
+                    }
+                }
+            }
+        }
+    }
+
+    HOTTILE *GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1, 
+        uint32_t renderTargetArrayIndex = 0)
+    {
+        uint32_t x, y;
+        MacroTileMgr::getTileIndices(macroID, x, y);
+
+        assert(x < KNOB_NUM_HOT_TILES_X);
+        assert(y < KNOB_NUM_HOT_TILES_Y);
+
+        HotTileSet &tile = mHotTiles[x][y];
+        HOTTILE& hotTile = tile.Attachment[attachment];
+        if (hotTile.pBuffer == NULL)
+        {
+            if (create)
+            {
+                uint32_t size = numSamples * mHotTileSize[attachment];
+                hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
+                hotTile.state = HOTTILE_INVALID;
+                hotTile.numSamples = numSamples;
+                hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
+            }
+            else
+            {
+                return NULL;
+            }
+        }
+        else
+        {
+            // free the old tile and create a new one with enough space to hold all samples
+            if (numSamples > hotTile.numSamples)
+            {
+                // tile should be either uninitialized or resolved if we're deleting and switching to a 
+                // new sample count
+                assert((hotTile.state == HOTTILE_INVALID) ||
+                       (hotTile.state == HOTTILE_RESOLVED) || 
+                       (hotTile.state == HOTTILE_CLEAR));
+                _aligned_free(hotTile.pBuffer);
+
+                uint32_t size = numSamples * mHotTileSize[attachment];
+                hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
+                hotTile.state = HOTTILE_INVALID;
+                hotTile.numSamples = numSamples;
+            }
+
+            // if requested render target array index isn't currently loaded, need to store out the current hottile 
+            // and load the requested array slice
+            if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex)
+            {
+                SWR_FORMAT format;
+                switch (attachment)
+                {
+                case SWR_ATTACHMENT_COLOR0:
+                case SWR_ATTACHMENT_COLOR1:
+                case SWR_ATTACHMENT_COLOR2:
+                case SWR_ATTACHMENT_COLOR3:
+                case SWR_ATTACHMENT_COLOR4:
+                case SWR_ATTACHMENT_COLOR5:
+                case SWR_ATTACHMENT_COLOR6:
+                case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break;
+                case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break;
+                case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break;
+                default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break;
+                }
+
+                if (hotTile.state == HOTTILE_DIRTY)
+                {
+                    pContext->pfnStoreTile(GetPrivateState(pDC), format, attachment,
+                        x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer);
+                }
+
+                pContext->pfnLoadTile(GetPrivateState(pDC), format, attachment,
+                    x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer);
+
+                hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
+                hotTile.state = HOTTILE_DIRTY;
+            }
+        }
+        return &tile.Attachment[attachment];
+    }
+
+    HotTileSet &GetHotTile(uint32_t macroID)
+    {
+        uint32_t x, y;
+        MacroTileMgr::getTileIndices(macroID, x, y);
+        assert(x < KNOB_NUM_HOT_TILES_X);
+        assert(y < KNOB_NUM_HOT_TILES_Y);
+
+        return mHotTiles[x][y];
+    }
+
+private:
+    HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y];
+    uint32_t mHotTileSize[SWR_NUM_ATTACHMENTS];
+};
+
diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.cpp b/src/gallium/drivers/swr/rasterizer/core/utils.cpp
new file mode 100644
index 00000000000..f36452f2cec
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/utils.cpp
@@ -0,0 +1,148 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file utils.cpp
+*
+* @brief Utilities used by SWR core.
+*
+******************************************************************************/
+#if defined(_WIN32)
+
+#include<Windows.h>
+#include <Gdiplus.h>
+#include <Gdiplusheaders.h>
+#include <cstdint>
+
+using namespace Gdiplus;
+
+int GetEncoderClsid(const WCHAR* format, CLSID* pClsid)
+{
+   uint32_t  num = 0;          // number of image encoders
+   uint32_t  size = 0;         // size of the image encoder array in bytes
+
+   ImageCodecInfo* pImageCodecInfo = nullptr;
+
+   GetImageEncodersSize(&num, &size);
+   if(size == 0)
+      return -1;  // Failure
+
+   pImageCodecInfo = (ImageCodecInfo*)(malloc(size));
+   if(pImageCodecInfo == nullptr)
+      return -1;  // Failure
+
+   GetImageEncoders(num, size, pImageCodecInfo);
+
+   for(uint32_t j = 0; j < num; ++j)
+   {
+      if( wcscmp(pImageCodecInfo[j].MimeType, format) == 0 )
+      {
+         *pClsid = pImageCodecInfo[j].Clsid;
+         free(pImageCodecInfo);
+         return j;  // Success
+      }    
+   }
+
+   free(pImageCodecInfo);
+   return -1;  // Failure
+}
+
+void SaveImageToPNGFile(
+    const WCHAR *pFilename,
+    void *pBuffer,
+    uint32_t width,
+    uint32_t height)
+{
+    // dump pixels to a png
+    // Initialize GDI+.
+    GdiplusStartupInput gdiplusStartupInput;
+    ULONG_PTR gdiplusToken;
+    GdiplusStartup(&gdiplusToken, &gdiplusStartupInput, nullptr);
+
+    Bitmap *bitmap = new Bitmap(width, height);
+    BYTE *pBytes = (BYTE*)pBuffer;
+    static const uint32_t bytesPerPixel = 4;
+    for (uint32_t y = 0; y < height; ++y)
+        for (uint32_t x = 0; x < width; ++x)
+        {
+            uint32_t pixel = *(uint32_t*)pBytes;
+            if (pixel == 0xcdcdcdcd)
+            {
+                pixel = 0xFFFF00FF;
+            }
+            else if (pixel == 0xdddddddd)
+            {
+                pixel = 0x80FF0000;
+            }
+            else
+            {
+                pixel |= 0xFF000000;
+            }
+            Color color(pixel);
+            bitmap->SetPixel(x, y, color);
+            pBytes += bytesPerPixel;
+        }
+
+    // Save image.
+    CLSID pngClsid;
+    GetEncoderClsid(L"image/png", &pngClsid);
+    bitmap->Save(pFilename, &pngClsid, nullptr);
+
+    delete bitmap;
+
+    GdiplusShutdown(gdiplusToken);
+}
+
+void OpenBitmapFromFile(
+    const WCHAR *pFilename,
+    void **pBuffer,
+    uint32_t *width,
+    uint32_t *height)
+{
+    GdiplusStartupInput gdiplusStartupInput;
+    ULONG_PTR gdiplusToken;
+    GdiplusStartup(&gdiplusToken, &gdiplusStartupInput, nullptr);
+
+    Bitmap *bitmap  = new Bitmap(pFilename);
+
+    *width          = bitmap->GetWidth();
+    *height         = bitmap->GetHeight();
+    *pBuffer        = new BYTE[*width * *height * 4]; // width * height * |RGBA|
+
+    // The folder 'stb_image' contains a PNG open/close module which
+    // is far less painful than this is, yo.
+    Gdiplus::Color clr;
+    for (uint32_t y = 0, idx = 0; y < *height; ++y)
+    {
+        for (uint32_t x = 0; x < *width; ++x, idx += 4)
+        {
+            bitmap->GetPixel(x, *height - y - 1, &clr);
+            ((BYTE*)*pBuffer)[idx + 0] = clr.GetBlue();
+            ((BYTE*)*pBuffer)[idx + 1] = clr.GetGreen();
+            ((BYTE*)*pBuffer)[idx + 2] = clr.GetRed();
+            ((BYTE*)*pBuffer)[idx + 3] = clr.GetAlpha();
+        }
+    }
+
+    delete bitmap;
+    bitmap = 0;
+}
+#endif
diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h
new file mode 100644
index 00000000000..b9dc48c4fd7
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/utils.h
@@ -0,0 +1,831 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file utils.h
+*
+* @brief Utilities used by SWR core.
+*
+******************************************************************************/
+#pragma once
+
+#include <string.h>
+#include "common/os.h"
+#include "common/simdintrin.h"
+#include "common/swr_assert.h"
+
+#if defined(_WIN32)
+void SaveImageToPNGFile(
+    const WCHAR *pFilename,
+    void *pBuffer,
+    uint32_t width,
+    uint32_t height);
+
+void OpenBitmapFromFile(
+    const WCHAR *pFilename,
+    void **pBuffer,
+    uint32_t *width,
+    uint32_t *height);
+#endif
+
+/// @todo assume linux is always 64 bit
+#if defined(_WIN64) || defined(__linux__) || defined(__gnu_linux__)
+#define _MM_INSERT_EPI64 _mm_insert_epi64
+#define _MM_EXTRACT_EPI64 _mm_extract_epi64
+#else
+INLINE INT64 _MM_EXTRACT_EPI64(__m128i a, const int32_t ndx)
+{
+    OSALIGNLINE(uint32_t) elems[4];
+    _mm_store_si128((__m128i*)elems, a);
+    if (ndx == 0)
+    {
+        uint64_t foo = elems[0];
+        foo |= (uint64_t)elems[1] << 32;
+        return foo;
+    } 
+    else
+    {
+        uint64_t foo = elems[2];
+        foo |= (uint64_t)elems[3] << 32;
+        return foo;
+    }
+}
+
+INLINE __m128i  _MM_INSERT_EPI64(__m128i a, INT64 b, const int32_t ndx)
+{
+    OSALIGNLINE(int64_t) elems[2];
+    _mm_store_si128((__m128i*)elems, a);
+    if (ndx == 0)
+    {
+        elems[0] = b;
+    }
+    else
+    {
+        elems[1] = b;
+    }
+    __m128i out;
+    out = _mm_load_si128((const __m128i*)elems);
+    return out;
+}
+#endif
+
+OSALIGNLINE(struct) BBOX
+{
+    int top, bottom, left, right;
+
+    BBOX() {}
+    BBOX(int t, int b, int l, int r) : top(t), bottom(b), left(l), right(r) {}
+
+    bool operator==(const BBOX& rhs)
+    {
+        return (this->top == rhs.top &&
+            this->bottom == rhs.bottom &&
+            this->left == rhs.left &&
+            this->right == rhs.right);
+    }
+
+    bool operator!=(const BBOX& rhs)
+    {
+        return !(*this == rhs);
+    }
+};
+
+struct simdBBox
+{
+    simdscalari top, bottom, left, right;
+};
+
+INLINE
+void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
+{
+    __m128i row0i = _mm_castps_si128(row0);
+    __m128i row1i = _mm_castps_si128(row1);
+    __m128i row2i = _mm_castps_si128(row2);
+    __m128i row3i = _mm_castps_si128(row3);
+
+    __m128i vTemp = row2i;
+    row2i = _mm_unpacklo_epi32(row2i, row3i);
+    vTemp = _mm_unpackhi_epi32(vTemp, row3i);
+
+    row3i = row0i;
+    row0i = _mm_unpacklo_epi32(row0i, row1i);
+    row3i = _mm_unpackhi_epi32(row3i, row1i);
+
+    row1i = row0i;
+    row0i = _mm_unpacklo_epi64(row0i, row2i);
+    row1i = _mm_unpackhi_epi64(row1i, row2i);
+
+    row2i = row3i;
+    row2i = _mm_unpacklo_epi64(row2i, vTemp);
+    row3i = _mm_unpackhi_epi64(row3i, vTemp);
+
+    row0 = _mm_castsi128_ps(row0i);
+    row1 = _mm_castsi128_ps(row1i);
+    row2 = _mm_castsi128_ps(row2i);
+    row3 = _mm_castsi128_ps(row3i);
+}
+
+INLINE
+void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
+{
+    __m128i vTemp = row2;
+    row2 = _mm_unpacklo_epi32(row2, row3);
+    vTemp = _mm_unpackhi_epi32(vTemp, row3);
+
+    row3 = row0;
+    row0 = _mm_unpacklo_epi32(row0, row1);
+    row3 = _mm_unpackhi_epi32(row3, row1);
+
+    row1 = row0;
+    row0 = _mm_unpacklo_epi64(row0, row2);
+    row1 = _mm_unpackhi_epi64(row1, row2);
+
+    row2 = row3;
+    row2 = _mm_unpacklo_epi64(row2, vTemp);
+    row3 = _mm_unpackhi_epi64(row3, vTemp);
+}
+
+#define GCC_VERSION (__GNUC__ * 10000 \
+                     + __GNUC_MINOR__ * 100 \
+                     + __GNUC_PATCHLEVEL__)
+
+#if defined(__GNUC__) && (GCC_VERSION < 40900)
+#define _mm_undefined_ps _mm_setzero_ps
+#define _mm_undefined_si128 _mm_setzero_si128
+#if KNOB_SIMD_WIDTH == 8
+#define _mm256_undefined_ps _mm256_setzero_ps
+#endif
+#endif
+
+#if KNOB_SIMD_WIDTH == 8
+INLINE
+void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2)
+{
+    __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2);                    //x0z0x1z1 x4z4x5z5
+    __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps());    //y0w0y1w1 y4w4y5w5
+    __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx);                //x0y0z0w0 x4y4z4w4
+    __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx);                //x1y1z1w1 x5y5z5w5
+
+    r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2);                        //x2z2x3z3 x6z6x7z7
+    r1rx = _mm256_unpackhi_ps(vSrc1, _mm256_undefined_ps());        //y2w2y3w3 y6w6yw77
+    __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx);                //x2y2z2w2 x6y6z6w6
+    __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx);                //x3y3z3w3 x7y7z7w7
+
+    vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
+    vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
+    vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
+    vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
+
+    vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
+    vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
+    vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
+    vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
+}
+
+INLINE
+void vTranspose4x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2, __m256 &vSrc3)
+{
+    __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2);                    //x0z0x1z1 x4z4x5z5
+    __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3);                    //y0w0y1w1 y4w4y5w5
+    __m256 r02r1xlolo = _mm256_unpacklo_ps(r0r2, r1rx);                //x0y0z0w0 x4y4z4w4
+    __m256 r02r1xlohi = _mm256_unpackhi_ps(r0r2, r1rx);                //x1y1z1w1 x5y5z5w5
+
+    r0r2 = _mm256_unpackhi_ps(vSrc0, vSrc2);                        //x2z2x3z3 x6z6x7z7
+    r1rx = _mm256_unpackhi_ps(vSrc1, vSrc3)                    ;        //y2w2y3w3 y6w6yw77
+    __m256 r02r1xhilo = _mm256_unpacklo_ps(r0r2, r1rx);                //x2y2z2w2 x6y6z6w6
+    __m256 r02r1xhihi = _mm256_unpackhi_ps(r0r2, r1rx);                //x3y3z3w3 x7y7z7w7
+
+    vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
+    vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
+    vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
+    vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
+
+    vDst[4] = _mm256_extractf128_ps(r02r1xlolo, 1);
+    vDst[5] = _mm256_extractf128_ps(r02r1xlohi, 1);
+    vDst[6] = _mm256_extractf128_ps(r02r1xhilo, 1);
+    vDst[7] = _mm256_extractf128_ps(r02r1xhihi, 1);
+}
+
+INLINE
+void vTranspose8x8(__m256 (&vDst)[8], const __m256 &vMask0, const __m256 &vMask1, const __m256 &vMask2, const __m256 &vMask3, const __m256 &vMask4, const __m256 &vMask5, const __m256 &vMask6, const __m256 &vMask7)
+{
+    __m256 __t0 = _mm256_unpacklo_ps(vMask0, vMask1);
+    __m256 __t1 = _mm256_unpackhi_ps(vMask0, vMask1);
+    __m256 __t2 = _mm256_unpacklo_ps(vMask2, vMask3);
+    __m256 __t3 = _mm256_unpackhi_ps(vMask2, vMask3);
+    __m256 __t4 = _mm256_unpacklo_ps(vMask4, vMask5);
+    __m256 __t5 = _mm256_unpackhi_ps(vMask4, vMask5);
+    __m256 __t6 = _mm256_unpacklo_ps(vMask6, vMask7);
+    __m256 __t7 = _mm256_unpackhi_ps(vMask6, vMask7);
+    __m256 __tt0 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(1,0,1,0));
+    __m256 __tt1 = _mm256_shuffle_ps(__t0,__t2,_MM_SHUFFLE(3,2,3,2));
+    __m256 __tt2 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(1,0,1,0));
+    __m256 __tt3 = _mm256_shuffle_ps(__t1,__t3,_MM_SHUFFLE(3,2,3,2));
+    __m256 __tt4 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(1,0,1,0));
+    __m256 __tt5 = _mm256_shuffle_ps(__t4,__t6,_MM_SHUFFLE(3,2,3,2));
+    __m256 __tt6 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(1,0,1,0));
+    __m256 __tt7 = _mm256_shuffle_ps(__t5,__t7,_MM_SHUFFLE(3,2,3,2));
+    vDst[0] = _mm256_permute2f128_ps(__tt0, __tt4, 0x20);
+    vDst[1] = _mm256_permute2f128_ps(__tt1, __tt5, 0x20);
+    vDst[2] = _mm256_permute2f128_ps(__tt2, __tt6, 0x20);
+    vDst[3] = _mm256_permute2f128_ps(__tt3, __tt7, 0x20);
+    vDst[4] = _mm256_permute2f128_ps(__tt0, __tt4, 0x31);
+    vDst[5] = _mm256_permute2f128_ps(__tt1, __tt5, 0x31);
+    vDst[6] = _mm256_permute2f128_ps(__tt2, __tt6, 0x31);
+    vDst[7] = _mm256_permute2f128_ps(__tt3, __tt7, 0x31);
+}
+
+INLINE
+void vTranspose8x8(__m256 (&vDst)[8], const __m256i &vMask0, const __m256i &vMask1, const __m256i &vMask2, const __m256i &vMask3, const __m256i &vMask4, const __m256i &vMask5, const __m256i &vMask6, const __m256i &vMask7)
+{
+    vTranspose8x8(vDst, _mm256_castsi256_ps(vMask0), _mm256_castsi256_ps(vMask1), _mm256_castsi256_ps(vMask2), _mm256_castsi256_ps(vMask3), 
+        _mm256_castsi256_ps(vMask4), _mm256_castsi256_ps(vMask5), _mm256_castsi256_ps(vMask6), _mm256_castsi256_ps(vMask7));
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////
+/// TranposeSingleComponent
+//////////////////////////////////////////////////////////////////////////
+template<uint32_t bpp>
+struct TransposeSingleComponent
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Pass-thru for single component.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    {
+        memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose8_8_8_8
+//////////////////////////////////////////////////////////////////////////
+struct Transpose8_8_8_8
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    {
+        simdscalari src = _simd_load_si((const simdscalari*)pSrc);
+#if KNOB_SIMD_WIDTH == 8
+#if KNOB_ARCH == KNOB_ARCH_AVX
+        __m128i c0c1 = _mm256_castsi256_si128(src);                                           // rrrrrrrrgggggggg
+        __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1));  // bbbbbbbbaaaaaaaa
+        __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3);                                        // rrrrrrrrbbbbbbbb
+        __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3);                                        // ggggggggaaaaaaaa
+        __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3);                                          // rgrgrgrgrgrgrgrg
+        __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3);                                          // babababababababa
+        __m128i c0123lo = _mm_unpacklo_epi16(c01, c23);                                       // rgbargbargbargba
+        __m128i c0123hi = _mm_unpackhi_epi16(c01, c23);                                       // rgbargbargbargba
+        _mm_store_si128((__m128i*)pDst, c0123lo);
+        _mm_store_si128((__m128i*)(pDst + 16), c0123hi);
+#elif KNOB_ARCH == KNOB_ARCH_AVX2
+        simdscalari dst01 = _mm256_shuffle_epi8(src,
+            _mm256_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
+        simdscalari dst23 = _mm256_permute2x128_si256(src, src, 0x01);
+        dst23 = _mm256_shuffle_epi8(dst23,
+            _mm256_set_epi32(0x80800f07, 0x80800e06, 0x80800d05, 0x80800c04, 0x0b038080, 0x0a028080, 0x09018080, 0x08008080));
+        simdscalari dst = _mm256_or_si256(dst01, dst23);
+        _simd_store_si((simdscalari*)pDst, dst);
+#endif
+#else
+#error Unsupported vector width
+#endif
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose8_8_8
+//////////////////////////////////////////////////////////////////////////
+struct Transpose8_8_8
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose8_8
+//////////////////////////////////////////////////////////////////////////
+struct Transpose8_8
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    {
+        simdscalari src = _simd_load_si((const simdscalari*)pSrc);
+
+#if KNOB_SIMD_WIDTH == 8
+        __m128i rg = _mm256_castsi256_si128(src);           // rrrrrrrr gggggggg
+        __m128i g = _mm_unpackhi_epi64(rg, rg);             // gggggggg gggggggg
+        rg = _mm_unpacklo_epi8(rg, g);
+        _mm_store_si128((__m128i*)pDst, rg);
+#else
+#error Unsupported vector width
+#endif
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose32_32_32_32
+//////////////////////////////////////////////////////////////////////////
+struct Transpose32_32_32_32
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    {
+#if KNOB_SIMD_WIDTH == 8
+        simdscalar src0 = _simd_load_ps((const float*)pSrc);
+        simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
+        simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
+        simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
+
+        __m128 vDst[8];
+        vTranspose4x8(vDst, src0, src1, src2, src3);
+        _mm_store_ps((float*)pDst, vDst[0]);
+        _mm_store_ps((float*)pDst+4, vDst[1]);
+        _mm_store_ps((float*)pDst+8, vDst[2]);
+        _mm_store_ps((float*)pDst+12, vDst[3]);
+        _mm_store_ps((float*)pDst+16, vDst[4]);
+        _mm_store_ps((float*)pDst+20, vDst[5]);
+        _mm_store_ps((float*)pDst+24, vDst[6]);
+        _mm_store_ps((float*)pDst+28, vDst[7]);
+#else
+#error Unsupported vector width
+#endif
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose32_32_32
+//////////////////////////////////////////////////////////////////////////
+struct Transpose32_32_32
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    {
+#if KNOB_SIMD_WIDTH == 8
+        simdscalar src0 = _simd_load_ps((const float*)pSrc);
+        simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
+        simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
+
+        __m128 vDst[8];
+        vTranspose3x8(vDst, src0, src1, src2);
+        _mm_store_ps((float*)pDst, vDst[0]);
+        _mm_store_ps((float*)pDst + 4, vDst[1]);
+        _mm_store_ps((float*)pDst + 8, vDst[2]);
+        _mm_store_ps((float*)pDst + 12, vDst[3]);
+        _mm_store_ps((float*)pDst + 16, vDst[4]);
+        _mm_store_ps((float*)pDst + 20, vDst[5]);
+        _mm_store_ps((float*)pDst + 24, vDst[6]);
+        _mm_store_ps((float*)pDst + 28, vDst[7]);
+#else
+#error Unsupported vector width
+#endif
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose32_32
+//////////////////////////////////////////////////////////////////////////
+struct Transpose32_32
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    {
+        const float* pfSrc = (const float*)pSrc;
+        __m128 src_r0 = _mm_load_ps(pfSrc + 0);
+        __m128 src_r1 = _mm_load_ps(pfSrc + 4);
+        __m128 src_g0 = _mm_load_ps(pfSrc + 8);
+        __m128 src_g1 = _mm_load_ps(pfSrc + 12);
+
+        __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
+        __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
+        __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
+        __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
+
+        float* pfDst = (float*)pDst;
+        _mm_store_ps(pfDst + 0, dst0);
+        _mm_store_ps(pfDst + 4, dst1);
+        _mm_store_ps(pfDst + 8, dst2);
+        _mm_store_ps(pfDst + 12, dst3);
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose16_16_16_16
+//////////////////////////////////////////////////////////////////////////
+struct Transpose16_16_16_16
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    {
+#if KNOB_SIMD_WIDTH == 8
+        simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
+        simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
+
+        __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
+        __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
+        __m128i src_b = _mm256_extractf128_si256(src_ba, 0);
+        __m128i src_a = _mm256_extractf128_si256(src_ba, 1);
+
+        __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
+        __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
+        __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
+        __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
+
+        __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
+        __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
+        __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
+        __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
+
+        _mm_store_si128(((__m128i*)pDst) + 0, dst0);
+        _mm_store_si128(((__m128i*)pDst) + 1, dst1);
+        _mm_store_si128(((__m128i*)pDst) + 2, dst2);
+        _mm_store_si128(((__m128i*)pDst) + 3, dst3);
+#else
+#error Unsupported vector width
+#endif
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose16_16_16
+//////////////////////////////////////////////////////////////////////////
+struct Transpose16_16_16
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    {
+#if KNOB_SIMD_WIDTH == 8
+        simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
+
+        __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
+        __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
+        __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
+        __m128i src_a = _mm_undefined_si128();
+
+        __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
+        __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
+        __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
+        __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
+
+        __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
+        __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
+        __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
+        __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
+
+        _mm_store_si128(((__m128i*)pDst) + 0, dst0);
+        _mm_store_si128(((__m128i*)pDst) + 1, dst1);
+        _mm_store_si128(((__m128i*)pDst) + 2, dst2);
+        _mm_store_si128(((__m128i*)pDst) + 3, dst3);
+#else
+#error Unsupported vector width
+#endif
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose16_16
+//////////////////////////////////////////////////////////////////////////
+struct Transpose16_16
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    {
+        simdscalar src = _simd_load_ps((const float*)pSrc);
+
+#if KNOB_SIMD_WIDTH == 8
+        __m128 comp0 = _mm256_castps256_ps128(src);
+        __m128 comp1 = _mm256_extractf128_ps(src, 1);
+
+        __m128i comp0i = _mm_castps_si128(comp0);
+        __m128i comp1i = _mm_castps_si128(comp1);
+
+        __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
+        __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
+
+        _mm_store_si128((__m128i*)pDst, resLo);
+        _mm_store_si128((__m128i*)pDst + 1, resHi);
+#else
+#error Unsupported vector width
+#endif
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose24_8
+//////////////////////////////////////////////////////////////////////////
+struct Transpose24_8
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose32_8_24
+//////////////////////////////////////////////////////////////////////////
+struct Transpose32_8_24
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+};
+
+
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose4_4_4_4
+//////////////////////////////////////////////////////////////////////////
+struct Transpose4_4_4_4
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose5_6_5
+//////////////////////////////////////////////////////////////////////////
+struct Transpose5_6_5
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose9_9_9_5
+//////////////////////////////////////////////////////////////////////////
+struct Transpose9_9_9_5
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose5_5_5_1
+//////////////////////////////////////////////////////////////////////////
+struct Transpose5_5_5_1
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose10_10_10_2
+//////////////////////////////////////////////////////////////////////////
+struct Transpose10_10_10_2
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Transpose11_11_10
+//////////////////////////////////////////////////////////////////////////
+struct Transpose11_11_10
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
+    /// @param pSrc - source data in SOA form
+    /// @param pDst - output data in AOS form
+    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+};
+
+// helper function to unroll loops
+template<int Begin, int End, int Step = 1>
+struct UnrollerL {
+    template<typename Lambda>
+    INLINE static void step(Lambda& func) {
+        func(Begin);
+        UnrollerL<Begin + Step, End, Step>::step(func);
+    }
+};
+
+template<int End, int Step>
+struct UnrollerL<End, End, Step> {
+    template<typename Lambda>
+    static void step(Lambda& func) {
+    }
+};
+
+// general CRC compute
+INLINE
+uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
+{
+#if defined(_WIN64) || defined(__x86_64__)
+    uint32_t sizeInQwords = size / sizeof(uint64_t);
+    uint32_t sizeRemainderBytes = size % sizeof(uint64_t);
+    uint64_t* pDataWords = (uint64_t*)pData;
+    for (uint32_t i = 0; i < sizeInQwords; ++i)
+    {
+        crc = (uint32_t)_mm_crc32_u64(crc, *pDataWords++);
+    }
+#else
+    uint32_t sizeInDwords = size / sizeof(uint32_t);
+    uint32_t sizeRemainderBytes = size % sizeof(uint32_t);
+    uint32_t* pDataWords = (uint32_t*)pData;
+    for (uint32_t i = 0; i < sizeInDwords; ++i)
+    {
+        crc = _mm_crc32_u32(crc, *pDataWords++);
+    }
+#endif
+
+    BYTE* pRemainderBytes = (BYTE*)pDataWords;
+    for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
+    {
+        crc = _mm_crc32_u8(crc, *pRemainderBytes++);
+    }
+
+    return crc;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Add byte offset to any-type pointer
+//////////////////////////////////////////////////////////////////////////
+template <typename T>
+INLINE
+static T* PtrAdd(T* p, intptr_t offset)
+{
+    intptr_t intp = reinterpret_cast<intptr_t>(p);
+    return reinterpret_cast<T*>(intp + offset);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Is a power-of-2?
+//////////////////////////////////////////////////////////////////////////
+template <typename T>
+INLINE
+static bool IsPow2(T value)
+{
+    return value == (value & (0 - value));
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Align down to specified alignment
+/// Note: IsPow2(alignment) MUST be true
+//////////////////////////////////////////////////////////////////////////
+template <typename T1, typename T2>
+INLINE
+static T1 AlignDownPow2(T1 value, T2 alignment)
+{
+    SWR_ASSERT(IsPow2(alignment));
+    return value & ~T1(alignment - 1);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Align up to specified alignment
+/// Note: IsPow2(alignment) MUST be true
+//////////////////////////////////////////////////////////////////////////
+template <typename T1, typename T2>
+INLINE
+static T1 AlignUpPow2(T1 value, T2 alignment)
+{
+    return AlignDownPow2(value + T1(alignment - 1), alignment);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Align up ptr to specified alignment
+/// Note: IsPow2(alignment) MUST be true
+//////////////////////////////////////////////////////////////////////////
+template <typename T1, typename T2>
+INLINE
+static T1* AlignUpPow2(T1* value, T2 alignment)
+{
+    return reinterpret_cast<T1*>(
+        AlignDownPow2(reinterpret_cast<uintptr_t>(value) + uintptr_t(alignment - 1), alignment));
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Align down to specified alignment
+//////////////////////////////////////////////////////////////////////////
+template <typename T1, typename T2>
+INLINE
+static T1 AlignDown(T1 value, T2 alignment)
+{
+    if (IsPow2(alignment)) { return AlignDownPow2(value, alignment); }
+    return value - T1(value % alignment);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Align down to specified alignment
+//////////////////////////////////////////////////////////////////////////
+template <typename T1, typename T2>
+INLINE
+static T1* AlignDown(T1* value, T2 alignment)
+{
+    return (T1*)AlignDown(uintptr_t(value), alignment);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Align up to specified alignment
+/// Note: IsPow2(alignment) MUST be true
+//////////////////////////////////////////////////////////////////////////
+template <typename T1, typename T2>
+INLINE
+static T1 AlignUp(T1 value, T2 alignment)
+{
+    return AlignDown(value + T1(alignment - 1), alignment);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Align up to specified alignment
+/// Note: IsPow2(alignment) MUST be true
+//////////////////////////////////////////////////////////////////////////
+template <typename T1, typename T2>
+INLINE
+static T1* AlignUp(T1* value, T2 alignment)
+{
+    return AlignDown(PtrAdd(value, alignment - 1), alignment);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// Helper structure used to access an array of elements that don't 
+/// correspond to a typical word size.
+//////////////////////////////////////////////////////////////////////////
+template<typename T, size_t BitsPerElementT, size_t ArrayLenT>
+class BitsArray
+{
+private:
+    static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
+    static const size_t ELEMENTS_PER_WORD = BITS_PER_WORD / BitsPerElementT;
+    static const size_t NUM_WORDS = (ArrayLenT + ELEMENTS_PER_WORD - 1) / ELEMENTS_PER_WORD;
+    static const size_t ELEMENT_MASK = (size_t(1) << BitsPerElementT) - 1;
+
+    static_assert(ELEMENTS_PER_WORD * BitsPerElementT == BITS_PER_WORD,
+        "Element size must an integral fraction of pointer size");
+
+    size_t              m_words[NUM_WORDS] = {};
+
+public:
+
+    T operator[] (size_t elementIndex) const
+    {
+        size_t word = m_words[elementIndex / ELEMENTS_PER_WORD];
+        word >>= ((elementIndex % ELEMENTS_PER_WORD) * BitsPerElementT);
+        return T(word & ELEMENT_MASK);
+    }
+};
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
new file mode 100644
index 00000000000..734c89792f0
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -0,0 +1,313 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+* 
+* @file JitManager.cpp
+* 
+* @brief Implementation if the Jit Manager.
+* 
+* Notes:
+* 
+******************************************************************************/
+#if defined(_WIN32)
+#pragma warning(disable: 4800 4146 4244 4267 4355 4996)
+#endif
+
+#include "jit_api.h"
+#include "JitManager.h"
+#include "fetch_jit.h"
+
+#if defined(_WIN32)
+#include "llvm/ADT/Triple.h"
+#endif
+#include "llvm/IR/Function.h"
+#include "llvm/Support/DynamicLibrary.h"
+
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
+
+#include "llvm/Analysis/CFGPrinter.h"
+#include "llvm/IRReader/IRReader.h"
+
+#include "core/state.h"
+#include "common/containers.hpp"
+
+#include "state_llvm.h"
+
+#include <sstream>
+#if defined(_WIN32)
+#include <psapi.h>
+#include <cstring>
+
+#define INTEL_OUTPUT_DIR "c:\\Intel"
+#define SWR_OUTPUT_DIR INTEL_OUTPUT_DIR "\\SWR"
+#define JITTER_OUTPUT_DIR SWR_OUTPUT_DIR "\\Jitter"
+#endif
+
+using namespace llvm;
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Contructor for JitManager.
+/// @param simdWidth - SIMD width to be used in generated program.
+JitManager::JitManager(uint32_t simdWidth, const char *arch)
+    : mContext(), mBuilder(mContext), mIsModuleFinalized(true), mJitNumber(0), mVWidth(simdWidth), mArch(arch)
+{
+    InitializeNativeTarget();
+    InitializeNativeTargetAsmPrinter();
+    InitializeNativeTargetDisassembler();
+
+    TargetOptions    tOpts;
+    tOpts.AllowFPOpFusion = FPOpFusion::Fast;
+    tOpts.NoInfsFPMath = false;
+    tOpts.NoNaNsFPMath = false;
+    tOpts.UnsafeFPMath = true;
+#if defined(_DEBUG)
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR < 7
+    tOpts.NoFramePointerElim = true;
+#endif
+#endif
+
+    //tOpts.PrintMachineCode    = true;
+
+    std::stringstream fnName("JitModule", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
+    fnName << mJitNumber++;
+    std::unique_ptr<Module> newModule(new Module(fnName.str(), mContext));
+    mpCurrentModule = newModule.get();
+
+    auto &&EB = EngineBuilder(std::move(newModule));
+    EB.setTargetOptions(tOpts);
+    EB.setOptLevel(CodeGenOpt::Aggressive);
+
+    StringRef hostCPUName;
+
+    // force JIT to use the same CPU arch as the rest of swr
+    if(mArch.AVX512F())
+    {
+        assert(0 && "Implement AVX512 jitter");
+        hostCPUName = sys::getHostCPUName();
+        if (mVWidth == 0)
+        {
+            mVWidth = 16;
+        }
+    }
+    else if(mArch.AVX2())
+    {
+        hostCPUName = StringRef("core-avx2");
+        if (mVWidth == 0)
+        {
+            mVWidth = 8;
+        }
+    }
+    else if(mArch.AVX())
+    {
+        if (mArch.F16C())
+        {
+            hostCPUName = StringRef("core-avx-i");
+        }
+        else
+        {
+            hostCPUName = StringRef("corei7-avx");
+        }
+        if (mVWidth == 0)
+        {
+            mVWidth = 8;
+        }
+    }
+    else
+    {
+        hostCPUName = sys::getHostCPUName();
+        if (mVWidth == 0)
+        {
+            mVWidth = 8; // 4?
+        }
+    }
+
+    EB.setMCPU(hostCPUName);
+
+#if defined(_WIN32)
+    // Needed for MCJIT on windows
+    Triple hostTriple(sys::getProcessTriple());
+    hostTriple.setObjectFormat(Triple::ELF);
+    mpCurrentModule->setTargetTriple(hostTriple.getTriple());
+#endif // _WIN32
+
+    mpExec = EB.create();
+
+#if LLVM_USE_INTEL_JITEVENTS
+    JITEventListener *vTune = JITEventListener::createIntelJITEventListener();
+    mpExec->RegisterJITEventListener(vTune);
+#endif
+
+    mFP32Ty = Type::getFloatTy(mContext);   // float type
+    mInt8Ty = Type::getInt8Ty(mContext);
+    mInt32Ty = Type::getInt32Ty(mContext);   // int type
+    mInt64Ty = Type::getInt64Ty(mContext);   // int type
+    mV4FP32Ty = StructType::get(mContext, std::vector<Type*>(4, mFP32Ty), false); // vector4 float type (represented as structure)
+    mV4Int32Ty = StructType::get(mContext, std::vector<Type*>(4, mInt32Ty), false); // vector4 int type
+
+    // fetch function signature
+    // typedef void(__cdecl *PFN_FETCH_FUNC)(SWR_FETCH_CONTEXT& fetchInfo, simdvertex& out);
+    std::vector<Type*> fsArgs;
+    fsArgs.push_back(PointerType::get(Gen_SWR_FETCH_CONTEXT(this), 0));
+    fsArgs.push_back(PointerType::get(Gen_simdvertex(this), 0));
+
+    mFetchShaderTy = FunctionType::get(Type::getVoidTy(mContext), fsArgs, false);
+
+    mSimtFP32Ty = VectorType::get(mFP32Ty, mVWidth);
+    mSimtInt32Ty = VectorType::get(mInt32Ty, mVWidth);
+
+    mSimdVectorTy = StructType::get(mContext, std::vector<Type*>(4, mSimtFP32Ty), false);
+    mSimdVectorInt32Ty = StructType::get(mContext, std::vector<Type*>(4, mSimtInt32Ty), false);
+
+#if defined(_WIN32)
+    // explicitly instantiate used symbols from potentially staticly linked libs
+    sys::DynamicLibrary::AddSymbol("exp2f", &exp2f);
+    sys::DynamicLibrary::AddSymbol("log2f", &log2f);
+    sys::DynamicLibrary::AddSymbol("sinf", &sinf);
+    sys::DynamicLibrary::AddSymbol("cosf", &cosf);
+    sys::DynamicLibrary::AddSymbol("powf", &powf);
+#endif
+
+#if defined(_WIN32)
+    if (KNOB_DUMP_SHADER_IR)
+    {
+        CreateDirectory(INTEL_OUTPUT_DIR, NULL);
+        CreateDirectory(SWR_OUTPUT_DIR, NULL);
+        CreateDirectory(JITTER_OUTPUT_DIR, NULL);
+    }
+
+    ///@todo Figure out a better solution for this.
+    // Redirect stdin, stdout, and stderr to attached console.
+    freopen("CONIN$", "r", stdin);
+    freopen("CONOUT$", "w", stdout);
+    freopen("CONOUT$", "w", stderr);
+#endif
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Create new LLVM module.
+void JitManager::SetupNewModule()
+{
+    SWR_ASSERT(mIsModuleFinalized == true && "Current module is not finalized!");
+    
+    std::stringstream fnName("JitModule", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
+    fnName << mJitNumber++;
+    std::unique_ptr<Module> newModule(new Module(fnName.str(), mContext));
+    mpCurrentModule = newModule.get();
+#if defined(_WIN32)
+    // Needed for MCJIT on windows
+    Triple hostTriple(sys::getProcessTriple());
+    hostTriple.setObjectFormat(Triple::ELF);
+    newModule->setTargetTriple(hostTriple.getTriple());
+#endif // _WIN32
+
+    mpExec->addModule(std::move(newModule));
+    mIsModuleFinalized = false;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Create new LLVM module from IR.
+bool JitManager::SetupModuleFromIR(const uint8_t *pIR)
+{
+    std::unique_ptr<MemoryBuffer> pMem = MemoryBuffer::getMemBuffer(StringRef((const char*)pIR), "");
+
+    SMDiagnostic Err;
+    std::unique_ptr<Module> newModule = parseIR(pMem.get()->getMemBufferRef(), Err, mContext);
+
+    if (newModule == nullptr)
+    {
+        SWR_ASSERT(0, "Parse failed! Check Err for details.");
+        return false;
+    }
+
+    mpCurrentModule = newModule.get();
+#if defined(_WIN32)
+    // Needed for MCJIT on windows
+    Triple hostTriple(sys::getProcessTriple());
+    hostTriple.setObjectFormat(Triple::ELF);
+    newModule->setTargetTriple(hostTriple.getTriple());
+#endif // _WIN32
+
+    mpExec->addModule(std::move(newModule));
+    mIsModuleFinalized = false;
+
+    return true;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Dump function to file.
+void JitManager::DumpToFile(Function *f, const char *fileName)
+{
+    if (KNOB_DUMP_SHADER_IR)
+    {
+#if defined(_WIN32)
+        DWORD pid = GetCurrentProcessId();
+        TCHAR procname[MAX_PATH];
+        GetModuleFileName(NULL, procname, MAX_PATH);
+        const char* pBaseName = strrchr(procname, '\\');
+        std::stringstream outDir;
+        outDir << JITTER_OUTPUT_DIR << pBaseName << "_" << pid << std::ends;
+        CreateDirectory(outDir.str().c_str(), NULL);
+#endif
+
+        std::error_code EC;
+        const char *funcName = f->getName().data();
+        char fName[256];
+#if defined(_WIN32)
+        sprintf(fName, "%s\\%s.%s.ll", outDir.str().c_str(), funcName, fileName);
+#else
+        sprintf(fName, "%s.%s.ll", funcName, fileName);
+#endif
+        raw_fd_ostream fd(fName, EC, llvm::sys::fs::F_None);
+        Module* pModule = f->getParent();
+        pModule->print(fd, nullptr);
+
+#if defined(_WIN32)
+        sprintf(fName, "%s\\cfg.%s.%s.dot", outDir.str().c_str(), funcName, fileName);
+#else
+        sprintf(fName, "cfg.%s.%s.dot", funcName, fileName);
+#endif
+        fd.flush();
+
+        raw_fd_ostream fd_cfg(fName, EC, llvm::sys::fs::F_Text);
+        WriteGraph(fd_cfg, (const Function*)f);
+
+        fd_cfg.flush();
+    }
+}
+
+extern "C"
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Create JIT context.
+    /// @param simdWidth - SIMD width to be used in generated program.
+    HANDLE JITCALL JitCreateContext(uint32_t targetSimdWidth, const char* arch)
+    {
+        return new JitManager(targetSimdWidth, arch);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Destroy JIT context.
+    void JITCALL JitDestroyContext(HANDLE hJitContext)
+    {
+        delete reinterpret_cast<JitManager*>(hJitContext);
+    }
+}
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
new file mode 100644
index 00000000000..c974a611224
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
@@ -0,0 +1,186 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file JitManager.h
+*
+* @brief JitManager contains the LLVM data structures used for JIT generation
+*
+* Notes:
+*
+******************************************************************************/
+#pragma once
+
+#include "common/os.h"
+#include "common/isa.hpp"
+
+#if defined(_WIN32)
+#pragma warning(disable : 4146 4244 4267 4800 4996)
+#endif
+
+// llvm 3.7+ reuses "DEBUG" as an enum value
+#pragma push_macro("DEBUG")
+#undef DEBUG
+
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+
+#include "llvm/Config/llvm-config.h"
+#ifndef LLVM_VERSION_MAJOR
+#include "llvm/Config/config.h"
+#endif
+
+#include "llvm/IR/Verifier.h"
+#include "llvm/ExecutionEngine/MCJIT.h"
+#include "llvm/Support/FileSystem.h"
+#define LLVM_F_NONE sys::fs::F_None
+
+#include "llvm/Analysis/Passes.h"
+
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#include "llvm/PassManager.h"
+#else
+#include "llvm/IR/LegacyPassManager.h"
+using namespace llvm::legacy;
+#endif
+
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/ExecutionEngine/ExecutionEngine.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Support/Host.h"
+
+
+#pragma pop_macro("DEBUG")
+
+using namespace llvm;
+//////////////////////////////////////////////////////////////////////////
+/// JitInstructionSet
+/// @brief Subclass of InstructionSet that allows users to override
+/// the reporting of support for certain ISA features.  This allows capping
+/// the jitted code to a certain feature level, e.g. jit AVX level code on 
+/// a platform that supports AVX2.
+//////////////////////////////////////////////////////////////////////////
+class JitInstructionSet : public InstructionSet
+{
+public:
+    JitInstructionSet(const char* requestedIsa) : isaRequest(requestedIsa)
+    {
+        std::transform(isaRequest.begin(), isaRequest.end(), isaRequest.begin(), ::tolower);
+
+        if(isaRequest == "avx")
+        {
+            bForceAVX = true;
+            bForceAVX2 = false;
+            bForceAVX512 = false;
+        }
+        else if(isaRequest == "avx2")
+        {
+            bForceAVX = false;
+            bForceAVX2 = true;
+            bForceAVX512 = false;
+        }
+        #if 0
+        else if(isaRequest == "avx512")
+        {
+            bForceAVX = false;
+            bForceAVX2 = false;
+            bForceAVX512 = true;
+        }
+        #endif
+    };
+
+    bool AVX2(void) { return bForceAVX ? 0 : InstructionSet::AVX2(); }
+    bool AVX512F(void) { return (bForceAVX | bForceAVX2) ? 0 : InstructionSet::AVX512F(); }
+    bool BMI2(void) { return bForceAVX ? 0 : InstructionSet::BMI2(); }
+
+private:
+    bool bForceAVX = false;
+    bool bForceAVX2 = false;
+    bool bForceAVX512 = false;
+    std::string isaRequest;
+};
+
+
+
+struct JitLLVMContext : LLVMContext
+{
+};
+
+
+//////////////////////////////////////////////////////////////////////////
+/// JitManager
+//////////////////////////////////////////////////////////////////////////
+struct JitManager
+{
+    JitManager(uint32_t w, const char *arch);
+    ~JitManager(){};
+
+    JitLLVMContext          mContext;   ///< LLVM compiler
+    IRBuilder<>             mBuilder;   ///< LLVM IR Builder
+    ExecutionEngine*        mpExec;
+
+    // Need to be rebuilt after a JIT and before building new IR
+    Module* mpCurrentModule;
+    bool mIsModuleFinalized;
+    uint32_t mJitNumber;
+
+    uint32_t                 mVWidth;
+
+    // Built in types.
+    Type*                mInt8Ty;
+    Type*                mInt32Ty;
+    Type*                mInt64Ty;
+    Type*                mFP32Ty;
+    StructType*          mV4FP32Ty;
+    StructType*          mV4Int32Ty;
+
+    // helper scalar function types
+    FunctionType* mUnaryFPTy;
+    FunctionType* mBinaryFPTy;
+    FunctionType* mTrinaryFPTy;
+    FunctionType* mUnaryIntTy;
+    FunctionType* mBinaryIntTy;
+    FunctionType* mTrinaryIntTy;
+
+    Type* mSimtFP32Ty;
+    Type* mSimtInt32Ty;
+
+    Type* mSimdVectorInt32Ty;
+    Type* mSimdVectorTy;
+
+    // fetch shader types
+    FunctionType*        mFetchShaderTy;
+
+    JitInstructionSet mArch;
+
+    void SetupNewModule();
+    bool SetupModuleFromIR(const uint8_t *pIR);
+
+    static void DumpToFile(Function *f, const char *fileName);
+};
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
new file mode 100644
index 00000000000..954524afd3a
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
@@ -0,0 +1,772 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file blend_jit.cpp
+*
+* @brief Implementation of the blend jitter
+*
+* Notes:
+*
+******************************************************************************/
+#include "jit_api.h"
+#include "blend_jit.h"
+#include "builder.h"
+#include "state_llvm.h"
+#include "common/containers.hpp"
+#include "llvm/IR/DataLayout.h"
+
+#include <sstream>
+
+// components with bit-widths <= the QUANTIZE_THRESHOLD will be quantized
+#define QUANTIZE_THRESHOLD 2
+
+//////////////////////////////////////////////////////////////////////////
+/// Interface to Jitting a blend shader
+//////////////////////////////////////////////////////////////////////////
+struct BlendJit : public Builder
+{
+    BlendJit(JitManager* pJitMgr) : Builder(pJitMgr){};
+
+    template<bool Color, bool Alpha>
+    void GenerateBlendFactor(SWR_BLEND_FACTOR factor, Value* constColor[4], Value* src[4], Value* src1[4], Value* dst[4], Value* result[4])
+    {
+        Value* out[4];
+
+        switch (factor)
+        {
+        case BLENDFACTOR_ONE:
+            out[0] = out[1] = out[2] = out[3] = VIMMED1(1.0f);
+            break;
+        case BLENDFACTOR_SRC_COLOR:
+            out[0] = src[0];
+            out[1] = src[1];
+            out[2] = src[2];
+            out[3] = src[3];
+            break;
+        case BLENDFACTOR_SRC_ALPHA:
+            out[0] = out[1] = out[2] = out[3] = src[3];
+            break;
+        case BLENDFACTOR_DST_ALPHA:
+            out[0] = out[1] = out[2] = out[3] = dst[3];
+            break;
+        case BLENDFACTOR_DST_COLOR:
+            out[0] = dst[0];
+            out[1] = dst[1];
+            out[2] = dst[2];
+            out[3] = dst[3];
+            break;
+        case BLENDFACTOR_SRC_ALPHA_SATURATE:
+            out[0] = out[1] = out[2] = VMINPS(src[3], FSUB(VIMMED1(1.0f), dst[3]));
+            out[3] = VIMMED1(1.0f);
+            break;
+        case BLENDFACTOR_CONST_COLOR:
+            out[0] = constColor[0];
+            out[1] = constColor[1];
+            out[2] = constColor[2];
+            out[3] = constColor[3];
+            break;
+        case BLENDFACTOR_CONST_ALPHA:
+            out[0] = out[1] = out[2] = out[3] = constColor[3];
+            break;
+        case BLENDFACTOR_SRC1_COLOR:
+            out[0] = src1[0];
+            out[1] = src1[1];
+            out[2] = src1[2];
+            out[3] = src1[3];
+            break;
+        case BLENDFACTOR_SRC1_ALPHA:
+            out[0] = out[1] = out[2] = out[3] = src1[3];
+            break;
+        case BLENDFACTOR_ZERO:
+            out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
+            break;
+        case BLENDFACTOR_INV_SRC_COLOR:
+            out[0] = FSUB(VIMMED1(1.0f), src[0]);
+            out[1] = FSUB(VIMMED1(1.0f), src[1]);
+            out[2] = FSUB(VIMMED1(1.0f), src[2]);
+            out[3] = FSUB(VIMMED1(1.0f), src[3]);
+            break;
+        case BLENDFACTOR_INV_SRC_ALPHA:
+            out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src[3]);
+            break;
+        case BLENDFACTOR_INV_DST_ALPHA:
+            out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), dst[3]);
+            break;
+        case BLENDFACTOR_INV_DST_COLOR:
+            out[0] = FSUB(VIMMED1(1.0f), dst[0]);
+            out[1] = FSUB(VIMMED1(1.0f), dst[1]);
+            out[2] = FSUB(VIMMED1(1.0f), dst[2]);
+            out[3] = FSUB(VIMMED1(1.0f), dst[3]);
+            break;
+        case BLENDFACTOR_INV_CONST_COLOR:
+            out[0] = FSUB(VIMMED1(1.0f), constColor[0]);
+            out[1] = FSUB(VIMMED1(1.0f), constColor[1]);
+            out[2] = FSUB(VIMMED1(1.0f), constColor[2]);
+            out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
+            break;
+        case BLENDFACTOR_INV_CONST_ALPHA:
+            out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), constColor[3]);
+            break;
+        case BLENDFACTOR_INV_SRC1_COLOR:
+            out[0] = FSUB(VIMMED1(1.0f), src1[0]);
+            out[1] = FSUB(VIMMED1(1.0f), src1[1]);
+            out[2] = FSUB(VIMMED1(1.0f), src1[2]);
+            out[3] = FSUB(VIMMED1(1.0f), src1[3]);
+            break;
+        case BLENDFACTOR_INV_SRC1_ALPHA:
+            out[0] = out[1] = out[2] = out[3] = FSUB(VIMMED1(1.0f), src1[3]);
+            break;
+        default:
+            SWR_ASSERT(false, "Unsupported blend factor: %d", factor);
+            out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
+            break;
+        }
+
+        if (Color)
+        {
+            result[0] = out[0];
+            result[1] = out[1];
+            result[2] = out[2];
+        }
+
+        if (Alpha)
+        {
+            result[3] = out[3];
+        }
+    }
+
+    void Clamp(SWR_FORMAT format, Value* src[4])
+    {
+        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
+        SWR_TYPE type = info.type[0];
+
+        switch (type)
+        {
+        case SWR_TYPE_FLOAT:
+            break;
+
+        case SWR_TYPE_UNORM:
+            src[0] = VMINPS(VMAXPS(src[0], VIMMED1(0.0f)), VIMMED1(1.0f));
+            src[1] = VMINPS(VMAXPS(src[1], VIMMED1(0.0f)), VIMMED1(1.0f));
+            src[2] = VMINPS(VMAXPS(src[2], VIMMED1(0.0f)), VIMMED1(1.0f));
+            src[3] = VMINPS(VMAXPS(src[3], VIMMED1(0.0f)), VIMMED1(1.0f));
+            break;
+
+        case SWR_TYPE_SNORM:
+            src[0] = VMINPS(VMAXPS(src[0], VIMMED1(-1.0f)), VIMMED1(1.0f));
+            src[1] = VMINPS(VMAXPS(src[1], VIMMED1(-1.0f)), VIMMED1(1.0f));
+            src[2] = VMINPS(VMAXPS(src[2], VIMMED1(-1.0f)), VIMMED1(1.0f));
+            src[3] = VMINPS(VMAXPS(src[3], VIMMED1(-1.0f)), VIMMED1(1.0f));
+            break;
+
+        default: SWR_ASSERT(false, "Unsupport format type: %d", type);
+        }
+    }
+
+    void ApplyDefaults(SWR_FORMAT format, Value* src[4])
+    {
+        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
+
+        bool valid[] = { false, false, false, false };
+        for (uint32_t c = 0; c < info.numComps; ++c)
+        {
+            valid[info.swizzle[c]] = true;
+        }
+
+        for (uint32_t c = 0; c < 4; ++c)
+        {
+            if (!valid[c])
+            {
+                src[c] = BITCAST(VIMMED1((int)info.defaults[c]), mSimdFP32Ty);
+            }
+        }
+    }
+
+    void ApplyUnusedDefaults(SWR_FORMAT format, Value* src[4])
+    {
+        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
+
+        for (uint32_t c = 0; c < info.numComps; ++c)
+        {
+            if (info.type[c] == SWR_TYPE_UNUSED)
+            {
+                src[info.swizzle[c]] = BITCAST(VIMMED1((int)info.defaults[info.swizzle[c]]), mSimdFP32Ty);
+            }
+        }
+    }
+
+    void Quantize(SWR_FORMAT format, Value* src[4])
+    {
+        const SWR_FORMAT_INFO& info = GetFormatInfo(format);
+        for (uint32_t c = 0; c < info.numComps; ++c)
+        {
+            if (info.bpc[c] <= QUANTIZE_THRESHOLD)
+            {
+                uint32_t swizComp = info.swizzle[c];
+                float factor = (float)((1 << info.bpc[c]) - 1);
+                switch (info.type[c])
+                {
+                case SWR_TYPE_UNORM:
+                    src[swizComp] = FADD(FMUL(src[swizComp], VIMMED1(factor)), VIMMED1(0.5f));
+                    src[swizComp] = VROUND(src[swizComp], C(_MM_FROUND_TO_ZERO));
+                    src[swizComp] = FMUL(src[swizComp], VIMMED1(1.0f /factor));
+                    break;
+                default: SWR_ASSERT(false, "Unsupported format type: %d", info.type[c]);
+                }
+            }
+        }
+    }
+
+    template<bool Color, bool Alpha>
+    void BlendFunc(SWR_BLEND_OP blendOp, Value* src[4], Value* srcFactor[4], Value* dst[4], Value* dstFactor[4], Value* result[4])
+    {
+        Value* out[4];
+        Value* srcBlend[4];
+        Value* dstBlend[4];
+        for (uint32_t i = 0; i < 4; ++i)
+        {
+            srcBlend[i] = FMUL(src[i], srcFactor[i]);
+            dstBlend[i] = FMUL(dst[i], dstFactor[i]);
+        }
+
+        switch (blendOp)
+        {
+        case BLENDOP_ADD:
+            out[0] = FADD(srcBlend[0], dstBlend[0]);
+            out[1] = FADD(srcBlend[1], dstBlend[1]);
+            out[2] = FADD(srcBlend[2], dstBlend[2]);
+            out[3] = FADD(srcBlend[3], dstBlend[3]);
+            break;
+
+        case BLENDOP_SUBTRACT:
+            out[0] = FSUB(srcBlend[0], dstBlend[0]);
+            out[1] = FSUB(srcBlend[1], dstBlend[1]);
+            out[2] = FSUB(srcBlend[2], dstBlend[2]);
+            out[3] = FSUB(srcBlend[3], dstBlend[3]);
+            break;
+
+        case BLENDOP_REVSUBTRACT:
+            out[0] = FSUB(dstBlend[0], srcBlend[0]);
+            out[1] = FSUB(dstBlend[1], srcBlend[1]);
+            out[2] = FSUB(dstBlend[2], srcBlend[2]);
+            out[3] = FSUB(dstBlend[3], srcBlend[3]);
+            break;
+
+        case BLENDOP_MIN:
+            out[0] = VMINPS(src[0], dst[0]);
+            out[1] = VMINPS(src[1], dst[1]);
+            out[2] = VMINPS(src[2], dst[2]);
+            out[3] = VMINPS(src[3], dst[3]);
+            break;
+
+        case BLENDOP_MAX:
+            out[0] = VMAXPS(src[0], dst[0]);
+            out[1] = VMAXPS(src[1], dst[1]);
+            out[2] = VMAXPS(src[2], dst[2]);
+            out[3] = VMAXPS(src[3], dst[3]);
+            break;
+
+        default:
+            SWR_ASSERT(false, "Unsupported blend operation: %d", blendOp);
+            out[0] = out[1] = out[2] = out[3] = VIMMED1(0.0f);
+            break;
+        }
+
+        if (Color)
+        {
+            result[0] = out[0];
+            result[1] = out[1];
+            result[2] = out[2];
+        }
+
+        if (Alpha)
+        {
+            result[3] = out[3];
+        }
+    }
+
+    void LogicOpFunc(SWR_LOGIC_OP logicOp, Value* src[4], Value* dst[4], Value* result[4])
+    {
+        // Op: (s == PS output, d = RT contents)
+        switch(logicOp)
+        {
+        case LOGICOP_CLEAR:
+            result[0] = VIMMED1(0);
+            result[1] = VIMMED1(0);
+            result[2] = VIMMED1(0);
+            result[3] = VIMMED1(0);
+            break;
+
+        case LOGICOP_NOR:
+            // ~(s | d)
+            result[0] = XOR(OR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
+            result[1] = XOR(OR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
+            result[2] = XOR(OR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
+            result[3] = XOR(OR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
+            break;
+
+        case LOGICOP_AND_INVERTED:
+            // ~s & d
+            // todo: use avx andnot instr when I can find the intrinsic to call
+            result[0] = AND(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
+            result[1] = AND(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
+            result[2] = AND(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
+            result[3] = AND(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
+            break;
+
+        case LOGICOP_COPY_INVERTED:
+            // ~s
+            result[0] = XOR(src[0], VIMMED1(0xFFFFFFFF));
+            result[1] = XOR(src[1], VIMMED1(0xFFFFFFFF));
+            result[2] = XOR(src[2], VIMMED1(0xFFFFFFFF));
+            result[3] = XOR(src[3], VIMMED1(0xFFFFFFFF));
+            break;
+
+        case LOGICOP_AND_REVERSE:
+            // s & ~d
+            // todo: use avx andnot instr when I can find the intrinsic to call
+            result[0] = AND(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
+            result[1] = AND(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
+            result[2] = AND(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
+            result[3] = AND(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
+            break;
+
+        case LOGICOP_INVERT:
+            // ~d
+            result[0] = XOR(dst[0], VIMMED1(0xFFFFFFFF));
+            result[1] = XOR(dst[1], VIMMED1(0xFFFFFFFF));
+            result[2] = XOR(dst[2], VIMMED1(0xFFFFFFFF));
+            result[3] = XOR(dst[3], VIMMED1(0xFFFFFFFF));
+            break;
+
+        case LOGICOP_XOR:
+            // s ^ d
+            result[0] = XOR(src[0], dst[0]);
+            result[1] = XOR(src[1], dst[1]);
+            result[2] = XOR(src[2], dst[2]);
+            result[3] = XOR(src[3], dst[3]);
+            break;
+
+        case LOGICOP_NAND:
+            // ~(s & d)
+            result[0] = XOR(AND(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
+            result[1] = XOR(AND(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
+            result[2] = XOR(AND(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
+            result[3] = XOR(AND(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
+            break;
+
+        case LOGICOP_AND:
+            // s & d
+            result[0] = AND(src[0], dst[0]);
+            result[1] = AND(src[1], dst[1]);
+            result[2] = AND(src[2], dst[2]);
+            result[3] = AND(src[3], dst[3]);
+            break;
+
+        case LOGICOP_EQUIV:
+            // ~(s ^ d)
+            result[0] = XOR(XOR(src[0], dst[0]), VIMMED1(0xFFFFFFFF));
+            result[1] = XOR(XOR(src[1], dst[1]), VIMMED1(0xFFFFFFFF));
+            result[2] = XOR(XOR(src[2], dst[2]), VIMMED1(0xFFFFFFFF));
+            result[3] = XOR(XOR(src[3], dst[3]), VIMMED1(0xFFFFFFFF));
+            break;
+
+        case LOGICOP_NOOP:
+            result[0] = dst[0];
+            result[1] = dst[1];
+            result[2] = dst[2];
+            result[3] = dst[3];
+            break;
+
+        case LOGICOP_OR_INVERTED:
+            // ~s | d
+            result[0] = OR(XOR(src[0], VIMMED1(0xFFFFFFFF)), dst[0]);
+            result[1] = OR(XOR(src[1], VIMMED1(0xFFFFFFFF)), dst[1]);
+            result[2] = OR(XOR(src[2], VIMMED1(0xFFFFFFFF)), dst[2]);
+            result[3] = OR(XOR(src[3], VIMMED1(0xFFFFFFFF)), dst[3]);
+            break;
+
+        case LOGICOP_COPY:
+            result[0] = src[0];
+            result[1] = src[1];
+            result[2] = src[2];
+            result[3] = src[3];
+            break;
+
+        case LOGICOP_OR_REVERSE:
+            // s | ~d
+            result[0] = OR(XOR(dst[0], VIMMED1(0xFFFFFFFF)), src[0]);
+            result[1] = OR(XOR(dst[1], VIMMED1(0xFFFFFFFF)), src[1]);
+            result[2] = OR(XOR(dst[2], VIMMED1(0xFFFFFFFF)), src[2]);
+            result[3] = OR(XOR(dst[3], VIMMED1(0xFFFFFFFF)), src[3]);
+            break;
+
+        case LOGICOP_OR:
+            // s | d
+            result[0] = OR(src[0], dst[0]);
+            result[1] = OR(src[1], dst[1]);
+            result[2] = OR(src[2], dst[2]);
+            result[3] = OR(src[3], dst[3]);
+            break;
+
+        case LOGICOP_SET:
+            result[0] = VIMMED1(0xFFFFFFFF);
+            result[1] = VIMMED1(0xFFFFFFFF);
+            result[2] = VIMMED1(0xFFFFFFFF);
+            result[3] = VIMMED1(0xFFFFFFFF);
+            break;
+
+        default:
+            SWR_ASSERT(false, "Unsupported logic operation: %d", logicOp);
+            result[0] = result[1] = result[2] = result[3] = VIMMED1(0.0f);
+            break;
+        }
+    }
+
+    void AlphaTest(const BLEND_COMPILE_STATE& state, Value* pBlendState, Value* pAlpha, Value* ppMask)
+    {
+        // load uint32_t reference
+        Value* pRef = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_alphaTestReference }));
+
+        Value* pTest = nullptr;
+        if (state.alphaTestFormat == ALPHA_TEST_UNORM8)
+        {
+            // convert float alpha to unorm8
+            Value* pAlphaU8 = FMUL(pAlpha, VIMMED1(256.0f));
+            pAlphaU8 = FP_TO_UI(pAlphaU8, mSimdInt32Ty);
+
+            // compare
+            switch (state.alphaTestFunction)
+            {
+            case ZFUNC_ALWAYS:  pTest = VIMMED1(true); break;
+            case ZFUNC_NEVER:   pTest = VIMMED1(false); break;
+            case ZFUNC_LT:      pTest = ICMP_ULT(pAlphaU8, pRef); break;
+            case ZFUNC_EQ:      pTest = ICMP_EQ(pAlphaU8, pRef); break;
+            case ZFUNC_LE:      pTest = ICMP_ULE(pAlphaU8, pRef); break;
+            case ZFUNC_GT:      pTest = ICMP_UGT(pAlphaU8, pRef); break;
+            case ZFUNC_NE:      pTest = ICMP_NE(pAlphaU8, pRef); break;
+            case ZFUNC_GE:      pTest = ICMP_UGE(pAlphaU8, pRef); break;
+            default:
+                SWR_ASSERT(false, "Invalid alpha test function");
+                break;
+            }
+        }
+        else
+        {
+            // cast ref to float
+            pRef = BITCAST(pRef, mSimdFP32Ty);
+
+            // compare
+            switch (state.alphaTestFunction)
+            {
+            case ZFUNC_ALWAYS:  pTest = VIMMED1(true); break;
+            case ZFUNC_NEVER:   pTest = VIMMED1(false); break;
+            case ZFUNC_LT:      pTest = FCMP_OLT(pAlpha, pRef); break;
+            case ZFUNC_EQ:      pTest = FCMP_OEQ(pAlpha, pRef); break;
+            case ZFUNC_LE:      pTest = FCMP_OLE(pAlpha, pRef); break;
+            case ZFUNC_GT:      pTest = FCMP_OGT(pAlpha, pRef); break;
+            case ZFUNC_NE:      pTest = FCMP_ONE(pAlpha, pRef); break;
+            case ZFUNC_GE:      pTest = FCMP_OGE(pAlpha, pRef); break;
+            default:
+                SWR_ASSERT(false, "Invalid alpha test function");
+                break;
+            }
+        }
+
+        // load current mask
+        Value* pMask = LOAD(ppMask);
+
+        // convert to int1 mask
+        pMask = MASK(pMask);
+
+        // and with alpha test result
+        pMask = AND(pMask, pTest);
+
+        // convert back to vector mask
+        pMask = VMASK(pMask);
+
+        // store new mask
+        STORE(pMask, ppMask);
+    }
+
+    Function* Create(const BLEND_COMPILE_STATE& state)
+    {
+        static std::size_t jitNum = 0;
+
+        std::stringstream fnName("BlendShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
+        fnName << jitNum++;
+
+        // blend function signature
+        //typedef void(*PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, BYTE*, simdvector&, simdscalari*, simdscalari*);
+
+        std::vector<Type*> args{
+            PointerType::get(Gen_SWR_BLEND_STATE(JM()), 0), // SWR_BLEND_STATE*
+            PointerType::get(mSimdFP32Ty, 0),               // simdvector& src
+            PointerType::get(mSimdFP32Ty, 0),               // simdvector& src1
+            Type::getInt32Ty(JM()->mContext),               // sampleNum
+            PointerType::get(mSimdFP32Ty, 0),               // uint8_t* pDst
+            PointerType::get(mSimdFP32Ty, 0),               // simdvector& result
+            PointerType::get(mSimdInt32Ty, 0),              // simdscalari* oMask
+            PointerType::get(mSimdInt32Ty, 0),              // simdscalari* pMask
+        };
+
+        FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
+        Function* blendFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
+
+        BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", blendFunc);
+
+        IRB()->SetInsertPoint(entry);
+
+        // arguments
+        auto argitr = blendFunc->getArgumentList().begin();
+        Value* pBlendState = &*argitr++;
+        pBlendState->setName("pBlendState");
+        Value* pSrc = &*argitr++;
+        pSrc->setName("src");
+        Value* pSrc1 = &*argitr++;
+        pSrc1->setName("src1");
+        Value* sampleNum = &*argitr++;
+        sampleNum->setName("sampleNum");
+        Value* pDst = &*argitr++;
+        pDst->setName("pDst");
+        Value* pResult = &*argitr++;
+        pResult->setName("result");
+        Value* ppoMask = &*argitr++;
+        ppoMask->setName("ppoMask");
+        Value* ppMask = &*argitr++;
+        ppMask->setName("pMask");
+
+        static_assert(KNOB_COLOR_HOT_TILE_FORMAT == R32G32B32A32_FLOAT, "Unsupported hot tile format");
+        Value* dst[4];
+        Value* constantColor[4];
+        Value* src[4];
+        Value* src1[4];
+        Value* result[4];
+        for (uint32_t i = 0; i < 4; ++i)
+        {
+            // load hot tile
+            dst[i] = LOAD(pDst, { i });
+
+            // load constant color
+            constantColor[i] = VBROADCAST(LOAD(pBlendState, { 0, SWR_BLEND_STATE_constantColor, i }));
+
+            // load src
+            src[i] = LOAD(pSrc, { i });
+
+            // load src1
+            src1[i] = LOAD(pSrc1, { i });
+        }
+        Value* currentMask = VIMMED1(-1);
+        if(state.desc.alphaToCoverageEnable)
+        {
+            currentMask = FP_TO_SI(FMUL(src[3], VBROADCAST(C((float)state.desc.numSamples))), mSimdInt32Ty);
+        }
+
+        // alpha test
+        if (state.desc.alphaTestEnable)
+        {
+            AlphaTest(state, pBlendState, src[3], ppMask);
+        }
+
+        // color blend
+        if (state.blendState.blendEnable)
+        {
+            // clamp sources
+            Clamp(state.format, src);
+            Clamp(state.format, src1);
+            Clamp(state.format, dst);
+            Clamp(state.format, constantColor);
+
+            // apply defaults to hottile contents to take into account missing components
+            ApplyDefaults(state.format, dst);
+
+            // Force defaults for unused 'X' components
+            ApplyUnusedDefaults(state.format, dst);
+
+            // Quantize low precision components
+            Quantize(state.format, dst);
+
+            // special case clamping for R11G11B10_float which has no sign bit
+            if (state.format == R11G11B10_FLOAT)
+            {
+                dst[0] = VMAXPS(dst[0], VIMMED1(0.0f));
+                dst[1] = VMAXPS(dst[1], VIMMED1(0.0f));
+                dst[2] = VMAXPS(dst[2], VIMMED1(0.0f));
+                dst[3] = VMAXPS(dst[3], VIMMED1(0.0f));
+            }
+
+            Value* srcFactor[4];
+            Value* dstFactor[4];
+            if (state.desc.independentAlphaBlendEnable)
+            {
+                GenerateBlendFactor<true, false>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
+                GenerateBlendFactor<false, true>(state.blendState.sourceAlphaBlendFactor, constantColor, src, src1, dst, srcFactor);
+
+                GenerateBlendFactor<true, false>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
+                GenerateBlendFactor<false, true>(state.blendState.destAlphaBlendFactor, constantColor, src, src1, dst, dstFactor);
+
+                BlendFunc<true, false>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
+                BlendFunc<false, true>(state.blendState.alphaBlendFunc, src, srcFactor, dst, dstFactor, result);
+            }
+            else
+            {
+                GenerateBlendFactor<true, true>(state.blendState.sourceBlendFactor, constantColor, src, src1, dst, srcFactor);
+                GenerateBlendFactor<true, true>(state.blendState.destBlendFactor, constantColor, src, src1, dst, dstFactor);
+
+                BlendFunc<true, true>(state.blendState.colorBlendFunc, src, srcFactor, dst, dstFactor, result);
+            }
+
+            // store results out
+            for (uint32_t i = 0; i < 4; ++i)
+            {
+                STORE(result[i], pResult, { i });
+            }
+        }
+        
+        if(state.blendState.logicOpEnable)
+        {
+            const SWR_FORMAT_INFO& info = GetFormatInfo(state.format);
+            SWR_ASSERT(info.type[0] == SWR_TYPE_UINT);
+            Value* vMask[4];
+            for(uint32_t i = 0; i < 4; i++)
+            {
+                switch(info.bpc[i])
+                {
+                case 0: vMask[i] = VIMMED1(0x00000000); break;
+                case 2: vMask[i] = VIMMED1(0x00000003); break;
+                case 5: vMask[i] = VIMMED1(0x0000001F); break;
+                case 6: vMask[i] = VIMMED1(0x0000003F); break;
+                case 8: vMask[i] = VIMMED1(0x000000FF); break;
+                case 10: vMask[i] = VIMMED1(0x000003FF); break;
+                case 11: vMask[i] = VIMMED1(0x000007FF); break;
+                case 16: vMask[i] = VIMMED1(0x0000FFFF); break;
+                case 24: vMask[i] = VIMMED1(0x00FFFFFF); break;
+                case 32: vMask[i] = VIMMED1(0xFFFFFFFF); break;
+                default:
+                    vMask[i] = VIMMED1(0x0);
+                    SWR_ASSERT(0, "Unsupported bpc for logic op\n");
+                    break;
+                }
+                src[i] = BITCAST(src[i], mSimdInt32Ty);//, vMask[i]);
+                dst[i] = BITCAST(dst[i], mSimdInt32Ty);
+            }
+
+            LogicOpFunc(state.blendState.logicOpFunc, src, dst, result);
+
+            // store results out
+            for(uint32_t i = 0; i < 4; ++i)
+            {
+                // clear upper bits from PS output not in RT format after doing logic op
+                result[i] = AND(result[i], vMask[i]);
+
+                STORE(BITCAST(result[i], mSimdFP32Ty), pResult, {i});
+            }
+        }
+
+        if(state.desc.oMaskEnable)
+        {
+            assert(!(state.desc.alphaToCoverageEnable));
+            // load current mask
+            Value* oMask = LOAD(ppoMask);
+            Value* sampleMasked = VBROADCAST(SHL(C(1), sampleNum));
+            oMask = AND(oMask, sampleMasked);
+            currentMask = AND(oMask, currentMask);
+        }
+
+        if(state.desc.sampleMaskEnable)
+        {
+            Value* sampleMask = LOAD(pBlendState, { 0, SWR_BLEND_STATE_sampleMask});
+            Value* sampleMasked = SHL(C(1), sampleNum);
+            sampleMask = AND(sampleMask, sampleMasked);
+            sampleMask = VBROADCAST(ICMP_SGT(sampleMask, C(0)));
+            sampleMask = S_EXT(sampleMask, mSimdInt32Ty);
+            currentMask = AND(sampleMask, currentMask);
+        }
+
+        if(state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable ||
+           state.desc.oMaskEnable)
+        {
+            // load current mask
+            Value* pMask = LOAD(ppMask);
+            currentMask = S_EXT(ICMP_SGT(currentMask, VBROADCAST(C(0))), mSimdInt32Ty);
+            Value* outputMask = AND(pMask, currentMask);
+            // store new mask
+            STORE(outputMask, GEP(ppMask, C(0)));
+        }
+
+        RET_VOID();
+
+        JitManager::DumpToFile(blendFunc, "");
+
+        FunctionPassManager passes(JM()->mpCurrentModule);
+        passes.add(createBreakCriticalEdgesPass());
+        passes.add(createCFGSimplificationPass());
+        passes.add(createEarlyCSEPass());
+        passes.add(createPromoteMemoryToRegisterPass());
+        passes.add(createCFGSimplificationPass());
+        passes.add(createEarlyCSEPass());
+        passes.add(createInstructionCombiningPass());
+        passes.add(createInstructionSimplifierPass());
+        passes.add(createConstantPropagationPass());
+        passes.add(createSCCPPass());
+        passes.add(createAggressiveDCEPass());
+
+        passes.run(*blendFunc);
+
+        JitManager::DumpToFile(blendFunc, "optimized");
+
+        return blendFunc;
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief JITs from fetch shader IR
+/// @param hJitMgr - JitManager handle
+/// @param func   - LLVM function IR
+/// @return PFN_FETCH_FUNC - pointer to fetch code
+PFN_BLEND_JIT_FUNC JitBlendFunc(HANDLE hJitMgr, const HANDLE hFunc)
+{
+    const llvm::Function *func = (const llvm::Function*)hFunc;
+    JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
+    PFN_BLEND_JIT_FUNC pfnBlend;
+    pfnBlend = (PFN_BLEND_JIT_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
+    // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
+    pJitMgr->mIsModuleFinalized = true;
+
+    return pfnBlend;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief JIT compiles blend shader
+/// @param hJitMgr - JitManager handle
+/// @param state   - blend state to build function from
+extern "C" PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitMgr, const BLEND_COMPILE_STATE& state)
+{
+    JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
+
+    pJitMgr->SetupNewModule();
+
+    BlendJit theJit(pJitMgr);
+    HANDLE hFunc = theJit.Create(state);
+
+    return JitBlendFunc(hJitMgr, hFunc);
+}
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h
new file mode 100644
index 00000000000..057eb92b67e
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.h
@@ -0,0 +1,93 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file blend_jit.h
+*
+* @brief Definition of the blend jitter
+*
+* Notes:
+*
+******************************************************************************/
+#pragma once
+
+#include "common/formats.h"
+#include "core/context.h"
+#include "core/state.h"
+
+struct RENDER_TARGET_BLEND_COMPILE_STATE
+{
+    bool blendEnable;
+    bool logicOpEnable;
+    SWR_BLEND_FACTOR sourceAlphaBlendFactor;
+    SWR_BLEND_FACTOR destAlphaBlendFactor;
+    SWR_BLEND_FACTOR sourceBlendFactor;
+    SWR_BLEND_FACTOR destBlendFactor;
+    SWR_BLEND_OP colorBlendFunc;
+    SWR_BLEND_OP alphaBlendFunc;
+    SWR_LOGIC_OP logicOpFunc;
+};
+
+enum ALPHA_TEST_FORMAT
+{
+    ALPHA_TEST_UNORM8,
+    ALPHA_TEST_FLOAT32
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// BLEND_DESC
+//////////////////////////////////////////////////////////////////////////
+struct BLEND_DESC
+{
+    union
+    {
+        struct
+        {
+            uint32_t            alphaTestEnable: 1;
+            uint32_t            independentAlphaBlendEnable: 1;
+            uint32_t            alphaToCoverageEnable: 1;
+            uint32_t            oMaskEnable:1;
+            uint32_t            inputCoverageEnable:1;
+            uint32_t            sampleMaskEnable:1;
+            uint32_t            numSamples:5;
+            uint32_t            _reserved : 21;
+        };
+        uint32_t bits;
+    };
+};
+#define BLEND_ENABLE_MASK 0x3D // a2c | oMaskEnable | inputCoverageEnable | sampleMaskEnable
+//////////////////////////////////////////////////////////////////////////
+/// State required for blend jit
+//////////////////////////////////////////////////////////////////////////
+struct BLEND_COMPILE_STATE
+{
+    SWR_FORMAT format;          // format of render target being blended
+    RENDER_TARGET_BLEND_COMPILE_STATE blendState;
+    BLEND_DESC desc;
+
+    SWR_ZFUNCTION alphaTestFunction;
+    ALPHA_TEST_FORMAT alphaTestFormat;
+
+    bool operator==(const BLEND_COMPILE_STATE& other) const
+    {
+        return memcmp(this, &other, sizeof(BLEND_COMPILE_STATE)) == 0;
+    }
+};
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
new file mode 100644
index 00000000000..c15bdf1e756
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -0,0 +1,71 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+* 
+* @file builder.h
+* 
+* @brief Includes all the builder related functionality
+* 
+* Notes:
+* 
+******************************************************************************/
+
+#include "builder.h"
+
+using namespace llvm;
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Contructor for Builder.
+/// @param pJitMgr - JitManager which contains modules, function passes, etc.
+Builder::Builder(JitManager *pJitMgr)
+    : mpJitMgr(pJitMgr)
+{
+    mpIRBuilder = &pJitMgr->mBuilder;
+
+    mVoidTy = Type::getVoidTy(pJitMgr->mContext);
+    mFP16Ty = Type::getHalfTy(pJitMgr->mContext);
+    mFP32Ty = Type::getFloatTy(pJitMgr->mContext);
+    mDoubleTy = Type::getDoubleTy(pJitMgr->mContext);
+    mInt1Ty = Type::getInt1Ty(pJitMgr->mContext);
+    mInt8Ty = Type::getInt8Ty(pJitMgr->mContext);
+    mInt16Ty = Type::getInt16Ty(pJitMgr->mContext);
+    mInt32Ty = Type::getInt32Ty(pJitMgr->mContext);
+    mInt64Ty = Type::getInt64Ty(pJitMgr->mContext);
+    mV4FP32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mFP32Ty), false); // vector4 float type (represented as structure)
+    mV4Int32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mInt32Ty), false); // vector4 int type
+    mSimdInt16Ty = VectorType::get(mInt16Ty, mpJitMgr->mVWidth);
+    mSimdInt32Ty = VectorType::get(mInt32Ty, mpJitMgr->mVWidth);
+    mSimdInt64Ty = VectorType::get(mInt64Ty, mpJitMgr->mVWidth);
+    mSimdFP16Ty = VectorType::get(mFP16Ty, mpJitMgr->mVWidth);
+    mSimdFP32Ty = VectorType::get(mFP32Ty, mpJitMgr->mVWidth);
+
+    if (sizeof(uint32_t*) == 4)
+    {
+        mIntPtrTy = mInt32Ty;
+        mSimdIntPtrTy = mSimdInt32Ty;
+    }
+    else
+    {
+        SWR_ASSERT(sizeof(uint32_t*) == 8);
+        mIntPtrTy = mInt64Ty;
+        mSimdIntPtrTy = mSimdInt64Ty;
+    }
+}
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
new file mode 100644
index 00000000000..49216612cc9
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -0,0 +1,71 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+* 
+* @file builder.h
+* 
+* @brief Includes all the builder related functionality
+* 
+* Notes:
+* 
+******************************************************************************/
+#pragma once
+
+#include "JitManager.h"
+#include "common/formats.h"
+
+using namespace llvm;
+
+struct Builder
+{
+    Builder(JitManager *pJitMgr);
+    IRBuilder<>* IRB() { return mpIRBuilder; };
+    JitManager* JM() { return mpJitMgr; }
+
+    JitManager* mpJitMgr;
+    IRBuilder<>* mpIRBuilder;
+
+    // Built in types.
+    Type*                mVoidTy;
+    Type*                mInt1Ty;
+    Type*                mInt8Ty;
+    Type*                mInt16Ty;
+    Type*                mInt32Ty;
+    Type*                mInt64Ty;
+    Type*                mIntPtrTy;
+    Type*                mFP16Ty;
+    Type*                mFP32Ty;
+    Type*                mDoubleTy;
+    Type*                mSimdFP16Ty;
+    Type*                mSimdFP32Ty;
+    Type*                mSimdInt16Ty;
+    Type*                mSimdInt32Ty;
+    Type*                mSimdInt64Ty;
+    Type*                mSimdIntPtrTy;
+    StructType*          mV4FP32Ty;
+    StructType*          mV4Int32Ty;
+
+#include "builder_gen.h"
+#include "builder_x86.h"
+#include "builder_misc.h"
+#include "builder_math.h"
+
+};
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h
new file mode 100644
index 00000000000..92867ec9836
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_math.h
@@ -0,0 +1,34 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+* 
+* @file builder_math.h
+* 
+* @brief math/alu builder functions
+* 
+* Notes:
+* 
+******************************************************************************/
+#pragma once
+
+Value* VLOG2PS(Value* src);
+Value* VPOW24PS(Value* src);
+Value* VEXP2PS(Value* src);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
new file mode 100644
index 00000000000..5394fc7bf5a
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -0,0 +1,1447 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+* 
+* @file builder_misc.cpp
+* 
+* @brief Implementation for miscellaneous builder functions
+* 
+* Notes:
+* 
+******************************************************************************/
+#include "builder.h"
+#include "llvm/Support/DynamicLibrary.h"
+
+void __cdecl CallPrint(const char* fmt, ...);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Convert an IEEE 754 32-bit single precision float to an
+///        16 bit float with 5 exponent bits and a variable
+///        number of mantissa bits.
+/// @param val - 32-bit float
+/// @todo Maybe move this outside of this file into a header?
+static uint16_t Convert32To16Float(float val)
+{
+    uint32_t sign, exp, mant;
+    uint32_t roundBits;
+
+    // Extract the sign, exponent, and mantissa
+    uint32_t uf = *(uint32_t*)&val;
+    sign = (uf & 0x80000000) >> 31;
+    exp = (uf & 0x7F800000) >> 23;
+    mant = uf & 0x007FFFFF;
+
+    // Check for out of range
+    if (std::isnan(val))
+    {
+        exp = 0x1F;
+        mant = 0x200;
+        sign = 1;                     // set the sign bit for NANs
+    }
+    else if (std::isinf(val))
+    {
+        exp = 0x1f;
+        mant = 0x0;
+    }
+    else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
+    {
+        exp = 0x1E;
+        mant = 0x3FF;
+    }
+    else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
+    {
+        mant |= 0x00800000;
+        for (; exp <= 0x70; mant >>= 1, exp++)
+            ;
+        exp = 0;
+        mant = mant >> 13;
+    }
+    else if (exp < 0x66) // Too small to represent -> Zero
+    {
+        exp = 0;
+        mant = 0;
+    }
+    else
+    {
+        // Saves bits that will be shifted off for rounding
+        roundBits = mant & 0x1FFFu;
+        // convert exponent and mantissa to 16 bit format
+        exp = exp - 0x70;
+        mant = mant >> 13;
+
+        // Essentially RTZ, but round up if off by only 1 lsb
+        if (roundBits == 0x1FFFu)
+        {
+            mant++;
+            // check for overflow
+            if ((mant & 0xC00u) != 0)
+                exp++;
+            // make sure only the needed bits are used
+            mant &= 0x3FF;
+        }
+    }
+
+    uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
+    return (uint16_t)tmpVal;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
+///        float
+/// @param val - 16-bit float
+/// @todo Maybe move this outside of this file into a header?
+static float ConvertSmallFloatTo32(UINT val)
+{
+    UINT result;
+    if ((val & 0x7fff) == 0)
+    {
+        result = ((uint32_t)(val & 0x8000)) << 16;
+    }
+    else if ((val & 0x7c00) == 0x7c00)
+    {
+        result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
+        result |= ((uint32_t)val & 0x8000) << 16;
+    }
+    else
+    {
+        uint32_t sign = (val & 0x8000) << 16;
+        uint32_t mant = (val & 0x3ff) << 13;
+        uint32_t exp = (val >> 10) & 0x1f;
+        if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
+        {
+            mant <<= 1;
+            while (mant < (0x400 << 13))
+            {
+                exp--;
+                mant <<= 1;
+            }
+            mant &= (0x3ff << 13);
+        }
+        exp = ((exp - 15 + 127) & 0xff) << 23;
+        result = sign | exp | mant;
+    }
+
+    return *(float*)&result;
+}
+
+Constant *Builder::C(bool i)
+{
+    return ConstantInt::get(IRB()->getInt1Ty(), (i ? 1 : 0));
+}
+
+Constant *Builder::C(char i)
+{
+    return ConstantInt::get(IRB()->getInt8Ty(), i);
+}
+
+Constant *Builder::C(uint8_t i)
+{
+    return ConstantInt::get(IRB()->getInt8Ty(), i);
+}
+
+Constant *Builder::C(int i)
+{
+    return ConstantInt::get(IRB()->getInt32Ty(), i);
+}
+
+Constant *Builder::C(int64_t i)
+{
+    return ConstantInt::get(IRB()->getInt64Ty(), i);
+}
+
+Constant *Builder::C(uint16_t i)
+{
+    return ConstantInt::get(mInt16Ty,i);
+}
+
+Constant *Builder::C(uint32_t i)
+{
+    return ConstantInt::get(IRB()->getInt32Ty(), i);
+}
+
+Constant *Builder::C(float i)
+{
+    return ConstantFP::get(IRB()->getFloatTy(), i);
+}
+
+Constant *Builder::PRED(bool pred)
+{
+    return ConstantInt::get(IRB()->getInt1Ty(), (pred ? 1 : 0));
+}
+
+Value *Builder::VIMMED1(int i)
+{
+    return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
+}
+
+Value *Builder::VIMMED1(uint32_t i)
+{
+    return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
+}
+
+Value *Builder::VIMMED1(float i)
+{
+    return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantFP>(C(i)));
+}
+
+Value *Builder::VIMMED1(bool i)
+{
+    return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
+}
+
+Value *Builder::VUNDEF_IPTR()
+{
+    return UndefValue::get(VectorType::get(PointerType::get(mInt32Ty, 0),JM()->mVWidth));
+}
+
+Value *Builder::VUNDEF_I()
+{
+    return UndefValue::get(VectorType::get(mInt32Ty, JM()->mVWidth));
+}
+
+Value *Builder::VUNDEF(Type *ty, uint32_t size)
+{
+    return UndefValue::get(VectorType::get(ty, size));
+}
+
+Value *Builder::VUNDEF_F()
+{
+    return UndefValue::get(VectorType::get(mFP32Ty, JM()->mVWidth));
+}
+
+Value *Builder::VUNDEF(Type* t)
+{
+    return UndefValue::get(VectorType::get(t, JM()->mVWidth));
+}
+
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index)
+{
+    return VINSERT(vec, val, C((int64_t)index));
+}
+#endif
+
+Value *Builder::VBROADCAST(Value *src)
+{
+    // check if src is already a vector
+    if (src->getType()->isVectorTy())
+    {
+        return src;
+    }
+
+    return VECTOR_SPLAT(JM()->mVWidth, src);
+}
+
+uint32_t Builder::IMMED(Value* v)
+{
+    SWR_ASSERT(isa<ConstantInt>(v));
+    ConstantInt *pValConst = cast<ConstantInt>(v);
+    return pValConst->getZExtValue();
+}
+
+Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
+{
+    std::vector<Value*> indices;
+    for (auto i : indexList)
+        indices.push_back(i);
+    return GEPA(ptr, indices);
+}
+
+Value *Builder::GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList)
+{
+    std::vector<Value*> indices;
+    for (auto i : indexList)
+        indices.push_back(C(i));
+    return GEPA(ptr, indices);
+}
+
+LoadInst *Builder::LOAD(Value *basePtr, const std::initializer_list<uint32_t> &indices, const llvm::Twine& name)
+{
+    std::vector<Value*> valIndices;
+    for (auto i : indices)
+        valIndices.push_back(C(i));
+    return LOAD(GEPA(basePtr, valIndices), name);
+}
+
+LoadInst *Builder::LOADV(Value *basePtr, const std::initializer_list<Value*> &indices, const llvm::Twine& name)
+{
+    std::vector<Value*> valIndices;
+    for (auto i : indices)
+        valIndices.push_back(i);
+    return LOAD(GEPA(basePtr, valIndices), name);
+}
+
+StoreInst *Builder::STORE(Value *val, Value *basePtr, const std::initializer_list<uint32_t> &indices)
+{
+    std::vector<Value*> valIndices;
+    for (auto i : indices)
+        valIndices.push_back(C(i));
+    return STORE(val, GEPA(basePtr, valIndices));
+}
+
+StoreInst *Builder::STOREV(Value *val, Value *basePtr, const std::initializer_list<Value*> &indices)
+{
+    std::vector<Value*> valIndices;
+    for (auto i : indices)
+        valIndices.push_back(i);
+    return STORE(val, GEPA(basePtr, valIndices));
+}
+
+CallInst *Builder::CALL(Value *Callee, const std::initializer_list<Value*> &argsList)
+{
+    std::vector<Value*> args;
+    for (auto arg : argsList)
+        args.push_back(arg);
+    return CALLA(Callee, args);
+}
+
+Value *Builder::VRCP(Value *va)
+{
+    return FDIV(VIMMED1(1.0f), va);  // 1 / a
+}
+
+Value *Builder::VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY)
+{
+    Value* vOut = FMADDPS(vA, vX, vC);
+    vOut = FMADDPS(vB, vY, vOut);
+    return vOut;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Generate an i32 masked load operation in LLVM IR.  If not  
+/// supported on the underlying platform, emulate it with float masked load
+/// @param src - base address pointer for the load
+/// @param vMask - SIMD wide mask that controls whether to access memory load 0
+Value *Builder::MASKLOADD(Value* src,Value* mask)
+{
+    Value* vResult;
+    // use avx2 gather instruction is available
+    if(JM()->mArch.AVX2())
+    {
+        Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
+        vResult = CALL(func,{src,mask});
+    }
+    else
+    {
+        Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
+        Value* fMask = BITCAST(mask,VectorType::get(mFP32Ty,JM()->mVWidth));
+        vResult = BITCAST(CALL(func,{src,fMask}), VectorType::get(mInt32Ty,JM()->mVWidth));
+    }
+    return vResult;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief insert a JIT call to CallPrint
+/// - outputs formatted string to both stdout and VS output window
+/// - DEBUG builds only
+/// Usage example:
+///   PRINT("index %d = 0x%p\n",{C(lane), pIndex});
+///   where C(lane) creates a constant value to print, and pIndex is the Value*
+///   result from a GEP, printing out the pointer to memory
+/// @param printStr - constant string to print, which includes format specifiers
+/// @param printArgs - initializer list of Value*'s to print to std out
+CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs)
+{
+    // push the arguments to CallPrint into a vector
+    std::vector<Value*> printCallArgs;
+    // save room for the format string.  we still need to modify it for vectors
+    printCallArgs.resize(1);
+
+    // search through the format string for special processing
+    size_t pos = 0;
+    std::string tempStr(printStr);
+    pos = tempStr.find('%', pos);
+    auto v = printArgs.begin();
+
+    while ((pos != std::string::npos) && (v != printArgs.end()))
+    {
+        Value* pArg = *v;
+        Type* pType = pArg->getType();
+
+        if (tempStr[pos + 1] == 't')
+        {
+            if (pType->isVectorTy())
+            {
+                Type* pContainedType = pType->getContainedType(0);
+
+                std::string vectorFormatStr;
+
+                if (pContainedType->isFloatTy())
+                {
+                    tempStr[pos + 1] = 'f';  // Ensure its %f
+                    printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(0)), mDoubleTy));
+
+                    for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
+                    {
+                        vectorFormatStr += "%f ";
+                        printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), mDoubleTy));
+                    }
+                }
+                else if (pContainedType->isIntegerTy())
+                {
+                    tempStr[pos + 1] = 'd';  // Ensure its %d
+                    printCallArgs.push_back(VEXTRACT(pArg, C(0)));
+
+                    for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
+                    {
+                        vectorFormatStr += "%d ";
+                        printCallArgs.push_back(VEXTRACT(pArg, C(i)));
+                    }
+                }
+                else
+                {
+                    SWR_ASSERT(0, "Unsupported tyep");
+                }
+
+                tempStr.insert(pos, vectorFormatStr);
+                pos += vectorFormatStr.size();
+            }
+            else
+            {
+                if (pType->isFloatTy())
+                {
+                    tempStr[pos + 1] = 'f';  // Ensure its %f
+                    printCallArgs.push_back(FP_EXT(pArg, mDoubleTy));
+                }
+                else if (pType->isIntegerTy())
+                {
+                    tempStr[pos + 1] = 'd';  // Ensure its %d
+                    printCallArgs.push_back(pArg);
+                }
+            }
+        }
+        else if (toupper(tempStr[pos + 1]) == 'X')
+        {
+            if (pType->isVectorTy())
+            {
+                tempStr[pos] = '0';
+                tempStr.insert(pos + 1, "x%08");
+
+                printCallArgs.push_back(VEXTRACT(pArg, C(0)));
+
+                std::string vectorFormatStr;
+                for (uint32_t i = 1; i < pType->getVectorNumElements(); ++i)
+                {
+                    vectorFormatStr += "0x%08X ";
+                    printCallArgs.push_back(VEXTRACT(pArg, C(i)));
+                }
+
+                tempStr.insert(pos, vectorFormatStr);
+                pos += vectorFormatStr.size();
+            }
+            else
+            {
+                tempStr[pos] = '0';
+                tempStr.insert(pos + 1, "x%08");
+                printCallArgs.push_back(pArg);
+                pos += 3;
+            }
+        }
+        // for %f we need to cast float Values to doubles so that they print out correctly
+        else if ((tempStr[pos + 1] == 'f') && (pType->isFloatTy()))
+        {
+            printCallArgs.push_back(FP_EXT(pArg, Type::getDoubleTy(JM()->mContext)));
+            pos++;
+        }
+        // add special handling for %f and %d format specifiers to make printing llvm vector types easier
+        else if (pType->isVectorTy())
+        {
+            Type* pContainedType = pType->getContainedType(0);
+
+            if ((tempStr[pos + 1] == 'f') && (pContainedType->isFloatTy()))
+            {
+                uint32_t i = 0;
+                for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
+                {
+                    tempStr.insert(pos, std::string("%f "));
+                    pos += 3;
+                    printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
+                }
+                printCallArgs.push_back(FP_EXT(VEXTRACT(pArg, C(i)), Type::getDoubleTy(JM()->mContext)));
+            }
+            else if ((tempStr[pos + 1] == 'd') && (pContainedType->isIntegerTy()))
+            {
+                uint32_t i = 0;
+                for (; i < (pArg->getType()->getVectorNumElements()) - 1; i++)
+                {
+                    tempStr.insert(pos, std::string("%d "));
+                    pos += 3;
+                    printCallArgs.push_back(VEXTRACT(pArg, C(i)));
+                }
+                printCallArgs.push_back(VEXTRACT(pArg, C(i)));
+            }
+            else
+            {
+                /// not a supported vector to print
+                /// @todo pointer types too
+                SWR_ASSERT(0);
+            }
+        }
+        else
+        {
+            printCallArgs.push_back(pArg);
+        }
+
+        // advance to the next arguement
+        v++;
+        pos = tempStr.find('%', ++pos);
+    }
+
+    // create global variable constant string
+    Constant *constString = ConstantDataArray::getString(JM()->mContext,tempStr,true);
+    GlobalVariable *gvPtr = new GlobalVariable(constString->getType(),true,GlobalValue::InternalLinkage,constString,"printStr");
+    JM()->mpCurrentModule->getGlobalList().push_back(gvPtr);
+
+    // get a pointer to the first character in the constant string array
+    std::vector<Constant*> geplist{C(0),C(0)};
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+    Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false);
+#else
+    Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
+#endif
+
+    // insert the pointer to the format string in the argument vector
+    printCallArgs[0] = strGEP;
+
+    // get pointer to CallPrint function and insert decl into the module if needed
+    std::vector<Type*> args;
+    args.push_back(PointerType::get(mInt8Ty,0));
+    FunctionType* callPrintTy = FunctionType::get(Type::getVoidTy(JM()->mContext),args,true);
+    Function *callPrintFn = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("CallPrint", callPrintTy));
+
+    // if we haven't yet added the symbol to the symbol table
+    if((sys::DynamicLibrary::SearchForAddressOfSymbol("CallPrint")) == nullptr)
+    {
+        sys::DynamicLibrary::AddSymbol("CallPrint", (void *)&CallPrint);
+    }
+
+    // insert a call to CallPrint
+    return CALLA(callPrintFn,printCallArgs);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Wrapper around PRINT with initializer list.
+CallInst* Builder::PRINT(const std::string &printStr)
+{
+    return PRINT(printStr, {});
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Generate a masked gather operation in LLVM IR.  If not  
+/// supported on the underlying platform, emulate it with loads
+/// @param vSrc - SIMD wide value that will be loaded if mask is invalid
+/// @param pBase - Int8* base VB address pointer value
+/// @param vIndices - SIMD wide value of VB byte offsets
+/// @param vMask - SIMD wide mask that controls whether to access memory or the src values
+/// @param scale - value to scale indices by
+Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
+{
+    Value* vGather;
+
+    // use avx2 gather instruction if available
+    if(JM()->mArch.AVX2())
+    {
+        // force mask to <N x float>, required by vgather
+        vMask = BITCAST(vMask, mSimdFP32Ty);
+        vGather = VGATHERPS(vSrc,pBase,vIndices,vMask,scale);
+    }
+    else
+    {
+        Value* pStack = STACKSAVE();
+
+        // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
+        Value* vSrcPtr = ALLOCA(vSrc->getType());
+        STORE(vSrc, vSrcPtr);
+
+        vGather = VUNDEF_F();
+        Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
+        Value *vOffsets = MUL(vIndices,vScaleVec);
+        Value *mask = MASK(vMask);
+        for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+        {
+            // single component byte index
+            Value *offset = VEXTRACT(vOffsets,C(i));
+            // byte pointer to component
+            Value *loadAddress = GEP(pBase,offset);
+            loadAddress = BITCAST(loadAddress,PointerType::get(mFP32Ty,0));
+            // pointer to the value to load if we're masking off a component
+            Value *maskLoadAddress = GEP(vSrcPtr,{C(0), C(i)});
+            Value *selMask = VEXTRACT(mask,C(i));
+            // switch in a safe address to load if we're trying to access a vertex 
+            Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
+            Value *val = LOAD(validAddress);
+            vGather = VINSERT(vGather,val,C(i));
+        }
+        STACKRESTORE(pStack);
+    }
+
+    return vGather;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Generate a masked gather operation in LLVM IR.  If not  
+/// supported on the underlying platform, emulate it with loads
+/// @param vSrc - SIMD wide value that will be loaded if mask is invalid
+/// @param pBase - Int8* base VB address pointer value
+/// @param vIndices - SIMD wide value of VB byte offsets
+/// @param vMask - SIMD wide mask that controls whether to access memory or the src values
+/// @param scale - value to scale indices by
+Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMask, Value* scale)
+{
+    Value* vGather;
+
+    // use avx2 gather instruction if available
+    if(JM()->mArch.AVX2())
+    {
+        vGather = VGATHERDD(vSrc, pBase, vIndices, vMask, scale);
+    }
+    else
+    {
+        Value* pStack = STACKSAVE();
+
+        // store vSrc on the stack.  this way we can select between a valid load address and the vSrc address
+        Value* vSrcPtr = ALLOCA(vSrc->getType());
+        STORE(vSrc, vSrcPtr);
+
+        vGather = VUNDEF_I();
+        Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
+        Value *vOffsets = MUL(vIndices, vScaleVec);
+        Value *mask = MASK(vMask);
+        for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+        {
+            // single component byte index
+            Value *offset = VEXTRACT(vOffsets, C(i));
+            // byte pointer to component
+            Value *loadAddress = GEP(pBase, offset);
+            loadAddress = BITCAST(loadAddress, PointerType::get(mInt32Ty, 0));
+            // pointer to the value to load if we're masking off a component
+            Value *maskLoadAddress = GEP(vSrcPtr, {C(0), C(i)});
+            Value *selMask = VEXTRACT(mask, C(i));
+            // switch in a safe address to load if we're trying to access a vertex 
+            Value *validAddress = SELECT(selMask, loadAddress, maskLoadAddress);
+            Value *val = LOAD(validAddress, C(0));
+            vGather = VINSERT(vGather, val, C(i));
+        }
+
+        STACKRESTORE(pStack);
+    }
+    return vGather;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief convert x86 <N x float> mask to llvm <N x i1> mask
+Value* Builder::MASK(Value* vmask)
+{
+    Value* src = BITCAST(vmask, mSimdInt32Ty);
+    return ICMP_SLT(src, VIMMED1(0));
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief convert llvm <N x i1> mask to x86 <N x i32> mask
+Value* Builder::VMASK(Value* mask)
+{
+    return S_EXT(mask, mSimdInt32Ty);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Generate a VPSHUFB operation in LLVM IR.  If not  
+/// supported on the underlying platform, emulate it
+/// @param a - 256bit SIMD(32x8bit) of 8bit integer values
+/// @param b - 256bit SIMD(32x8bit) of 8bit integer mask values
+/// Byte masks in lower 128 lane of b selects 8 bit values from lower 
+/// 128bits of a, and vice versa for the upper lanes.  If the mask 
+/// value is negative, '0' is inserted.
+Value *Builder::PSHUFB(Value* a, Value* b)
+{
+    Value* res;
+    // use avx2 pshufb instruction if available
+    if(JM()->mArch.AVX2())
+    {
+        res = VPSHUFB(a, b);
+    }
+    else
+    {
+        Constant* cB = dyn_cast<Constant>(b);
+        // number of 8 bit elements in b
+        uint32_t numElms = cast<VectorType>(cB->getType())->getNumElements();
+        // output vector
+        Value* vShuf = UndefValue::get(VectorType::get(mInt8Ty, numElms));
+
+        // insert an 8 bit value from the high and low lanes of a per loop iteration
+        numElms /= 2;
+        for(uint32_t i = 0; i < numElms; i++)
+        {
+            ConstantInt* cLow128b = cast<ConstantInt>(cB->getAggregateElement(i));
+            ConstantInt* cHigh128b = cast<ConstantInt>(cB->getAggregateElement(i + numElms));
+
+            // extract values from constant mask
+            char valLow128bLane =  (char)(cLow128b->getSExtValue());
+            char valHigh128bLane = (char)(cHigh128b->getSExtValue());
+
+            Value* insertValLow128b;
+            Value* insertValHigh128b;
+
+            // if the mask value is negative, insert a '0' in the respective output position
+            // otherwise, lookup the value at mask position (bits 3..0 of the respective mask byte) in a and insert in output vector
+            insertValLow128b = (valLow128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valLow128bLane & 0xF)));
+            insertValHigh128b = (valHigh128bLane < 0) ? C((char)0) : VEXTRACT(a, C((valHigh128bLane & 0xF) + numElms));
+
+            vShuf = VINSERT(vShuf, insertValLow128b, i);
+            vShuf = VINSERT(vShuf, insertValHigh128b, (i + numElms));
+        }
+        res = vShuf;
+    }
+    return res;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Generate a VPSHUFB operation (sign extend 8 8bit values to 32 
+/// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
+/// @param a - 128bit SIMD lane(16x8bit) of 8bit integer values.  Only 
+/// lower 8 values are used.
+Value *Builder::PMOVSXBD(Value* a)
+{
+    Value* res;
+    // use avx2 byte sign extend instruction if available
+    if(JM()->mArch.AVX2())
+    {
+        res = VPMOVSXBD(a);
+    }
+    else
+    {
+        // VPMOVSXBD output type
+        Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
+        // Extract 8 values from 128bit lane and sign extend
+        res = S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
+    }
+    return res;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Generate a VPSHUFB operation (sign extend 8 16bit values to 32 
+/// bits)in LLVM IR.  If not supported on the underlying platform, emulate it
+/// @param a - 128bit SIMD lane(8x16bit) of 16bit integer values.
+Value *Builder::PMOVSXWD(Value* a)
+{
+    Value* res;
+    // use avx2 word sign extend if available
+    if(JM()->mArch.AVX2())
+    {
+        res = VPMOVSXWD(a);
+    }
+    else
+    {
+        // VPMOVSXWD output type
+        Type* v8x32Ty = VectorType::get(mInt32Ty, 8);
+        // Extract 8 values from 128bit lane and sign extend
+        res = S_EXT(VSHUFFLE(a, a, C<int>({0, 1, 2, 3, 4, 5, 6, 7})), v8x32Ty);
+    }
+    return res;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Generate a VPERMD operation (shuffle 32 bit integer values 
+/// across 128 bit lanes) in LLVM IR.  If not supported on the underlying 
+/// platform, emulate it
+/// @param a - 256bit SIMD lane(8x32bit) of integer values.
+/// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
+Value *Builder::PERMD(Value* a, Value* idx)
+{
+    Value* res;
+    // use avx2 permute instruction if available
+    if(JM()->mArch.AVX2())
+    {
+        // llvm 3.6.0 swapped the order of the args to vpermd
+        res = VPERMD(idx, a);
+    }
+    else
+    {
+        res = VSHUFFLE(a, a, idx);
+    }
+    return res;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
+/// in LLVM IR.  If not supported on the underlying platform, emulate it
+/// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
+Value *Builder::CVTPH2PS(Value* a)
+{
+    if (JM()->mArch.F16C())
+    {
+        return VCVTPH2PS(a);
+    }
+    else
+    {
+        FunctionType* pFuncTy = FunctionType::get(mFP32Ty, mInt16Ty);
+        Function* pCvtPh2Ps = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("ConvertSmallFloatTo32", pFuncTy));
+
+        if (sys::DynamicLibrary::SearchForAddressOfSymbol("ConvertSmallFloatTo32") == nullptr)
+        {
+            sys::DynamicLibrary::AddSymbol("ConvertSmallFloatTo32", (void *)&ConvertSmallFloatTo32);
+        }
+
+        Value* pResult = UndefValue::get(mSimdFP32Ty);
+        for (uint32_t i = 0; i < JM()->mVWidth; ++i)
+        {
+            Value* pSrc = VEXTRACT(a, C(i));
+            Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
+            pResult = VINSERT(pResult, pConv, C(i));
+        }
+
+        return pResult;
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Generate a VCVTPS2PH operation (float32->float16 conversion)
+/// in LLVM IR.  If not supported on the underlying platform, emulate it
+/// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
+Value *Builder::CVTPS2PH(Value* a, Value* rounding)
+{
+    if (JM()->mArch.F16C())
+    {
+        return VCVTPS2PH(a, rounding);
+    }
+    else
+    {
+        // call scalar C function for now
+        FunctionType* pFuncTy = FunctionType::get(mInt16Ty, mFP32Ty);
+        Function* pCvtPs2Ph = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("Convert32To16Float", pFuncTy));
+
+        if (sys::DynamicLibrary::SearchForAddressOfSymbol("Convert32To16Float") == nullptr)
+        {
+            sys::DynamicLibrary::AddSymbol("Convert32To16Float", (void *)&Convert32To16Float);
+        }
+
+        Value* pResult = UndefValue::get(mSimdInt16Ty);
+        for (uint32_t i = 0; i < JM()->mVWidth; ++i)
+        {
+            Value* pSrc = VEXTRACT(a, C(i));
+            Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
+            pResult = VINSERT(pResult, pConv, C(i));
+        }
+
+        return pResult;
+    }
+}
+
+Value *Builder::PMAXSD(Value* a, Value* b)
+{
+    if (JM()->mArch.AVX2())
+    {
+        return VPMAXSD(a, b);
+    }
+    else
+    {
+        // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
+        Function* pmaxsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pmaxsd);
+
+        // low 128
+        Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
+        Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
+        Value* resLo = CALL(pmaxsd, {aLo, bLo});
+
+        // high 128
+        Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
+        Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
+        Value* resHi = CALL(pmaxsd, {aHi, bHi});
+
+        // combine 
+        Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
+        result = VINSERTI128(result, resHi, C((uint8_t)1));
+
+        return result;
+    }
+}
+
+Value *Builder::PMINSD(Value* a, Value* b)
+{
+    if (JM()->mArch.AVX2())
+    {
+        return VPMINSD(a, b);
+    }
+    else
+    {
+        // use 4-wide sse max intrinsic on lower/upper halves of 8-wide sources
+        Function* pminsd = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_sse41_pminsd);
+
+        // low 128
+        Value* aLo = VEXTRACTI128(a, C((uint8_t)0));
+        Value* bLo = VEXTRACTI128(b, C((uint8_t)0));
+        Value* resLo = CALL(pminsd, {aLo, bLo});
+
+        // high 128
+        Value* aHi = VEXTRACTI128(a, C((uint8_t)1));
+        Value* bHi = VEXTRACTI128(b, C((uint8_t)1));
+        Value* resHi = CALL(pminsd, {aHi, bHi});
+
+        // combine 
+        Value* result = VINSERTI128(VUNDEF_I(), resLo, C((uint8_t)0));
+        result = VINSERTI128(result, resHi, C((uint8_t)1));
+
+        return result;
+    }
+}
+
+void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets, 
+                      Value* mask, Value* vGatherComponents[], bool bPackedOutput)
+{
+    const SWR_FORMAT_INFO &info = GetFormatInfo(format);
+    if(info.type[0] == SWR_TYPE_FLOAT && info.bpc[0] == 32)
+    {
+        // ensure our mask is the correct type
+        mask = BITCAST(mask, mSimdFP32Ty);
+        GATHER4PS(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
+    }
+    else
+    {
+        // ensure our mask is the correct type
+        mask = BITCAST(mask, mSimdInt32Ty);
+        GATHER4DD(info, pSrcBase, byteOffsets, mask, vGatherComponents, bPackedOutput);
+    }
+}
+
+void Builder::GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets, 
+                        Value* mask, Value* vGatherComponents[], bool bPackedOutput)
+{
+    switch(info.bpp / info.numComps)
+    {
+        case 16: 
+        {
+                Value* vGatherResult[2];
+                Value *vMask;
+
+                // TODO: vGatherMaskedVal
+                Value* vGatherMaskedVal = VIMMED1((float)0);
+
+                // always have at least one component out of x or y to fetch
+
+                // save mask as it is zero'd out after each gather
+                vMask = mask;
+
+                vGatherResult[0] = GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
+                // e.g. result of first 8x32bit integer gather for 16bit components
+                // 256i - 0    1    2    3    4    5    6    7
+                //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
+                //
+
+                // if we have at least one component out of x or y to fetch
+                if(info.numComps > 2)
+                {
+                    // offset base to the next components(zw) in the vertex to gather
+                    pSrcBase = GEP(pSrcBase, C((char)4));
+                    vMask = mask;
+
+                    vGatherResult[1] =  GATHERPS(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
+                    // e.g. result of second 8x32bit integer gather for 16bit components
+                    // 256i - 0    1    2    3    4    5    6    7
+                    //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
+                    //
+                }
+                else
+                {
+                    vGatherResult[1] =  vGatherMaskedVal;
+                }
+
+                // Shuffle gathered components into place, each row is a component
+                Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);  
+        }
+            break;
+        case 32: 
+        { 
+            // apply defaults
+            for (uint32_t i = 0; i < 4; ++i)
+            {
+                vGatherComponents[i] = VIMMED1(*(float*)&info.defaults[i]);
+            }
+
+            for(uint32_t i = 0; i < info.numComps; i++)
+            {
+                uint32_t swizzleIndex = info.swizzle[i];
+
+                // save mask as it is zero'd out after each gather
+                Value *vMask = mask;
+
+                // Gather a SIMD of components
+                vGatherComponents[swizzleIndex] = GATHERPS(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
+
+                // offset base to the next component to gather
+                pSrcBase = GEP(pSrcBase, C((char)4));
+            }
+        }
+            break;
+        default:
+            SWR_ASSERT(0, "Invalid float format");
+            break;
+    }
+}
+
+void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
+                        Value* mask, Value* vGatherComponents[], bool bPackedOutput)
+{
+    switch (info.bpp / info.numComps)
+    {
+        case 8:
+        {
+            Value* vGatherMaskedVal = VIMMED1((int32_t)0);
+            Value* vGatherResult = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, mask, C((char)1));
+            // e.g. result of an 8x32bit integer gather for 8bit components
+            // 256i - 0    1    2    3    4    5    6    7
+            //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw 
+
+            Shuffle8bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);  
+        }
+            break;
+        case 16:
+        {
+            Value* vGatherResult[2];
+            Value *vMask;
+
+            // TODO: vGatherMaskedVal
+            Value* vGatherMaskedVal = VIMMED1((int32_t)0);
+
+            // always have at least one component out of x or y to fetch
+
+            // save mask as it is zero'd out after each gather
+            vMask = mask;
+
+            vGatherResult[0] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
+            // e.g. result of first 8x32bit integer gather for 16bit components
+            // 256i - 0    1    2    3    4    5    6    7
+            //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
+            //
+
+            // if we have at least one component out of x or y to fetch
+            if(info.numComps > 2)
+            {
+                // offset base to the next components(zw) in the vertex to gather
+                pSrcBase = GEP(pSrcBase, C((char)4));
+                vMask = mask;
+
+                vGatherResult[1] = GATHERDD(vGatherMaskedVal, pSrcBase, byteOffsets, vMask, C((char)1));
+                // e.g. result of second 8x32bit integer gather for 16bit components
+                // 256i - 0    1    2    3    4    5    6    7
+                //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
+                //
+            }
+            else
+            {
+                vGatherResult[1] = vGatherMaskedVal;
+            }
+
+            // Shuffle gathered components into place, each row is a component
+            Shuffle16bpcGather4(info, vGatherResult, vGatherComponents, bPackedOutput);
+
+        }
+            break;
+        case 32:
+        {
+            // apply defaults
+            for (uint32_t i = 0; i < 4; ++i)
+            {
+                vGatherComponents[i] = VIMMED1((int)info.defaults[i]);
+            }
+
+            for(uint32_t i = 0; i < info.numComps; i++)
+            {
+                uint32_t swizzleIndex = info.swizzle[i];
+
+                // save mask as it is zero'd out after each gather
+                Value *vMask = mask;
+
+                // Gather a SIMD of components
+                vGatherComponents[swizzleIndex] = GATHERDD(vGatherComponents[swizzleIndex], pSrcBase, byteOffsets, vMask, C((char)1));
+
+                // offset base to the next component to gather
+                pSrcBase = GEP(pSrcBase, C((char)4));
+            }
+        }
+            break;
+        default:
+            SWR_ASSERT(0, "unsupported format");
+        break;
+    }
+}
+
+void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
+{
+    // cast types
+    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
+    Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits
+
+    // input could either be float or int vector; do shuffle work in int
+    vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
+    vGatherInput[1] = BITCAST(vGatherInput[1], mSimdInt32Ty);
+
+    if(bPackedOutput) 
+    {
+        Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+
+        // shuffle mask
+        Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+                                     0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
+        Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[0], v32x8Ty), vConstMask), vGatherTy);
+        // after pshufb: group components together in each 128bit lane
+        // 256i - 0    1    2    3    4    5    6    7
+        //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
+
+        Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
+        // after PERMD: move and pack xy components into each 128bit lane
+        // 256i - 0    1    2    3    4    5    6    7
+        //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
+
+        // do the same for zw components
+        Value* vi128ZW = nullptr;
+        if(info.numComps > 2) 
+        {
+            Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
+            vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
+        }
+
+        for(uint32_t i = 0; i < 4; i++)
+        {
+            uint32_t swizzleIndex = info.swizzle[i];
+            // todo: fixed for packed
+            Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
+            if(i >= info.numComps)
+            {
+                // set the default component val
+                vGatherOutput[swizzleIndex] = vGatherMaskedVal;
+                continue;
+            }
+
+            // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
+            uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
+            // if x or y, use vi128XY permute result, else use vi128ZW
+            Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
+
+            // extract packed component 128 bit lanes 
+            vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
+        }
+
+    }
+    else 
+    {
+        // pshufb masks for each component
+        Value* vConstMask[2];
+        // x/z shuffle mask
+        vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
+                                 0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
+
+        // y/w shuffle mask
+        vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
+                                 2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
+
+
+        // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
+        // apply defaults
+        for (uint32_t i = 0; i < 4; ++i)
+        {
+            vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
+        }
+
+        for(uint32_t i = 0; i < info.numComps; i++)
+        {
+            uint32_t swizzleIndex = info.swizzle[i];
+
+            // select correct constMask for x/z or y/w pshufb
+            uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
+            // if x or y, use vi128XY permute result, else use vi128ZW
+            uint32_t selectedGather = (i < 2) ? 0 : 1;
+
+            vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
+            // after pshufb mask for x channel; z uses the same shuffle from the second gather
+            // 256i - 0    1    2    3    4    5    6    7
+            //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 
+        }
+    }
+}
+
+void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
+{
+    // cast types
+    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
+    Type* v32x8Ty =  VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits
+
+    if(bPackedOutput)
+    {
+        Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+        // shuffle mask
+        Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
+                                     0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
+        Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
+        // after pshufb: group components together in each 128bit lane
+        // 256i - 0    1    2    3    4    5    6    7
+        //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
+
+        Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
+        // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
+        // 256i - 0    1    2    3    4    5    6    7
+        //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
+
+        // do the same for zw components
+        Value* vi128ZW = nullptr;
+        if(info.numComps > 2) 
+        {
+            vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
+        }
+
+        // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
+        for(uint32_t i = 0; i < 4; i++)
+        {
+            uint32_t swizzleIndex = info.swizzle[i];
+            // todo: fix for packed
+            Value* vGatherMaskedVal = VIMMED1((int32_t)(info.defaults[i]));
+            if(i >= info.numComps)
+            {
+                // set the default component val
+                vGatherOutput[swizzleIndex] = vGatherMaskedVal;
+                continue;
+            }
+
+            // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
+            uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; 
+            // if x or y, use vi128XY permute result, else use vi128ZW
+            Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
+            
+            // sign extend
+            vGatherOutput[swizzleIndex] = VEXTRACT(selectedPermute, C(lane));
+        }
+    }
+    // else zero extend
+    else{
+        // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
+        // apply defaults
+        for (uint32_t i = 0; i < 4; ++i)
+        {
+            vGatherOutput[i] = VIMMED1((int32_t)info.defaults[i]);
+        }
+
+        for(uint32_t i = 0; i < info.numComps; i++){
+            uint32_t swizzleIndex = info.swizzle[i];
+
+            // pshufb masks for each component
+            Value* vConstMask;
+            switch(i)
+            {
+                case 0:
+                    // x shuffle mask
+                    vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
+                                          0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
+                    break;
+                case 1:
+                    // y shuffle mask
+                    vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
+                                          1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
+                    break;
+                case 2:
+                    // z shuffle mask
+                    vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
+                                          2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
+                    break;
+                case 3:
+                    // w shuffle mask
+                    vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
+                                          3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
+                    break;
+                default:
+                    vConstMask = nullptr;
+                    break;
+            }
+
+                vGatherOutput[swizzleIndex] = BITCAST(PSHUFB(BITCAST(vGatherInput, v32x8Ty), vConstMask), vGatherTy);
+                // after pshufb for x channel
+                // 256i - 0    1    2    3    4    5    6    7
+                //        x000 x000 x000 x000 x000 x000 x000 x000 
+        }
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief emulates a scatter operation.
+/// @param pDst - pointer to destination 
+/// @param vSrc - vector of src data to scatter
+/// @param vOffsets - vector of byte offsets from pDst
+/// @param vMask - mask of valid lanes
+void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
+{
+    Value* pStack = STACKSAVE();
+
+    // allocate tmp stack for masked off lanes
+    Value* vTmpPtr = ALLOCA(vSrc->getType()->getVectorElementType());
+
+    Value *mask = MASK(vMask);
+    for (uint32_t i = 0; i < JM()->mVWidth; ++i)
+    {
+        Value *offset = VEXTRACT(vOffsets, C(i));
+        // byte pointer to component
+        Value *storeAddress = GEP(pDst, offset);
+        storeAddress = BITCAST(storeAddress, PointerType::get(mFP32Ty, 0));
+        Value *selMask = VEXTRACT(mask, C(i));
+        Value *srcElem = VEXTRACT(vSrc, C(i));
+        // switch in a safe address to load if we're trying to access a vertex 
+        Value *validAddress = SELECT(selMask, storeAddress, vTmpPtr);
+        STORE(srcElem, validAddress);
+    }
+
+    STACKRESTORE(pStack);
+}
+
+Value* Builder::VABSPS(Value* a)
+{
+    Value* asInt = BITCAST(a, mSimdInt32Ty);
+    Value* result = BITCAST(AND(asInt, VIMMED1(0x7fffffff)), mSimdFP32Ty);
+    return result;
+}
+
+Value *Builder::ICLAMP(Value* src, Value* low, Value* high)
+{
+    Value *lowCmp = ICMP_SLT(src, low);
+    Value *ret = SELECT(lowCmp, low, src);
+
+    Value *highCmp = ICMP_SGT(ret, high);
+    ret = SELECT(highCmp, high, ret);
+
+    return ret;
+}
+
+Value *Builder::FCLAMP(Value* src, Value* low, Value* high)
+{
+    Value *lowCmp = FCMP_OLT(src, low);
+    Value *ret = SELECT(lowCmp, low, src);
+
+    Value *highCmp = FCMP_OGT(ret, high);
+    ret = SELECT(highCmp, high, ret);
+
+    return ret;
+}
+
+Value *Builder::FCLAMP(Value* src, float low, float high)
+{
+    Value* result = VMAXPS(src, VIMMED1(low));
+    result = VMINPS(result, VIMMED1(high));
+
+    return result;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief save/restore stack, providing ability to push/pop the stack and 
+///        reduce overall stack requirements for temporary stack use
+Value* Builder::STACKSAVE()
+{
+    Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+    return CALL(pfnStackSave);
+#else
+    return CALLA(pfnStackSave);
+#endif
+}
+
+void Builder::STACKRESTORE(Value* pSaved)
+{
+    Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
+    CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
+}
+
+Value *Builder::FMADDPS(Value* a, Value* b, Value* c)
+{
+    Value* vOut;
+    // use FMADs if available
+    if(JM()->mArch.AVX2())
+    {
+        vOut = VFMADDPS(a, b, c);
+    }
+    else
+    {
+        vOut = FADD(FMUL(a, b), c);
+    }
+    return vOut;
+}
+
+Value* Builder::POPCNT(Value* a)
+{
+    Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
+    return CALL(pCtPop, std::initializer_list<Value*>{a});
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief C functions called by LLVM IR
+//////////////////////////////////////////////////////////////////////////
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief called in JIT code, inserted by PRINT
+/// output to both stdout and visual studio debug console
+void __cdecl CallPrint(const char* fmt, ...)
+{
+    va_list args;
+    va_start(args, fmt);
+    vprintf(fmt, args);
+
+#if defined( _WIN32 )
+    char strBuf[1024];
+    vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
+    OutputDebugString(strBuf);
+#endif
+}
+
+Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
+{
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+    Function *func =
+        Intrinsic::getDeclaration(JM()->mpCurrentModule,
+                                  Intrinsic::x86_avx_vextractf128_si_256);
+    return CALL(func, {a, imm8});
+#else
+    bool flag = !imm8->isZeroValue();
+    SmallVector<Constant*,8> idx;
+    for (unsigned i = 0; i < JM()->mVWidth / 2; i++) {
+        idx.push_back(C(flag ? i + JM()->mVWidth / 2 : i));
+    }
+    return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
+#endif
+}
+
+Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
+{
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+    Function *func =
+        Intrinsic::getDeclaration(JM()->mpCurrentModule,
+                                  Intrinsic::x86_avx_vinsertf128_si_256);
+    return CALL(func, {a, b, imm8});
+#else
+    bool flag = !imm8->isZeroValue();
+    SmallVector<Constant*,8> idx;
+    for (unsigned i = 0; i < JM()->mVWidth; i++) {
+        idx.push_back(C(i));
+    }
+    Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
+
+    SmallVector<Constant*,8> idx2;
+    for (unsigned i = 0; i < JM()->mVWidth / 2; i++) {
+        idx2.push_back(C(flag ? i : i + JM()->mVWidth));
+    }
+    for (unsigned i = JM()->mVWidth / 2; i < JM()->mVWidth; i++) {
+        idx2.push_back(C(flag ? i + JM()->mVWidth / 2 : i));
+    }
+    return VSHUFFLE(a, inter, ConstantVector::get(idx2));
+#endif
+}
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
new file mode 100644
index 00000000000..48e0558c4dd
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -0,0 +1,149 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+* 
+* @file builder_misc.h
+* 
+* @brief miscellaneous builder functions
+* 
+* Notes:
+* 
+******************************************************************************/
+#pragma once
+
+Constant *C(bool i);
+Constant *C(char i);
+Constant *C(uint8_t i);
+Constant *C(int i);
+Constant *C(int64_t i);
+Constant *C(uint16_t i);
+Constant *C(uint32_t i);
+Constant *C(float i);
+
+template<typename Ty>
+Constant *C(const std::initializer_list<Ty> &constList)
+{
+    std::vector<Constant*> vConsts;
+    for(auto i : constList) {
+
+        vConsts.push_back(C((Ty)i));
+    }
+    return ConstantVector::get(vConsts);
+}
+
+Constant *PRED(bool pred);
+Value *VIMMED1(int i);
+Value *VIMMED1(uint32_t i);
+Value *VIMMED1(float i);
+Value *VIMMED1(bool i);
+Value *VUNDEF(Type* t);
+Value *VUNDEF_F();
+Value *VUNDEF_I();
+Value *VUNDEF(Type* ty, uint32_t size);
+Value *VUNDEF_IPTR();
+#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+Value *VINSERT(Value *vec, Value *val, uint64_t index);
+#endif
+Value *VBROADCAST(Value *src);
+Value *VRCP(Value *va);
+Value *VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY);
+
+uint32_t IMMED(Value* i);
+
+Value *GEP(Value* ptr, const std::initializer_list<Value*> &indexList);
+Value *GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList);
+CallInst *CALL(Value *Callee, const std::initializer_list<Value*> &args);
+
+LoadInst *LOAD(Value *BasePtr, const std::initializer_list<uint32_t> &offset, const llvm::Twine& name = "");
+LoadInst *LOADV(Value *BasePtr, const std::initializer_list<Value*> &offset, const llvm::Twine& name = "");
+StoreInst *STORE(Value *Val, Value *BasePtr, const std::initializer_list<uint32_t> &offset);
+StoreInst *STOREV(Value *Val, Value *BasePtr, const std::initializer_list<Value*> &offset);
+
+Value *VCMPPS_EQ(Value* a, Value* b)    { return VCMPPS(a, b, C((uint8_t)_CMP_EQ_OQ)); }
+Value *VCMPPS_LT(Value* a, Value* b)    { return VCMPPS(a, b, C((uint8_t)_CMP_LT_OQ)); }
+Value *VCMPPS_LE(Value* a, Value* b)    { return VCMPPS(a, b, C((uint8_t)_CMP_LE_OQ)); }
+Value *VCMPPS_ISNAN(Value* a, Value* b) { return VCMPPS(a, b, C((uint8_t)_CMP_UNORD_Q)); }
+Value *VCMPPS_NEQ(Value* a, Value* b)   { return VCMPPS(a, b, C((uint8_t)_CMP_NEQ_OQ)); }
+Value *VCMPPS_GE(Value* a, Value* b)    { return VCMPPS(a, b, C((uint8_t)_CMP_GE_OQ)); }
+Value *VCMPPS_GT(Value* a, Value* b)    { return VCMPPS(a, b, C((uint8_t)_CMP_GT_OQ)); }
+Value *VCMPPS_NOTNAN(Value* a, Value* b){ return VCMPPS(a, b, C((uint8_t)_CMP_ORD_Q)); }
+
+Value *MASK(Value* vmask);
+Value *VMASK(Value* mask);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief functions that build IR to call x86 intrinsics directly, or
+/// emulate them with other instructions if not available on the host
+//////////////////////////////////////////////////////////////////////////
+Value *MASKLOADD(Value* src, Value* mask);
+
+void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
+                      Value* mask, Value* vGatherComponents[], bool bPackedOutput);
+
+Value *GATHERPS(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale);
+void GATHER4PS(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
+               Value* mask, Value* vGatherComponents[], bool bPackedOutput);
+
+Value *GATHERDD(Value* src, Value* pBase, Value* indices, Value* mask, Value* scale);
+void GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byteOffsets,
+               Value* mask, Value* vGatherComponents[], bool bPackedOutput);
+
+void SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask);
+
+void Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput);
+void Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[], Value* vGatherOutput[], bool bPackedOutput);
+
+Value *PSHUFB(Value* a, Value* b);
+Value *PMOVSXBD(Value* a);
+Value *PMOVSXWD(Value* a);
+Value *PERMD(Value* a, Value* idx);
+Value *CVTPH2PS(Value* a);
+Value *CVTPS2PH(Value* a, Value* rounding);
+Value *PMAXSD(Value* a, Value* b);
+Value *PMINSD(Value* a, Value* b);
+Value *VABSPS(Value* a);
+Value *FMADDPS(Value* a, Value* b, Value* c);
+
+// LLVM removed VPCMPGTD x86 intrinsic.  This emulates that behavior
+Value *VPCMPGTD(Value* a, Value* b)
+{
+    Value* vIndexMask = ICMP_UGT(a,b);
+
+    // need to set the high bit for x86 intrinsic masks
+    return S_EXT(vIndexMask,VectorType::get(mInt32Ty,JM()->mVWidth));
+}
+
+Value *ICLAMP(Value* src, Value* low, Value* high);
+Value *FCLAMP(Value* src, Value* low, Value* high);
+Value *FCLAMP(Value* src, float low, float high);
+
+CallInst *PRINT(const std::string &printStr);
+CallInst *PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs);
+Value* STACKSAVE();
+void STACKRESTORE(Value* pSaved);
+
+Value* POPCNT(Value* a);
+
+Value* INT3() { return INTERRUPT(C((uint8_t)3)); }
+
+
+Value *VEXTRACTI128(Value* a, Constant* imm8);
+Value *VINSERTI128(Value* a, Value* b, Constant* imm8);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
new file mode 100644
index 00000000000..c5a180e27cb
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -0,0 +1,1431 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file fetch_jit.cpp
+*
+* @brief Implementation of the fetch jitter
+*
+* Notes:
+*
+******************************************************************************/
+#include "jit_api.h"
+#include "fetch_jit.h"
+#include "builder.h"
+#include "state_llvm.h"
+#include "common/containers.hpp"
+#include "llvm/IR/DataLayout.h"
+#include <sstream>
+#include <tuple>
+
+//#define FETCH_DUMP_VERTEX 1
+
+bool isComponentEnabled(ComponentEnable enableMask, uint8_t component);
+
+enum ConversionType
+{
+    CONVERT_NONE,
+    CONVERT_NORMALIZED,
+    CONVERT_USCALED,
+    CONVERT_SSCALED,
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// Interface to Jitting a fetch shader
+//////////////////////////////////////////////////////////////////////////
+struct FetchJit : public Builder
+{
+    FetchJit(JitManager* pJitMgr) : Builder(pJitMgr){};
+
+    Function* Create(const FETCH_COMPILE_STATE& fetchState);
+    Value* GetSimdValid32bitIndices(Value* vIndices, Value* pLastIndex);
+    Value* GetSimdValid16bitIndices(Value* vIndices, Value* pLastIndex);
+    Value* GetSimdValid8bitIndices(Value* vIndices, Value* pLastIndex);
+
+    // package up Shuffle*bpcGatherd args into a tuple for convenience
+    typedef std::tuple<Value*&, Value*, const Instruction::CastOps, const ConversionType, 
+                       uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4],
+                       const uint32_t (&)[4]> Shuffle8bpcArgs;
+    void Shuffle8bpcGatherd(Shuffle8bpcArgs &args);
+
+    typedef std::tuple<Value*(&)[2], Value*, const Instruction::CastOps, const ConversionType,
+                       uint32_t&, uint32_t&, const ComponentEnable, const ComponentControl(&)[4], Value*(&)[4]> Shuffle16bpcArgs;
+    void Shuffle16bpcGather(Shuffle16bpcArgs &args);
+
+    void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
+
+    Value* GenerateCompCtrlVector(const ComponentControl ctrl);
+
+    void JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut);
+    void JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut);
+};
+
+Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
+{
+    static std::size_t fetchNum = 0;
+
+    std::stringstream fnName("FetchShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
+    fnName << fetchNum++;
+
+    Function*    fetch = Function::Create(JM()->mFetchShaderTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
+    BasicBlock*    entry = BasicBlock::Create(JM()->mContext, "entry", fetch);
+
+    IRB()->SetInsertPoint(entry);
+
+    auto    argitr = fetch->getArgumentList().begin();
+
+    // Fetch shader arguments
+    Value*    fetchInfo = &*argitr; ++argitr;
+    fetchInfo->setName("fetchInfo");
+    Value*    pVtxOut = &*argitr;
+    pVtxOut->setName("vtxOutput");
+    // this is just shorthand to tell LLVM to get a pointer to the base address of simdvertex
+    // index 0(just the pointer to the simdvertex structure
+    // index 1(which element of the simdvertex structure to offset to(in this case 0)
+    // so the indices being i32's doesn't matter
+    // TODO: generated this GEP with a VECTOR structure type so this makes sense
+    std::vector<Value*>    vtxInputIndices(2, C(0));
+    // GEP
+    pVtxOut = GEP(pVtxOut, C(0));
+    pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, JM()->mVWidth), 0));
+
+    // SWR_FETCH_CONTEXT::pStreams
+    Value*    streams = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
+    streams->setName("pStreams");
+
+    // SWR_FETCH_CONTEXT::pIndices
+    Value*    indices = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pIndices});
+    indices->setName("pIndices");
+
+    // SWR_FETCH_CONTEXT::pLastIndex
+    Value*    pLastIndex = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pLastIndex});
+    pLastIndex->setName("pLastIndex");
+    
+
+    Value* vIndices;
+    switch(fetchState.indexType)
+    {
+        case R8_UINT:
+            indices = BITCAST(indices, Type::getInt8PtrTy(JM()->mContext, 0));
+            if(fetchState.bDisableIndexOOBCheck){
+                vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt8Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
+                vIndices = Z_EXT(vIndices, mSimdInt32Ty);
+            }
+            else{
+                pLastIndex = BITCAST(pLastIndex, Type::getInt8PtrTy(JM()->mContext, 0));
+                vIndices = GetSimdValid8bitIndices(indices, pLastIndex);
+            }
+            break;
+        case R16_UINT: 
+            indices = BITCAST(indices, Type::getInt16PtrTy(JM()->mContext, 0)); 
+            if(fetchState.bDisableIndexOOBCheck){
+                vIndices = LOAD(BITCAST(indices, PointerType::get(VectorType::get(mInt16Ty, mpJitMgr->mVWidth), 0)), {(uint32_t)0});
+                vIndices = Z_EXT(vIndices, mSimdInt32Ty);
+            }
+            else{
+                pLastIndex = BITCAST(pLastIndex, Type::getInt16PtrTy(JM()->mContext, 0));
+                vIndices = GetSimdValid16bitIndices(indices, pLastIndex);
+            }
+            break;
+        case R32_UINT:
+            (fetchState.bDisableIndexOOBCheck) ? vIndices = LOAD(BITCAST(indices, PointerType::get(mSimdInt32Ty,0)),{(uint32_t)0})
+                                               : vIndices = GetSimdValid32bitIndices(indices, pLastIndex);
+            break; // incoming type is already 32bit int
+        default: SWR_ASSERT(0, "Unsupported index type"); vIndices = nullptr; break;
+    }
+
+    // store out vertex IDs
+    STORE(vIndices, GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_VertexID }));
+
+    // store out cut mask if enabled
+    if (fetchState.bEnableCutIndex)
+    {
+        Value* vCutIndex = VIMMED1(fetchState.cutIndex);
+        Value* cutMask = VMASK(ICMP_EQ(vIndices, vCutIndex));
+        STORE(cutMask, GEP(fetchInfo, { 0, SWR_FETCH_CONTEXT_CutMask }));
+    }
+
+    // Fetch attributes from memory and output to a simdvertex struct
+    // since VGATHER has a perf penalty on HSW vs BDW, allow client to choose which fetch method to use
+    (fetchState.bDisableVGATHER) ? JitLoadVertices(fetchState, fetchInfo, streams, vIndices, pVtxOut)
+                                 : JitGatherVertices(fetchState, fetchInfo, streams, vIndices, pVtxOut);
+
+    RET_VOID();
+
+    JitManager::DumpToFile(fetch, "src");
+
+    verifyFunction(*fetch);
+
+    FunctionPassManager setupPasses(JM()->mpCurrentModule);
+
+    ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
+    setupPasses.add(createBreakCriticalEdgesPass());
+    setupPasses.add(createCFGSimplificationPass());
+    setupPasses.add(createEarlyCSEPass());
+    setupPasses.add(createPromoteMemoryToRegisterPass());
+
+    setupPasses.run(*fetch);
+
+    JitManager::DumpToFile(fetch, "se");
+
+    FunctionPassManager optPasses(JM()->mpCurrentModule);
+
+    ///@todo Haven't touched these either. Need to remove some of these and add others.
+    optPasses.add(createCFGSimplificationPass());
+    optPasses.add(createEarlyCSEPass());
+    optPasses.add(createInstructionCombiningPass());
+    optPasses.add(createInstructionSimplifierPass());
+    optPasses.add(createConstantPropagationPass());
+    optPasses.add(createSCCPPass());
+    optPasses.add(createAggressiveDCEPass());
+
+    optPasses.run(*fetch);
+    optPasses.run(*fetch);
+
+    JitManager::DumpToFile(fetch, "opt");
+
+    return fetch;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Loads attributes from memory using LOADs, shuffling the 
+/// components into SOA form. 
+/// *Note* currently does not support component control,
+/// component packing, or instancing
+/// @param fetchState - info about attributes to be fetched from memory
+/// @param streams - value pointer to the current vertex stream
+/// @param vIndices - vector value of indices to load
+/// @param pVtxOut - value pointer to output simdvertex struct
+void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo, Value* streams, Value* vIndices, Value* pVtxOut)
+{
+    // Zack shuffles; a variant of the Charleston.
+
+    SWRL::UncheckedFixedVector<Value*, 16>    vectors;
+
+    std::vector<Constant*>    pMask(JM()->mVWidth);
+    for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+    {
+        pMask[i] = (C(i < 4 ? i : 4));
+    }
+    Constant* promoteMask = ConstantVector::get(pMask);
+    Constant* uwvec = UndefValue::get(VectorType::get(mFP32Ty, 4));
+
+    Value* startVertex = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
+
+    for(uint32_t nelt = 0; nelt < fetchState.numAttribs; ++nelt)
+    {
+        Value*    elements[4] = {0};
+        const INPUT_ELEMENT_DESC& ied = fetchState.layout[nelt];
+        const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
+        uint32_t    numComponents = info.numComps;
+        uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
+
+        vectors.clear();
+
+        // load SWR_VERTEX_BUFFER_STATE::pData
+        Value *stream = LOAD(streams, {ied.StreamIndex, 2});
+
+        // load SWR_VERTEX_BUFFER_STATE::pitch
+        Value *stride = LOAD(streams, {ied.StreamIndex, 1});
+        stride = Z_EXT(stride, mInt64Ty);
+
+        // load SWR_VERTEX_BUFFER_STATE::size
+        Value *size = LOAD(streams, {ied.StreamIndex, 3});
+        size = Z_EXT(size, mInt64Ty);
+
+        Value* startVertexOffset = MUL(Z_EXT(startVertex, mInt64Ty), stride);
+
+        // Load from the stream.
+        for(uint32_t lane = 0; lane < JM()->mVWidth; ++lane)
+        {
+            // Get index
+            Value* index = VEXTRACT(vIndices, C(lane));
+            index = Z_EXT(index, mInt64Ty);
+
+            Value*    offset = MUL(index, stride);
+            offset = ADD(offset, C((int64_t)ied.AlignedByteOffset));
+            offset = ADD(offset, startVertexOffset);
+
+            if (!fetchState.bDisableIndexOOBCheck) {
+                // check for out of bound access, including partial OOB, and mask them to 0
+                Value *endOffset = ADD(offset, C((int64_t)info.Bpp));
+                Value *oob = ICMP_ULE(endOffset, size);
+                offset = SELECT(oob, offset, ConstantInt::get(mInt64Ty, 0));
+            }
+
+            Value*    pointer = GEP(stream, offset);
+            // We use a full-lane, but don't actually care.
+            Value*    vptr = 0;
+
+            // get a pointer to a 4 component attrib in default address space
+            switch(bpc)
+            {
+                case 8: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt8Ty, 4), 0)); break;
+                case 16: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mInt16Ty, 4), 0)); break;
+                case 32: vptr = BITCAST(pointer, PointerType::get(VectorType::get(mFP32Ty, 4), 0)); break;
+                default: SWR_ASSERT(false, "Unsupported underlying bpp!");
+            }
+
+            // load 4 components of attribute
+            Value*    vec = ALIGNED_LOAD(vptr, 1, false);
+
+            // Convert To FP32 internally
+            switch(info.type[0])
+            {
+                case SWR_TYPE_UNORM:
+                    switch(bpc)
+                    {
+                        case 8:
+                            vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
+                            vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 255.0))));
+                            break;
+                        case 16:
+                            vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
+                            vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 65535.0))));
+                            break;
+                        default:
+                            SWR_ASSERT(false, "Unsupported underlying type!");
+                            break;
+                    }
+                    break;
+                case SWR_TYPE_SNORM:
+                    switch(bpc)
+                    {
+                        case 8:
+                            vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
+                            vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 128.0))));
+                            break;
+                        case 16:
+                            vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
+                            vec = FMUL(vec, ConstantVector::get(std::vector<Constant*>(4, ConstantFP::get(mFP32Ty, 1.0 / 32768.0))));
+                            break;
+                        default:
+                            SWR_ASSERT(false, "Unsupported underlying type!");
+                            break;
+                    }
+                    break;
+                case SWR_TYPE_UINT:
+                    // Zero extend uint32_t types.
+                    switch(bpc)
+                    {
+                        case 8:
+                        case 16:
+                            vec = Z_EXT(vec, VectorType::get(mInt32Ty, 4));
+                            vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
+                            break;
+                        case 32:
+                            break; // Pass through unchanged.
+                        default:
+                            SWR_ASSERT(false, "Unsupported underlying type!");
+                            break;
+                    }
+                    break;
+                case SWR_TYPE_SINT:
+                    // Sign extend SINT types.
+                    switch(bpc)
+                    {
+                        case 8:
+                        case 16:
+                            vec = S_EXT(vec, VectorType::get(mInt32Ty, 4));
+                            vec = BITCAST(vec, VectorType::get(mFP32Ty, 4));
+                            break;
+                        case 32:
+                            break; // Pass through unchanged.
+                        default:
+                            SWR_ASSERT(false, "Unsupported underlying type!");
+                            break;
+                    }
+                    break;
+                case SWR_TYPE_FLOAT:
+                    switch(bpc)
+                    {
+                        case 32:
+                            break; // Pass through unchanged.
+                        default:
+                            SWR_ASSERT(false, "Unsupported underlying type!");
+                    }
+                    break;
+                case SWR_TYPE_USCALED:
+                    vec = UI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
+                    break;
+                case SWR_TYPE_SSCALED:
+                    vec = SI_TO_FP(vec, VectorType::get(mFP32Ty, 4));
+                    break;
+                case SWR_TYPE_UNKNOWN:
+                case SWR_TYPE_UNUSED:
+                    SWR_ASSERT(false, "Unsupported type %d!", info.type[0]);
+            }
+
+            // promote mask: sse(0,1,2,3) | avx(0,1,2,3,4,4,4,4)
+            // uwvec: 4 x F32, undef value
+            Value*    wvec = VSHUFFLE(vec, uwvec, promoteMask);
+            vectors.push_back(wvec);
+        }
+
+        std::vector<Constant*>        v01Mask(JM()->mVWidth);
+        std::vector<Constant*>        v23Mask(JM()->mVWidth);
+        std::vector<Constant*>        v02Mask(JM()->mVWidth);
+        std::vector<Constant*>        v13Mask(JM()->mVWidth);
+
+        // Concatenate the vectors together.
+        elements[0] = VUNDEF_F(); 
+        elements[1] = VUNDEF_F(); 
+        elements[2] = VUNDEF_F(); 
+        elements[3] = VUNDEF_F(); 
+        for(uint32_t b = 0, num4Wide = JM()->mVWidth / 4; b < num4Wide; ++b)
+        {
+            v01Mask[4 * b + 0] = C(0 + 4 * b);
+            v01Mask[4 * b + 1] = C(1 + 4 * b);
+            v01Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth);
+            v01Mask[4 * b + 3] = C(1 + 4 * b + JM()->mVWidth);
+
+            v23Mask[4 * b + 0] = C(2 + 4 * b);
+            v23Mask[4 * b + 1] = C(3 + 4 * b);
+            v23Mask[4 * b + 2] = C(2 + 4 * b + JM()->mVWidth);
+            v23Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth);
+
+            v02Mask[4 * b + 0] = C(0 + 4 * b);
+            v02Mask[4 * b + 1] = C(2 + 4 * b);
+            v02Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth);
+            v02Mask[4 * b + 3] = C(2 + 4 * b + JM()->mVWidth);
+
+            v13Mask[4 * b + 0] = C(1 + 4 * b);
+            v13Mask[4 * b + 1] = C(3 + 4 * b);
+            v13Mask[4 * b + 2] = C(1 + 4 * b + JM()->mVWidth);
+            v13Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth);
+
+            std::vector<Constant*>    iMask(JM()->mVWidth);
+            for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+            {
+                if(((4 * b) <= i) && (i < (4 * (b + 1))))
+                {
+                    iMask[i] = C(i % 4 + JM()->mVWidth);
+                }
+                else
+                {
+                    iMask[i] = C(i);
+                }
+            }
+            Constant* insertMask = ConstantVector::get(iMask);
+            elements[0] = VSHUFFLE(elements[0], vectors[4 * b + 0], insertMask);
+            elements[1] = VSHUFFLE(elements[1], vectors[4 * b + 1], insertMask);
+            elements[2] = VSHUFFLE(elements[2], vectors[4 * b + 2], insertMask);
+            elements[3] = VSHUFFLE(elements[3], vectors[4 * b + 3], insertMask);
+        }
+
+        Value* x0y0x1y1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v01Mask));
+        Value* x2y2x3y3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v01Mask));
+        Value* z0w0z1w1 = VSHUFFLE(elements[0], elements[1], ConstantVector::get(v23Mask));
+        Value* z2w3z2w3 = VSHUFFLE(elements[2], elements[3], ConstantVector::get(v23Mask));
+        elements[0] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v02Mask));
+        elements[1] = VSHUFFLE(x0y0x1y1, x2y2x3y3, ConstantVector::get(v13Mask));
+        elements[2] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v02Mask));
+        elements[3] = VSHUFFLE(z0w0z1w1, z2w3z2w3, ConstantVector::get(v13Mask));
+
+        switch(numComponents + 1)
+        {
+            case    1: elements[0] = VIMMED1(0.0f);
+            case    2: elements[1] = VIMMED1(0.0f);
+            case    3: elements[2] = VIMMED1(0.0f);
+            case    4: elements[3] = VIMMED1(1.0f);
+        }
+
+        for(uint32_t c = 0; c < 4; ++c)
+        {
+            Value* dest = GEP(pVtxOut, C(nelt * 4 + c), "destGEP");
+            STORE(elements[c], dest);
+        }
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Loads attributes from memory using AVX2 GATHER(s)
+/// @param fetchState - info about attributes to be fetched from memory
+/// @param fetchInfo - first argument passed to fetch shader
+/// @param streams - value pointer to the current vertex stream
+/// @param vIndices - vector value of indices to gather
+/// @param pVtxOut - value pointer to output simdvertex struct
+void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState, Value* fetchInfo,
+                                 Value* streams, Value* vIndices, Value* pVtxOut)
+{
+    uint32_t currentVertexElement = 0;
+    uint32_t outputElt = 0;
+    Value* vVertexElements[4];
+
+    Value* startVertex = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartVertex});
+    Value* startInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_StartInstance});
+    Value* curInstance = LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_CurInstance});
+    Value* vBaseVertex = VBROADCAST(LOAD(fetchInfo, {0, SWR_FETCH_CONTEXT_BaseVertex}));
+    curInstance->setName("curInstance");
+
+    for(uint32_t nInputElt = 0; nInputElt < fetchState.numAttribs; ++nInputElt)
+    {
+        const INPUT_ELEMENT_DESC& ied = fetchState.layout[nInputElt];
+        const SWR_FORMAT_INFO &info = GetFormatInfo((SWR_FORMAT)ied.Format);
+        uint32_t bpc = info.bpp / info.numComps;  ///@todo Code below assumes all components are same size. Need to fix.
+
+        Value *stream = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pData});
+
+        // VGATHER* takes an *i8 src pointer
+        Value* pStreamBase = BITCAST(stream, PointerType::get(mInt8Ty, 0));
+
+        Value *stride = LOAD(streams, {ied.StreamIndex, SWR_VERTEX_BUFFER_STATE_pitch});
+        Value *vStride = VBROADCAST(stride);
+
+        // max vertex index that is fully in bounds
+        Value *maxVertex = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_maxVertex)});
+        maxVertex = LOAD(maxVertex);
+
+        Value *vCurIndices;
+        Value *startOffset;
+        if(ied.InstanceEnable)
+        {
+            Value* stepRate = C(ied.InstanceDataStepRate);
+
+            // prevent a div by 0 for 0 step rate
+            Value* isNonZeroStep = ICMP_UGT(stepRate, C(0));
+            stepRate = SELECT(isNonZeroStep, stepRate, C(1));
+
+            // calc the current offset into instanced data buffer
+            Value* calcInstance = UDIV(curInstance, stepRate);
+
+            // if step rate is 0, every instance gets instance 0
+            calcInstance = SELECT(isNonZeroStep, calcInstance, C(0));
+
+            vCurIndices = VBROADCAST(calcInstance);
+
+            startOffset = startInstance;
+        }
+        else
+        {
+            // offset indices by baseVertex            
+            vCurIndices = ADD(vIndices, vBaseVertex);
+
+            startOffset = startVertex;
+        }
+
+        // All of the OOB calculations are in vertices, not VB offsets, to prevent having to 
+        // do 64bit address offset calculations.
+
+        // calculate byte offset to the start of the VB
+        Value* baseOffset = MUL(Z_EXT(startOffset, mInt64Ty), Z_EXT(stride, mInt64Ty));
+        pStreamBase = GEP(pStreamBase, baseOffset);
+
+        // if we have a start offset, subtract from max vertex. Used for OOB check
+        maxVertex = SUB(Z_EXT(maxVertex, mInt64Ty), Z_EXT(startOffset, mInt64Ty));
+        Value* neg = ICMP_SLT(maxVertex, C((int64_t)0));
+        // if we have a negative value, we're already OOB. clamp at 0.
+        maxVertex = SELECT(neg, C(0), TRUNC(maxVertex, mInt32Ty));
+
+        // Load the in bounds size of a partially valid vertex
+        Value *partialInboundsSize = GEP(streams, {C(ied.StreamIndex), C(SWR_VERTEX_BUFFER_STATE_partialInboundsSize)});
+        partialInboundsSize = LOAD(partialInboundsSize);
+        Value* vPartialVertexSize = VBROADCAST(partialInboundsSize);
+        Value* vBpp = VBROADCAST(C(info.Bpp));
+        Value* vAlignmentOffsets = VBROADCAST(C(ied.AlignedByteOffset));
+
+        // is the element is <= the partially valid size
+        Value* vElementInBoundsMask = ICMP_SLE(vBpp, SUB(vPartialVertexSize, vAlignmentOffsets));
+
+        // are vertices partially OOB?
+        Value* vMaxVertex = VBROADCAST(maxVertex);
+        Value* vPartialOOBMask = ICMP_EQ(vCurIndices, vMaxVertex);
+
+        // are vertices are fully in bounds?
+        Value* vGatherMask = ICMP_ULT(vCurIndices, vMaxVertex);
+
+        // blend in any partially OOB indices that have valid elements
+        vGatherMask = SELECT(vPartialOOBMask, vElementInBoundsMask, vGatherMask);
+        vGatherMask = VMASK(vGatherMask);
+
+        // calculate the actual offsets into the VB
+        Value* vOffsets = MUL(vCurIndices, vStride);
+        vOffsets = ADD(vOffsets, vAlignmentOffsets);
+
+        // Packing and component control 
+        ComponentEnable compMask = (ComponentEnable)ied.ComponentPacking;
+        const ComponentControl compCtrl[4] { (ComponentControl)ied.ComponentControl0, (ComponentControl)ied.ComponentControl1, 
+                                             (ComponentControl)ied.ComponentControl2, (ComponentControl)ied.ComponentControl3}; 
+
+        if(info.type[0] == SWR_TYPE_FLOAT)
+        {
+            ///@todo: support 64 bit vb accesses
+            Value* gatherSrc = VIMMED1(0.0f);
+
+            // Gather components from memory to store in a simdvertex structure
+            switch(bpc)
+            {
+                case 16:
+                {
+                    Value* vGatherResult[2];
+                    Value *vMask;
+
+                    // if we have at least one component out of x or y to fetch
+                    if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
+                        // save mask as it is zero'd out after each gather
+                        vMask = vGatherMask;
+
+                        vGatherResult[0] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+                        // e.g. result of first 8x32bit integer gather for 16bit components
+                        // 256i - 0    1    2    3    4    5    6    7
+                        //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
+                        //
+                    }
+
+                    // if we have at least one component out of z or w to fetch
+                    if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
+                        // offset base to the next components(zw) in the vertex to gather
+                        pStreamBase = GEP(pStreamBase, C((char)4));
+                        vMask = vGatherMask;
+
+                        vGatherResult[1] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+                        // e.g. result of second 8x32bit integer gather for 16bit components
+                        // 256i - 0    1    2    3    4    5    6    7
+                        //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
+                        //
+                    }
+
+                    // if we have at least one component to shuffle into place
+                    if(compMask){
+                        Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, Instruction::CastOps::FPExt, CONVERT_NONE,
+                                                                      currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
+                        // Shuffle gathered components into place in simdvertex struct
+                        Shuffle16bpcGather(args);  // outputs to vVertexElements ref
+                    }
+                }
+                    break;
+                case 32:
+                {
+                    for(uint32_t i = 0; i < 4; i++)
+                    {
+                        if(!isComponentEnabled(compMask, i)){
+                            // offset base to the next component in the vertex to gather
+                            pStreamBase = GEP(pStreamBase, C((char)4));
+                            continue;
+                        }
+
+                        // if we need to gather the component
+                        if(compCtrl[i] == StoreSrc){
+                            // save mask as it is zero'd out after each gather
+                            Value *vMask = vGatherMask;
+
+                            // Gather a SIMD of vertices
+                            vVertexElements[currentVertexElement++] = GATHERPS(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+                        }
+                        else{
+                            vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
+                        }
+
+                        if(currentVertexElement > 3){
+                            StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
+                            // reset to the next vVertexElement to output
+                            currentVertexElement = 0;
+                        }
+
+                        // offset base to the next component in the vertex to gather
+                        pStreamBase = GEP(pStreamBase, C((char)4));
+                    }
+                }
+                    break;
+                default:
+                    SWR_ASSERT(0, "Tried to fetch invalid FP format");
+                    break;
+            }
+        }
+        else
+        {
+            Instruction::CastOps extendCastType = Instruction::CastOps::CastOpsEnd;
+            ConversionType conversionType = CONVERT_NONE;
+
+            switch(info.type[0])
+            {
+                case SWR_TYPE_UNORM: 
+                    conversionType = CONVERT_NORMALIZED;
+                case SWR_TYPE_UINT:
+                    extendCastType = Instruction::CastOps::ZExt;
+                    break;
+                case SWR_TYPE_SNORM:
+                    conversionType = CONVERT_NORMALIZED;
+                case SWR_TYPE_SINT:
+                    extendCastType = Instruction::CastOps::SExt;
+                    break;
+                case SWR_TYPE_USCALED:
+                    conversionType = CONVERT_USCALED;
+                    extendCastType = Instruction::CastOps::UIToFP;
+                    break;
+                case SWR_TYPE_SSCALED:
+                    conversionType = CONVERT_SSCALED;
+                    extendCastType = Instruction::CastOps::SIToFP;
+                    break;
+                default:
+                    break;
+            }
+
+            // value substituted when component of gather is masked
+            Value* gatherSrc = VIMMED1(0);
+
+            // Gather components from memory to store in a simdvertex structure
+            switch (bpc)
+            {
+                case 8:
+                {
+                    // if we have at least one component to fetch
+                    if(compMask){
+                        Value* vGatherResult = GATHERDD(gatherSrc, pStreamBase, vOffsets, vGatherMask, C((char)1));
+                        // e.g. result of an 8x32bit integer gather for 8bit components
+                        // 256i - 0    1    2    3    4    5    6    7
+                        //        xyzw xyzw xyzw xyzw xyzw xyzw xyzw xyzw 
+
+                        Shuffle8bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
+                                                                     currentVertexElement, outputElt, compMask, compCtrl, vVertexElements, info.swizzle);
+                        // Shuffle gathered components into place in simdvertex struct
+                        Shuffle8bpcGatherd(args); // outputs to vVertexElements ref
+                    }
+                }
+                break;
+                case 16:
+                {
+                    Value* vGatherResult[2];
+                    Value *vMask;
+
+                    // if we have at least one component out of x or y to fetch
+                    if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
+                        // save mask as it is zero'd out after each gather
+                        vMask = vGatherMask;
+
+                        vGatherResult[0] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+                        // e.g. result of first 8x32bit integer gather for 16bit components
+                        // 256i - 0    1    2    3    4    5    6    7
+                        //        xyxy xyxy xyxy xyxy xyxy xyxy xyxy xyxy
+                        //
+                    }
+
+                    // if we have at least one component out of z or w to fetch
+                    if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
+                        // offset base to the next components(zw) in the vertex to gather
+                        pStreamBase = GEP(pStreamBase, C((char)4));
+                        vMask = vGatherMask;
+
+                        vGatherResult[1] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+                        // e.g. result of second 8x32bit integer gather for 16bit components
+                        // 256i - 0    1    2    3    4    5    6    7
+                        //        zwzw zwzw zwzw zwzw zwzw zwzw zwzw zwzw 
+                        //
+                    }
+
+                    // if we have at least one component to shuffle into place
+                    if(compMask){
+                        Shuffle16bpcArgs args = std::forward_as_tuple(vGatherResult, pVtxOut, extendCastType, conversionType,
+                                                                      currentVertexElement, outputElt, compMask, compCtrl, vVertexElements);
+                        // Shuffle gathered components into place in simdvertex struct
+                        Shuffle16bpcGather(args);  // outputs to vVertexElements ref
+                    }
+                }
+                break;
+                case 32:
+                {
+                    SWR_ASSERT(conversionType == CONVERT_NONE);
+
+                    // Gathered components into place in simdvertex struct
+                    for(uint32_t i = 0; i < 4; i++)
+                    {
+                        if(!isComponentEnabled(compMask, i)){
+                            // offset base to the next component in the vertex to gather
+                            pStreamBase = GEP(pStreamBase, C((char)4));
+                            continue;
+                        }
+
+                        // if we need to gather the component
+                        if(compCtrl[i] == StoreSrc){
+                            // save mask as it is zero'd out after each gather
+                            Value *vMask = vGatherMask;
+
+                            vVertexElements[currentVertexElement++] = GATHERDD(gatherSrc, pStreamBase, vOffsets, vMask, C((char)1));
+
+                            // e.g. result of a single 8x32bit integer gather for 32bit components
+                            // 256i - 0    1    2    3    4    5    6    7
+                            //        xxxx xxxx xxxx xxxx xxxx xxxx xxxx xxxx 
+                        }
+                        else{
+                            vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
+                        }
+
+                        if(currentVertexElement > 3){
+                            StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
+                            // reset to the next vVertexElement to output
+                            currentVertexElement = 0;
+                        }
+
+                        // offset base to the next component  in the vertex to gather
+                        pStreamBase = GEP(pStreamBase, C((char)4));
+                    }
+                }
+                break;
+            }
+        }
+    }
+
+    // if we have a partially filled vVertexElement struct, output it
+    if(currentVertexElement > 0){
+        StoreVertexElements(pVtxOut, outputElt++, currentVertexElement+1, vVertexElements);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Loads a simd of valid indices. OOB indices are set to 0
+/// *Note* have to do 16bit index checking in scalar until we have AVX-512
+/// support
+/// @param pIndices - pointer to 8 bit indices
+/// @param pLastIndex - pointer to last valid index
+Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
+{
+    // can fit 2 16 bit integers per vWidth lane
+    Value* vIndices =  VUNDEF_I();
+
+    // store 0 index on stack to be used to conditionally load from if index address is OOB
+    Value* pZeroIndex = ALLOCA(mInt8Ty);
+    STORE(C((uint8_t)0), pZeroIndex);
+
+    // Load a SIMD of index pointers
+    for(int64_t lane = 0; lane < JM()->mVWidth; lane++)
+    {
+        // Calculate the address of the requested index
+        Value *pIndex = GEP(pIndices, C(lane));
+
+        // check if the address is less than the max index, 
+        Value* mask = ICMP_ULT(pIndex, pLastIndex);
+
+        // if valid, load the index. if not, load 0 from the stack
+        Value* pValid = SELECT(mask, pIndex, pZeroIndex);
+        Value *index = LOAD(pValid, "valid index");
+
+        // zero extended index to 32 bits and insert into the correct simd lane
+        index = Z_EXT(index, mInt32Ty);
+        vIndices = VINSERT(vIndices, index, lane);
+    }
+    return vIndices;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Loads a simd of valid indices. OOB indices are set to 0
+/// *Note* have to do 16bit index checking in scalar until we have AVX-512
+/// support
+/// @param pIndices - pointer to 16 bit indices
+/// @param pLastIndex - pointer to last valid index
+Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
+{
+    // can fit 2 16 bit integers per vWidth lane
+    Value* vIndices =  VUNDEF_I();
+
+    // store 0 index on stack to be used to conditionally load from if index address is OOB
+    Value* pZeroIndex = ALLOCA(mInt16Ty);
+    STORE(C((uint16_t)0), pZeroIndex);
+
+    // Load a SIMD of index pointers
+    for(int64_t lane = 0; lane < JM()->mVWidth; lane++)
+    {
+        // Calculate the address of the requested index
+        Value *pIndex = GEP(pIndices, C(lane));
+
+        // check if the address is less than the max index, 
+        Value* mask = ICMP_ULT(pIndex, pLastIndex);
+
+        // if valid, load the index. if not, load 0 from the stack
+        Value* pValid = SELECT(mask, pIndex, pZeroIndex);
+        Value *index = LOAD(pValid, "valid index");
+
+        // zero extended index to 32 bits and insert into the correct simd lane
+        index = Z_EXT(index, mInt32Ty);
+        vIndices = VINSERT(vIndices, index, lane);
+    }
+    return vIndices;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Loads a simd of valid indices. OOB indices are set to 0
+/// @param pIndices - pointer to 32 bit indices
+/// @param pLastIndex - pointer to last valid index
+Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
+{
+    DataLayout dL(JM()->mpCurrentModule);
+    unsigned int ptrSize = dL.getPointerSize() * 8;  // ptr size in bits
+    Value* iLastIndex = PTR_TO_INT(pLastIndex, Type::getIntNTy(JM()->mContext, ptrSize));
+    Value* iIndices = PTR_TO_INT(pIndices, Type::getIntNTy(JM()->mContext, ptrSize));
+
+    // get the number of indices left in the buffer (endPtr - curPtr) / sizeof(index)
+    Value* numIndicesLeft = SUB(iLastIndex,iIndices);
+    numIndicesLeft = TRUNC(numIndicesLeft, mInt32Ty);
+    numIndicesLeft = SDIV(numIndicesLeft, C(4));
+
+    // create a vector of index counts from the base index ptr passed into the fetch
+    const std::vector<Constant*> vecIndices {C(0), C(1), C(2), C(3), C(4), C(5), C(6), C(7)};
+    Constant* vIndexOffsets = ConstantVector::get(vecIndices);
+
+    // compare index count to the max valid index
+    // e.g vMaxIndex      4 4 4 4 4 4 4 4 : 4 indices left to load
+    //     vIndexOffsets  0 1 2 3 4 5 6 7
+    //     ------------------------------
+    //     vIndexMask    -1-1-1-1 0 0 0 0 : offsets < max pass
+    //     vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
+    Value* vMaxIndex = VBROADCAST(numIndicesLeft);
+    Value* vIndexMask = VPCMPGTD(vMaxIndex,vIndexOffsets);
+
+    // VMASKLOAD takes an *i8 src pointer
+    pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
+
+    // Load the indices; OOB loads 0
+    return MASKLOADD(pIndices,vIndexMask);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Takes a SIMD of gathered 8bpc verts, zero or sign extends, 
+/// denormalizes if needed, converts to F32 if needed, and positions in 
+//  the proper SIMD rows to be output to the simdvertex structure
+/// @param args: (tuple of args, listed below)
+///   @param vGatherResult - 8 gathered 8bpc vertices
+///   @param pVtxOut - base pointer to output simdvertex struct
+///   @param extendType - sign extend or zero extend
+///   @param bNormalized - do we need to denormalize?
+///   @param currentVertexElement - reference to the current vVertexElement
+///   @param outputElt - reference to the current offset from simdvertex we're o
+///   @param compMask - component packing mask
+///   @param compCtrl - component control val
+///   @param vVertexElements[4] - vertex components to output
+///   @param swizzle[4] - component swizzle location
+void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
+{
+    // Unpack tuple args
+    Value*& vGatherResult = std::get<0>(args);
+    Value* pVtxOut = std::get<1>(args);
+    const Instruction::CastOps extendType = std::get<2>(args);
+    const ConversionType conversionType = std::get<3>(args);
+    uint32_t &currentVertexElement = std::get<4>(args);
+    uint32_t &outputElt =  std::get<5>(args);
+    const ComponentEnable compMask = std::get<6>(args);
+    const ComponentControl (&compCtrl)[4] = std::get<7>(args);
+    Value* (&vVertexElements)[4] = std::get<8>(args);
+    const uint32_t (&swizzle)[4] = std::get<9>(args);
+
+    // cast types
+    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
+    Type* v32x8Ty =  VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits
+
+    // have to do extra work for sign extending
+    if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
+        Type* v16x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 2); // 8x16bit ints in a 128bit lane
+        Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+
+        // shuffle mask, including any swizzling
+        const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
+        const char z = (char)swizzle[2]; const char w = (char)swizzle[3];
+        Value* vConstMask = C<char>({char(x), char(x+4), char(x+8), char(x+12),
+                    char(y), char(y+4), char(y+8), char(y+12),
+                    char(z), char(z+4), char(z+8), char(z+12),
+                    char(w), char(w+4), char(w+8), char(w+12),
+                    char(x), char(x+4), char(x+8), char(x+12),
+                    char(y), char(y+4), char(y+8), char(y+12),
+                    char(z), char(z+4), char(z+8), char(z+12),
+                    char(w), char(w+4), char(w+8), char(w+12)});
+
+        Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
+        // after pshufb: group components together in each 128bit lane
+        // 256i - 0    1    2    3    4    5    6    7
+        //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
+
+        Value* vi128XY = nullptr;
+        if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
+            vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 4, 0, 0, 1, 5, 0, 0})), v128Ty);
+            // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
+            // 256i - 0    1    2    3    4    5    6    7
+            //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
+        }
+
+        // do the same for zw components
+        Value* vi128ZW = nullptr;
+        if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
+            vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({2, 6, 0, 0, 3, 7, 0, 0})), v128Ty);
+        }
+
+        // init denormalize variables if needed
+        Instruction::CastOps fpCast;
+        Value* conversionFactor;
+
+        switch (conversionType)
+        {
+        case CONVERT_NORMALIZED:
+            fpCast = Instruction::CastOps::SIToFP;
+            conversionFactor = VIMMED1((float)(1.0 / 127.0));
+            break;
+        case CONVERT_SSCALED:
+            fpCast = Instruction::CastOps::SIToFP;
+            conversionFactor = VIMMED1((float)(1.0));
+            break;
+        case CONVERT_USCALED:
+            SWR_ASSERT(0, "Type should not be sign extended!");
+            conversionFactor = nullptr;
+            break;
+        default:
+            SWR_ASSERT(conversionType == CONVERT_NONE);
+            conversionFactor = nullptr;
+            break;
+        }
+
+        // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
+        for(uint32_t i = 0; i < 4; i++){
+            if(!isComponentEnabled(compMask, i)){
+                continue;
+            }
+
+            if(compCtrl[i] == ComponentControl::StoreSrc){
+                // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
+                uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1; 
+                // if x or y, use vi128XY permute result, else use vi128ZW
+                Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
+            
+                // sign extend
+                vVertexElements[currentVertexElement] = PMOVSXBD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v16x8Ty));
+
+                // denormalize if needed
+                if(conversionType != CONVERT_NONE){
+                    vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
+                }
+                currentVertexElement++;
+            }
+            else{
+                vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
+            }
+
+            if(currentVertexElement > 3){
+                StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
+                // reset to the next vVertexElement to output
+                currentVertexElement = 0;
+            }
+        }
+    }
+    // else zero extend
+    else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
+    {
+        // init denormalize variables if needed
+        Instruction::CastOps fpCast;
+        Value* conversionFactor;
+
+        switch (conversionType)
+        {
+        case CONVERT_NORMALIZED:
+            fpCast = Instruction::CastOps::UIToFP;
+            conversionFactor = VIMMED1((float)(1.0 / 255.0));
+            break;
+        case CONVERT_USCALED:
+            fpCast = Instruction::CastOps::UIToFP;
+            conversionFactor = VIMMED1((float)(1.0));
+            break;
+        case CONVERT_SSCALED:
+            SWR_ASSERT(0, "Type should not be zero extended!");
+            conversionFactor = nullptr;
+            break;
+        default:
+            SWR_ASSERT(conversionType == CONVERT_NONE);
+            conversionFactor = nullptr;
+            break;
+        }
+
+        // shuffle enabled components into lower byte of each 32bit lane, 0 extending to 32 bits
+        for(uint32_t i = 0; i < 4; i++){
+            if(!isComponentEnabled(compMask, i)){
+                continue;
+            }
+
+            if(compCtrl[i] == ComponentControl::StoreSrc){
+                // pshufb masks for each component
+                Value* vConstMask;
+                switch(swizzle[i]){
+                    case 0:
+                        // x shuffle mask
+                        vConstMask = C<char>({0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1,
+                                              0, -1, -1, -1, 4, -1, -1, -1, 8, -1, -1, -1, 12, -1, -1, -1});
+                        break;
+                    case 1:
+                        // y shuffle mask
+                        vConstMask = C<char>({1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1,
+                                              1, -1, -1, -1, 5, -1, -1, -1, 9, -1, -1, -1, 13, -1, -1, -1});
+                        break;
+                    case 2:
+                        // z shuffle mask
+                        vConstMask = C<char>({2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1,
+                                              2, -1, -1, -1, 6, -1, -1, -1, 10, -1, -1, -1, 14, -1, -1, -1});
+                        break;
+                    case 3:
+                        // w shuffle mask
+                        vConstMask = C<char>({3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1,
+                                              3, -1, -1, -1, 7, -1, -1, -1, 11, -1, -1, -1, 15, -1, -1, -1});
+                        break;
+                    default:
+                        vConstMask = nullptr;
+                        break;
+                }
+
+                vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult, v32x8Ty), vConstMask), vGatherTy);
+                // after pshufb for x channel
+                // 256i - 0    1    2    3    4    5    6    7
+                //        x000 x000 x000 x000 x000 x000 x000 x000 
+
+                // denormalize if needed
+                if (conversionType != CONVERT_NONE){
+                    vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
+                }
+                currentVertexElement++;
+            }
+            else{
+                vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
+            }
+
+            if(currentVertexElement > 3){
+                StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
+                // reset to the next vVertexElement to output
+                currentVertexElement = 0;
+            }
+        }
+    }
+    else
+    {
+        SWR_ASSERT(0, "Unsupported conversion type");
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Takes a SIMD of gathered 16bpc verts, zero or sign extends, 
+/// denormalizes if needed, converts to F32 if needed, and positions in 
+//  the proper SIMD rows to be output to the simdvertex structure
+/// @param args: (tuple of args, listed below)
+///   @param vGatherResult[2] - array of gathered 16bpc vertices, 4 per index
+///   @param pVtxOut - base pointer to output simdvertex struct
+///   @param extendType - sign extend or zero extend
+///   @param bNormalized - do we need to denormalize?
+///   @param currentVertexElement - reference to the current vVertexElement
+///   @param outputElt - reference to the current offset from simdvertex we're o
+///   @param compMask - component packing mask
+///   @param compCtrl - component control val
+///   @param vVertexElements[4] - vertex components to output
+void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
+{
+    // Unpack tuple args
+    Value* (&vGatherResult)[2] = std::get<0>(args);
+    Value* pVtxOut = std::get<1>(args);
+    const Instruction::CastOps extendType = std::get<2>(args);
+    const ConversionType conversionType = std::get<3>(args);
+    uint32_t &currentVertexElement = std::get<4>(args);
+    uint32_t &outputElt = std::get<5>(args);
+    const ComponentEnable compMask = std::get<6>(args);
+    const ComponentControl(&compCtrl)[4] = std::get<7>(args);
+    Value* (&vVertexElements)[4] = std::get<8>(args);
+
+    // cast types
+    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
+    Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits
+
+    // have to do extra work for sign extending
+    if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
+        (extendType == Instruction::CastOps::FPExt))
+    {
+        // is this PP float?
+        bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
+
+        Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
+        Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+
+        // shuffle mask
+        Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
+                                     0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15});
+        Value* vi128XY = nullptr;
+        if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1)){
+            Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[0], v32x8Ty), vConstMask), vGatherTy);
+            // after pshufb: group components together in each 128bit lane
+            // 256i - 0    1    2    3    4    5    6    7
+            //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
+
+            vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
+            // after PERMD: move and pack xy components into each 128bit lane
+            // 256i - 0    1    2    3    4    5    6    7
+            //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
+        }
+
+        // do the same for zw components
+        Value* vi128ZW = nullptr;
+        if(isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)){
+            Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
+            vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({0, 1, 4, 5, 2, 3, 6, 7})), v128bitTy);
+        }
+
+        // init denormalize variables if needed
+        Instruction::CastOps IntToFpCast;
+        Value* conversionFactor;
+
+        switch (conversionType)
+        {
+        case CONVERT_NORMALIZED:
+            IntToFpCast = Instruction::CastOps::SIToFP;
+            conversionFactor = VIMMED1((float)(1.0 / 32767.0));
+            break;
+        case CONVERT_SSCALED:
+            IntToFpCast = Instruction::CastOps::SIToFP;
+            conversionFactor = VIMMED1((float)(1.0));
+            break;
+        case CONVERT_USCALED:
+            SWR_ASSERT(0, "Type should not be sign extended!");
+            conversionFactor = nullptr;
+            break;
+        default:
+            SWR_ASSERT(conversionType == CONVERT_NONE);
+            conversionFactor = nullptr;
+            break;
+        }
+
+        // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
+        for(uint32_t i = 0; i < 4; i++){
+            if(!isComponentEnabled(compMask, i)){
+                continue;
+            }
+
+            if(compCtrl[i] == ComponentControl::StoreSrc){
+                // if x or z, extract 128bits from lane 0, else for y or w, extract from lane 1
+                uint32_t lane = ((i == 0) || (i == 2)) ? 0 : 1;
+                // if x or y, use vi128XY permute result, else use vi128ZW
+                Value* selectedPermute = (i < 2) ? vi128XY : vi128ZW;
+                
+                if(bFP) {
+                    // extract 128 bit lanes to sign extend each component
+                    vVertexElements[currentVertexElement] = CVTPH2PS(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
+                }
+                else {
+                    // extract 128 bit lanes to sign extend each component
+                    vVertexElements[currentVertexElement] = PMOVSXWD(BITCAST(VEXTRACT(selectedPermute, C(lane)), v8x16Ty));
+
+                    // denormalize if needed
+                    if(conversionType != CONVERT_NONE){
+                        vVertexElements[currentVertexElement] = FMUL(CAST(IntToFpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
+                    }
+                }
+                currentVertexElement++;
+            }
+            else{
+                vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
+            }
+
+            if(currentVertexElement > 3){
+                StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
+                // reset to the next vVertexElement to output
+                currentVertexElement = 0;
+            }
+        }
+
+    }
+    // else zero extend
+    else if ((extendType == Instruction::CastOps::ZExt) || (extendType == Instruction::CastOps::UIToFP))
+    {
+        // pshufb masks for each component
+        Value* vConstMask[2];
+        if(isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 2)){
+            // x/z shuffle mask
+            vConstMask[0] = C<char>({0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1,
+                                     0, 1, -1, -1, 4, 5, -1, -1, 8, 9, -1, -1, 12, 13, -1, -1, });
+        }
+        
+        if(isComponentEnabled(compMask, 1) || isComponentEnabled(compMask, 3)){
+            // y/w shuffle mask
+            vConstMask[1] = C<char>({2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1,
+                                     2, 3, -1, -1, 6, 7, -1, -1, 10, 11, -1, -1, 14, 15, -1, -1});
+        }
+
+        // init denormalize variables if needed
+        Instruction::CastOps fpCast;
+        Value* conversionFactor;
+
+        switch (conversionType)
+        {
+        case CONVERT_NORMALIZED:
+            fpCast = Instruction::CastOps::UIToFP;
+            conversionFactor = VIMMED1((float)(1.0 / 65535.0));
+            break;
+        case CONVERT_USCALED:
+            fpCast = Instruction::CastOps::UIToFP;
+            conversionFactor = VIMMED1((float)(1.0f));
+            break;
+        case CONVERT_SSCALED:
+            SWR_ASSERT(0, "Type should not be zero extended!");
+            conversionFactor = nullptr;
+            break;
+        default:
+            SWR_ASSERT(conversionType == CONVERT_NONE);
+            conversionFactor = nullptr;
+            break;
+        }
+
+        // shuffle enabled components into lower word of each 32bit lane, 0 extending to 32 bits
+        for(uint32_t i = 0; i < 4; i++){
+            if(!isComponentEnabled(compMask, i)){
+                continue;
+            }
+
+            if(compCtrl[i] == ComponentControl::StoreSrc){
+                // select correct constMask for x/z or y/w pshufb
+                uint32_t selectedMask = ((i == 0) || (i == 2)) ? 0 : 1;
+                // if x or y, use vi128XY permute result, else use vi128ZW
+                uint32_t selectedGather = (i < 2) ? 0 : 1;
+
+                vVertexElements[currentVertexElement] = BITCAST(PSHUFB(BITCAST(vGatherResult[selectedGather], v32x8Ty), vConstMask[selectedMask]), vGatherTy);
+                // after pshufb mask for x channel; z uses the same shuffle from the second gather
+                // 256i - 0    1    2    3    4    5    6    7
+                //        xx00 xx00 xx00 xx00 xx00 xx00 xx00 xx00 
+
+                // denormalize if needed
+                if(conversionType != CONVERT_NONE){
+                    vVertexElements[currentVertexElement] = FMUL(CAST(fpCast, vVertexElements[currentVertexElement], mSimdFP32Ty), conversionFactor);
+                }
+                currentVertexElement++;
+            }
+            else{
+                vVertexElements[currentVertexElement++] = GenerateCompCtrlVector(compCtrl[i]);
+            }
+
+            if(currentVertexElement > 3){
+                StoreVertexElements(pVtxOut, outputElt++, 4, vVertexElements);
+                // reset to the next vVertexElement to output
+                currentVertexElement = 0;
+            }
+        }
+    }
+    else
+    {
+        SWR_ASSERT(0, "Unsupported conversion type");
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Output a simdvertex worth of elements to the current outputElt
+/// @param pVtxOut - base address of VIN output struct
+/// @param outputElt - simdvertex offset in VIN to write to
+/// @param numEltsToStore - number of simdvertex rows to write out
+/// @param vVertexElements - LLVM Value*[] simdvertex to write out
+void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
+{
+    for(uint32_t c = 0; c < numEltsToStore; ++c)
+    {
+        // STORE expects FP32 x vWidth type, just bitcast if needed
+        if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()){
+#if FETCH_DUMP_VERTEX
+            PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
+#endif
+            vVertexElements[c] = BITCAST(vVertexElements[c], mSimdFP32Ty);
+        }
+#if FETCH_DUMP_VERTEX
+        else
+        {
+            PRINT("vVertexElements[%d]: %f\n", {C(c), vVertexElements[c]});
+        }
+#endif
+        // outputElt * 4 = offsetting by the size of a simdvertex
+        // + c offsets to a 32bit x vWidth row within the current vertex
+        Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
+        STORE(vVertexElements[c], dest);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Generates a constant vector of values based on the 
+/// ComponentControl value
+/// @param ctrl - ComponentControl value
+Value* FetchJit::GenerateCompCtrlVector(const ComponentControl ctrl)
+{
+    switch(ctrl)
+    {
+        case NoStore:   return VUNDEF_I();
+        case Store0:    return VIMMED1(0);
+        case Store1Fp:  return VIMMED1(1.0f);
+        case Store1Int: return VIMMED1(1);
+        case StoreSrc:
+        default:        SWR_ASSERT(0, "Invalid component control"); return VUNDEF_I();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Returns the enable mask for the specified component.
+/// @param enableMask - enable bits
+/// @param component - component to check if enabled.
+bool isComponentEnabled(ComponentEnable enableMask, uint8_t component)
+{
+    switch (component)
+    {
+        // X
+    case 0: return (enableMask & ComponentEnable::X);
+        // Y
+    case 1: return (enableMask & ComponentEnable::Y);
+        // Z
+    case 2: return (enableMask & ComponentEnable::Z);
+        // W
+    case 3: return (enableMask & ComponentEnable::W);
+
+    default: return false;
+    }
+}
+
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief JITs from fetch shader IR
+/// @param hJitMgr - JitManager handle
+/// @param func   - LLVM function IR
+/// @return PFN_FETCH_FUNC - pointer to fetch code
+PFN_FETCH_FUNC JitFetchFunc(HANDLE hJitMgr, const HANDLE hFunc)
+{
+    const llvm::Function* func = (const llvm::Function*)hFunc;
+    JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
+    PFN_FETCH_FUNC pfnFetch;
+
+    pfnFetch = (PFN_FETCH_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
+    // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
+    pJitMgr->mIsModuleFinalized = true;
+
+#if defined(KNOB_SWRC_TRACING)
+    char fName[1024];
+    const char *funcName = func->getName().data();
+    sprintf(fName, "%s.bin", funcName);
+    FILE *fd = fopen(fName, "wb");
+    fwrite((void *)pfnFetch, 1, 2048, fd);
+    fclose(fd);
+#endif
+
+    return pfnFetch;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief JIT compiles fetch shader
+/// @param hJitMgr - JitManager handle
+/// @param state   - fetch state to build function from
+extern "C" PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitMgr, const FETCH_COMPILE_STATE& state)
+{
+    JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
+
+    pJitMgr->SetupNewModule();
+
+    FetchJit theJit(pJitMgr);
+    HANDLE hFunc = theJit.Create(state);
+
+    return JitFetchFunc(hJitMgr, hFunc);
+}
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h
new file mode 100644
index 00000000000..ea3625d2fde
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.h
@@ -0,0 +1,128 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file fetch_jit.h
+*
+* @brief Definition of the fetch jitter
+*
+* Notes:
+*
+******************************************************************************/
+#pragma once
+
+#include "common/formats.h"
+#include "core/state.h"
+
+//////////////////////////////////////////////////////////////////////////
+/// INPUT_ELEMENT_DESC
+//////////////////////////////////////////////////////////////////////////
+struct INPUT_ELEMENT_DESC
+{
+    union
+    {
+        struct
+        {
+            uint32_t            AlignedByteOffset : 12;
+            uint32_t            Format : 10;
+            uint32_t            StreamIndex : 6;
+            uint32_t            InstanceEnable : 1;
+            uint32_t            ComponentControl0 : 3;
+            uint32_t            ComponentControl1 : 3;
+            uint32_t            ComponentControl2 : 3;
+            uint32_t            ComponentControl3 : 3;
+            uint32_t            ComponentPacking : 4;
+            uint32_t            _reserved : 19;
+        };
+        uint64_t bits;
+    };
+    uint32_t InstanceDataStepRate;
+};
+
+// used to set ComponentPacking
+enum ComponentEnable
+{
+    NONE = 0x0,
+    X    = 0x1,
+    Y    = 0x2,
+    XY   = 0x3,
+    Z    = 0x4,
+    XZ   = 0x5,
+    YZ   = 0x6,
+    XYZ  = 0x7,
+    W    = 0x8,
+    XW   = 0x9,
+    YW   = 0xA,
+    XYW  = 0xB,
+    ZW   = 0xC,
+    XZW  = 0xD,
+    YZW  = 0xE,
+    XYZW = 0xF,
+};
+
+enum ComponentControl
+{
+    NoStore     = 0,
+    StoreSrc    = 1,
+    Store0      = 2,
+    Store1Fp    = 3,
+    Store1Int   = 4,
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// State required for fetch shader jit compile.
+//////////////////////////////////////////////////////////////////////////
+struct FETCH_COMPILE_STATE
+{
+    uint32_t numAttribs;
+    INPUT_ELEMENT_DESC layout[KNOB_NUM_ATTRIBUTES];
+    SWR_FORMAT indexType;
+    uint32_t cutIndex{ 0xffffffff };
+
+    // Options that effect the JIT'd code
+    bool bDisableVGATHER;           // if enabled, FetchJit will generate loads/shuffles instead of VGATHERs
+    bool bDisableIndexOOBCheck;     // if enabled, FetchJit will exclude index OOB check
+    bool bEnableCutIndex{ false };  // compares indices with the cut index and returns a cut mask
+
+    FETCH_COMPILE_STATE(bool useVGATHER = false, bool indexOOBCheck = false) :
+        bDisableVGATHER(useVGATHER), bDisableIndexOOBCheck(indexOOBCheck){};
+
+    bool operator==(const FETCH_COMPILE_STATE &other) const
+    {
+        if (numAttribs != other.numAttribs) return false;
+        if (indexType != other.indexType) return false;
+        if (bDisableVGATHER != other.bDisableVGATHER) return false;
+        if (bDisableIndexOOBCheck != other.bDisableIndexOOBCheck) return false;
+        if (bEnableCutIndex != other.bEnableCutIndex) return false;
+        if (cutIndex != other.cutIndex) return false;
+
+        for(uint32_t i = 0; i < numAttribs; ++i)
+        {
+            if((layout[i].bits != other.layout[i].bits) ||
+               ((layout[i].InstanceEnable == 1) &&
+                (layout[i].InstanceDataStepRate != other.layout[i].InstanceDataStepRate))){
+                return false;
+            }
+        }
+
+        return true;
+    }
+};
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h b/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h
new file mode 100644
index 00000000000..39d63836673
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/jit_api.h
@@ -0,0 +1,108 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file jit_api.h
+*
+* @brief Platform independent JIT interface
+*
+* Notes:
+*
+******************************************************************************/
+#pragma once
+#include "common/os.h"
+
+#include "fetch_jit.h"
+#include "streamout_jit.h"
+#include "blend_jit.h"
+
+#if defined(_WIN32)
+#define EXCEPTION_PRINT_STACK(ret) ret
+#endif // _WIN32
+
+#if defined(_WIN32)
+#define JITCALL __stdcall
+#else
+#define JITCALL
+#endif
+
+extern "C"
+{
+
+struct ShaderInfo;
+
+//////////////////////////////////////////////////////////////////////////
+/// Jit Compile Info Input
+//////////////////////////////////////////////////////////////////////////
+struct JIT_COMPILE_INPUT
+{
+    SWR_SHADER_TYPE type;
+
+    const void* pIR;        ///< Pointer to LLVM IR text.
+
+    bool enableJitSampler;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Create JIT context.
+HANDLE JITCALL JitCreateContext(uint32_t targetSimdWidth, const char* arch);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Destroy JIT context.
+void JITCALL JitDestroyContext(HANDLE hJitContext);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief JIT compile shader.
+/// @param hJitContext - Jit Context
+/// @param input  - Input containing LLVM IR and other information
+/// @param output - Output containing information about JIT shader
+ShaderInfo* JITCALL JitCompileShader(
+    HANDLE hJitContext,
+    const JIT_COMPILE_INPUT& input);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief JIT destroy shader.
+/// @param hJitContext - Jit Context
+/// @param pShaderInfo  - pointer to shader object.
+void JITCALL JitDestroyShader(
+    HANDLE hJitContext,
+    ShaderInfo*& pShaderInfo);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief JIT compiles fetch shader
+/// @param hJitContext - Jit Context
+/// @param state   - Fetch state to build function from
+PFN_FETCH_FUNC JITCALL JitCompileFetch(HANDLE hJitContext, const FETCH_COMPILE_STATE& state);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief JIT compiles streamout shader
+/// @param hJitContext - Jit Context
+/// @param state   - SO state to build function from
+PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitContext, const STREAMOUT_COMPILE_STATE& state);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief JIT compiles blend shader
+/// @param hJitContext - Jit Context
+/// @param state   - blend state to build function from
+PFN_BLEND_JIT_FUNC JITCALL JitCompileBlend(HANDLE hJitContext, const BLEND_COMPILE_STATE& state);
+
+
+}; // extern "C"
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
new file mode 100644
index 00000000000..1814b7c8d5f
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
@@ -0,0 +1,401 @@
+# Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+#!deps/python32/python.exe
+
+import os, sys, re
+import argparse
+import json as JSON
+import operator
+
+header = r"""/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+* 
+* @file %s
+* 
+* @brief auto-generated file
+* 
+* DO NOT EDIT
+* 
+******************************************************************************/
+
+"""
+
+"""
+"""
+def gen_file_header(filename):
+    global header
+    headerStr = header % filename
+    return headerStr.splitlines()
+
+
+inst_aliases = {
+    'SHUFFLE_VECTOR': 'VSHUFFLE',
+    'INSERT_ELEMENT': 'VINSERT',
+    'EXTRACT_ELEMENT': 'VEXTRACT',
+    'MEM_SET': 'MEMSET',
+    'MEM_CPY': 'MEMCPY',
+    'MEM_MOVE': 'MEMMOVE',
+    'L_SHR': 'LSHR',
+    'A_SHR': 'ASHR',
+    'BIT_CAST': 'BITCAST',
+    'U_DIV': 'UDIV',
+    'S_DIV': 'SDIV',
+    'U_REM': 'UREM',
+    'S_REM': 'SREM',
+    'BIN_OP': 'BINOP',
+}
+
+intrinsics = [
+	    ["VGATHERPS", "x86_avx2_gather_d_ps_256", ["src", "pBase", "indices", "mask", "scale"]],
+        ["VGATHERDD", "x86_avx2_gather_d_d_256", ["src", "pBase", "indices", "mask", "scale"]],
+	    ["VSQRTPS", "x86_avx_sqrt_ps_256", ["a"]],
+	    ["VRSQRTPS", "x86_avx_rsqrt_ps_256", ["a"]],
+	    ["VRCPPS", "x86_avx_rcp_ps_256", ["a"]],
+	    ["VMINPS", "x86_avx_min_ps_256", ["a", "b"]],
+	    ["VMAXPS", "x86_avx_max_ps_256", ["a", "b"]],
+	    ["VPMINSD", "x86_avx2_pmins_d", ["a", "b"]],
+	    ["VPMAXSD", "x86_avx2_pmaxs_d", ["a", "b"]],
+	    ["VROUND", "x86_avx_round_ps_256", ["a", "rounding"]],
+        ["VCMPPS", "x86_avx_cmp_ps_256", ["a", "b", "cmpop"]],
+        ["VBLENDVPS", "x86_avx_blendv_ps_256", ["a", "b", "mask"]],
+        ["BEXTR_32", "x86_bmi_bextr_32", ["src", "control"]],
+        ["VMASKLOADD", "x86_avx2_maskload_d_256", ["src", "mask"]],
+        ["VMASKMOVPS", "x86_avx_maskload_ps_256", ["src", "mask"]],
+        ["VPSHUFB", "x86_avx2_pshuf_b", ["a", "b"]],
+        ["VPMOVSXBD", "x86_avx2_pmovsxbd", ["a"]],  # sign extend packed 8bit components
+        ["VPMOVSXWD", "x86_avx2_pmovsxwd", ["a"]],  # sign extend packed 16bit components
+        ["VPERMD", "x86_avx2_permd", ["idx", "a"]],
+        ["VCVTPH2PS", "x86_vcvtph2ps_256", ["a"]],
+        ["VCVTPS2PH", "x86_vcvtps2ph_256", ["a", "round"]],
+        ["VHSUBPS", "x86_avx_hsub_ps_256", ["a", "b"]],
+        ["VPTESTC", "x86_avx_ptestc_256", ["a", "b"]],
+        ["VPTESTZ", "x86_avx_ptestz_256", ["a", "b"]],
+        ["VFMADDPS", "x86_fma_vfmadd_ps_256", ["a", "b", "c"]],
+        ["VCVTTPS2DQ", "x86_avx_cvtt_ps2dq_256", ["a"]],
+        ["VMOVMSKPS", "x86_avx_movmsk_ps_256", ["a"]],
+        ["INTERRUPT", "x86_int", ["a"]],
+    ]
+
+def convert_uppercamel(name):
+    s1 = re.sub('(.)([A-Z][a-z]+)', r'\1_\2', name)
+    return re.sub('([a-z0-9])([A-Z])', r'\1_\2', s1).upper()
+
+"""
+    Given an input file (e.g. IRBuilder.h) generates function dictionary.
+"""
+def parse_ir_builder(input_file):
+
+    functions = []
+
+    lines = input_file.readlines()
+
+    idx = 0
+    while idx < len(lines) - 1:
+        line = lines[idx].rstrip()
+        idx += 1
+
+        #match = re.search(r"\*Create", line)
+        match = re.search(r"[\*\s]Create(\w*)\(", line)
+        if match is not None:
+            #print("Line: %s" % match.group(1))
+
+            if re.search(r"^\s*Create", line) is not None:
+                func_sig = lines[idx-2].rstrip() + line
+            else:
+                func_sig = line
+
+            end_of_args = False
+            while not end_of_args:
+                end_paren = re.search(r"\)", line)
+                if end_paren is not None:
+                    end_of_args = True
+                else:
+                    line = lines[idx].rstrip()
+                    func_sig += line
+                    idx += 1
+
+            delfunc = re.search(r"LLVM_DELETED_FUNCTION|= delete;", func_sig)
+
+            if not delfunc:
+                func = re.search(r"(.*?)\*[\n\s]*(Create\w*)\((.*?)\)", func_sig)
+                if func is not None:
+
+                    return_type = func.group(1).lstrip() + '*'
+                    func_name = func.group(2)
+                    arguments = func.group(3)
+
+                    func_args = ''
+                    func_args_nodefs = ''
+
+                    num_args = arguments.count(',')
+
+                    arg_names = []
+                    num_args = 0
+                    args = arguments.split(',')
+                    for arg in args:
+                        arg = arg.lstrip()
+                        if arg:
+                            if num_args > 0:
+                                func_args += ', '
+                                func_args_nodefs += ', '
+                            func_args += arg
+                            func_args_nodefs += arg.split(' =')[0]
+
+                            split_args = arg.split('=')
+                            arg_name = split_args[0].rsplit(None, 1)[-1]
+
+                            #print("Before ArgName = %s" % arg_name)
+
+                            reg_arg = re.search(r"[\&\*]*(\w*)", arg_name)
+                            if reg_arg:
+                                #print("Arg Name = %s" % reg_arg.group(1))
+                                arg_names += [reg_arg.group(1)]
+
+                            num_args += 1
+
+                    ignore = False
+
+                    # The following functions need to be ignored.
+                    if func_name == 'CreateInsertNUWNSWBinOp':
+                        ignore = True
+
+                    if func_name == 'CreateMaskedIntrinsic':
+                        ignore = True
+
+                    # Convert CamelCase to CAMEL_CASE
+                    func_mod = re.search(r"Create(\w*)", func_name)
+                    if func_mod:
+                        func_mod = func_mod.group(1)
+                        func_mod = convert_uppercamel(func_mod)
+                        if func_mod[0:2] == 'F_' or func_mod[0:2] == 'I_':
+                            func_mod = func_mod[0] + func_mod[2:]
+
+                    # Substitute alias based on CAMEL_CASE name.
+                    func_alias = inst_aliases.get(func_mod)
+                    if not func_alias:
+                        func_alias = func_mod
+
+                        if func_name == 'CreateCall' or func_name == 'CreateGEP':
+                            arglist = re.search(r'ArrayRef', func_args)
+                            if arglist:
+                                func_alias = func_alias + 'A'
+
+                    if not ignore:
+                        functions.append({
+                                "name": func_name,
+                                "alias": func_alias,
+                                "return": return_type,
+                                "args": func_args,
+                                "args_nodefs": func_args_nodefs,
+                                "arg_names": arg_names
+                            })
+
+    return functions
+
+"""
+    Auto-generates macros for LLVM IR
+"""
+def generate_gen_h(functions, output_file):
+    output_lines = gen_file_header(os.path.basename(output_file.name))
+
+    output_lines += [
+        '#pragma once',
+        '',
+        '//////////////////////////////////////////////////////////////////////////',
+        '/// Auto-generated Builder IR declarations',
+        '//////////////////////////////////////////////////////////////////////////',
+    ]
+
+    for func in functions:
+        name = func['name']
+        if func['alias']:
+            name = func['alias']
+        output_lines += [
+            '%s%s(%s);' % (func['return'], name, func['args'])
+        ]
+
+    output_file.write('\n'.join(output_lines) + '\n')
+
+"""
+    Auto-generates macros for LLVM IR
+"""
+def generate_gen_cpp(functions, output_file):
+    output_lines = gen_file_header(os.path.basename(output_file.name))
+
+    output_lines += [
+        '#include \"builder.h\"',
+        ''
+    ]
+
+    for func in functions:
+        name = func['name']
+        if func['alias']:
+            name = func['alias']
+
+        args = func['arg_names']
+        func_args = ''
+        first_arg = True
+        for arg in args:
+            if not first_arg:
+                func_args += ', '
+            func_args += arg
+            first_arg = False
+
+        output_lines += [
+            '//////////////////////////////////////////////////////////////////////////',
+            '%sBuilder::%s(%s)' % (func['return'], name, func['args_nodefs']),
+            '{',
+            '   return IRB()->%s(%s);' % (func['name'], func_args),
+            '}',
+            '',
+        ]
+
+    output_file.write('\n'.join(output_lines) + '\n')
+
+"""
+    Auto-generates macros for LLVM IR
+"""
+def generate_x86_h(output_file):
+    output_lines = gen_file_header(os.path.basename(output_file.name))
+
+    output_lines += [
+        '#pragma once',
+        '',
+        '//////////////////////////////////////////////////////////////////////////',
+        '/// Auto-generated x86 intrinsics',
+        '//////////////////////////////////////////////////////////////////////////',
+    ]
+
+    for inst in intrinsics:
+        #print("Inst: %s, x86: %s numArgs: %d" % (inst[0], inst[1], len(inst[2])))
+
+        args = ''
+        first = True
+        for arg in inst[2]:
+            if not first:
+                args += ', '
+            args += ("Value* %s" % arg)
+            first = False
+
+        output_lines += [
+            'Value *%s(%s);' % (inst[0], args)
+        ]
+
+    output_file.write('\n'.join(output_lines) + '\n')
+
+"""
+    Auto-generates macros for LLVM IR
+"""
+def generate_x86_cpp(output_file):
+    output_lines = gen_file_header(os.path.basename(output_file.name))
+
+    output_lines += [
+        '#include \"builder.h\"',
+        ''
+    ]
+
+    for inst in intrinsics:
+        #print("Inst: %s, x86: %s numArgs: %d" % (inst[0], inst[1], len(inst[2])))
+
+        args = ''
+        pass_args = ''
+        first = True
+        for arg in inst[2]:
+            if not first:
+                args += ', '
+                pass_args += ', '
+            args += ("Value* %s" % arg)
+            pass_args += arg
+            first = False
+
+        output_lines += [
+            '//////////////////////////////////////////////////////////////////////////',
+            'Value *Builder::%s(%s)' % (inst[0], args),
+            '{',
+            '    Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::%s);' % inst[1],
+            '    return CALL(func, std::initializer_list<Value*>{%s});' % pass_args,
+            '}',
+            '',
+        ]
+
+    output_file.write('\n'.join(output_lines) + '\n')
+
+"""
+    Function which is invoked when this script is started from a command line.
+    Will present and consume a set of arguments which will tell this script how
+    to behave
+"""
+def main():
+
+    # Parse args...
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", "-i", type=argparse.FileType('r'), help="Path to IRBuilder.h", required=False)
+    parser.add_argument("--output", "-o", type=argparse.FileType('w'), help="Path to output file", required=True)
+    parser.add_argument("--gen_h", "-gen_h", help="Generate builder_gen.h", action="store_true", default=False)
+    parser.add_argument("--gen_cpp", "-gen_cpp", help="Generate builder_gen.cpp", action="store_true", default=False)
+    parser.add_argument("--gen_x86_h", "-gen_x86_h", help="Generate x86 intrinsics. No input is needed.", action="store_true", default=False)
+    parser.add_argument("--gen_x86_cpp", "-gen_x86_cpp", help="Generate x86 intrinsics. No input is needed.", action="store_true", default=False)
+    args = parser.parse_args()
+
+    if args.input:
+        functions = parse_ir_builder(args.input)
+
+        if args.gen_h:
+            generate_gen_h(functions, args.output)
+
+        if args.gen_cpp:
+            generate_gen_cpp(functions, args.output)
+    else:
+        if args.gen_x86_h:
+            generate_x86_h(args.output)
+
+        if args.gen_x86_cpp:
+            generate_x86_cpp(args.output)
+
+        if args.gen_h:
+            print("Need to specify --input for --gen_h!")
+
+        if args.gen_cpp:
+            print("Need to specify --input for --gen_cpp!")
+
+if __name__ == '__main__':
+    main()
+# END OF FILE
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py
new file mode 100644
index 00000000000..7bba435467b
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py
@@ -0,0 +1,341 @@
+# Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+#!deps/python32/python.exe
+
+import os, sys, re
+import argparse
+import json as JSON
+import operator
+
+header = r"""
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+* 
+* @file %s
+* 
+* @brief auto-generated file
+* 
+* DO NOT EDIT
+* 
+******************************************************************************/
+
+#pragma once
+
+"""
+
+"""
+"""
+def gen_file_header(filename):
+    global header
+    headerStr = header % filename
+    return headerStr.splitlines()
+
+"""
+"""
+def gen_llvm_type(type, name, postfix_name, is_pointer, is_pointer_pointer, is_array, is_array_array, array_count, array_count1, is_llvm_struct, is_llvm_enum, is_llvm_pfn, output_file):
+
+    llvm_type = ''
+
+    if is_llvm_struct:
+        if is_pointer or is_pointer_pointer:
+            llvm_type = 'Type::getInt32Ty(ctx)'
+        else:
+            llvm_type = 'ArrayType::get(Type::getInt8Ty(ctx), sizeof(%s))' % type
+    elif is_llvm_enum:
+        llvm_type = 'Type::getInt32Ty(ctx)'
+    elif is_llvm_pfn:
+        llvm_type = 'PointerType::get(Type::getInt8Ty(ctx), 0)'
+    else:
+        if type == "BYTE" or type == "char" or type == "uint8_t" or type == "int8_t" or type == 'bool':
+            llvm_type = 'Type::getInt8Ty(ctx)'
+        elif type == 'UINT64' or type == 'INT64' or type == 'uint64_t' or type == 'int64_t':
+            llvm_type = 'Type::getInt64Ty(ctx)'
+        elif type == 'UINT16' or type == 'int16_t' or type == 'uint16_t':
+            llvm_type = 'Type::getInt16Ty(ctx)'
+        elif type == 'UINT' or type == 'INT' or type == 'int' or type == 'BOOL' or type == 'uint32_t' or type == 'int32_t':
+            llvm_type = 'Type::getInt32Ty(ctx)'
+        elif type == 'float' or type == 'FLOAT':
+            llvm_type = 'Type::getFloatTy(ctx)'
+        elif type == 'double' or type == 'DOUBLE':
+            llvm_type = 'Type::getDoubleTy(ctx)'
+        elif type == 'void' or type == 'VOID':
+            llvm_type = 'Type::getInt32Ty(ctx)'
+        elif type == 'HANDLE':
+            llvm_type = 'PointerType::get(Type::getInt32Ty(ctx), 0)'
+        elif type == 'simdscalar':
+            llvm_type = 'VectorType::get(Type::getFloatTy(ctx), pJitMgr->mVWidth)'
+        elif type == 'simdscalari':
+            llvm_type = 'VectorType::get(Type::getInt32Ty(ctx), pJitMgr->mVWidth)'
+        elif type == 'simdvector':
+            llvm_type = 'ArrayType::get(VectorType::get(Type::getFloatTy(ctx), pJitMgr->mVWidth), 4)'
+        else:
+            llvm_type = 'Gen_%s%s(pJitMgr)' % (type, postfix_name)
+
+    if is_pointer:
+        llvm_type = 'PointerType::get(%s, 0)' % llvm_type
+
+    if is_pointer_pointer:
+        llvm_type = 'PointerType::get(%s, 0)' % llvm_type
+
+    if is_array_array:
+        llvm_type = 'ArrayType::get(ArrayType::get(%s, %s), %s)' % (llvm_type, array_count1, array_count)
+    elif is_array:
+        llvm_type = 'ArrayType::get(%s, %s)' % (llvm_type, array_count)
+
+    return ['    members.push_back( %s );    // %s' % (llvm_type, name)]
+
+"""
+"""
+def gen_llvm_types(input_file, output_file):
+
+    output_lines = gen_file_header(os.path.basename(output_file.name))
+
+    lines = input_file.readlines()
+
+    postfix_name = ""
+
+    for idx in range(len(lines)):
+        line = lines[idx].rstrip()
+
+        match = re.match(r"(\s*)struct(\s*)(\w+)", line)
+        if match:
+            llvm_args = []
+
+             # Detect start of structure
+            is_fwd_decl = re.search(r";", line)
+
+            if not is_fwd_decl:
+
+                # Extract the command name
+                struct_name = match.group(3).strip()
+
+                output_lines += [
+                    '//////////////////////////////////////////////////////////////////////////',
+                    '/// Generate LLVM type information for %s' % struct_name,
+                    'INLINE static StructType *Gen_%s%s(JitManager* pJitMgr)' % (struct_name, postfix_name),
+                    '{',
+                    '    LLVMContext& ctx = pJitMgr->mContext;',
+                    '    std::vector<Type*> members;',
+                    '',
+                ]
+
+                end_of_struct = False
+
+                while not end_of_struct and idx < len(lines)-1:
+                    idx += 1
+                    line = lines[idx].rstrip()
+
+                    is_llvm_typedef = re.search(r"@llvm_typedef", line)
+                    if is_llvm_typedef is not None:
+                        is_llvm_typedef = True
+                    else:
+                        is_llvm_typedef = False
+
+                    ###########################################
+                    # Is field a llvm struct? Tells script to treat type as array of bytes that is size of structure.
+                    is_llvm_struct = re.search(r"@llvm_struct", line)
+
+                    if is_llvm_struct is not None:
+                        is_llvm_struct = True
+                    else:
+                        is_llvm_struct = False
+
+                    ###########################################
+                    # Is field a llvm enum? Tells script to treat type as an enum and replaced with uint32 type.
+                    is_llvm_enum = re.search(r"@llvm_enum", line)
+
+                    if is_llvm_enum is not None:
+                        is_llvm_enum = True
+                    else:
+                        is_llvm_enum = False
+
+                    ###########################################
+                    # Is field a llvm function pointer? Tells script to treat type as an enum and replaced with uint32 type.
+                    is_llvm_pfn = re.search(r"@llvm_pfn", line)
+
+                    if is_llvm_pfn is not None:
+                        is_llvm_pfn = True
+                    else:
+                        is_llvm_pfn = False
+
+                    ###########################################
+                    # Is field const?
+                    is_const = re.search(r"\s+const\s+", line)
+
+                    if is_const is not None:
+                        is_const = True
+                    else:
+                        is_const = False
+
+                    ###########################################
+                    # Is field a pointer?
+                    is_pointer_pointer = re.search("\*\*", line)
+
+                    if is_pointer_pointer is not None:
+                        is_pointer_pointer = True
+                    else:
+                        is_pointer_pointer = False
+
+                    ###########################################
+                    # Is field a pointer?
+                    is_pointer = re.search("\*", line)
+
+                    if is_pointer is not None:
+                        is_pointer = True
+                    else:
+                        is_pointer = False
+
+                    ###########################################
+                    # Is field an array of arrays?
+                    # TODO: Can add this to a list.
+                    is_array_array = re.search("\[(\w*)\]\[(\w*)\]", line)
+                    array_count = '0'
+                    array_count1 = '0'
+
+                    if is_array_array is not None:
+                        array_count = is_array_array.group(1)
+                        array_count1 = is_array_array.group(2)
+                        is_array_array = True
+                    else:
+                        is_array_array = False
+
+                    ###########################################
+                    # Is field an array?
+                    is_array = re.search("\[(\w*)\]", line)
+
+                    if is_array is not None:
+                        array_count = is_array.group(1)
+                        is_array = True
+                    else:
+                        is_array = False
+
+                    is_scoped = re.search("::", line)
+
+                    if is_scoped is not None:
+                        is_scoped = True
+                    else:
+                        is_scoped = False
+
+                    type = None
+                    name = None
+                    if is_const and is_pointer:
+
+                        if is_scoped:
+                            field_match = re.match(r"(\s*)(\w+\<*\w*\>*)(\s+)(\w+::)(\w+)(\s*\**\s*)(\w+)", line)
+
+                            type = "%s%s" % (field_match.group(4), field_match.group(5))
+                            name = field_match.group(7)
+                        else:
+                            field_match = re.match(r"(\s*)(\w+\<*\w*\>*)(\s+)(\w+)(\s*\**\s*)(\w+)", line)
+
+                            type = field_match.group(4)
+                            name = field_match.group(6)
+
+                    elif is_pointer:
+                        field_match = re.match(r"(\s*)(\s+)(\w+\<*\w*\>*)(\s*\**\s*)(\w+)", line)
+
+                        if field_match:
+                            type = field_match.group(3)
+                            name = field_match.group(5)
+                    elif is_const:
+                        field_match = re.match(r"(\s*)(\w+\<*\w*\>*)(\s+)(\w+)(\s*)(\w+)", line)
+
+                        if field_match:
+                            type = field_match.group(4)
+                            name = field_match.group(6)
+                    else:
+                        if is_scoped:
+                            field_match = re.match(r"\s*(\w+\<*\w*\>*)\s*::\s*(\w+\<*\w*\>*)\s+(\w+)", line)
+
+                            if field_match:
+                                type = field_match.group(1) + '::' + field_match.group(2)
+                                name = field_match.group(3)
+                        else:
+                            field_match = re.match(r"(\s*)(\w+\<*\w*\>*)(\s+)(\w+)", line)
+
+                            if field_match:
+                                type = field_match.group(2)
+                                name = field_match.group(4)
+
+                    if is_llvm_typedef is False:
+                        if type is not None:
+                            output_lines += gen_llvm_type(type, name, postfix_name, is_pointer, is_pointer_pointer, is_array, is_array_array, array_count, array_count1, is_llvm_struct, is_llvm_enum, is_llvm_pfn, output_file)
+                            llvm_args.append(name)
+
+                    # Detect end of structure
+                    end_of_struct = re.match(r"(\s*)};", line)
+
+                    if (end_of_struct):
+                        output_lines += [
+                            '',
+                            '    return StructType::get(ctx, members, false);',
+                            '}',
+                            '',
+                        ]
+
+                        for i in range(len(llvm_args)):
+                            output_lines.append('static const uint32_t %s%s_%s = %s;' % (struct_name, postfix_name, llvm_args[i], i))
+
+                        output_lines.append('')
+
+    output_file.write('\n'.join(output_lines) + '\n')
+
+"""
+    Function which is invoked when this script is started from a command line.
+    Will present and consume a set of arguments which will tell this script how
+    to behave
+"""
+def main():
+
+    # Parse args...
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--input", "-i", type=argparse.FileType('r'),
+            help="Path to input file containing structs", required=True)
+    parser.add_argument("--output", "-o", type=argparse.FileType('w'),
+            help="Path to output file", required=True)
+    parser.add_argument("--scalar", "-scalar", help="Generates scalar files with all enums", action="store_true", default=False)
+    args = parser.parse_args()
+
+    gen_llvm_types(args.input, args.output)
+
+if __name__ == '__main__':
+    main()
+# END OF FILE
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
new file mode 100644
index 00000000000..6c5f22bc47c
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
@@ -0,0 +1,357 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file streamout_jit.cpp
+*
+* @brief Implementation of the streamout jitter
+*
+* Notes:
+*
+******************************************************************************/
+#include "jit_api.h"
+#include "streamout_jit.h"
+#include "builder.h"
+#include "state_llvm.h"
+#include "common/containers.hpp"
+#include "llvm/IR/DataLayout.h"
+
+#include <sstream>
+#include <unordered_set>
+
+//////////////////////////////////////////////////////////////////////////
+/// Interface to Jitting a fetch shader
+//////////////////////////////////////////////////////////////////////////
+struct StreamOutJit : public Builder
+{
+    StreamOutJit(JitManager* pJitMgr) : Builder(pJitMgr){};
+
+    // returns pointer to SWR_STREAMOUT_BUFFER 
+    Value* getSOBuffer(Value* pSoCtx, uint32_t buffer)
+    {
+        return LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pBuffer, buffer });
+    }
+
+
+    //////////////////////////////////////////////////////////////////////////
+    // @brief checks if streamout buffer is oob
+    // @return <i1> true/false
+    Value* oob(const STREAMOUT_COMPILE_STATE& state, Value* pSoCtx, uint32_t buffer)
+    {
+        Value* returnMask = C(false);
+
+        Value* pBuf = getSOBuffer(pSoCtx, buffer);
+
+        // load enable
+        // @todo bool data types should generate <i1> llvm type
+        Value* enabled = TRUNC(LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_enable }), IRB()->getInt1Ty());
+
+        // load buffer size
+        Value* bufferSize = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_bufferSize });
+        
+        // load current streamOffset
+        Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
+
+        // load buffer pitch
+        Value* pitch = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch });
+
+        // buffer is considered oob if in use in a decl but not enabled
+        returnMask = OR(returnMask, NOT(enabled));
+
+        // buffer is oob if cannot fit a prims worth of verts
+        Value* newOffset = ADD(streamOffset, MUL(pitch, C(state.numVertsPerPrim)));
+        returnMask = OR(returnMask, ICMP_SGT(newOffset, bufferSize));
+
+        return returnMask;
+    }
+
+
+    //////////////////////////////////////////////////////////////////////////
+    // @brief converts scalar bitmask to <4 x i32> suitable for shuffle vector,
+    //        packing the active mask bits
+    //        ex. bitmask 0011 -> (0, 1, 0, 0)
+    //            bitmask 1000 -> (3, 0, 0, 0)
+    //            bitmask 1100 -> (2, 3, 0, 0)
+    Value* PackMask(uint32_t bitmask)
+    {
+        std::vector<Constant*> indices(4, C(0));
+        DWORD index;
+        uint32_t elem = 0;
+        while (_BitScanForward(&index, bitmask))
+        {
+            indices[elem++] = C((int)index);
+            bitmask &= ~(1 << index);
+        }
+
+        return ConstantVector::get(indices);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    // @brief convert scalar bitmask to <4xfloat> bitmask
+    Value* ToMask(uint32_t bitmask)
+    {
+        std::vector<Constant*> indices;
+        for (uint32_t i = 0; i < 4; ++i)
+        {
+            if (bitmask & (1 << i))
+            {
+                indices.push_back(C(-1.0f));
+            }
+            else
+            {
+                indices.push_back(C(0.0f));
+            }
+        }
+        return ConstantVector::get(indices);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    // @brief processes a single decl from the streamout stream. Reads 4 components from the input
+    //        stream and writes N components to the output buffer given the componentMask or if
+    //        a hole, just increments the buffer pointer
+    // @param pStream - pointer to current attribute
+    // @param pOutBuffers - pointers to the current location of each output buffer
+    // @param decl - input decl
+    void buildDecl(Value* pStream, Value* pOutBuffers[4], const STREAMOUT_DECL& decl)
+    {
+        // @todo add this to x86 macros
+        Function* maskStore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskstore_ps);
+
+        uint32_t numComponents = _mm_popcnt_u32(decl.componentMask);
+        uint32_t packedMask = (1 << numComponents) - 1;
+        if (!decl.hole)
+        {
+            // increment stream pointer to correct slot
+            Value* pAttrib = GEP(pStream, C(4 * decl.attribSlot));
+
+            // load 4 components from stream
+            Type* simd4Ty = VectorType::get(IRB()->getFloatTy(), 4);
+            Type* simd4PtrTy = PointerType::get(simd4Ty, 0);
+            pAttrib = BITCAST(pAttrib, simd4PtrTy);
+            Value *vattrib = LOAD(pAttrib);
+
+            // shuffle/pack enabled components
+            Value* vpackedAttrib = VSHUFFLE(vattrib, vattrib, PackMask(decl.componentMask));
+
+            // store to output buffer
+            // cast SO buffer to i8*, needed by maskstore
+            Value* pOut = BITCAST(pOutBuffers[decl.bufferIndex], PointerType::get(mInt8Ty, 0));
+
+            // cast input to <4xfloat>
+            Value* src = BITCAST(vpackedAttrib, simd4Ty);
+            CALL(maskStore, {pOut, ToMask(packedMask), src});
+        }
+
+        // increment SO buffer
+        pOutBuffers[decl.bufferIndex] = GEP(pOutBuffers[decl.bufferIndex], C(numComponents));
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    // @brief builds a single vertex worth of data for the given stream
+    // @param streamState - state for this stream
+    // @param pCurVertex - pointer to src stream vertex data
+    // @param pOutBuffer - pointers to up to 4 SO buffers
+    void buildVertex(const STREAMOUT_STREAM& streamState, Value* pCurVertex, Value* pOutBuffer[4])
+    {
+        for (uint32_t d = 0; d < streamState.numDecls; ++d)
+        {
+            const STREAMOUT_DECL& decl = streamState.decl[d];
+            buildDecl(pCurVertex, pOutBuffer, decl);
+        }
+    }
+
+    void buildStream(const STREAMOUT_COMPILE_STATE& state, const STREAMOUT_STREAM& streamState, Value* pSoCtx, BasicBlock* returnBB, Function* soFunc)
+    {
+        // get list of active SO buffers
+        std::unordered_set<uint32_t> activeSOBuffers;
+        for (uint32_t d = 0; d < streamState.numDecls; ++d)
+        {
+            const STREAMOUT_DECL& decl = streamState.decl[d];
+            activeSOBuffers.insert(decl.bufferIndex);
+        }
+
+        // always increment numPrimStorageNeeded
+        Value *numPrimStorageNeeded = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded });
+        numPrimStorageNeeded = ADD(numPrimStorageNeeded, C(1));
+        STORE(numPrimStorageNeeded, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimStorageNeeded });
+
+        // check OOB on active SO buffers.  If any buffer is out of bound, don't write
+        // the primitive to any buffer
+        Value* oobMask = C(false);
+        for (uint32_t buffer : activeSOBuffers)
+        {
+            oobMask = OR(oobMask, oob(state, pSoCtx, buffer));
+        }
+
+        BasicBlock* validBB = BasicBlock::Create(JM()->mContext, "valid", soFunc);
+
+        // early out if OOB
+        COND_BR(oobMask, returnBB, validBB);
+
+        IRB()->SetInsertPoint(validBB);
+
+        Value* numPrimsWritten = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten });
+        numPrimsWritten = ADD(numPrimsWritten, C(1));
+        STORE(numPrimsWritten, pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_numPrimsWritten });
+
+        // compute start pointer for each output buffer
+        Value* pOutBuffer[4];
+        Value* pOutBufferStartVertex[4];
+        Value* outBufferPitch[4];
+        for (uint32_t b: activeSOBuffers)
+        {
+            Value* pBuf = getSOBuffer(pSoCtx, b);
+            Value* pData = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pBuffer });
+            Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
+            pOutBuffer[b] = GEP(pData, streamOffset);
+            pOutBufferStartVertex[b] = pOutBuffer[b];
+
+            outBufferPitch[b] = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_pitch });
+        }
+
+        // loop over the vertices of the prim
+        Value* pStreamData = LOAD(pSoCtx, { 0, SWR_STREAMOUT_CONTEXT_pPrimData });
+        for (uint32_t v = 0; v < state.numVertsPerPrim; ++v)
+        {
+            buildVertex(streamState, pStreamData, pOutBuffer);
+
+            // increment stream and output buffer pointers
+            // stream verts are always 32*4 dwords apart
+            pStreamData = GEP(pStreamData, C(KNOB_NUM_ATTRIBUTES * 4));
+
+            // output buffers offset using pitch in buffer state
+            for (uint32_t b : activeSOBuffers)
+            {
+                pOutBufferStartVertex[b] = GEP(pOutBufferStartVertex[b], outBufferPitch[b]);
+                pOutBuffer[b] = pOutBufferStartVertex[b];
+            }
+        }
+
+        // update each active buffer's streamOffset
+        for (uint32_t b : activeSOBuffers)
+        {
+            Value* pBuf = getSOBuffer(pSoCtx, b);
+            Value* streamOffset = LOAD(pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
+            streamOffset = ADD(streamOffset, MUL(C(state.numVertsPerPrim), outBufferPitch[b]));
+            STORE(streamOffset, pBuf, { 0, SWR_STREAMOUT_BUFFER_streamOffset });
+        }
+    }
+
+    Function* Create(const STREAMOUT_COMPILE_STATE& state)
+    {
+        static std::size_t soNum = 0;
+
+        std::stringstream fnName("SOShader", std::ios_base::in | std::ios_base::out | std::ios_base::ate);
+        fnName << soNum++;
+
+        // SO function signature
+        // typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT*)
+
+        std::vector<Type*> args{
+            PointerType::get(Gen_SWR_STREAMOUT_CONTEXT(JM()), 0), // SWR_STREAMOUT_CONTEXT*
+        };
+
+        FunctionType* fTy = FunctionType::get(IRB()->getVoidTy(), args, false);
+        Function* soFunc = Function::Create(fTy, GlobalValue::ExternalLinkage, fnName.str(), JM()->mpCurrentModule);
+
+        // create return basic block
+        BasicBlock* entry = BasicBlock::Create(JM()->mContext, "entry", soFunc);
+        BasicBlock* returnBB = BasicBlock::Create(JM()->mContext, "return", soFunc);
+
+        IRB()->SetInsertPoint(entry);
+
+        // arguments
+        auto argitr = soFunc->getArgumentList().begin();
+        Value* pSoCtx = &*argitr++;
+        pSoCtx->setName("pSoCtx");
+
+        const STREAMOUT_STREAM& streamState = state.stream;
+        buildStream(state, streamState, pSoCtx, returnBB, soFunc);
+
+        BR(returnBB);
+
+        IRB()->SetInsertPoint(returnBB);
+        RET_VOID();
+
+        JitManager::DumpToFile(soFunc, "SoFunc");
+
+        FunctionPassManager passes(JM()->mpCurrentModule);
+        passes.add(createBreakCriticalEdgesPass());
+        passes.add(createCFGSimplificationPass());
+        passes.add(createEarlyCSEPass());
+        passes.add(createPromoteMemoryToRegisterPass());
+        passes.add(createCFGSimplificationPass());
+        passes.add(createEarlyCSEPass());
+        passes.add(createInstructionCombiningPass());
+        passes.add(createInstructionSimplifierPass());
+        passes.add(createConstantPropagationPass());
+        passes.add(createSCCPPass());
+        passes.add(createAggressiveDCEPass());
+
+        passes.run(*soFunc);
+
+        JitManager::DumpToFile(soFunc, "SoFunc_optimized");
+
+        return soFunc;
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief JITs from streamout shader IR
+/// @param hJitMgr - JitManager handle
+/// @param func   - LLVM function IR
+/// @return PFN_SO_FUNC - pointer to SOS function
+PFN_SO_FUNC JitStreamoutFunc(HANDLE hJitMgr, const HANDLE hFunc)
+{
+    const llvm::Function *func = (const llvm::Function*)hFunc;
+    JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
+    PFN_SO_FUNC pfnStreamOut;
+    pfnStreamOut = (PFN_SO_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
+    // MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
+    pJitMgr->mIsModuleFinalized = true;
+
+    return pfnStreamOut;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief JIT compiles streamout shader
+/// @param hJitMgr - JitManager handle
+/// @param state   - SO state to build function from
+extern "C" PFN_SO_FUNC JITCALL JitCompileStreamout(HANDLE hJitMgr, const STREAMOUT_COMPILE_STATE& state)
+{
+    JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
+
+    STREAMOUT_COMPILE_STATE soState = state;
+    if (soState.offsetAttribs)
+    {
+        for (uint32_t i = 0; i < soState.stream.numDecls; ++i)
+        {
+            soState.stream.decl[i].attribSlot -= soState.offsetAttribs;
+        }
+    }
+
+    pJitMgr->SetupNewModule();
+
+    StreamOutJit theJit(pJitMgr);
+    HANDLE hFunc = theJit.Create(soState);
+
+    return JitStreamoutFunc(hJitMgr, hFunc);
+}
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h
new file mode 100644
index 00000000000..097f8ab44d9
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.h
@@ -0,0 +1,94 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file streamout_jit.h
+*
+* @brief Definition of the streamout jitter
+*
+* Notes:
+*
+******************************************************************************/
+#pragma once
+
+#include "common/formats.h"
+#include "core/state.h"
+
+//////////////////////////////////////////////////////////////////////////
+/// STREAMOUT_DECL - Stream decl
+//////////////////////////////////////////////////////////////////////////
+struct STREAMOUT_DECL
+{
+    // Buffer that stream maps to.
+    DWORD bufferIndex;
+
+    // attribute to stream
+    uint32_t attribSlot;
+
+    // attribute component mask 
+    uint32_t componentMask;
+
+    // indicates this decl is a hole
+    bool hole;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// STREAMOUT_STREAM - Stream decls
+//////////////////////////////////////////////////////////////////////////
+struct STREAMOUT_STREAM
+{
+    // numnber of decls for this stream
+    uint32_t numDecls;
+
+    // array of numDecls decls
+    STREAMOUT_DECL decl[128];
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// State required for streamout jit
+//////////////////////////////////////////////////////////////////////////
+struct STREAMOUT_COMPILE_STATE
+{
+    // number of verts per primitive
+    uint32_t numVertsPerPrim;
+    uint32_t offsetAttribs; ///< attrib offset to subtract from all STREAMOUT_DECL::attribSlot values.
+
+    uint64_t streamMask;
+
+    // stream decls
+    STREAMOUT_STREAM stream;
+
+    bool operator==(const STREAMOUT_COMPILE_STATE &other) const
+    {
+        if (numVertsPerPrim != other.numVertsPerPrim) return false;
+        if (stream.numDecls != other.stream.numDecls) return false;
+
+        for (uint32_t i = 0; i < stream.numDecls; ++i)
+        {
+            if (stream.decl[i].bufferIndex != other.stream.decl[i].bufferIndex) return false;
+            if (stream.decl[i].attribSlot != other.stream.decl[i].attribSlot) return false;
+            if (stream.decl[i].componentMask != other.stream.decl[i].componentMask) return false;
+            if (stream.decl[i].hole != other.stream.decl[i].hole) return false;
+        }
+
+        return true;
+    }
+};
diff --git a/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
new file mode 100644
index 00000000000..ad73cd840a7
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
@@ -0,0 +1,287 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file ClearTile.cpp
+*
+* @brief Functionality for ClearTile. StoreHotTileClear clears a single macro
+*        tile in the destination.
+*
+******************************************************************************/
+#include "common/os.h"
+#include "core/context.h"
+#include "common/formats.h"
+#include "memory/TilingFunctions.h"
+#include "memory/tilingtraits.h"
+#include "memory/Convert.h"
+
+typedef void(*PFN_STORE_TILES_CLEAR)(const FLOAT*, SWR_SURFACE_STATE*, UINT, UINT);
+
+//////////////////////////////////////////////////////////////////////////
+/// Clear Raster Tile Function Tables.
+//////////////////////////////////////////////////////////////////////////
+static PFN_STORE_TILES_CLEAR sStoreTilesClearColorTable[NUM_SWR_FORMATS];
+
+static PFN_STORE_TILES_CLEAR sStoreTilesClearDepthTable[NUM_SWR_FORMATS];
+
+//////////////////////////////////////////////////////////////////////////
+/// StoreRasterTileClear
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct StoreRasterTileClear
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Stores an 8x8 raster tile to the destination surface.
+    /// @param pColor - Pointer to clear color.
+    /// @param pDstSurface - Destination surface state
+    /// @param x, y - Coordinates to raster tile.
+    INLINE static void StoreClear(
+        const BYTE* dstFormattedColor,
+        UINT dstBytesPerPixel,
+        SWR_SURFACE_STATE* pDstSurface,
+        UINT x, UINT y) // (x, y) pixel coordinate to start of raster tile.
+    {
+        // Compute destination address for raster tile.
+        BYTE* pDstTile = (BYTE*)pDstSurface->pBaseAddress +
+            (y * pDstSurface->pitch) + (x * dstBytesPerPixel);
+
+        // start of first row
+        BYTE* pDst = pDstTile;
+        UINT dstBytesPerRow = 0;
+
+        // For each raster tile pixel in row 0 (rx, 0)
+        for (UINT rx = 0; (rx < KNOB_TILE_X_DIM) && ((x + rx) < pDstSurface->width); ++rx)
+        {
+            memcpy(pDst, dstFormattedColor, dstBytesPerPixel);
+
+            // Increment pointer to next pixel in row.
+            pDst += dstBytesPerPixel;
+            dstBytesPerRow += dstBytesPerPixel;
+        }
+
+        // start of second row
+        pDst = pDstTile + pDstSurface->pitch;
+
+        // For each remaining row in the rest of the raster tile
+        for (UINT ry = 1; (ry < KNOB_TILE_Y_DIM) && ((y + ry) < pDstSurface->height); ++ry)
+        {
+            // copy row
+            memcpy(pDst, pDstTile, dstBytesPerRow);
+
+            // Increment pointer to first pixel in next row.
+            pDst += pDstSurface->pitch;
+        }
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// StoreMacroTileClear - Stores a macro tile clear to its raster tiles.
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct StoreMacroTileClear
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Stores a macrotile to the destination surface.
+    /// @param pColor - Pointer to color to write to pixels.
+    /// @param pDstSurface - Destination surface state
+    /// @param x, y - Coordinates to macro tile
+    static void StoreClear(
+        const FLOAT *pColor,
+        SWR_SURFACE_STATE* pDstSurface,
+        UINT x, UINT y)
+    {
+        UINT dstBytesPerPixel = (FormatTraits<DstFormat>::bpp / 8);
+
+        BYTE dstFormattedColor[16]; // max bpp is 128, so 16 is all we need here for one pixel
+
+        FLOAT srcColor[4];
+
+        for (UINT comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
+        {
+            srcColor[comp] = pColor[FormatTraits<DstFormat>::swizzle(comp)];
+        }
+
+        // using this helper function, but the Tiling Traits is unused inside it so just using a dummy value
+        ConvertPixelFromFloat<DstFormat>(dstFormattedColor, srcColor);
+
+        // Store each raster tile from the hot tile to the destination surface.
+        // TODO:  Put in check for partial coverage on x/y -- SWR_ASSERT if it happens.
+        //        Intent is for this function to only handle full tiles.
+        for (UINT row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+        {
+            for (UINT col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+            {
+                StoreRasterTileClear<SrcFormat, DstFormat>::StoreClear(dstFormattedColor, dstBytesPerPixel, pDstSurface, (x + col), (y + row));
+            }
+        }
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Writes clear color to every pixel of a render surface
+/// @param hPrivateContext - Handle to private DC
+/// @param renderTargetIndex - Index to destination render target
+/// @param x, y - Coordinates to raster tile.
+/// @param pClearColor - Pointer to clear color
+void StoreHotTileClear(
+    SWR_SURFACE_STATE *pDstSurface,
+    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
+    UINT x,
+    UINT y,
+    const float* pClearColor)
+{
+    PFN_STORE_TILES_CLEAR pfnStoreTilesClear = NULL;
+
+    SWR_ASSERT(renderTargetIndex != SWR_ATTACHMENT_STENCIL);  ///@todo Not supported yet.
+
+    if (renderTargetIndex != SWR_ATTACHMENT_DEPTH)
+    {
+        pfnStoreTilesClear = sStoreTilesClearColorTable[pDstSurface->format];
+    }
+    else
+    {
+        pfnStoreTilesClear = sStoreTilesClearDepthTable[pDstSurface->format];
+    }
+
+    SWR_ASSERT(pfnStoreTilesClear != NULL);
+
+    // Store a macro tile.
+    /// @todo Once all formats are supported then if check can go away. This is to help us near term to make progress.
+    if (pfnStoreTilesClear != NULL)
+    {
+        pfnStoreTilesClear(pClearColor, pDstSurface, x, y);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
+#define INIT_STORE_TILES_CLEAR_COLOR_TABLE() \
+    memset(sStoreTilesClearColorTable, 0, sizeof(sStoreTilesClearColorTable)); \
+    \
+    sStoreTilesClearColorTable[R32G32B32A32_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::StoreClear; \
+    sStoreTilesClearColorTable[R32G32B32A32_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32A32_SINT>::StoreClear; \
+    sStoreTilesClearColorTable[R32G32B32A32_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32A32_UINT>::StoreClear; \
+    sStoreTilesClearColorTable[R32G32B32X32_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::StoreClear; \
+    sStoreTilesClearColorTable[R32G32B32_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32_FLOAT>::StoreClear; \
+    sStoreTilesClearColorTable[R32G32B32_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32_SINT>::StoreClear; \
+    sStoreTilesClearColorTable[R32G32B32_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32B32_UINT>::StoreClear; \
+    sStoreTilesClearColorTable[R16G16B16A16_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::StoreClear; \
+    sStoreTilesClearColorTable[R16G16B16A16_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::StoreClear; \
+    sStoreTilesClearColorTable[R16G16B16A16_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_SINT>::StoreClear; \
+    sStoreTilesClearColorTable[R16G16B16A16_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_UINT>::StoreClear; \
+    sStoreTilesClearColorTable[R16G16B16A16_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::StoreClear; \
+    sStoreTilesClearColorTable[R32G32_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32_FLOAT>::StoreClear; \
+    sStoreTilesClearColorTable[R32G32_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32_SINT>::StoreClear; \
+    sStoreTilesClearColorTable[R32G32_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32G32_UINT>::StoreClear; \
+    sStoreTilesClearColorTable[R16G16B16X16_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::StoreClear; \
+    sStoreTilesClearColorTable[R16G16B16X16_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::StoreClear; \
+    sStoreTilesClearColorTable[B8G8R8A8_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::StoreClear; \
+    sStoreTilesClearColorTable[B8G8R8A8_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::StoreClear; \
+    sStoreTilesClearColorTable[R10G10B10A2_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreClear; \
+    sStoreTilesClearColorTable[R10G10B10A2_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreClear; \
+    sStoreTilesClearColorTable[R10G10B10A2_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreClear; \
+    sStoreTilesClearColorTable[R8G8B8A8_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::StoreClear; \
+    sStoreTilesClearColorTable[R8G8B8A8_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::StoreClear; \
+    sStoreTilesClearColorTable[R8G8B8A8_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::StoreClear; \
+    sStoreTilesClearColorTable[R8G8B8A8_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_SINT>::StoreClear; \
+    sStoreTilesClearColorTable[R8G8B8A8_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8A8_UINT>::StoreClear; \
+    sStoreTilesClearColorTable[R16G16_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_UNORM>::StoreClear; \
+    sStoreTilesClearColorTable[R16G16_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_SNORM>::StoreClear; \
+    sStoreTilesClearColorTable[R16G16_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_SINT>::StoreClear; \
+    sStoreTilesClearColorTable[R16G16_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_UINT>::StoreClear; \
+    sStoreTilesClearColorTable[R16G16_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16_FLOAT>::StoreClear; \
+    sStoreTilesClearColorTable[B10G10R10A2_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreClear; \
+    sStoreTilesClearColorTable[B10G10R10A2_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreClear; \
+    sStoreTilesClearColorTable[R11G11B10_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreClear; \
+    sStoreTilesClearColorTable[R32_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32_SINT>::StoreClear; \
+    sStoreTilesClearColorTable[R32_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32_UINT>::StoreClear; \
+    sStoreTilesClearColorTable[R32_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R32_FLOAT>::StoreClear; \
+    sStoreTilesClearColorTable[A32_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, A32_FLOAT>::StoreClear; \
+    sStoreTilesClearColorTable[B8G8R8X8_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::StoreClear; \
+    sStoreTilesClearColorTable[B8G8R8X8_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::StoreClear; \
+    sStoreTilesClearColorTable[R8G8B8X8_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::StoreClear; \
+    sStoreTilesClearColorTable[R8G8B8X8_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::StoreClear; \
+    sStoreTilesClearColorTable[B10G10R10X2_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreClear; \
+    sStoreTilesClearColorTable[B5G6R5_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G6R5_UNORM>::StoreClear; \
+    sStoreTilesClearColorTable[B5G6R5_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreClear; \
+    sStoreTilesClearColorTable[B5G5R5A1_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreClear; \
+    sStoreTilesClearColorTable[B5G5R5A1_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreClear; \
+    sStoreTilesClearColorTable[B4G4R4A4_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreClear; \
+    sStoreTilesClearColorTable[B4G4R4A4_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreClear; \
+    sStoreTilesClearColorTable[R8G8_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8_UNORM>::StoreClear; \
+    sStoreTilesClearColorTable[R8G8_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8_SNORM>::StoreClear; \
+    sStoreTilesClearColorTable[R8G8_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8_SINT>::StoreClear; \
+    sStoreTilesClearColorTable[R8G8_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8_UINT>::StoreClear; \
+    sStoreTilesClearColorTable[R16_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_UNORM>::StoreClear; \
+    sStoreTilesClearColorTable[R16_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_SNORM>::StoreClear; \
+    sStoreTilesClearColorTable[R16_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_SINT>::StoreClear; \
+    sStoreTilesClearColorTable[R16_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_UINT>::StoreClear; \
+    sStoreTilesClearColorTable[R16_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16_FLOAT>::StoreClear; \
+    sStoreTilesClearColorTable[A16_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, A16_UNORM>::StoreClear; \
+    sStoreTilesClearColorTable[A16_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, A16_FLOAT>::StoreClear; \
+    sStoreTilesClearColorTable[B5G5R5X1_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreClear; \
+    sStoreTilesClearColorTable[B5G5R5X1_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreClear; \
+    sStoreTilesClearColorTable[R8_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8_UNORM>::StoreClear; \
+    sStoreTilesClearColorTable[R8_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8_SNORM>::StoreClear; \
+    sStoreTilesClearColorTable[R8_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8_SINT>::StoreClear; \
+    sStoreTilesClearColorTable[R8_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8_UINT>::StoreClear; \
+    sStoreTilesClearColorTable[A8_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, A8_UNORM>::StoreClear; \
+    sStoreTilesClearColorTable[BC1_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC1_UNORM>::StoreClear; \
+    sStoreTilesClearColorTable[BC2_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC2_UNORM>::StoreClear; \
+    sStoreTilesClearColorTable[BC3_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC3_UNORM>::StoreClear; \
+    sStoreTilesClearColorTable[BC4_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC4_UNORM>::StoreClear; \
+    sStoreTilesClearColorTable[BC5_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC5_UNORM>::StoreClear; \
+    sStoreTilesClearColorTable[BC1_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC1_UNORM_SRGB>::StoreClear; \
+    sStoreTilesClearColorTable[BC2_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC2_UNORM_SRGB>::StoreClear; \
+    sStoreTilesClearColorTable[BC3_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC3_UNORM_SRGB>::StoreClear; \
+    sStoreTilesClearColorTable[R8G8B8_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_UNORM>::StoreClear; \
+    sStoreTilesClearColorTable[R8G8B8_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_SNORM>::StoreClear; \
+    sStoreTilesClearColorTable[BC4_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC4_SNORM>::StoreClear; \
+    sStoreTilesClearColorTable[BC5_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, BC5_SNORM>::StoreClear; \
+    sStoreTilesClearColorTable[R16G16B16_FLOAT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_FLOAT>::StoreClear; \
+    sStoreTilesClearColorTable[R16G16B16_UNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_UNORM>::StoreClear; \
+    sStoreTilesClearColorTable[R16G16B16_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_SNORM>::StoreClear; \
+    sStoreTilesClearColorTable[R8G8B8_UNORM_SRGB]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::StoreClear; \
+    sStoreTilesClearColorTable[R16G16B16_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_UINT>::StoreClear; \
+    sStoreTilesClearColorTable[R16G16B16_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R16G16B16_SINT>::StoreClear; \
+    sStoreTilesClearColorTable[R10G10B10A2_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreClear; \
+    sStoreTilesClearColorTable[R10G10B10A2_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreClear; \
+    sStoreTilesClearColorTable[B10G10R10A2_SNORM]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreClear; \
+    sStoreTilesClearColorTable[B10G10R10A2_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreClear; \
+    sStoreTilesClearColorTable[B10G10R10A2_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreClear; \
+    sStoreTilesClearColorTable[R8G8B8_UINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_UINT>::StoreClear; \
+    sStoreTilesClearColorTable[R8G8B8_SINT]      = StoreMacroTileClear<R32G32B32A32_FLOAT, R8G8B8_SINT>::StoreClear; \
+
+//////////////////////////////////////////////////////////////////////////
+/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
+#define INIT_STORE_TILES_CLEAR_DEPTH_TABLE() \
+    memset(sStoreTilesClearDepthTable, 0, sizeof(sStoreTilesClearDepthTable)); \
+    \
+    sStoreTilesClearDepthTable[R32_FLOAT] = StoreMacroTileClear<R32_FLOAT, R32_FLOAT>::StoreClear; \
+    sStoreTilesClearDepthTable[R24_UNORM_X8_TYPELESS] = StoreMacroTileClear<R32_FLOAT, R24_UNORM_X8_TYPELESS>::StoreClear; \
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Sets up tables for ClearTile
+void InitSimClearTilesTable()
+{
+    INIT_STORE_TILES_CLEAR_COLOR_TABLE();
+    INIT_STORE_TILES_CLEAR_DEPTH_TABLE();
+}
diff --git a/src/gallium/drivers/swr/rasterizer/memory/Convert.h b/src/gallium/drivers/swr/rasterizer/memory/Convert.h
new file mode 100644
index 00000000000..0f9e0ad4bd8
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/memory/Convert.h
@@ -0,0 +1,698 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+* 
+* @file Convert.h
+* 
+* @brief Conversion utility functions
+* 
+******************************************************************************/
+#pragma once
+
+#if defined(_WIN32)
+// disable "potential divide by 0"
+#pragma warning(disable: 4723)
+#endif
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Convert an IEEE 754 16-bit float to an 32-bit single precision
+///        float
+/// @param val - 16-bit float
+/// @todo Maybe move this outside of this file into a header?
+static float ConvertSmallFloatTo32(UINT val)
+{
+    UINT result;
+    if ((val & 0x7fff) == 0)
+    {
+        result = ((uint32_t)(val & 0x8000)) << 16;
+    }
+    else if ((val & 0x7c00) == 0x7c00)
+    {
+        result = ((val & 0x3ff) == 0) ? 0x7f800000 : 0x7fc00000;
+        result |= ((uint32_t)val & 0x8000) << 16;
+    }
+    else
+    {
+        uint32_t sign = (val & 0x8000) << 16;
+        uint32_t mant = (val & 0x3ff) << 13;
+        uint32_t exp = (val >> 10) & 0x1f;
+        if ((exp == 0) && (mant != 0)) // Adjust exponent and mantissa for denormals
+        {
+            mant <<= 1;
+            while (mant < (0x400 << 13))
+            {
+                exp--;
+                mant <<= 1;
+            }
+            mant &= (0x3ff << 13);
+        }
+        exp = ((exp - 15 + 127) & 0xff) << 23;
+        result = sign | exp | mant;
+    }
+
+    return *(float*)&result;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Convert an IEEE 754 32-bit single precision float to an 
+///        unsigned small float with 5 exponent bits and a variable
+///        number of mantissa bits.
+/// @param val - 32-bit float
+/// @todo Maybe move this outside of this file into a header?
+template<UINT numMantissaBits>
+static UINT Convert32ToSmallFloat(float val)
+{
+    uint32_t sign, exp, mant;
+    uint32_t roundBits;
+
+    // Extract the sign, exponent, and mantissa
+    UINT uf = *(UINT*)&val;
+
+    sign = (uf & 0x80000000) >> 31;
+    exp = (uf & 0x7F800000) >> 23;
+    mant = uf & 0x007FFFFF;
+
+    // 10/11 bit floats are unsigned.  Negative values are clamped to 0.
+    if (sign != 0)
+    {
+        exp = mant = 0;
+    }
+    // Check for out of range
+    else if ((exp == 0xFF) && (mant != 0)) // NaN
+    {
+        exp = 0x1F;
+        mant = 1 << numMantissaBits;
+    }
+    else if ((exp == 0xFF) && (mant == 0)) // INF
+    {
+        exp = 0x1F;
+        mant = 0;
+    }
+    else if (exp > (0x70 + 0x1E)) // Too big to represent
+    {
+        exp = 0x1Eu;
+        mant = (1 << numMantissaBits) - 1;  // 0x3F for 6 bit mantissa.
+    }
+    else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
+    {
+        mant |= 0x00800000;
+        for (; exp <= 0x70; mant >>= 1, exp++)
+            ;
+        exp = 0;
+        mant = mant >> (23 - numMantissaBits);
+    }
+    else if (exp < 0x66) // Too small to represent -> Zero
+    {
+        exp = 0;
+        mant = 0;
+    }
+    else
+    {
+        // Saves bits that will be shifted off for rounding
+        roundBits = mant & 0x1FFFu;
+        // convert exponent and mantissa to 16 bit format
+        exp = exp - 0x70u;
+        mant = mant >> (23 - numMantissaBits);
+
+        // Essentially RTZ, but round up if off by only 1 lsb
+        if (roundBits == 0x1FFFu)
+        {
+            mant++;
+            // check for overflow
+            if ((mant & (0x3 << numMantissaBits)) != 0) // 0x60 = 0x3 << (num Mantissa Bits)
+                exp++;
+            // make sure only the needed bits are used
+            mant &= (1 << numMantissaBits) - 1;
+        }
+    }
+
+    UINT tmpVal = (exp << numMantissaBits) | mant;
+    return tmpVal;
+}
+
+#if KNOB_ARCH == KNOB_ARCH_AVX
+//////////////////////////////////////////////////////////////////////////
+/// @brief Convert an IEEE 754 32-bit single precision float to an
+///        16 bit float with 5 exponent bits and a variable
+///        number of mantissa bits.
+/// @param val - 32-bit float
+/// @todo Maybe move this outside of this file into a header?
+static uint16_t Convert32To16Float(float val)
+{
+    uint32_t sign, exp, mant;
+    uint32_t roundBits;
+
+    // Extract the sign, exponent, and mantissa
+    uint32_t uf = *(uint32_t*)&val;
+    sign = (uf & 0x80000000) >> 31;
+    exp = (uf & 0x7F800000) >> 23;
+    mant = uf & 0x007FFFFF;
+
+    // Check for out of range
+    if (std::isnan(val))
+    {
+        exp = 0x1F;
+        mant = 0x200;
+        sign = 1;                     // set the sign bit for NANs
+    }
+    else if (std::isinf(val))
+    {
+        exp = 0x1f;
+        mant = 0x0;
+    }
+    else if (exp > (0x70 + 0x1E)) // Too big to represent -> max representable value
+    {
+        exp = 0x1E;
+        mant = 0x3FF;
+    }
+    else if ((exp <= 0x70) && (exp >= 0x66)) // It's a denorm
+    {
+        mant |= 0x00800000;
+        for (; exp <= 0x70; mant >>= 1, exp++)
+            ;
+        exp = 0;
+        mant = mant >> 13;
+    }
+    else if (exp < 0x66) // Too small to represent -> Zero
+    {
+        exp = 0;
+        mant = 0;
+    }
+    else
+    {
+        // Saves bits that will be shifted off for rounding
+        roundBits = mant & 0x1FFFu;
+        // convert exponent and mantissa to 16 bit format
+        exp = exp - 0x70;
+        mant = mant >> 13;
+
+        // Essentially RTZ, but round up if off by only 1 lsb
+        if (roundBits == 0x1FFFu)
+        {
+            mant++;
+            // check for overflow
+            if ((mant & 0xC00u) != 0)
+                exp++;
+            // make sure only the needed bits are used
+            mant &= 0x3FF;
+        }
+    }
+
+    uint32_t tmpVal = (sign << 15) | (exp << 10) | mant;
+    return (uint16_t)tmpVal;
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Retrieve color from hot tile source which is always float.
+/// @param pDstPixel - Pointer to destination pixel.
+/// @param srcPixel - Pointer to source pixel (pre-swizzled according to dest).
+template<SWR_FORMAT DstFormat>
+static void ConvertPixelFromFloat(
+    BYTE* pDstPixel,
+    const float srcPixel[4])
+{
+    UINT outColor[4];  // typeless bits
+
+    // Store component
+    for (UINT comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
+    {
+        SWR_TYPE type = FormatTraits<DstFormat>::GetType(comp);
+
+        float src = srcPixel[comp];
+
+        switch (type)
+        {
+        case SWR_TYPE_UNORM:
+        {
+            // Force NaN to 0. IEEE standard, comparisons involving NaN always evaluate to false.
+            src = (src != src) ? 0.0f : src;
+
+            // Clamp [0, 1]
+            src = std::max(src, 0.0f);
+            src = std::min(src, 1.0f);
+
+            // SRGB
+            if (FormatTraits<DstFormat>::isSRGB && comp != 3)
+            {
+                src = (src <= 0.0031308f) ? (12.92f * src) : (1.055f * powf(src, (1.0f / 2.4f)) - 0.055f);
+            }
+
+            // Float scale to integer scale.
+            UINT scale = (1 << FormatTraits<DstFormat>::GetBPC(comp)) - 1;
+            src = (float)scale * src;
+            src = roundf(src);
+            outColor[comp] = (UINT)src; // Drop fractional part.
+            break;
+        }
+        case SWR_TYPE_SNORM:
+        {
+            SWR_ASSERT(!FormatTraits<DstFormat>::isSRGB);
+
+            // Force NaN to 0. IEEE standard, comparisons involving NaN always evaluate to false.
+            src = (src != src) ? 0.0f : src;
+
+            // Clamp [-1, 1]
+            src = std::max(src, -1.0f);
+            src = std::min(src, 1.0f);
+
+            // Float scale to integer scale.
+            UINT scale = (1 << (FormatTraits<DstFormat>::GetBPC(comp) - 1)) - 1;
+            src = (float)scale * src;
+
+            // Round
+            src += (src >= 0) ? 0.5f : -0.5f;
+
+            INT out = (INT)src;
+
+            outColor[comp] = *(UINT*)&out;
+
+            break;
+        }
+        case SWR_TYPE_UINT:
+        {
+            ///@note The *(UINT*)& is currently necessary as the hot tile appears to always be float.
+            //       However, the number in the hot tile should be unsigned integer. So doing this
+            //       to preserve bits intead of doing a float -> integer conversion.
+            if (FormatTraits<DstFormat>::GetBPC(comp) == 32)
+            {
+                outColor[comp] = *(UINT*)&src;
+            }
+            else
+            {
+                outColor[comp] = *(UINT*)&src;
+                UINT max = (1 << FormatTraits<DstFormat>::GetBPC(comp)) - 1;  // 2^numBits - 1
+
+                outColor[comp] = std::min(max, outColor[comp]);
+            }
+            break;
+        }
+        case SWR_TYPE_SINT:
+        {
+            if (FormatTraits<DstFormat>::GetBPC(comp) == 32)
+            {
+                outColor[comp] = *(UINT*)&src;
+            }
+            else
+            {
+                INT out = *(INT*)&src;  // Hot tile format is SINT?
+                INT max = (1 << (FormatTraits<DstFormat>::GetBPC(comp) - 1)) - 1;
+                INT min = -1 - max;
+
+                ///@note The output is unsigned integer (bag of bits) and so performing
+                //       the clamping here based on range of output component. Also, manually adding
+                //       the sign bit in the appropriate spot. Maybe a better way?
+                out = std::max(out, min);
+                out = std::min(out, max);
+
+                outColor[comp] = *(UINT*)&out;
+            }
+            break;
+        }
+        case SWR_TYPE_FLOAT:
+        {
+            if (FormatTraits<DstFormat>::GetBPC(comp) == 16)
+            {
+                // Convert from 32-bit float to 16-bit float using _mm_cvtps_ph
+                // @todo 16bit float instruction support is orthogonal to avx support.  need to
+                // add check for F16C support instead.
+#if KNOB_ARCH == KNOB_ARCH_AVX2
+                __m128 src128 = _mm_set1_ps(src);
+                __m128i srci128 = _mm_cvtps_ph(src128, _MM_FROUND_TRUNC);
+                UINT value = _mm_extract_epi16(srci128, 0);
+#else
+                UINT value = Convert32To16Float(src);
+#endif
+
+                outColor[comp] = value;
+            }
+            else if (FormatTraits<DstFormat>::GetBPC(comp) == 11)
+            {
+                outColor[comp] = Convert32ToSmallFloat<6>(src);
+            }
+            else if (FormatTraits<DstFormat>::GetBPC(comp) == 10)
+            {
+                outColor[comp] = Convert32ToSmallFloat<5>(src);
+            }
+            else
+            {
+                outColor[comp] = *(UINT*)&src;
+            }
+
+            break;
+        }
+        default:
+            SWR_ASSERT(0);
+            break;
+        }
+    }
+
+    typename FormatTraits<DstFormat>::FormatT* pPixel = (typename FormatTraits<DstFormat>::FormatT*)pDstPixel;
+
+    switch (FormatTraits<DstFormat>::numComps)
+    {
+    case 4:
+        pPixel->a = outColor[3];
+    case 3:
+        pPixel->b = outColor[2];
+    case 2:
+        pPixel->g = outColor[1];
+    case 1:
+        pPixel->r = outColor[0];
+        break;
+    default:
+        SWR_ASSERT(0);
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Convert pixel in any format to float32
+/// @param pDstPixel - Pointer to destination pixel.
+/// @param srcPixel - Pointer to source pixel
+template<SWR_FORMAT SrcFormat>
+INLINE static void ConvertPixelToFloat(
+    float dstPixel[4],
+    const BYTE* pSrc)
+{
+    UINT srcColor[4];  // typeless bits
+
+    // unpack src pixel
+    typename FormatTraits<SrcFormat>::FormatT* pPixel = (typename FormatTraits<SrcFormat>::FormatT*)pSrc;
+
+    // apply format defaults
+    for (uint32_t comp = 0; comp < 4; ++comp)
+    {
+        uint32_t def = FormatTraits<SrcFormat>::GetDefault(comp);
+        dstPixel[comp] = *(float*)&def;
+    }
+
+    // load format data
+    switch (FormatTraits<SrcFormat>::numComps)
+    {
+    case 4:
+        srcColor[3] = pPixel->a;
+    case 3:
+        srcColor[2] = pPixel->b;
+    case 2:
+        srcColor[1] = pPixel->g;
+    case 1:
+        srcColor[0] = pPixel->r;
+        break;
+    default:
+        SWR_ASSERT(0);
+    }
+
+    // Convert components
+    for (UINT comp = 0; comp < FormatTraits<SrcFormat>::numComps; ++comp)
+    {
+        SWR_TYPE type = FormatTraits<SrcFormat>::GetType(comp);
+
+        UINT src = srcColor[comp];
+
+        switch (type)
+        {
+        case SWR_TYPE_UNORM:
+        {
+            float dst;
+            if (FormatTraits<SrcFormat>::isSRGB && comp != 3)
+            {
+                dst = *(float*)&srgb8Table[src];
+            }
+            else
+            {
+                // component sizes > 16 must use fp divide to maintain ulp requirements
+                if (FormatTraits<SrcFormat>::GetBPC(comp) > 16)
+                {
+                    dst = (float)src / (float)((1 << FormatTraits<SrcFormat>::GetBPC(comp)) - 1);
+                }
+                else
+                {
+                    const float scale = (1.0f / (float)((1 << FormatTraits<SrcFormat>::GetBPC(comp)) - 1));
+                    dst = (float)src * scale;
+                }
+            }
+            dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = dst;
+            break;
+        }
+        case SWR_TYPE_SNORM:
+        {
+            SWR_ASSERT(!FormatTraits<SrcFormat>::isSRGB);
+
+            float dst;
+            if (src == 0x10)
+            {
+                dst = -1.0f;
+            }
+            else
+            {
+                switch (FormatTraits<SrcFormat>::GetBPC(comp))
+                {
+                case 8:
+                    dst = (float)((int8_t)src);
+                    break;
+                case 16:
+                    dst = (float)((int16_t)src);
+                    break;
+                case 32:
+                    dst = (float)((int32_t)src);
+                    break;
+                default:
+                    assert(0 && "attempted to load from SNORM with unsupported bpc");
+                    dst = 0.0f;
+                    break;
+                }
+                dst = dst * (1.0f / ((1 << (FormatTraits<SrcFormat>::GetBPC(comp) - 1)) - 1));
+            }
+            dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = dst;
+            break;
+        }
+        case SWR_TYPE_UINT:
+        {
+            UINT dst = (UINT)src;
+            dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = *(float*)&dst;
+            break;
+        }
+        case SWR_TYPE_SINT:
+        {
+            int dst;
+            switch (FormatTraits<SrcFormat>::GetBPC(comp))
+            {
+            case 8:
+                dst = (int8_t)src;
+                break;
+            case 16:
+                dst = (int16_t)src;
+                break;
+            case 32:
+                dst = (int32_t)src;
+                break;
+            default:
+                assert(0 && "attempted to load from SINT with unsupported bpc");
+                dst = 0;
+                break;
+            }
+            dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = *(float*)&dst;
+            break;
+        }
+        case SWR_TYPE_FLOAT:
+        {
+            float dst;
+            if (FormatTraits<SrcFormat>::GetBPC(comp) == 16)
+            {
+#if KNOB_ARCH == KNOB_ARCH_AVX2
+                // Convert from 16-bit float to 32-bit float using _mm_cvtph_ps
+                // @todo 16bit float instruction support is orthogonal to avx support.  need to
+                // add check for F16C support instead.
+                __m128i src128 = _mm_set1_epi32(src);
+                __m128 res = _mm_cvtph_ps(src128);
+                _mm_store_ss(&dst, res);
+#else
+                dst = ConvertSmallFloatTo32(src);
+#endif
+            }
+            else if (FormatTraits<SrcFormat>::GetBPC(comp) == 11)
+            {
+                dst = ConvertSmallFloatTo32(src << 4);
+            }
+            else if (FormatTraits<SrcFormat>::GetBPC(comp) == 10)
+            {
+                dst = ConvertSmallFloatTo32(src << 5);
+            }
+            else
+            {
+                dst = *(float*)&src;
+            }
+
+            dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = *(float*)&dst;
+            break;
+        }
+        default:
+            SWR_ASSERT(0);
+            break;
+        }
+    }
+}
+
+// non-templated version of conversion functions
+INLINE static void ConvertPixelFromFloat(
+    SWR_FORMAT format,
+    uint8_t* pDst,
+    const float srcPixel[4])
+{
+    switch (format)
+    {
+    case R32G32B32A32_FLOAT: ConvertPixelFromFloat<R32G32B32A32_FLOAT>(pDst, srcPixel); break;
+    case R32G32B32A32_SINT: ConvertPixelFromFloat<R32G32B32A32_SINT>(pDst, srcPixel); break;
+    case R32G32B32A32_UINT: ConvertPixelFromFloat<R32G32B32A32_UINT>(pDst, srcPixel); break;
+    case R32G32B32X32_FLOAT: ConvertPixelFromFloat<R32G32B32X32_FLOAT>(pDst, srcPixel); break;
+    case R32G32B32A32_SSCALED: ConvertPixelFromFloat<R32G32B32A32_SSCALED>(pDst, srcPixel); break;
+    case R32G32B32A32_USCALED: ConvertPixelFromFloat<R32G32B32A32_USCALED>(pDst, srcPixel); break;
+    case R32G32B32_FLOAT: ConvertPixelFromFloat<R32G32B32_FLOAT>(pDst, srcPixel); break;
+    case R32G32B32_SINT: ConvertPixelFromFloat<R32G32B32_SINT>(pDst, srcPixel); break;
+    case R32G32B32_UINT: ConvertPixelFromFloat<R32G32B32_UINT>(pDst, srcPixel); break;
+    case R32G32B32_SSCALED: ConvertPixelFromFloat<R32G32B32_SSCALED>(pDst, srcPixel); break;
+    case R32G32B32_USCALED: ConvertPixelFromFloat<R32G32B32_USCALED>(pDst, srcPixel); break;
+    case R16G16B16A16_UNORM: ConvertPixelFromFloat<R16G16B16A16_UNORM>(pDst, srcPixel); break;
+    case R16G16B16A16_SNORM: ConvertPixelFromFloat<R16G16B16A16_SNORM>(pDst, srcPixel); break;
+    case R16G16B16A16_SINT: ConvertPixelFromFloat<R16G16B16A16_SINT>(pDst, srcPixel); break;
+    case R16G16B16A16_UINT: ConvertPixelFromFloat<R16G16B16A16_UINT>(pDst, srcPixel); break;
+    case R16G16B16A16_FLOAT: ConvertPixelFromFloat<R16G16B16A16_FLOAT>(pDst, srcPixel); break;
+    case R32G32_FLOAT: ConvertPixelFromFloat<R32G32_FLOAT>(pDst, srcPixel); break;
+    case R32G32_SINT: ConvertPixelFromFloat<R32G32_SINT>(pDst, srcPixel); break;
+    case R32G32_UINT: ConvertPixelFromFloat<R32G32_UINT>(pDst, srcPixel); break;
+    case R32_FLOAT_X8X24_TYPELESS: ConvertPixelFromFloat<R32_FLOAT_X8X24_TYPELESS>(pDst, srcPixel); break;
+    case R16G16B16X16_UNORM: ConvertPixelFromFloat<R16G16B16X16_UNORM>(pDst, srcPixel); break;
+    case R16G16B16X16_FLOAT: ConvertPixelFromFloat<R16G16B16X16_FLOAT>(pDst, srcPixel); break;
+    case R16G16B16A16_SSCALED: ConvertPixelFromFloat<R16G16B16A16_SSCALED>(pDst, srcPixel); break;
+    case R16G16B16A16_USCALED: ConvertPixelFromFloat<R16G16B16A16_USCALED>(pDst, srcPixel); break;
+    case R32G32_SSCALED: ConvertPixelFromFloat<R32G32_SSCALED>(pDst, srcPixel); break;
+    case R32G32_USCALED: ConvertPixelFromFloat<R32G32_USCALED>(pDst, srcPixel); break;
+    case R32_FLOAT_X8X24_TYPELESS_LD: ConvertPixelFromFloat<R32_FLOAT_X8X24_TYPELESS_LD>(pDst, srcPixel); break;
+    case B8G8R8A8_UNORM: ConvertPixelFromFloat<B8G8R8A8_UNORM>(pDst, srcPixel); break;
+    case B8G8R8A8_UNORM_SRGB: ConvertPixelFromFloat<B8G8R8A8_UNORM_SRGB>(pDst, srcPixel); break;
+    case R10G10B10A2_UNORM: ConvertPixelFromFloat<R10G10B10A2_UNORM>(pDst, srcPixel); break;
+    case R10G10B10A2_UNORM_SRGB: ConvertPixelFromFloat<R10G10B10A2_UNORM_SRGB>(pDst, srcPixel); break;
+    case R10G10B10A2_UINT: ConvertPixelFromFloat<R10G10B10A2_UINT>(pDst, srcPixel); break;
+    case R8G8B8A8_UNORM: ConvertPixelFromFloat<R8G8B8A8_UNORM>(pDst, srcPixel); break;
+    case R8G8B8A8_UNORM_SRGB: ConvertPixelFromFloat<R8G8B8A8_UNORM_SRGB>(pDst, srcPixel); break;
+    case R8G8B8A8_SNORM: ConvertPixelFromFloat<R8G8B8A8_SNORM>(pDst, srcPixel); break;
+    case R8G8B8A8_SINT: ConvertPixelFromFloat<R8G8B8A8_SINT>(pDst, srcPixel); break;
+    case R8G8B8A8_UINT: ConvertPixelFromFloat<R8G8B8A8_UINT>(pDst, srcPixel); break;
+    case R16G16_UNORM: ConvertPixelFromFloat<R16G16_UNORM>(pDst, srcPixel); break;
+    case R16G16_SNORM: ConvertPixelFromFloat<R16G16_SNORM>(pDst, srcPixel); break;
+    case R16G16_SINT: ConvertPixelFromFloat<R16G16_SINT>(pDst, srcPixel); break;
+    case R16G16_UINT: ConvertPixelFromFloat<R16G16_UINT>(pDst, srcPixel); break;
+    case R16G16_FLOAT: ConvertPixelFromFloat<R16G16_FLOAT>(pDst, srcPixel); break;
+    case B10G10R10A2_UNORM: ConvertPixelFromFloat<B10G10R10A2_UNORM>(pDst, srcPixel); break;
+    case B10G10R10A2_UNORM_SRGB: ConvertPixelFromFloat<B10G10R10A2_UNORM_SRGB>(pDst, srcPixel); break;
+    case R11G11B10_FLOAT: ConvertPixelFromFloat<R11G11B10_FLOAT>(pDst, srcPixel); break;
+    case R32_SINT: ConvertPixelFromFloat<R32_SINT>(pDst, srcPixel); break;
+    case R32_UINT: ConvertPixelFromFloat<R32_UINT>(pDst, srcPixel); break;
+    case R32_FLOAT: ConvertPixelFromFloat<R32_FLOAT>(pDst, srcPixel); break;
+    case R24_UNORM_X8_TYPELESS: ConvertPixelFromFloat<R24_UNORM_X8_TYPELESS>(pDst, srcPixel); break;
+    case R24_UNORM_X8_TYPELESS_LD: ConvertPixelFromFloat<R24_UNORM_X8_TYPELESS_LD>(pDst, srcPixel); break;
+    case A32_FLOAT: ConvertPixelFromFloat<A32_FLOAT>(pDst, srcPixel); break;
+    case B8G8R8X8_UNORM: ConvertPixelFromFloat<B8G8R8X8_UNORM>(pDst, srcPixel); break;
+    case B8G8R8X8_UNORM_SRGB: ConvertPixelFromFloat<B8G8R8X8_UNORM_SRGB>(pDst, srcPixel); break;
+    case R8G8B8X8_UNORM: ConvertPixelFromFloat<R8G8B8X8_UNORM>(pDst, srcPixel); break;
+    case R8G8B8X8_UNORM_SRGB: ConvertPixelFromFloat<R8G8B8X8_UNORM_SRGB>(pDst, srcPixel); break;
+    case R9G9B9E5_SHAREDEXP: ConvertPixelFromFloat<R9G9B9E5_SHAREDEXP>(pDst, srcPixel); break;
+    case B10G10R10X2_UNORM: ConvertPixelFromFloat<B10G10R10X2_UNORM>(pDst, srcPixel); break;
+    case R10G10B10X2_USCALED: ConvertPixelFromFloat<R10G10B10X2_USCALED>(pDst, srcPixel); break;
+    case R8G8B8A8_SSCALED: ConvertPixelFromFloat<R8G8B8A8_SSCALED>(pDst, srcPixel); break;
+    case R8G8B8A8_USCALED: ConvertPixelFromFloat<R8G8B8A8_USCALED>(pDst, srcPixel); break;
+    case R16G16_SSCALED: ConvertPixelFromFloat<R16G16_SSCALED>(pDst, srcPixel); break;
+    case R16G16_USCALED: ConvertPixelFromFloat<R16G16_USCALED>(pDst, srcPixel); break;
+    case R32_SSCALED: ConvertPixelFromFloat<R32_SSCALED>(pDst, srcPixel); break;
+    case R32_USCALED: ConvertPixelFromFloat<R32_USCALED>(pDst, srcPixel); break;
+    case B5G6R5_UNORM: ConvertPixelFromFloat<B5G6R5_UNORM>(pDst, srcPixel); break;
+    case B5G6R5_UNORM_SRGB: ConvertPixelFromFloat<B5G6R5_UNORM_SRGB>(pDst, srcPixel); break;
+    case B5G5R5A1_UNORM: ConvertPixelFromFloat<B5G5R5A1_UNORM>(pDst, srcPixel); break;
+    case B5G5R5A1_UNORM_SRGB: ConvertPixelFromFloat<B5G5R5A1_UNORM_SRGB>(pDst, srcPixel); break;
+    case B4G4R4A4_UNORM: ConvertPixelFromFloat<B4G4R4A4_UNORM>(pDst, srcPixel); break;
+    case B4G4R4A4_UNORM_SRGB: ConvertPixelFromFloat<B4G4R4A4_UNORM_SRGB>(pDst, srcPixel); break;
+    case R8G8_UNORM: ConvertPixelFromFloat<R8G8_UNORM>(pDst, srcPixel); break;
+    case R8G8_SNORM: ConvertPixelFromFloat<R8G8_SNORM>(pDst, srcPixel); break;
+    case R8G8_SINT: ConvertPixelFromFloat<R8G8_SINT>(pDst, srcPixel); break;
+    case R8G8_UINT: ConvertPixelFromFloat<R8G8_UINT>(pDst, srcPixel); break;
+    case R16_UNORM: ConvertPixelFromFloat<R16_UNORM>(pDst, srcPixel); break;
+    case R16_SNORM: ConvertPixelFromFloat<R16_SNORM>(pDst, srcPixel); break;
+    case R16_SINT: ConvertPixelFromFloat<R16_SINT>(pDst, srcPixel); break;
+    case R16_UINT: ConvertPixelFromFloat<R16_UINT>(pDst, srcPixel); break;
+    case R16_FLOAT: ConvertPixelFromFloat<R16_FLOAT>(pDst, srcPixel); break;
+    case A16_UNORM: ConvertPixelFromFloat<A16_UNORM>(pDst, srcPixel); break;
+    case A16_FLOAT: ConvertPixelFromFloat<A16_FLOAT>(pDst, srcPixel); break;
+    case B5G5R5X1_UNORM: ConvertPixelFromFloat<B5G5R5X1_UNORM>(pDst, srcPixel); break;
+    case B5G5R5X1_UNORM_SRGB: ConvertPixelFromFloat<B5G5R5X1_UNORM_SRGB>(pDst, srcPixel); break;
+    case R8G8_SSCALED: ConvertPixelFromFloat<R8G8_SSCALED>(pDst, srcPixel); break;
+    case R8G8_USCALED: ConvertPixelFromFloat<R8G8_USCALED>(pDst, srcPixel); break;
+    case R16_SSCALED: ConvertPixelFromFloat<R16_SSCALED>(pDst, srcPixel); break;
+    case R16_USCALED: ConvertPixelFromFloat<R16_USCALED>(pDst, srcPixel); break;
+    case R8_UNORM: ConvertPixelFromFloat<R8_UNORM>(pDst, srcPixel); break;
+    case R8_SNORM: ConvertPixelFromFloat<R8_SNORM>(pDst, srcPixel); break;
+    case R8_SINT: ConvertPixelFromFloat<R8_SINT>(pDst, srcPixel); break;
+    case R8_UINT: ConvertPixelFromFloat<R8_UINT>(pDst, srcPixel); break;
+    case A8_UNORM: ConvertPixelFromFloat<A8_UNORM>(pDst, srcPixel); break;
+    case R8_SSCALED: ConvertPixelFromFloat<R8_SSCALED>(pDst, srcPixel); break;
+    case R8_USCALED: ConvertPixelFromFloat<R8_USCALED>(pDst, srcPixel); break;
+    case YCRCB_SWAPUVY: ConvertPixelFromFloat<YCRCB_SWAPUVY>(pDst, srcPixel); break;
+    case BC1_UNORM: ConvertPixelFromFloat<BC1_UNORM>(pDst, srcPixel); break;
+    case BC2_UNORM: ConvertPixelFromFloat<BC2_UNORM>(pDst, srcPixel); break;
+    case BC3_UNORM: ConvertPixelFromFloat<BC3_UNORM>(pDst, srcPixel); break;
+    case BC4_UNORM: ConvertPixelFromFloat<BC4_UNORM>(pDst, srcPixel); break;
+    case BC5_UNORM: ConvertPixelFromFloat<BC5_UNORM>(pDst, srcPixel); break;
+    case BC1_UNORM_SRGB: ConvertPixelFromFloat<BC1_UNORM_SRGB>(pDst, srcPixel); break;
+    case BC2_UNORM_SRGB: ConvertPixelFromFloat<BC2_UNORM_SRGB>(pDst, srcPixel); break;
+    case BC3_UNORM_SRGB: ConvertPixelFromFloat<BC3_UNORM_SRGB>(pDst, srcPixel); break;
+    case YCRCB_SWAPUV: ConvertPixelFromFloat<YCRCB_SWAPUV>(pDst, srcPixel); break;
+    case R8G8B8_UNORM: ConvertPixelFromFloat<R8G8B8_UNORM>(pDst, srcPixel); break;
+    case R8G8B8_SNORM: ConvertPixelFromFloat<R8G8B8_SNORM>(pDst, srcPixel); break;
+    case R8G8B8_SSCALED: ConvertPixelFromFloat<R8G8B8_SSCALED>(pDst, srcPixel); break;
+    case R8G8B8_USCALED: ConvertPixelFromFloat<R8G8B8_USCALED>(pDst, srcPixel); break;
+    case BC4_SNORM: ConvertPixelFromFloat<BC4_SNORM>(pDst, srcPixel); break;
+    case BC5_SNORM: ConvertPixelFromFloat<BC5_SNORM>(pDst, srcPixel); break;
+    case R16G16B16_FLOAT: ConvertPixelFromFloat<R16G16B16_FLOAT>(pDst, srcPixel); break;
+    case R16G16B16_UNORM: ConvertPixelFromFloat<R16G16B16_UNORM>(pDst, srcPixel); break;
+    case R16G16B16_SNORM: ConvertPixelFromFloat<R16G16B16_SNORM>(pDst, srcPixel); break;
+    case R16G16B16_SSCALED: ConvertPixelFromFloat<R16G16B16_SSCALED>(pDst, srcPixel); break;
+    case R16G16B16_USCALED: ConvertPixelFromFloat<R16G16B16_USCALED>(pDst, srcPixel); break;
+    case BC7_UNORM: ConvertPixelFromFloat<BC7_UNORM>(pDst, srcPixel); break;
+    case BC7_UNORM_SRGB: ConvertPixelFromFloat<BC7_UNORM_SRGB>(pDst, srcPixel); break;
+    case R8G8B8_UNORM_SRGB: ConvertPixelFromFloat<R8G8B8_UNORM_SRGB>(pDst, srcPixel); break;
+    case R16G16B16_UINT: ConvertPixelFromFloat<R16G16B16_UINT>(pDst, srcPixel); break;
+    case R16G16B16_SINT: ConvertPixelFromFloat<R16G16B16_SINT>(pDst, srcPixel); break;
+    case R10G10B10A2_SNORM: ConvertPixelFromFloat<R10G10B10A2_SNORM>(pDst, srcPixel); break;
+    case R10G10B10A2_USCALED: ConvertPixelFromFloat<R10G10B10A2_USCALED>(pDst, srcPixel); break;
+    case R10G10B10A2_SSCALED: ConvertPixelFromFloat<R10G10B10A2_SSCALED>(pDst, srcPixel); break;
+    case R10G10B10A2_SINT: ConvertPixelFromFloat<R10G10B10A2_SINT>(pDst, srcPixel); break;
+    case B10G10R10A2_SNORM: ConvertPixelFromFloat<B10G10R10A2_SNORM>(pDst, srcPixel); break;
+    case B10G10R10A2_USCALED: ConvertPixelFromFloat<B10G10R10A2_USCALED>(pDst, srcPixel); break;
+    case B10G10R10A2_SSCALED: ConvertPixelFromFloat<B10G10R10A2_SSCALED>(pDst, srcPixel); break;
+    case B10G10R10A2_UINT: ConvertPixelFromFloat<B10G10R10A2_UINT>(pDst, srcPixel); break;
+    case B10G10R10A2_SINT: ConvertPixelFromFloat<B10G10R10A2_SINT>(pDst, srcPixel); break;
+    case R8G8B8_UINT: ConvertPixelFromFloat<R8G8B8_UINT>(pDst, srcPixel); break;
+    case R8G8B8_SINT: ConvertPixelFromFloat<R8G8B8_SINT>(pDst, srcPixel); break;
+    default:
+        break;
+    }
+}
+
+
diff --git a/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp
new file mode 100644
index 00000000000..5d9c0045a8a
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/memory/LoadTile.cpp
@@ -0,0 +1,396 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+* 
+* @file LoadTile.cpp
+* 
+* @brief Functionality for Load
+* 
+******************************************************************************/
+#include "common/os.h"
+#include "common/formats.h"
+#include "core/context.h"
+#include "core/rdtsc_core.h"
+#include "memory/TilingFunctions.h"
+#include "memory/tilingtraits.h"
+#include "memory/Convert.h"
+
+typedef void(*PFN_LOAD_TILES)(SWR_SURFACE_STATE*, uint8_t*, uint32_t, uint32_t, uint32_t);
+
+//////////////////////////////////////////////////////////////////////////
+/// Load Raster Tile Function Tables.
+//////////////////////////////////////////////////////////////////////////
+static PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_NONE[NUM_SWR_FORMATS];
+static PFN_LOAD_TILES sLoadTilesDepthTable_SWR_TILE_NONE[NUM_SWR_FORMATS];
+
+static PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_MODE_YMAJOR[NUM_SWR_FORMATS];
+static PFN_LOAD_TILES sLoadTilesColorTable_SWR_TILE_MODE_XMAJOR[NUM_SWR_FORMATS];
+
+static PFN_LOAD_TILES sLoadTilesDepthTable_SWR_TILE_MODE_YMAJOR[NUM_SWR_FORMATS];
+
+//////////////////////////////////////////////////////////////////////////
+/// LoadRasterTile
+//////////////////////////////////////////////////////////////////////////
+template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct LoadRasterTile
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Retrieve color from hot tile source which is always float.
+    /// @param pSrc - Pointer to raster tile.
+    /// @param x, y - Coordinates to raster tile.
+    /// @param output - output color
+    INLINE static void SetSwizzledDstColor(
+        const float srcColor[4],
+        uint32_t x, uint32_t y,
+        uint8_t* pDst)
+    {
+        typedef SimdTile<DstFormat, SrcFormat> SimdT;
+
+        SimdT* pDstSimdTiles = (SimdT*)pDst;
+
+        // Compute which simd tile we're accessing within 8x8 tile.
+        //   i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
+        uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM);
+
+        SimdT* pSimdTile = &pDstSimdTiles[simdIndex];
+
+        uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM);
+
+        pSimdTile->SetSwizzledColor(simdOffset, srcColor);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Loads an 8x8 raster tile from the src surface.
+    /// @param pSrcSurface - Src surface state
+    /// @param pDst - Destination hot tile pointer
+    /// @param x, y - Coordinates to raster tile.
+    INLINE static void Load(
+        SWR_SURFACE_STATE* pSrcSurface,
+        uint8_t* pDst,
+        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
+    {
+        uint32_t lodWidth = (pSrcSurface->width == 1) ? 1 : pSrcSurface->width >> pSrcSurface->lod;
+        uint32_t lodHeight = (pSrcSurface->height == 1) ? 1 : pSrcSurface->height >> pSrcSurface->lod;
+
+        // For each raster tile pixel (rx, ry)
+        for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
+        {
+            for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
+            {
+                if (((x + rx) < lodWidth) &&
+                    ((y + ry) < lodHeight))
+                {
+                    uint8_t* pSrc = (uint8_t*)ComputeSurfaceAddress<false>(x + rx, y + ry, pSrcSurface->arrayIndex + renderTargetArrayIndex,
+                                                                           pSrcSurface->arrayIndex + renderTargetArrayIndex, sampleNum, 
+                                                                           pSrcSurface->lod, pSrcSurface);
+
+                    float srcColor[4];
+                    ConvertPixelToFloat<SrcFormat>(srcColor, pSrc);
+
+                    // store pixel to hottile
+                    SetSwizzledDstColor(srcColor, rx, ry, pDst);
+                }
+            }
+        }
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// LoadMacroTile - Loads a macro tile which consists of raster tiles.
+//////////////////////////////////////////////////////////////////////////
+template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct LoadMacroTile
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Load a macrotile to the destination surface.
+    /// @param pSrc - Pointer to macro tile.
+    /// @param pDstSurface - Destination surface state
+    /// @param x, y - Coordinates to macro tile
+    static void Load(
+        SWR_SURFACE_STATE* pSrcSurface,
+        uint8_t *pDstHotTile,
+        uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
+    {
+        // Load each raster tile from the hot tile to the destination surface.
+        for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+        {
+            for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+            {
+                for (uint32_t sampleNum = 0; sampleNum < pSrcSurface->numSamples; sampleNum++)
+                {
+                    LoadRasterTile<TTraits, SrcFormat, DstFormat>::Load(pSrcSurface, pDstHotTile, 
+                        (x + col), (y + row), sampleNum, renderTargetArrayIndex);
+                    pDstHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<DstFormat>::bpp / 8);
+                }
+            }
+        }
+    }
+};
+
+static void BUCKETS_START(UINT id)
+{
+#ifdef KNOB_ENABLE_RDTSC
+    gBucketMgr.StartBucket(id);
+#endif
+}
+
+static void BUCKETS_STOP(UINT id)
+{
+#ifdef KNOB_ENABLE_RDTSC
+    gBucketMgr.StopBucket(id);
+#endif
+}
+
+// on demand buckets for load tiles
+static std::vector<int> sBuckets(NUM_SWR_FORMATS, -1);
+static std::mutex sBucketMutex;
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Loads a full hottile from a render surface
+/// @param hPrivateContext - Handle to private DC
+/// @param dstFormat - Format for hot tile.
+/// @param renderTargetIndex - Index to src render target
+/// @param x, y - Coordinates to raster tile.
+/// @param pDstHotTile - Pointer to Hot Tile
+void LoadHotTile(
+    SWR_SURFACE_STATE *pSrcSurface,
+    SWR_FORMAT dstFormat,
+    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
+    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex,
+    uint8_t *pDstHotTile)
+{
+    PFN_LOAD_TILES pfnLoadTiles = NULL;
+
+    // don't need to load null surfaces
+    if (pSrcSurface->type == SURFACE_NULL)
+    {
+        return;
+    }
+    
+    // force 0 if requested renderTargetArrayIndex is OOB
+    if (renderTargetArrayIndex >= pSrcSurface->depth)
+    {
+        renderTargetArrayIndex = 0;
+    }
+
+    if (renderTargetIndex < SWR_ATTACHMENT_DEPTH)
+    {
+        switch (pSrcSurface->tileMode)
+        {
+        case SWR_TILE_NONE:
+            pfnLoadTiles = sLoadTilesColorTable_SWR_TILE_NONE[pSrcSurface->format];
+            break;
+        case SWR_TILE_MODE_YMAJOR:
+            pfnLoadTiles = sLoadTilesColorTable_SWR_TILE_MODE_YMAJOR[pSrcSurface->format];
+            break;
+        case SWR_TILE_MODE_XMAJOR:
+            pfnLoadTiles = sLoadTilesColorTable_SWR_TILE_MODE_XMAJOR[pSrcSurface->format];
+            break;
+        case SWR_TILE_MODE_WMAJOR:
+            SWR_ASSERT(pSrcSurface->format == R8_UINT);
+            pfnLoadTiles = LoadMacroTile<TilingTraits<SWR_TILE_MODE_WMAJOR, 8>, R8_UINT, R8_UINT>::Load;
+            break;
+        default:
+            SWR_ASSERT(0, "Unsupported tiling mode");
+            break;
+        }
+    }
+    else if (renderTargetIndex == SWR_ATTACHMENT_DEPTH)
+    {
+        // Currently depth can map to linear and tile-y.
+        switch (pSrcSurface->tileMode)
+        {
+        case SWR_TILE_NONE:
+            pfnLoadTiles = sLoadTilesDepthTable_SWR_TILE_NONE[pSrcSurface->format];
+            break;
+        case SWR_TILE_MODE_YMAJOR:
+            pfnLoadTiles = sLoadTilesDepthTable_SWR_TILE_MODE_YMAJOR[pSrcSurface->format];
+            break;
+        default:
+            SWR_ASSERT(0, "Unsupported tiling mode");
+            break;
+        }
+    }
+    else
+    {
+        SWR_ASSERT(renderTargetIndex == SWR_ATTACHMENT_STENCIL);
+        SWR_ASSERT(pSrcSurface->format == R8_UINT);
+        switch (pSrcSurface->tileMode)
+        {
+        case SWR_TILE_NONE:
+            pfnLoadTiles = LoadMacroTile<TilingTraits<SWR_TILE_NONE, 8>, R8_UINT, R8_UINT>::Load;
+            break;
+        case SWR_TILE_MODE_WMAJOR:
+            pfnLoadTiles = LoadMacroTile<TilingTraits<SWR_TILE_MODE_WMAJOR, 8>, R8_UINT, R8_UINT>::Load;
+            break;
+        default:
+            SWR_ASSERT(0, "Unsupported tiling mode");
+            break;
+        }
+    }
+
+    if (pfnLoadTiles == nullptr)
+    {
+        SWR_ASSERT(false, "Unsupported format for load tile");
+        return;
+    }
+
+    // Load a macro tile.
+#ifdef KNOB_ENABLE_RDTSC
+    if (sBuckets[pSrcSurface->format] == -1)
+    {
+        // guard sBuckets update since storetiles is called by multiple threads
+        sBucketMutex.lock();
+        if (sBuckets[pSrcSurface->format] == -1)
+        {
+            const SWR_FORMAT_INFO& info = GetFormatInfo(pSrcSurface->format);
+            BUCKET_DESC desc{ info.name, "", false, 0xffffffff };
+            sBuckets[pSrcSurface->format] = gBucketMgr.RegisterBucket(desc);
+        }
+        sBucketMutex.unlock();
+    }
+#endif
+
+    BUCKETS_START(sBuckets[pSrcSurface->format]);
+    pfnLoadTiles(pSrcSurface, pDstHotTile, x, y, renderTargetArrayIndex);
+    BUCKETS_STOP(sBuckets[pSrcSurface->format]);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// INIT_LOAD_TILES_TABLE - Helper macro for setting up the tables.
+#define INIT_LOAD_TILES_COLOR_TABLE(tilemode) \
+    memset(sLoadTilesColorTable_##tilemode, 0, sizeof(sLoadTilesColorTable_##tilemode)); \
+    \
+    sLoadTilesColorTable_##tilemode[R32G32B32A32_FLOAT]      = LoadMacroTile<TilingTraits<tilemode, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R32G32B32A32_SINT]      = LoadMacroTile<TilingTraits<tilemode, 128>, R32G32B32A32_SINT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R32G32B32A32_UINT]      = LoadMacroTile<TilingTraits<tilemode, 128>, R32G32B32A32_UINT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R32G32B32X32_FLOAT]      = LoadMacroTile<TilingTraits<tilemode, 128>, R32G32B32X32_FLOAT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R32G32B32_FLOAT]      = LoadMacroTile<TilingTraits<tilemode, 96>, R32G32B32_FLOAT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R32G32B32_SINT]      = LoadMacroTile<TilingTraits<tilemode, 96>, R32G32B32_SINT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R32G32B32_UINT]      = LoadMacroTile<TilingTraits<tilemode, 96>, R32G32B32_UINT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R16G16B16A16_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16A16_UNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R16G16B16A16_SNORM]      = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16A16_SNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R16G16B16A16_SINT]      = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16A16_SINT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R16G16B16A16_UINT]      = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16A16_UINT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R16G16B16A16_FLOAT]      = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16A16_FLOAT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R32G32_FLOAT]      = LoadMacroTile<TilingTraits<tilemode, 64>, R32G32_FLOAT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R32G32_SINT]      = LoadMacroTile<TilingTraits<tilemode, 64>, R32G32_SINT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R32G32_UINT]      = LoadMacroTile<TilingTraits<tilemode, 64>, R32G32_UINT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R16G16B16X16_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16X16_UNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R16G16B16X16_FLOAT]      = LoadMacroTile<TilingTraits<tilemode, 64>, R16G16B16X16_FLOAT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[B8G8R8A8_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 32>, B8G8R8A8_UNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[B8G8R8A8_UNORM_SRGB]      = LoadMacroTile<TilingTraits<tilemode, 32>, B8G8R8A8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R10G10B10A2_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 32>, R10G10B10A2_UNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R10G10B10A2_UNORM_SRGB]      = LoadMacroTile<TilingTraits<tilemode, 32>, R10G10B10A2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R10G10B10A2_UINT]      = LoadMacroTile<TilingTraits<tilemode, 32>, R10G10B10A2_UINT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R8G8B8A8_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8A8_UNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R8G8B8A8_UNORM_SRGB]      = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8A8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R8G8B8A8_SNORM]      = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8A8_SNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R8G8B8A8_SINT]      = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8A8_SINT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R8G8B8A8_UINT]      = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8A8_UINT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R16G16_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 32>, R16G16_UNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R16G16_SNORM]      = LoadMacroTile<TilingTraits<tilemode, 32>, R16G16_SNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R16G16_SINT]      = LoadMacroTile<TilingTraits<tilemode, 32>, R16G16_SINT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R16G16_UINT]      = LoadMacroTile<TilingTraits<tilemode, 32>, R16G16_UINT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R16G16_FLOAT]      = LoadMacroTile<TilingTraits<tilemode, 32>, R16G16_FLOAT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[B10G10R10A2_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10A2_UNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[B10G10R10A2_UNORM_SRGB]      = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10A2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R11G11B10_FLOAT]      = LoadMacroTile<TilingTraits<tilemode, 32>, R11G11B10_FLOAT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R32_SINT]      = LoadMacroTile<TilingTraits<tilemode, 32>, R32_SINT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R32_UINT]      = LoadMacroTile<TilingTraits<tilemode, 32>, R32_UINT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R32_FLOAT]      = LoadMacroTile<TilingTraits<tilemode, 32>, R32_FLOAT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[A32_FLOAT]      = LoadMacroTile<TilingTraits<tilemode, 32>, A32_FLOAT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[B8G8R8X8_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 32>, B8G8R8X8_UNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[B8G8R8X8_UNORM_SRGB]      = LoadMacroTile<TilingTraits<tilemode, 32>, B8G8R8X8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R8G8B8X8_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8X8_UNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R8G8B8X8_UNORM_SRGB]      = LoadMacroTile<TilingTraits<tilemode, 32>, R8G8B8X8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[B10G10R10X2_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10X2_UNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[B5G6R5_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 16>, B5G6R5_UNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[B5G6R5_UNORM_SRGB]      = LoadMacroTile<TilingTraits<tilemode, 16>, B5G6R5_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[B5G5R5A1_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 16>, B5G5R5A1_UNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[B5G5R5A1_UNORM_SRGB]      = LoadMacroTile<TilingTraits<tilemode, 16>, B5G5R5A1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[B4G4R4A4_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 16>, B4G4R4A4_UNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[B4G4R4A4_UNORM_SRGB]      = LoadMacroTile<TilingTraits<tilemode, 16>, B4G4R4A4_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R8G8_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 16>, R8G8_UNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R8G8_SNORM]      = LoadMacroTile<TilingTraits<tilemode, 16>, R8G8_SNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R8G8_SINT]      = LoadMacroTile<TilingTraits<tilemode, 16>, R8G8_SINT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R8G8_UINT]      = LoadMacroTile<TilingTraits<tilemode, 16>, R8G8_UINT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R16_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 16>, R16_UNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R16_SNORM]      = LoadMacroTile<TilingTraits<tilemode, 16>, R16_SNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R16_SINT]      = LoadMacroTile<TilingTraits<tilemode, 16>, R16_SINT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R16_UINT]      = LoadMacroTile<TilingTraits<tilemode, 16>, R16_UINT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R16_FLOAT]      = LoadMacroTile<TilingTraits<tilemode, 16>, R16_FLOAT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[A16_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 16>, A16_UNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[A16_FLOAT]      = LoadMacroTile<TilingTraits<tilemode, 16>, A16_FLOAT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[B5G5R5X1_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 16>, B5G5R5X1_UNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[B5G5R5X1_UNORM_SRGB]      = LoadMacroTile<TilingTraits<tilemode, 16>, B5G5R5X1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R8_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 8>, R8_UNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R8_SNORM]      = LoadMacroTile<TilingTraits<tilemode, 8>, R8_SNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R8_SINT]      = LoadMacroTile<TilingTraits<tilemode, 8>, R8_SINT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R8_UINT]      = LoadMacroTile<TilingTraits<tilemode, 8>, R8_UINT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[A8_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 8>, A8_UNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[BC1_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 64>, BC1_UNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[BC2_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 128>, BC2_UNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[BC3_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 128>, BC3_UNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[BC4_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 64>, BC4_UNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[BC5_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 128>, BC5_UNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[BC1_UNORM_SRGB]      = LoadMacroTile<TilingTraits<tilemode, 64>, BC1_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[BC2_UNORM_SRGB]      = LoadMacroTile<TilingTraits<tilemode, 128>, BC2_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[BC3_UNORM_SRGB]      = LoadMacroTile<TilingTraits<tilemode, 128>, BC3_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R8G8B8_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 24>, R8G8B8_UNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R8G8B8_SNORM]      = LoadMacroTile<TilingTraits<tilemode, 24>, R8G8B8_SNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[BC4_SNORM]      = LoadMacroTile<TilingTraits<tilemode, 64>, BC4_SNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[BC5_SNORM]      = LoadMacroTile<TilingTraits<tilemode, 128>, BC5_SNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R16G16B16_FLOAT]      = LoadMacroTile<TilingTraits<tilemode, 48>, R16G16B16_FLOAT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R16G16B16_UNORM]      = LoadMacroTile<TilingTraits<tilemode, 48>, R16G16B16_UNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R16G16B16_SNORM]      = LoadMacroTile<TilingTraits<tilemode, 48>, R16G16B16_SNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R8G8B8_UNORM_SRGB]      = LoadMacroTile<TilingTraits<tilemode, 24>, R8G8B8_UNORM_SRGB, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R16G16B16_UINT]      = LoadMacroTile<TilingTraits<tilemode, 48>, R16G16B16_UINT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R16G16B16_SINT]      = LoadMacroTile<TilingTraits<tilemode, 48>, R16G16B16_SINT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R10G10B10A2_SNORM]      = LoadMacroTile<TilingTraits<tilemode, 32>, R10G10B10A2_SNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R10G10B10A2_SINT]      = LoadMacroTile<TilingTraits<tilemode, 32>, R10G10B10A2_SINT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[B10G10R10A2_SNORM]      = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10A2_SNORM, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[B10G10R10A2_UINT]      = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10A2_UINT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[B10G10R10A2_SINT]      = LoadMacroTile<TilingTraits<tilemode, 32>, B10G10R10A2_SINT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R8G8B8_UINT]      = LoadMacroTile<TilingTraits<tilemode, 24>, R8G8B8_UINT, R32G32B32A32_FLOAT>::Load; \
+    sLoadTilesColorTable_##tilemode[R8G8B8_SINT]      = LoadMacroTile<TilingTraits<tilemode, 24>, R8G8B8_SINT, R32G32B32A32_FLOAT>::Load; \
+
+//////////////////////////////////////////////////////////////////////////
+/// INIT_LOAD_TILES_TABLE - Helper macro for setting up the tables.
+#define INIT_LOAD_TILES_DEPTH_TABLE(tilemode) \
+    memset(sLoadTilesDepthTable_##tilemode, 0, sizeof(sLoadTilesDepthTable_##tilemode)); \
+    \
+    sLoadTilesDepthTable_##tilemode[R16_UNORM] = LoadMacroTile<TilingTraits<tilemode, 16>, R16_UNORM, R32_FLOAT>::Load; \
+    sLoadTilesDepthTable_##tilemode[R32_FLOAT] = LoadMacroTile<TilingTraits<tilemode, 32>, R32_FLOAT, R32_FLOAT>::Load; \
+    sLoadTilesDepthTable_##tilemode[R24_UNORM_X8_TYPELESS] = LoadMacroTile<TilingTraits<tilemode, 32>, R24_UNORM_X8_TYPELESS, R32_FLOAT>::Load; \
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Sets up tables for LoadTile
+void InitSimLoadTilesTable()
+{
+    INIT_LOAD_TILES_COLOR_TABLE(SWR_TILE_NONE);
+    INIT_LOAD_TILES_DEPTH_TABLE(SWR_TILE_NONE);
+
+    INIT_LOAD_TILES_COLOR_TABLE(SWR_TILE_MODE_YMAJOR);
+    INIT_LOAD_TILES_COLOR_TABLE(SWR_TILE_MODE_XMAJOR);
+
+    INIT_LOAD_TILES_DEPTH_TABLE(SWR_TILE_MODE_YMAJOR);
+}
diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp
new file mode 100644
index 00000000000..9ed1d0bd0ec
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.cpp
@@ -0,0 +1,1717 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+* 
+* @file StoreTile.cpp
+* 
+* @brief Functionality for Store.
+* 
+******************************************************************************/
+#include "common/os.h"
+#include "common/formats.h"
+#include "core/context.h"
+#include "core/rdtsc_core.h"
+#include "core/format_conversion.h"
+
+#include "memory/TilingFunctions.h"
+#include "memory/tilingtraits.h"
+#include "memory/Convert.h"
+#include "core/multisample.h"
+
+#include <array>
+#include <sstream>
+
+typedef void(*PFN_STORE_TILES)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t);
+
+//////////////////////////////////////////////////////////////////////////
+/// Store Raster Tile Function Tables.
+//////////////////////////////////////////////////////////////////////////
+static PFN_STORE_TILES sStoreTilesTableColor[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {};
+static PFN_STORE_TILES sStoreTilesTableDepth[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {};
+static PFN_STORE_TILES sStoreTilesTableStencil[SWR_TILE_MODE_COUNT][NUM_SWR_FORMATS] = {};
+
+//////////////////////////////////////////////////////////////////////////
+/// StorePixels
+/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
+/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
+/// @param ppDsts   - Array of destination pointers.  Each pointer is
+///                   to a single row of at most 16B.
+/// @tparam NumDests - Number of destination pointers.  Each pair of
+///                    pointers is for a 16-byte column of two rows.
+//////////////////////////////////////////////////////////////////////////
+template <size_t PixelSize, size_t NumDests>
+struct StorePixels
+{
+    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) = delete;
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// StorePixels (32-bit pixel specialization)
+/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
+/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
+/// @param ppDsts   - Array of destination pointers.  Each pointer is
+///                   to a single row of at most 16B.
+/// @tparam NumDests - Number of destination pointers.  Each pair of
+///                    pointers is for a 16-byte column of two rows.
+//////////////////////////////////////////////////////////////////////////
+template <>
+struct StorePixels<8, 2>
+{
+    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
+    {
+        // Each 4-pixel row is 4 bytes.
+        const uint16_t* pPixSrc = (const uint16_t*)pSrc;
+
+        // Unswizzle from SWR-Z order
+        uint16_t* pRow = (uint16_t*)ppDsts[0];
+        pRow[0] = pPixSrc[0];
+        pRow[1] = pPixSrc[2];
+
+        pRow = (uint16_t*)ppDsts[1];
+        pRow[0] = pPixSrc[1];
+        pRow[1] = pPixSrc[3];
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// StorePixels (32-bit pixel specialization)
+/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
+/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
+/// @param ppDsts   - Array of destination pointers.  Each pointer is
+///                   to a single row of at most 16B.
+/// @tparam NumDests - Number of destination pointers.  Each pair of
+///                    pointers is for a 16-byte column of two rows.
+//////////////////////////////////////////////////////////////////////////
+template <>
+struct StorePixels<16, 2>
+{
+    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
+    {
+        // Each 4-pixel row is 8 bytes.
+        const uint32_t* pPixSrc = (const uint32_t*)pSrc;
+
+        // Unswizzle from SWR-Z order
+        uint32_t* pRow = (uint32_t*)ppDsts[0];
+        pRow[0] = pPixSrc[0];
+        pRow[1] = pPixSrc[2];
+
+        pRow = (uint32_t*)ppDsts[1];
+        pRow[0] = pPixSrc[1];
+        pRow[1] = pPixSrc[3];
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// StorePixels (32-bit pixel specialization)
+/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
+/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
+/// @param ppDsts   - Array of destination pointers.  Each pointer is
+///                   to a single row of at most 16B.
+/// @tparam NumDests - Number of destination pointers.  Each pair of
+///                    pointers is for a 16-byte column of two rows.
+//////////////////////////////////////////////////////////////////////////
+template <>
+struct StorePixels<32, 2>
+{
+    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
+    {
+        // Each 4-pixel row is 16-bytes
+        __m128i *pZRow01 = (__m128i*)pSrc;
+        __m128i vQuad00 = _mm_load_si128(pZRow01);
+        __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
+
+        __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
+        __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
+
+        _mm_storeu_si128((__m128i*)ppDsts[0], vRow00);
+        _mm_storeu_si128((__m128i*)ppDsts[1], vRow10);
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// StorePixels (32-bit pixel specialization)
+/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
+/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
+/// @param ppDsts   - Array of destination pointers.  Each pointer is
+///                   to a single row of at most 16B.
+/// @tparam NumDests - Number of destination pointers.  Each pair of
+///                    pointers is for a 16-byte column of two rows.
+//////////////////////////////////////////////////////////////////////////
+template <>
+struct StorePixels<64, 4>
+{
+    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
+    {
+        // Each 4-pixel row is 32 bytes.
+        const __m128i* pPixSrc = (const __m128i*)pSrc;
+
+        // order of pointers match SWR-Z layout
+        __m128i** pvDsts = (__m128i**)&ppDsts[0];
+        *pvDsts[0] = pPixSrc[0];
+        *pvDsts[1] = pPixSrc[1];
+        *pvDsts[2] = pPixSrc[2];
+        *pvDsts[3] = pPixSrc[3];
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// StorePixels (32-bit pixel specialization)
+/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
+/// @param pSrc     - Pointer to source raster tile in SWRZ pixel order
+/// @param ppDsts   - Array of destination pointers.  Each pointer is
+///                   to a single row of at most 16B.
+/// @tparam NumDests - Number of destination pointers.  Each pair of
+///                    pointers is for a 16-byte column of two rows.
+//////////////////////////////////////////////////////////////////////////
+template <>
+struct StorePixels<128, 8>
+{
+    static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
+    {
+        // Each 4-pixel row is 64 bytes.
+        const __m128i* pPixSrc = (const __m128i*)pSrc;
+
+        // Unswizzle from SWR-Z order
+        __m128i** pvDsts = (__m128i**)&ppDsts[0];
+        *pvDsts[0] = pPixSrc[0];
+        *pvDsts[1] = pPixSrc[2];
+        *pvDsts[2] = pPixSrc[1];
+        *pvDsts[3] = pPixSrc[3];
+        *pvDsts[4] = pPixSrc[4];
+        *pvDsts[5] = pPixSrc[6];
+        *pvDsts[6] = pPixSrc[5];
+        *pvDsts[7] = pPixSrc[7];
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct ConvertPixelsSOAtoAOS
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Converts a SIMD from the Hot Tile to the destination format
+    ///        and converts from SOA to AOS.
+    /// @param pSrc - Pointer to raster tile.
+    /// @param pDst - Pointer to destination surface or deswizzling buffer.
+    template <size_t NumDests>
+    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
+    {
+        static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
+
+        OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
+        OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
+
+        // Convert from SrcFormat --> DstFormat
+        simdvector src;
+        LoadSOA<SrcFormat>(pSrc, src);
+        StoreSOA<DstFormat>(src, soaTile);
+
+        // Convert from SOA --> AOS
+        FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
+
+        // Store data into destination
+        StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
+/// Specialization for no format conversion
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT Format>
+struct ConvertPixelsSOAtoAOS<Format, Format>
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Converts a SIMD from the Hot Tile to the destination format
+    ///        and converts from SOA to AOS.
+    /// @param pSrc - Pointer to raster tile.
+    /// @param pDst - Pointer to destination surface or deswizzling buffer.
+    template <size_t NumDests>
+    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
+    {
+        static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
+
+        OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
+
+        // Convert from SOA --> AOS
+        FormatTraits<Format>::TransposeT::Transpose(pSrc, aosTile);
+
+        // Store data into destination
+        StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts);
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// ConvertPixelsSOAtoAOS - Specialization conversion for B5G6R6_UNORM
+//////////////////////////////////////////////////////////////////////////
+template<>
+struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM >
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Converts a SIMD from the Hot Tile to the destination format
+    ///        and converts from SOA to AOS.
+    /// @param pSrc - Pointer to raster tile.
+    /// @param pDst - Pointer to destination surface or deswizzling buffer.
+    template <size_t NumDests>
+    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
+    {
+        static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
+        static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
+        static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
+
+        OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
+
+        // Load hot-tile
+        simdvector src, dst;
+        LoadSOA<SrcFormat>(pSrc, src);
+
+        // deswizzle
+        dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
+        dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
+        dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
+
+        // clamp
+        dst.x = Clamp<DstFormat>(dst.x, 0);
+        dst.y = Clamp<DstFormat>(dst.y, 1);
+        dst.z = Clamp<DstFormat>(dst.z, 2);
+
+        // normalize
+        dst.x = Normalize<DstFormat>(dst.x, 0);
+        dst.y = Normalize<DstFormat>(dst.y, 1);
+        dst.z = Normalize<DstFormat>(dst.z, 2);
+
+        // pack
+        simdscalari packed = _simd_castps_si(dst.x);
+        packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.y), FormatTraits<DstFormat>::GetBPC(0)));
+        packed = _simd_or_si(packed, _simd_slli_epi32(_simd_castps_si(dst.z), FormatTraits<DstFormat>::GetBPC(0) +
+                                                                              FormatTraits<DstFormat>::GetBPC(1)));
+
+        // pack low 16 bits of each 32 bit lane to low 128 bits of dst
+        uint32_t *pPacked = (uint32_t*)&packed;
+        uint16_t *pAosTile = (uint16_t*)&aosTile[0];
+        for (uint32_t t = 0; t < KNOB_SIMD_WIDTH; ++t)
+        {
+            *pAosTile++ = *pPacked++;
+        }
+
+        // Store data into destination
+        StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
+//////////////////////////////////////////////////////////////////////////
+template<>
+struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
+{
+    static const SWR_FORMAT SrcFormat = R32_FLOAT;
+    static const SWR_FORMAT DstFormat = R24_UNORM_X8_TYPELESS;
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Converts a SIMD from the Hot Tile to the destination format
+    ///        and converts from SOA to AOS.
+    /// @param pSrc - Pointer to raster tile.
+    /// @param pDst - Pointer to destination surface or deswizzling buffer.
+    template <size_t NumDests>
+    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
+    {
+        static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
+
+        OSALIGNSIMD(uint8_t) soaTile[MAX_RASTER_TILE_BYTES];
+        OSALIGNSIMD(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
+
+        // Convert from SrcFormat --> DstFormat
+        simdvector src;
+        LoadSOA<SrcFormat>(pSrc, src);
+        StoreSOA<DstFormat>(src, soaTile);
+
+        // Convert from SOA --> AOS
+        FormatTraits<DstFormat>::TransposeT::Transpose(soaTile, aosTile);
+
+        // Store data into destination but don't overwrite the X8 bits
+        // Each 4-pixel row is 16-bytes
+        __m128i *pZRow01 = (__m128i*)aosTile;
+        __m128i vQuad00 = _mm_load_si128(pZRow01);
+        __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
+
+        __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
+        __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
+
+        __m128i vDst0 = _mm_loadu_si128((const __m128i*)ppDsts[0]);
+        __m128i vDst1 = _mm_loadu_si128((const __m128i*)ppDsts[1]);
+
+        __m128i vMask = _mm_set1_epi32(0xFFFFFF);
+
+        vDst0 = _mm_andnot_si128(vMask, vDst0);
+        vDst0 = _mm_or_si128(vDst0, _mm_and_si128(vRow00, vMask));
+        vDst1 = _mm_andnot_si128(vMask, vDst1);
+        vDst1 = _mm_or_si128(vDst1, _mm_and_si128(vRow10, vMask));
+
+        _mm_storeu_si128((__m128i*)ppDsts[0], vDst0);
+        _mm_storeu_si128((__m128i*)ppDsts[1], vDst1);
+    }
+};
+
+template<SWR_FORMAT DstFormat>
+INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
+{
+    static const uint32_t offset = sizeof(simdscalar);
+
+    // swizzle rgba -> bgra while we load
+    simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr 
+    simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
+    simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb 
+    simdscalar vComp3 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(3))*offset)); // float32 aaaaaaaa 
+
+    // clamp
+    vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
+    vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
+
+    vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
+    vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
+
+    vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
+    vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
+
+    vComp3 = _simd_max_ps(vComp3, _simd_setzero_ps());
+    vComp3 = _simd_min_ps(vComp3, _simd_set1_ps(1.0f));
+
+    if (FormatTraits<DstFormat>::isSRGB)
+    {
+        // Gamma-correct only rgb
+        vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
+        vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
+        vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
+    }
+
+    // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
+    vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); 
+    vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
+    vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
+    vComp3 = _simd_mul_ps(vComp3, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(3)));
+
+    // moving to 8 wide integer vector types
+    __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
+    __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg 
+    __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb 
+    __m256i src3 = _simd_cvtps_epi32(vComp3); // padded byte aaaaaaaa
+
+#if KNOB_ARCH == KNOB_ARCH_AVX
+
+    // splitting into two sets of 4 wide integer vector types
+    // because AVX doesn't have instructions to support this operation at 8 wide
+    __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
+    __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
+    __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
+    __m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
+
+    __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
+    __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
+    __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
+    __m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
+
+    srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
+    srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
+    srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
+    srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
+    srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
+    srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
+
+    srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
+    srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00
+
+    srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
+    srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00
+
+    srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr
+    srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr
+
+    // unpack into rows that get the tiling order correct
+    __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0);  // abgrabgrabgrabgrabgrabgrabgrabgr
+    __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
+
+    __m256i final = _mm256_castsi128_si256(vRow00);
+    final = _mm256_insertf128_si256(final, vRow10, 1);
+
+#elif KNOB_ARCH == KNOB_ARCH_AVX2
+
+    // logic is as above, only wider
+    src1 = _mm256_slli_si256(src1, 1);
+    src2 = _mm256_slli_si256(src2, 2);
+    src3 = _mm256_slli_si256(src3, 3);
+
+    src0 = _mm256_or_si256(src0, src1);
+    src2 = _mm256_or_si256(src2, src3);
+
+    __m256i final = _mm256_or_si256(src0, src2);
+        
+    // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
+    final = _mm256_permute4x64_epi64(final, 0xD8);
+
+#endif
+
+    _mm256_storeu2_m128i((__m128i*)pDst1, (__m128i*)pDst, final);
+}
+
+template<SWR_FORMAT DstFormat>
+INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
+{
+    static const uint32_t offset = sizeof(simdscalar);
+
+    // swizzle rgba -> bgra while we load
+    simdscalar vComp0 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(0))*offset)); // float32 rrrrrrrr 
+    simdscalar vComp1 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(1))*offset)); // float32 gggggggg
+    simdscalar vComp2 = _simd_load_ps((const float*)(pSrc + (FormatTraits<DstFormat>::swizzle(2))*offset)); // float32 bbbbbbbb 
+                                                                                                            // clamp
+    vComp0 = _simd_max_ps(vComp0, _simd_setzero_ps());
+    vComp0 = _simd_min_ps(vComp0, _simd_set1_ps(1.0f));
+
+    vComp1 = _simd_max_ps(vComp1, _simd_setzero_ps());
+    vComp1 = _simd_min_ps(vComp1, _simd_set1_ps(1.0f));
+
+    vComp2 = _simd_max_ps(vComp2, _simd_setzero_ps());
+    vComp2 = _simd_min_ps(vComp2, _simd_set1_ps(1.0f));
+
+    if (FormatTraits<DstFormat>::isSRGB)
+    {
+        // Gamma-correct only rgb
+        vComp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, vComp0);
+        vComp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, vComp1);
+        vComp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, vComp2);
+    }
+
+    // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
+    vComp0 = _simd_mul_ps(vComp0, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
+    vComp1 = _simd_mul_ps(vComp1, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
+    vComp2 = _simd_mul_ps(vComp2, _simd_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
+
+    // moving to 8 wide integer vector types
+    __m256i src0 = _simd_cvtps_epi32(vComp0); // padded byte rrrrrrrr
+    __m256i src1 = _simd_cvtps_epi32(vComp1); // padded byte gggggggg 
+    __m256i src2 = _simd_cvtps_epi32(vComp2); // padded byte bbbbbbbb 
+
+#if KNOB_ARCH == KNOB_ARCH_AVX
+
+                                              // splitting into two sets of 4 wide integer vector types
+                                              // because AVX doesn't have instructions to support this operation at 8 wide
+    __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
+    __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
+    __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
+
+    __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
+    __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
+    __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
+
+    srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
+    srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
+    srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
+    srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
+
+    srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
+
+    srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
+
+    srcLo0 = _mm_or_si128(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
+    srcHi0 = _mm_or_si128(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
+
+                                           // unpack into rows that get the tiling order correct
+    __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0);  // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
+    __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
+
+    __m256i final = _mm256_castsi128_si256(vRow00);
+    final = _mm256_insertf128_si256(final, vRow10, 1);
+
+#elif KNOB_ARCH == KNOB_ARCH_AVX2
+
+                                              // logic is as above, only wider
+    src1 = _mm256_slli_si256(src1, 1);
+    src2 = _mm256_slli_si256(src2, 2);
+
+    src0 = _mm256_or_si256(src0, src1);
+
+    __m256i final = _mm256_or_si256(src0, src2);
+
+    // adjust the data to get the tiling order correct 0 1 2 3 -> 0 2 1 3
+    final = _mm256_permute4x64_epi64(final, 0xD8);
+
+#endif
+
+    _mm256_storeu2_m128i((__m128i*)pDst1, (__m128i*)pDst, final);
+}
+
+template<>
+struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>
+{
+    template <size_t NumDests>
+    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
+    {
+        FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
+    }
+};
+
+template<>
+struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>
+{
+    template <size_t NumDests>
+    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
+    {
+        FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
+    }
+};
+
+template<>
+struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB >
+{
+    template <size_t NumDests>
+    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
+    {
+        FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
+    }
+};
+
+template<>
+struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB >
+{
+    template <size_t NumDests>
+    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
+    {
+        FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
+    }
+};
+
+template<>
+struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM >
+{
+    template <size_t NumDests>
+    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
+    {
+        FlatConvert<R8G8B8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
+    }
+};
+
+template<>
+struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM >
+{
+    template <size_t NumDests>
+    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
+    {
+        FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
+    }
+};
+
+template<>
+struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB >
+{
+    template <size_t NumDests>
+    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
+    {
+        FlatConvert<R8G8B8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
+    }
+};
+
+template<>
+struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB >
+{
+    template <size_t NumDests>
+    INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
+    {
+        FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// StoreRasterTile
+//////////////////////////////////////////////////////////////////////////
+template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct StoreRasterTile
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Retrieve color from hot tile source which is always float.
+    /// @param pSrc - Pointer to raster tile.
+    /// @param x, y - Coordinates to raster tile.
+    /// @param output - output color
+    INLINE static void GetSwizzledSrcColor(
+        uint8_t* pSrc,
+        uint32_t x, uint32_t y,
+        float outputColor[4])
+    {
+        typedef SimdTile<SrcFormat, DstFormat> SimdT;
+
+        SimdT* pSrcSimdTiles = (SimdT*)pSrc;
+
+        // Compute which simd tile we're accessing within 8x8 tile.
+        //   i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
+        uint32_t simdIndex = (y / SIMD_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD_TILE_X_DIM) + (x / SIMD_TILE_X_DIM);
+
+        SimdT* pSimdTile = &pSrcSimdTiles[simdIndex];
+
+        uint32_t simdOffset = (y % SIMD_TILE_Y_DIM) * SIMD_TILE_X_DIM + (x % SIMD_TILE_X_DIM);
+
+        pSimdTile->GetSwizzledColor(simdOffset, outputColor);
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Stores an 8x8 raster tile to the destination surface.
+    /// @param pSrc - Pointer to raster tile.
+    /// @param pDstSurface - Destination surface state
+    /// @param x, y - Coordinates to raster tile.
+    INLINE static void Store(
+        uint8_t *pSrc,
+        SWR_SURFACE_STATE* pDstSurface,
+        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) // (x, y) pixel coordinate to start of raster tile.
+    {
+        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
+        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
+
+        // For each raster tile pixel (rx, ry)
+        for (uint32_t ry = 0; ry < KNOB_TILE_Y_DIM; ++ry)
+        {
+            for (uint32_t rx = 0; rx < KNOB_TILE_X_DIM; ++rx)
+            {
+                // Perform bounds checking.
+                if (((x + rx) < lodWidth) &&
+                    ((y + ry) < lodHeight))
+                {
+                    float srcColor[4];
+                    GetSwizzledSrcColor(pSrc, rx, ry, srcColor);
+
+                    uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false>((x + rx), (y + ry), 
+                        pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, 
+                        sampleNum, pDstSurface->lod, pDstSurface);
+                    ConvertPixelFromFloat<DstFormat>(pDst, srcColor);
+                }
+            }
+        }
+    }
+};
+
+template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct OptStoreRasterTile : StoreRasterTile<TTraits, SrcFormat, DstFormat>
+{};
+
+//////////////////////////////////////////////////////////////////////////
+/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 8bpp
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat >
+{
+    typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> GenericStoreTile;
+    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
+    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Stores an 8x8 raster tile to the destination surface.
+    /// @param pSrc - Pointer to raster tile.
+    /// @param pDstSurface - Destination surface state
+    /// @param x, y - Coordinates to raster tile.
+    INLINE static void Store(
+        uint8_t *pSrc,
+        SWR_SURFACE_STATE* pDstSurface,
+        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
+    {
+        // Punt non-full tiles to generic store
+        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
+        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
+        if (x + KNOB_TILE_X_DIM > lodWidth ||
+            y + KNOB_TILE_Y_DIM > lodHeight)
+        {
+            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
+        }
+
+        uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 
+            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+        uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
+
+        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
+        {
+            uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
+
+            for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
+            {
+                // Format conversion and convert from SOA to AOS, and store the rows.
+                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
+
+                ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
+                ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;;
+                pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
+            }
+
+            ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
+            ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
+        }
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 16bpp
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat >
+{
+    typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat> GenericStoreTile;
+    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
+    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Stores an 8x8 raster tile to the destination surface.
+    /// @param pSrc - Pointer to raster tile.
+    /// @param pDstSurface - Destination surface state
+    /// @param x, y - Coordinates to raster tile.
+    INLINE static void Store(
+        uint8_t *pSrc,
+        SWR_SURFACE_STATE* pDstSurface,
+        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
+    {
+        // Punt non-full tiles to generic store
+        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
+        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
+        if (x + KNOB_TILE_X_DIM > lodWidth ||
+            y + KNOB_TILE_Y_DIM > lodHeight)
+        {
+            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
+        }
+
+        uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 
+            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+        uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
+
+        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
+        {
+            uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
+
+            for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
+            {
+                // Format conversion and convert from SOA to AOS, and store the rows.
+                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
+
+                ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
+                ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;;
+                pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
+            }
+
+            ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
+            ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
+        }
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 32bpp
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat >
+{
+    typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat> GenericStoreTile;
+    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
+    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Stores an 8x8 raster tile to the destination surface.
+    /// @param pSrc - Pointer to raster tile.
+    /// @param pDstSurface - Destination surface state
+    /// @param x, y - Coordinates to raster tile.
+    INLINE static void Store(
+        uint8_t *pSrc,
+        SWR_SURFACE_STATE* pDstSurface,
+        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
+    {
+        // Punt non-full tiles to generic store
+        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
+        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
+        if (x + KNOB_TILE_X_DIM > lodWidth ||
+            y + KNOB_TILE_Y_DIM > lodHeight)
+        {
+            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
+        }
+
+        uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+        uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
+
+        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
+        {
+            uint8_t* ppStartRows[] = { ppRows[0], ppRows[1] };
+
+            for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
+            {
+                // Format conversion and convert from SOA to AOS, and store the rows.
+                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
+
+                ppRows[0] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
+                ppRows[1] += KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;;
+                pSrc += SRC_BYTES_PER_PIXEL * KNOB_SIMD_WIDTH;
+            }
+
+            ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
+            ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
+        }
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat >
+{
+    typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile;
+    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
+    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
+    static const size_t MAX_DST_COLUMN_BYTES = 16;
+    static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
+    static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Stores an 8x8 raster tile to the destination surface.
+    /// @param pSrc - Pointer to raster tile.
+    /// @param pDstSurface - Destination surface state
+    /// @param x, y - Coordinates to raster tile.
+    INLINE static void Store(
+        uint8_t *pSrc,
+        SWR_SURFACE_STATE* pDstSurface,
+        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
+    {
+        // Punt non-full tiles to generic store
+        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
+        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
+        if (x + KNOB_TILE_X_DIM > lodWidth ||
+            y + KNOB_TILE_Y_DIM > lodHeight)
+        {
+            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
+        }
+
+        uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+        uint8_t* ppDsts[] =
+        {
+            pDst,                                               // row 0, col 0
+            pDst + pDstSurface->pitch,                          // row 1, col 0
+            pDst + MAX_DST_COLUMN_BYTES,                        // row 0, col 1
+            pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES,   // row 1, col 1
+        };
+
+        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
+        {
+            uint8_t* ppStartRows[] =
+            {
+                ppDsts[0],
+                ppDsts[1],
+                ppDsts[2],
+                ppDsts[3],
+            };
+
+            for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
+            {
+                // Format conversion and convert from SOA to AOS, and store the rows.
+                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+
+                ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
+                ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
+                ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
+                ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
+                pSrc += SRC_COLUMN_BYTES;
+            }
+
+            ppDsts[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
+            ppDsts[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
+            ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch;
+            ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch;
+        }
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 128bpp
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat >
+{
+    typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
+    static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
+    static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
+    static const size_t MAX_DST_COLUMN_BYTES = 16;
+    static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
+    static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Stores an 8x8 raster tile to the destination surface.
+    /// @param pSrc - Pointer to raster tile.
+    /// @param pDstSurface - Destination surface state
+    /// @param x, y - Coordinates to raster tile.
+    INLINE static void Store(
+        uint8_t *pSrc,
+        SWR_SURFACE_STATE* pDstSurface,
+        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
+    {
+        // Punt non-full tiles to generic store
+        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
+        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
+        if (x + KNOB_TILE_X_DIM > lodWidth ||
+            y + KNOB_TILE_Y_DIM > lodHeight)
+        {
+            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
+        }
+
+        uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+        struct DstPtrs
+        {
+            uint8_t* ppDsts[8];
+        } ptrs;
+
+        // Need 8 pointers, 4 columns of 2 rows each
+        for (uint32_t y = 0; y < 2; ++y)
+        {
+            for (uint32_t x = 0; x < 4; ++x)
+            {
+                ptrs.ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
+            }
+        }
+
+        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
+        {
+            DstPtrs startPtrs = ptrs;
+
+            for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
+            {
+                // Format conversion and convert from SOA to AOS, and store the rows.
+                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
+
+                ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
+                ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
+                ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
+                ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
+                ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
+                ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
+                ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
+                ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
+                pSrc += SRC_COLUMN_BYTES;
+            }
+
+            ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * pDstSurface->pitch;
+            ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * pDstSurface->pitch;
+            ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * pDstSurface->pitch;
+            ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * pDstSurface->pitch;
+            ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * pDstSurface->pitch;
+            ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * pDstSurface->pitch;
+            ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch;
+            ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch;
+        }
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 8bpp
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat >
+{
+    typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile;
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Stores an 8x8 raster tile to the destination surface.
+    /// @param pSrc - Pointer to raster tile.
+    /// @param pDstSurface - Destination surface state
+    /// @param x, y - Coordinates to raster tile.
+    INLINE static void Store(
+        uint8_t *pSrc,
+        SWR_SURFACE_STATE* pDstSurface,
+        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
+    {
+        static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
+
+        // Punt non-full tiles to generic store
+        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
+        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
+        if (x + KNOB_TILE_X_DIM > lodWidth ||
+            y + KNOB_TILE_Y_DIM > lodHeight)
+        {
+            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
+        }
+
+        // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
+        // We can compute the offsets to each column within the raster tile once and increment from these.
+        // There will be 2 x 4-wide columns in an 8x8 raster tile.
+        uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+
+        // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
+        uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
+
+        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
+        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
+        {
+            uint32_t rowOffset = row * DestRowWidthBytes;
+
+            uint8_t* pRow = pCol0 + rowOffset;
+            uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
+
+            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+            pSrc += pSrcInc;
+
+            ppDsts[0] += DestRowWidthBytes / 4;
+            ppDsts[1] += DestRowWidthBytes / 4;
+
+            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+            pSrc += pSrcInc;
+        }
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 16bpp
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat >
+{
+    typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile;
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Stores an 8x8 raster tile to the destination surface.
+    /// @param pSrc - Pointer to raster tile.
+    /// @param pDstSurface - Destination surface state
+    /// @param x, y - Coordinates to raster tile.
+    INLINE static void Store(
+        uint8_t *pSrc,
+        SWR_SURFACE_STATE* pDstSurface,
+        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
+    {
+        static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
+
+        // Punt non-full tiles to generic store
+        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
+        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
+        if (x + KNOB_TILE_X_DIM > lodWidth ||
+            y + KNOB_TILE_Y_DIM > lodHeight)
+        {
+            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
+        }
+
+        // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
+        // We can compute the offsets to each column within the raster tile once and increment from these.
+        // There will be 2 x 4-wide columns in an 8x8 raster tile.
+        uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+
+        // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
+        uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
+
+        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
+        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
+        {
+            uint32_t rowOffset = row * DestRowWidthBytes;
+
+            uint8_t* pRow = pCol0 + rowOffset;
+            uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
+
+            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+            pSrc += pSrcInc;
+
+            ppDsts[0] += DestRowWidthBytes / 2;
+            ppDsts[1] += DestRowWidthBytes / 2;
+
+            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+            pSrc += pSrcInc;
+        }
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// OptStoreRasterTile - TILE_MODE_XMAJOR specialization for 32bpp
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat >
+{
+    typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Stores an 8x8 raster tile to the destination surface.
+    /// @param pSrc - Pointer to raster tile.
+    /// @param pDstSurface - Destination surface state
+    /// @param x, y - Coordinates to raster tile.
+    INLINE static void Store(
+        uint8_t *pSrc,
+        SWR_SURFACE_STATE* pDstSurface,
+        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
+    {
+        static const uint32_t DestRowWidthBytes = 512;                   // 512B rows
+
+                                                                     // Punt non-full tiles to generic store
+        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
+        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
+        if (x + KNOB_TILE_X_DIM > lodWidth ||
+            y + KNOB_TILE_Y_DIM > lodHeight)
+        {
+            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
+        }
+
+        // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows.
+        // We can compute the offsets to each column within the raster tile once and increment from these.
+        uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 
+            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+        uint8_t* pRow1 = pRow0 + DestRowWidthBytes;
+
+        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
+        {
+            for (uint32_t col = 0; col < KNOB_TILE_X_DIM; col += SIMD_TILE_X_DIM)
+            {
+                uint32_t xRowOffset = col * (FormatTraits<DstFormat>::bpp / 8);
+
+                uint8_t* ppDsts[] = { pRow0 + xRowOffset, pRow1 + xRowOffset };
+                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+
+                // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
+                pSrc += (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
+            }
+
+            pRow0 += (DestRowWidthBytes * 2);
+            pRow1 += (DestRowWidthBytes * 2);
+        }
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 32bpp
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat >
+{
+    typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Stores an 8x8 raster tile to the destination surface.
+    /// @param pSrc - Pointer to raster tile.
+    /// @param pDstSurface - Destination surface state
+    /// @param x, y - Coordinates to raster tile.
+    INLINE static void Store(
+        uint8_t *pSrc,
+        SWR_SURFACE_STATE* pDstSurface,
+        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
+    {
+        static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
+        static const uint32_t DestColumnBytes = DestRowWidthBytes * 32;  // 16B x 32 rows.
+
+        // Punt non-full tiles to generic store
+        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
+        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
+        if (x + KNOB_TILE_X_DIM > lodWidth ||
+            y + KNOB_TILE_Y_DIM > lodHeight)
+        {
+            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
+        }
+
+        // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
+        // We can compute the offsets to each column within the raster tile once and increment from these.
+        // There will be 2 x 4-wide columns in an 8x8 raster tile.
+        uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 
+            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+
+        // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
+        uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
+
+        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
+        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
+        {
+            uint32_t rowOffset = row * DestRowWidthBytes;
+
+            uint8_t* pRow = pCol0 + rowOffset;
+            uint8_t* ppDsts[] = { pRow, pRow + DestRowWidthBytes };
+
+            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+            pSrc += pSrcInc;
+
+            ppDsts[0] += DestColumnBytes;
+            ppDsts[1] += DestColumnBytes;
+
+            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+            pSrc += pSrcInc;
+        }
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// OptStoreRasterTile - TILE_MODE_YMAJOR specialization for 64bpp
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat >
+{
+    typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile;
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Stores an 8x8 raster tile to the destination surface.
+    /// @param pSrc - Pointer to raster tile.
+    /// @param pDstSurface - Destination surface state
+    /// @param x, y - Coordinates to raster tile.
+    INLINE static void Store(
+        uint8_t *pSrc,
+        SWR_SURFACE_STATE* pDstSurface,
+        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
+    {
+        static const uint32_t DestRowWidthBytes = 16;                    // 16B rows
+        static const uint32_t DestColumnBytes = DestRowWidthBytes * 32;  // 16B x 32 rows.
+
+        // Punt non-full tiles to generic store
+        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
+        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
+        if (x + KNOB_TILE_X_DIM > lodWidth ||
+            y + KNOB_TILE_Y_DIM > lodHeight)
+        {
+            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
+        }
+
+        // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
+        // We can compute the offsets to each column within the raster tile once and increment from these.
+        // There will be 2 x 4-wide columns in an 8x8 raster tile.
+        uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 
+            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+        uint8_t* pCol1 = pCol0 + DestColumnBytes;
+
+        // There are 4 columns, each 2 pixels wide when we have 64bpp pixels.
+        // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
+        uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
+
+        // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
+        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD_TILE_Y_DIM)
+        {
+            uint32_t rowOffset = row * DestRowWidthBytes;
+            uint8_t* ppDsts[] =
+            {
+                pCol0 + rowOffset,
+                pCol0 + rowOffset + DestRowWidthBytes,
+                pCol1 + rowOffset,
+                pCol1 + rowOffset + DestRowWidthBytes,
+            };
+
+            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+            pSrc += pSrcInc;
+
+            ppDsts[0] += DestColumnBytes * 2;
+            ppDsts[1] += DestColumnBytes * 2;
+            ppDsts[2] += DestColumnBytes * 2;
+            ppDsts[3] += DestColumnBytes * 2;
+
+            ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+            pSrc += pSrcInc;
+        }
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// OptStoreRasterTile - SWR_TILE_MODE_YMAJOR specialization for 128bpp
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat >
+{
+    typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
+ 
+    static const size_t TILE_Y_COL_WIDTH_BYTES  = 16;
+    static const size_t TILE_Y_ROWS             = 32;
+    static const size_t TILE_Y_COL_BYTES        = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES;
+    
+    static const size_t DST_BYTES_PER_PIXEL     = FormatTraits<DstFormat>::bpp / 8;
+    static const size_t SRC_BYTES_PER_PIXEL     = FormatTraits<SrcFormat>::bpp / 8;
+    static const size_t MAX_DST_COLUMN_BYTES    = 16;
+
+    static const size_t SRC_COLUMN_BYTES        = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
+    static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4;
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Stores an 8x8 raster tile to the destination surface.
+    /// @param pSrc - Pointer to raster tile.
+    /// @param pDstSurface - Destination surface state
+    /// @param x, y - Coordinates to raster tile.
+    INLINE static void Store(
+        uint8_t *pSrc,
+        SWR_SURFACE_STATE* pDstSurface,
+        uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
+    {
+        // Punt non-full tiles to generic store
+        uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
+        uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
+        if (x + KNOB_TILE_X_DIM > lodWidth ||
+            y + KNOB_TILE_Y_DIM > lodHeight)
+        {
+            return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
+        }
+
+        uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, 
+            pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+        struct DstPtrs
+        {
+            uint8_t* ppDsts[8];
+        } ptrs;
+
+        // Need 8 pointers, 4 columns of 2 rows each
+        for (uint32_t y = 0; y < 2; ++y)
+        {
+            for (uint32_t x = 0; x < 4; ++x)
+            {
+                ptrs.ppDsts[x * 2 + y] = pDst + y * TILE_Y_COL_WIDTH_BYTES + x * TILE_Y_COL_BYTES;
+            }
+        }
+
+        for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
+        {
+            DstPtrs startPtrs = ptrs;
+
+            for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD_TILE_X_DIM; ++col)
+            {
+                // Format conversion and convert from SOA to AOS, and store the rows.
+                ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ptrs.ppDsts);
+
+                ptrs.ppDsts[0] += DST_COLUMN_BYTES_PER_SRC;
+                ptrs.ppDsts[1] += DST_COLUMN_BYTES_PER_SRC;
+                ptrs.ppDsts[2] += DST_COLUMN_BYTES_PER_SRC;
+                ptrs.ppDsts[3] += DST_COLUMN_BYTES_PER_SRC;
+                ptrs.ppDsts[4] += DST_COLUMN_BYTES_PER_SRC;
+                ptrs.ppDsts[5] += DST_COLUMN_BYTES_PER_SRC;
+                ptrs.ppDsts[6] += DST_COLUMN_BYTES_PER_SRC;
+                ptrs.ppDsts[7] += DST_COLUMN_BYTES_PER_SRC;
+                pSrc += SRC_COLUMN_BYTES;
+            }
+
+            ptrs.ppDsts[0] = startPtrs.ppDsts[0] + 2 * TILE_Y_COL_WIDTH_BYTES;
+            ptrs.ppDsts[1] = startPtrs.ppDsts[1] + 2 * TILE_Y_COL_WIDTH_BYTES;
+            ptrs.ppDsts[2] = startPtrs.ppDsts[2] + 2 * TILE_Y_COL_WIDTH_BYTES;
+            ptrs.ppDsts[3] = startPtrs.ppDsts[3] + 2 * TILE_Y_COL_WIDTH_BYTES;
+            ptrs.ppDsts[4] = startPtrs.ppDsts[4] + 2 * TILE_Y_COL_WIDTH_BYTES;
+            ptrs.ppDsts[5] = startPtrs.ppDsts[5] + 2 * TILE_Y_COL_WIDTH_BYTES;
+            ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES;
+            ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES;
+        }
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// StoreMacroTile - Stores a macro tile which consists of raster tiles.
+//////////////////////////////////////////////////////////////////////////
+template<typename TTraits, SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+struct StoreMacroTile
+{
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Stores a macrotile to the destination surface using safe implementation.
+    /// @param pSrc - Pointer to macro tile.
+    /// @param pDstSurface - Destination surface state
+    /// @param x, y - Coordinates to macro tile
+    static void StoreGeneric(
+        uint8_t *pSrcHotTile,
+        SWR_SURFACE_STATE* pDstSurface,
+        uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
+    {
+        // Store each raster tile from the hot tile to the destination surface.
+        for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+        {
+            for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+            {
+                for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
+                {
+                    StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store (pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, 
+                        renderTargetArrayIndex);
+                    pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
+                }
+            }
+        }
+    }
+
+    typedef void(*PFN_STORE_TILES_INTERNAL)(uint8_t*, SWR_SURFACE_STATE*, uint32_t, uint32_t, uint32_t, uint32_t);
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Stores a macrotile to the destination surface.
+    /// @param pSrc - Pointer to macro tile.
+    /// @param pDstSurface - Destination surface state
+    /// @param x, y - Coordinates to macro tile
+    static void Store(
+        uint8_t *pSrcHotTile,
+        SWR_SURFACE_STATE* pDstSurface,
+        uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
+    {
+        PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES];
+        for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
+        {
+            size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false>(
+                0,
+                0,
+                pDstSurface->arrayIndex + renderTargetArrayIndex, // z for 3D surfaces
+                pDstSurface->arrayIndex + renderTargetArrayIndex, // array index for 2D arrays
+                sampleNum,
+                pDstSurface->lod,
+                pDstSurface);
+
+            // Only support generic store-tile if lod surface doesn't start on a page boundary and is non-linear
+            bool bForceGeneric = ((pDstSurface->tileMode != SWR_TILE_NONE) && (0 != (dstSurfAddress & 0xfff))) || (pDstSurface->bInterleavedSamples);
+
+            pfnStore[sampleNum] = (bForceGeneric || KNOB_USE_GENERIC_STORETILE) ? StoreRasterTile<TTraits, SrcFormat, DstFormat>::Store : OptStoreRasterTile<TTraits, SrcFormat, DstFormat>::Store;
+        }
+
+        // Store each raster tile from the hot tile to the destination surface.
+        for(uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+        {
+            for(uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+            {
+                for(uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
+                {
+                    pfnStore[sampleNum](pSrcHotTile, pDstSurface, (x + col), (y + row), sampleNum, renderTargetArrayIndex);
+                    pSrcHotTile += KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * (FormatTraits<SrcFormat>::bpp / 8);
+                }
+            }
+        }
+    }
+};
+
+static void BUCKETS_START(UINT id)
+{
+#ifdef KNOB_ENABLE_RDTSC
+    gBucketMgr.StartBucket(id);
+#endif
+}
+
+static void BUCKETS_STOP(UINT id)
+{
+#ifdef KNOB_ENABLE_RDTSC
+    gBucketMgr.StopBucket(id);
+#endif
+}
+
+// on demand buckets for store tiles
+static std::mutex sBucketMutex;
+static std::vector<int32_t> sBuckets(NUM_SWR_FORMATS, -1);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Deswizzles and stores a full hottile to a render surface
+/// @param hPrivateContext - Handle to private DC
+/// @param srcFormat - Format for hot tile.
+/// @param renderTargetIndex - Index to destination render target
+/// @param x, y - Coordinates to raster tile.
+/// @param pSrcHotTile - Pointer to Hot Tile
+void StoreHotTile(
+    SWR_SURFACE_STATE *pDstSurface,
+    SWR_FORMAT srcFormat,
+    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
+    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex,
+    uint8_t *pSrcHotTile)
+{
+    if (pDstSurface->type == SURFACE_NULL)
+    {
+        return;
+    }
+
+    // force 0 if requested renderTargetArrayIndex is OOB
+    if (renderTargetArrayIndex >= pDstSurface->depth)
+    {
+        renderTargetArrayIndex = 0;
+    }
+
+    PFN_STORE_TILES pfnStoreTiles = nullptr;
+
+    if ((renderTargetIndex <= SWR_ATTACHMENT_COLOR7) && (pDstSurface->tileMode != SWR_TILE_MODE_WMAJOR))
+    {
+        pfnStoreTiles = sStoreTilesTableColor[pDstSurface->tileMode][pDstSurface->format];
+    }
+    else if (renderTargetIndex == SWR_ATTACHMENT_DEPTH)
+    {
+        pfnStoreTiles = sStoreTilesTableDepth[pDstSurface->tileMode][pDstSurface->format];
+    }
+    else
+    {
+        pfnStoreTiles = sStoreTilesTableStencil[pDstSurface->tileMode][pDstSurface->format];
+    }
+
+    if(nullptr == pfnStoreTiles)
+    {
+        SWR_ASSERT(false, "Invalid pixel format / tile mode for store tiles");
+        return;
+    }
+
+    // Store a macro tile
+#ifdef KNOB_ENABLE_RDTSC
+    if (sBuckets[pDstSurface->format] == -1)
+    {
+        // guard sBuckets update since storetiles is called by multiple threads
+        sBucketMutex.lock();
+        if (sBuckets[pDstSurface->format] == -1)
+        {
+            const SWR_FORMAT_INFO& info = GetFormatInfo(pDstSurface->format);
+            BUCKET_DESC desc{info.name, "", false, 0xffffffff};
+            sBuckets[pDstSurface->format] = gBucketMgr.RegisterBucket(desc);
+        }
+        sBucketMutex.unlock();
+    }
+#endif
+
+    BUCKETS_START(sBuckets[pDstSurface->format]);
+    pfnStoreTiles(pSrcHotTile, pDstSurface, x, y, renderTargetArrayIndex);
+    BUCKETS_STOP(sBuckets[pDstSurface->format]);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// InitStoreTilesTable - Helper for setting up the tables.
+template <SWR_TILE_MODE TileModeT, size_t NumTileModesT, size_t ArraySizeT>
+void InitStoreTilesTableColor(
+    PFN_STORE_TILES (&table)[NumTileModesT][ArraySizeT])
+{
+    table[TileModeT][R32G32B32A32_FLOAT]        = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, R32G32B32A32_FLOAT>::Store;
+    table[TileModeT][R32G32B32A32_SINT]         = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, R32G32B32A32_SINT>::Store;
+    table[TileModeT][R32G32B32A32_UINT]         = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, R32G32B32A32_UINT>::Store;
+    table[TileModeT][R32G32B32X32_FLOAT]        = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, R32G32B32X32_FLOAT>::Store;
+    table[TileModeT][R32G32B32_FLOAT]           = StoreMacroTile<TilingTraits<TileModeT, 96>, R32G32B32A32_FLOAT, R32G32B32_FLOAT>::Store;
+    table[TileModeT][R32G32B32_SINT]            = StoreMacroTile<TilingTraits<TileModeT, 96>, R32G32B32A32_FLOAT, R32G32B32_SINT>::Store;
+    table[TileModeT][R32G32B32_UINT]            = StoreMacroTile<TilingTraits<TileModeT, 96>, R32G32B32A32_FLOAT, R32G32B32_UINT>::Store;
+    table[TileModeT][R16G16B16A16_UNORM]        = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UNORM>::Store;
+    table[TileModeT][R16G16B16A16_SNORM]        = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SNORM>::Store;
+    table[TileModeT][R16G16B16A16_SINT]         = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16A16_SINT>::Store;
+    table[TileModeT][R16G16B16A16_UINT]         = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16A16_UINT>::Store;
+    table[TileModeT][R16G16B16A16_FLOAT]        = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16A16_FLOAT>::Store;
+    table[TileModeT][R32G32_FLOAT]              = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R32G32_FLOAT>::Store;
+    table[TileModeT][R32G32_SINT]               = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R32G32_SINT>::Store;
+    table[TileModeT][R32G32_UINT]               = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R32G32_UINT>::Store;
+    table[TileModeT][R16G16B16X16_UNORM]        = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16X16_UNORM>::Store;
+    table[TileModeT][R16G16B16X16_FLOAT]        = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, R16G16B16X16_FLOAT>::Store;
+    table[TileModeT][B8G8R8A8_UNORM]            = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM>::Store;
+    table[TileModeT][B8G8R8A8_UNORM_SRGB]       = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB>::Store;
+    
+    // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now
+    table[TileModeT][R10G10B10A2_UNORM]         = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM>::StoreGeneric;
+    table[TileModeT][R10G10B10A2_UNORM_SRGB]    = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UNORM_SRGB>::StoreGeneric;
+    table[TileModeT][R10G10B10A2_UINT]          = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R10G10B10A2_UINT>::StoreGeneric;
+
+    table[TileModeT][R8G8B8A8_UNORM]            = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM>::Store;
+    table[TileModeT][R8G8B8A8_UNORM_SRGB]       = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UNORM_SRGB>::Store;
+    table[TileModeT][R8G8B8A8_SNORM]            = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SNORM>::Store;
+    table[TileModeT][R8G8B8A8_SINT]             = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8A8_SINT>::Store;
+    table[TileModeT][R8G8B8A8_UINT]             = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8A8_UINT>::Store;
+    table[TileModeT][R16G16_UNORM]              = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R16G16_UNORM>::Store;
+    table[TileModeT][R16G16_SNORM]              = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R16G16_SNORM>::Store;
+    table[TileModeT][R16G16_SINT]               = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R16G16_SINT>::Store;
+    table[TileModeT][R16G16_UINT]               = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R16G16_UINT>::Store;
+    table[TileModeT][R16G16_FLOAT]              = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R16G16_FLOAT>::Store;
+    
+    // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now
+    table[TileModeT][B10G10R10A2_UNORM]         = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM>::StoreGeneric;
+    table[TileModeT][B10G10R10A2_UNORM_SRGB]    = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UNORM_SRGB>::StoreGeneric;
+    table[TileModeT][R11G11B10_FLOAT]           = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R11G11B10_FLOAT>::StoreGeneric;
+
+    table[TileModeT][R32_SINT]                  = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R32_SINT>::Store;
+    table[TileModeT][R32_UINT]                  = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R32_UINT>::Store;
+    table[TileModeT][R32_FLOAT]                 = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R32_FLOAT>::Store;
+    table[TileModeT][A32_FLOAT]                 = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, A32_FLOAT>::Store;
+    table[TileModeT][B8G8R8X8_UNORM]            = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM>::Store;
+    table[TileModeT][B8G8R8X8_UNORM_SRGB]       = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB>::Store;
+    table[TileModeT][R8G8B8X8_UNORM]            = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM>::Store;
+    table[TileModeT][R8G8B8X8_UNORM_SRGB]       = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB>::Store;
+    
+    // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now
+    table[TileModeT][B10G10R10X2_UNORM]         = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10X2_UNORM>::StoreGeneric;
+    table[TileModeT][B5G6R5_UNORM]              = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM>::Store;
+    table[TileModeT][B5G6R5_UNORM_SRGB]         = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G6R5_UNORM_SRGB>::StoreGeneric;
+    table[TileModeT][B5G5R5A1_UNORM]            = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM>::StoreGeneric;
+    table[TileModeT][B5G5R5A1_UNORM_SRGB]       = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G5R5A1_UNORM_SRGB>::StoreGeneric;
+    table[TileModeT][B4G4R4A4_UNORM]            = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM>::StoreGeneric;
+    table[TileModeT][B4G4R4A4_UNORM_SRGB]       = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B4G4R4A4_UNORM_SRGB>::StoreGeneric;
+
+    table[TileModeT][R8G8_UNORM]                = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R8G8_UNORM>::Store;
+    table[TileModeT][R8G8_SNORM]                = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R8G8_SNORM>::Store;
+    table[TileModeT][R8G8_SINT]                 = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R8G8_SINT>::Store;
+    table[TileModeT][R8G8_UINT]                 = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R8G8_UINT>::Store;
+    table[TileModeT][R16_UNORM]                 = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R16_UNORM>::Store;
+    table[TileModeT][R16_SNORM]                 = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R16_SNORM>::Store;
+    table[TileModeT][R16_SINT]                  = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R16_SINT>::Store;
+    table[TileModeT][R16_UINT]                  = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R16_UINT>::Store;
+    table[TileModeT][R16_FLOAT]                 = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, R16_FLOAT>::Store;
+    table[TileModeT][A16_UNORM]                 = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, A16_UNORM>::Store;
+    table[TileModeT][A16_FLOAT]                 = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, A16_FLOAT>::Store;
+    
+    // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now
+    table[TileModeT][B5G5R5X1_UNORM]            = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM>::StoreGeneric;
+    table[TileModeT][B5G5R5X1_UNORM_SRGB]       = StoreMacroTile<TilingTraits<TileModeT, 16>, R32G32B32A32_FLOAT, B5G5R5X1_UNORM_SRGB>::StoreGeneric;
+
+    table[TileModeT][R8_UNORM]                  = StoreMacroTile<TilingTraits<TileModeT, 8>, R32G32B32A32_FLOAT, R8_UNORM>::Store;
+    table[TileModeT][R8_SNORM]                  = StoreMacroTile<TilingTraits<TileModeT, 8>, R32G32B32A32_FLOAT, R8_SNORM>::Store;
+    table[TileModeT][R8_SINT]                   = StoreMacroTile<TilingTraits<TileModeT, 8>, R32G32B32A32_FLOAT, R8_SINT>::Store;
+    table[TileModeT][R8_UINT]                   = StoreMacroTile<TilingTraits<TileModeT, 8>, R32G32B32A32_FLOAT, R8_UINT>::Store;
+    table[TileModeT][A8_UNORM]                  = StoreMacroTile<TilingTraits<TileModeT, 8>, R32G32B32A32_FLOAT, A8_UNORM>::Store;
+    table[TileModeT][BC1_UNORM]                 = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, BC1_UNORM>::Store;
+    table[TileModeT][BC2_UNORM]                 = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC2_UNORM>::Store;
+    table[TileModeT][BC3_UNORM]                 = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC3_UNORM>::Store;
+    table[TileModeT][BC4_UNORM]                 = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, BC4_UNORM>::Store;
+    table[TileModeT][BC5_UNORM]                 = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC5_UNORM>::Store;
+    table[TileModeT][BC1_UNORM_SRGB]            = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, BC1_UNORM_SRGB>::Store;
+    table[TileModeT][BC2_UNORM_SRGB]            = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC2_UNORM_SRGB>::Store;
+    table[TileModeT][BC3_UNORM_SRGB]            = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC3_UNORM_SRGB>::Store;
+    table[TileModeT][R8G8B8_UNORM]              = StoreMacroTile<TilingTraits<TileModeT, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM>::Store;
+    table[TileModeT][R8G8B8_SNORM]              = StoreMacroTile<TilingTraits<TileModeT, 24>, R32G32B32A32_FLOAT, R8G8B8_SNORM>::Store;
+    table[TileModeT][BC4_SNORM]                 = StoreMacroTile<TilingTraits<TileModeT, 64>, R32G32B32A32_FLOAT, BC4_SNORM>::Store;
+    table[TileModeT][BC5_SNORM]                 = StoreMacroTile<TilingTraits<TileModeT, 128>, R32G32B32A32_FLOAT, BC5_SNORM>::Store;
+    table[TileModeT][R16G16B16_FLOAT]           = StoreMacroTile<TilingTraits<TileModeT, 48>, R32G32B32A32_FLOAT, R16G16B16_FLOAT>::Store;
+    table[TileModeT][R16G16B16_UNORM]           = StoreMacroTile<TilingTraits<TileModeT, 48>, R32G32B32A32_FLOAT, R16G16B16_UNORM>::Store;
+    table[TileModeT][R16G16B16_SNORM]           = StoreMacroTile<TilingTraits<TileModeT, 48>, R32G32B32A32_FLOAT, R16G16B16_SNORM>::Store;
+    table[TileModeT][R8G8B8_UNORM_SRGB]         = StoreMacroTile<TilingTraits<TileModeT, 24>, R32G32B32A32_FLOAT, R8G8B8_UNORM_SRGB>::Store;
+    table[TileModeT][R16G16B16_UINT]            = StoreMacroTile<TilingTraits<TileModeT, 48>, R32G32B32A32_FLOAT, R16G16B16_UINT>::Store;
+    table[TileModeT][R16G16B16_SINT]            = StoreMacroTile<TilingTraits<TileModeT, 48>, R32G32B32A32_FLOAT, R16G16B16_SINT>::Store;
+
+    // 101010_2, 565, 555_1, and 444_4 formats force generic store tile for now
+    table[TileModeT][R10G10B10A2_SNORM]         = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SNORM>::StoreGeneric;
+    table[TileModeT][R10G10B10A2_SINT]          = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, R10G10B10A2_SINT>::StoreGeneric;
+    table[TileModeT][B10G10R10A2_SNORM]         = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SNORM>::StoreGeneric;
+    table[TileModeT][B10G10R10A2_UINT]          = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10A2_UINT>::StoreGeneric;
+    table[TileModeT][B10G10R10A2_SINT]          = StoreMacroTile<TilingTraits<TileModeT, 32>, R32G32B32A32_FLOAT, B10G10R10A2_SINT>::StoreGeneric;
+
+    table[TileModeT][R8G8B8_UINT]               = StoreMacroTile<TilingTraits<TileModeT, 24>, R32G32B32A32_FLOAT, R8G8B8_UINT>::Store;
+    table[TileModeT][R8G8B8_SINT]               = StoreMacroTile<TilingTraits<TileModeT, 24>, R32G32B32A32_FLOAT, R8G8B8_SINT>::Store;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// INIT_STORE_TILES_TABLE - Helper macro for setting up the tables.
+template <SWR_TILE_MODE TileModeT, size_t NumTileModes, size_t ArraySizeT>
+void InitStoreTilesTableDepth(
+    PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
+{
+    table[TileModeT][R32_FLOAT]                 = StoreMacroTile<TilingTraits<TileModeT, 32>, R32_FLOAT, R32_FLOAT>::Store;
+    table[TileModeT][R24_UNORM_X8_TYPELESS]     = StoreMacroTile<TilingTraits<TileModeT, 32>, R32_FLOAT, R24_UNORM_X8_TYPELESS>::Store;
+    table[TileModeT][R16_UNORM]                 = StoreMacroTile<TilingTraits<TileModeT, 16>, R32_FLOAT, R16_UNORM>::Store;
+}
+
+template <SWR_TILE_MODE TileModeT, size_t NumTileModes, size_t ArraySizeT>
+void InitStoreTilesTableStencil(
+    PFN_STORE_TILES(&table)[NumTileModes][ArraySizeT])
+{
+    table[TileModeT][R32_UINT]                  = StoreMacroTile<TilingTraits<TileModeT, 32>, R8_UINT, R32_UINT>::Store;
+    table[TileModeT][R8_UINT]                   = StoreMacroTile<TilingTraits<TileModeT, 8>, R8_UINT, R8_UINT>::Store;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Sets up tables for StoreTile
+void InitSimStoreTilesTable()
+{
+    memset(sStoreTilesTableColor, 0, sizeof(sStoreTilesTableColor));
+    memset(sStoreTilesTableDepth, 0, sizeof(sStoreTilesTableDepth));
+
+    InitStoreTilesTableColor<SWR_TILE_NONE>(sStoreTilesTableColor);
+    InitStoreTilesTableDepth<SWR_TILE_NONE>(sStoreTilesTableDepth);
+    InitStoreTilesTableStencil<SWR_TILE_NONE>(sStoreTilesTableStencil);
+
+    InitStoreTilesTableColor<SWR_TILE_MODE_YMAJOR>(sStoreTilesTableColor);
+    InitStoreTilesTableColor<SWR_TILE_MODE_XMAJOR>(sStoreTilesTableColor);
+
+    InitStoreTilesTableDepth<SWR_TILE_MODE_YMAJOR>(sStoreTilesTableDepth);
+    InitStoreTilesTableStencil<SWR_TILE_MODE_WMAJOR>(sStoreTilesTableStencil);
+}
diff --git a/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h b/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h
new file mode 100644
index 00000000000..a14f3bf3f7c
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h
@@ -0,0 +1,581 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+* 
+* @file TilingFunctions.h
+* 
+* @brief Tiling functions.
+* 
+******************************************************************************/
+#pragma once
+
+#include "core/state.h"
+#include "core/format_traits.h"
+#include "memory/tilingtraits.h"
+
+#include <algorithm>
+
+#define MAX_NUM_LOD 15
+
+#define GFX_ALIGN(x, a) (((x) + ((a) - 1)) - (((x) + ((a) - 1)) & ((a) - 1))) // Alt implementation with bitwise not (~) has issue with uint32 align used with 64-bit value, since ~'ed value will remain 32-bit.
+
+//////////////////////////////////////////////////////////////////////////
+/// SimdTile SSE(2x2), AVX(4x2), or AVX-512(4x4?)
+//////////////////////////////////////////////////////////////////////////
+template<SWR_FORMAT HotTileFormat, SWR_FORMAT SrcOrDstFormat>
+struct SimdTile
+{
+    // SimdTile is SOA (e.g. rrrrrrrr gggggggg bbbbbbbb aaaaaaaa )
+    float color[FormatTraits<HotTileFormat>::numComps][KNOB_SIMD_WIDTH];
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Retrieve color from simd.
+    /// @param index - linear index to color within simd.
+    /// @param outputColor - output color
+    INLINE void GetSwizzledColor(
+        uint32_t index,
+        float outputColor[4])
+    {
+        // SOA pattern for 2x2 is a subset of 4x2.
+        //   0 1 4 5
+        //   2 3 6 7
+        // The offset converts pattern to linear
+#if (SIMD_TILE_X_DIM == 4)
+        static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 };
+#elif (SIMD_TILE_X_DIM == 2)
+        static const uint32_t offset[] = { 0, 1, 2, 3 };
+#endif
+
+        for (uint32_t i = 0; i < FormatTraits<SrcOrDstFormat>::numComps; ++i)
+        {
+            outputColor[i] = this->color[FormatTraits<SrcOrDstFormat>::swizzle(i)][offset[index]];
+        }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Retrieve color from simd.
+    /// @param index - linear index to color within simd.
+    /// @param outputColor - output color
+    INLINE void SetSwizzledColor(
+        uint32_t index,
+        const float src[4])
+    {
+        // SOA pattern for 2x2 is a subset of 4x2.
+        //   0 1 4 5
+        //   2 3 6 7
+        // The offset converts pattern to linear
+#if (SIMD_TILE_X_DIM == 4)
+        static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 };
+#elif (SIMD_TILE_X_DIM == 2)
+        static const uint32_t offset[] = { 0, 1, 2, 3 };
+#endif
+
+        // Only loop over the components needed for destination.
+        for (uint32_t i = 0; i < FormatTraits<SrcOrDstFormat>::numComps; ++i)
+        {
+            this->color[i][offset[index]] = src[i];
+        }
+    }
+};
+
+template<>
+struct SimdTile <R8_UINT,R8_UINT>
+{
+    // SimdTile is SOA (e.g. rrrrrrrr gggggggg bbbbbbbb aaaaaaaa )
+    uint8_t color[FormatTraits<R8_UINT>::numComps][KNOB_SIMD_WIDTH];
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Retrieve color from simd.
+    /// @param index - linear index to color within simd.
+    /// @param outputColor - output color
+    INLINE void GetSwizzledColor(
+        uint32_t index,
+        float outputColor[4])
+    {
+        // SOA pattern for 2x2 is a subset of 4x2.
+        //   0 1 4 5
+        //   2 3 6 7
+        // The offset converts pattern to linear
+#if (SIMD_TILE_X_DIM == 4)
+        static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 };
+#elif (SIMD_TILE_X_DIM == 2)
+        static const uint32_t offset[] = { 0, 1, 2, 3 };
+#endif
+
+        for (uint32_t i = 0; i < FormatTraits<R8_UINT>::numComps; ++i)
+        {
+            uint32_t src = this->color[FormatTraits<R8_UINT>::swizzle(i)][offset[index]];
+            outputColor[i] = *(float*)&src;
+        }
+    }
+
+    //////////////////////////////////////////////////////////////////////////
+    /// @brief Retrieve color from simd.
+    /// @param index - linear index to color within simd.
+    /// @param outputColor - output color
+    INLINE void SetSwizzledColor(
+        uint32_t index,
+        const float src[4])
+    {
+        // SOA pattern for 2x2 is a subset of 4x2.
+        //   0 1 4 5
+        //   2 3 6 7
+        // The offset converts pattern to linear
+#if (SIMD_TILE_X_DIM == 4)
+        static const uint32_t offset[] = { 0, 1, 4, 5, 2, 3, 6, 7 };
+#elif (SIMD_TILE_X_DIM == 2)
+        static const uint32_t offset[] = { 0, 1, 2, 3 };
+#endif
+
+        // Only loop over the components needed for destination.
+        for (uint32_t i = 0; i < FormatTraits<R8_UINT>::numComps; ++i)
+        {
+            this->color[i][offset[index]] = *(uint8_t*)&src[i];
+        }
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes lod offset for 1D surface at specified lod.
+/// @param baseWidth - width of basemip (mip 0).
+/// @param hAlign - horizontal alignment per miip, in texels
+/// @param lod - lod index
+/// @param offset - output offset.
+INLINE void ComputeLODOffset1D(
+    const SWR_FORMAT_INFO& info,
+    uint32_t baseWidth,
+    uint32_t hAlign,
+    uint32_t lod,
+    uint32_t &offset)
+{
+    if (lod == 0)
+    {
+        offset = 0;
+    }
+    else
+    {
+        uint32_t curWidth = baseWidth;
+        // translate mip width from pixels to blocks for block compressed formats
+        // @note hAlign is already in blocks for compressed formats so no need to convert
+        if (info.isBC) curWidth /= info.bcWidth;
+
+        offset = GFX_ALIGN(curWidth, hAlign);
+        for (uint32_t l = 1; l < lod; ++l)
+        {
+            curWidth = GFX_ALIGN(std::max<uint32_t>(curWidth >> 1, 1U), hAlign);
+            offset += curWidth;
+        }
+
+        if (info.isSubsampled)
+        {
+            offset /= info.bcWidth;
+        }
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes x lod offset for 2D surface at specified lod.
+/// @param baseWidth - width of basemip (mip 0).
+/// @param hAlign - horizontal alignment per mip, in texels
+/// @param lod - lod index
+/// @param offset - output offset.
+INLINE void ComputeLODOffsetX(
+    const SWR_FORMAT_INFO& info,
+    uint32_t baseWidth,
+    uint32_t hAlign,
+    uint32_t lod,
+    uint32_t &offset)
+{
+    if (lod < 2)
+    {
+        offset = 0;
+    }
+    else
+    {
+        uint32_t curWidth = baseWidth;
+        // convert mip width from pixels to blocks for block compressed formats
+        // @note hAlign is already in blocks for compressed formats so no need to convert
+        if (info.isBC) curWidth /= info.bcWidth;
+
+        curWidth = std::max<uint32_t>(curWidth >> 1, 1U);
+        curWidth = GFX_ALIGN(curWidth, hAlign);
+
+        if (info.isSubsampled)
+        {
+            curWidth /= info.bcWidth;
+        }
+
+        offset = curWidth;
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes y lod offset for 2D surface at specified lod.
+/// @param baseWidth - width of basemip (mip 0).
+/// @param vAlign - vertical alignment per mip, in rows
+/// @param lod - lod index
+/// @param offset - output offset.
+INLINE void ComputeLODOffsetY(
+    const SWR_FORMAT_INFO& info,
+    uint32_t baseHeight,
+    uint32_t vAlign,
+    uint32_t lod,
+    uint32_t &offset)
+{
+    if (lod == 0)
+    {
+        offset = 0;
+    }
+    else
+    {
+        offset = 0;
+        uint32_t mipHeight = baseHeight;
+
+        // translate mip height from pixels to blocks for block compressed formats
+        // @note VAlign is already in blocks for compressed formats so no need to convert
+        if (info.isBC) mipHeight /= info.bcHeight;
+
+        for (uint32_t l = 1; l <= lod; ++l)
+        {
+            uint32_t alignedMipHeight = GFX_ALIGN(mipHeight, vAlign);
+            offset += ((l != 2) ? alignedMipHeight : 0);
+            mipHeight = std::max<uint32_t>(mipHeight >> 1, 1U);
+        }
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes 1D surface offset
+/// @param x - offset from start of array slice at given lod.
+/// @param array - array slice index
+/// @param lod - lod index
+/// @param pState - surface state
+/// @param xOffsetBytes - output offset in bytes.
+template<bool UseCachedOffsets>
+INLINE void ComputeSurfaceOffset1D(
+    uint32_t x,
+    uint32_t array,
+    uint32_t lod,
+    const SWR_SURFACE_STATE *pState,
+    uint32_t &xOffsetBytes)
+{
+    const SWR_FORMAT_INFO &info = GetFormatInfo(pState->format);
+    uint32_t lodOffset;
+
+    if (UseCachedOffsets)
+    {
+        lodOffset = pState->lodOffsets[0][lod];
+    }
+    else
+    {
+        ComputeLODOffset1D(info, pState->width, pState->halign, lod, lodOffset);
+    }
+
+    xOffsetBytes = (array * pState->qpitch + lodOffset + x) * info.Bpp;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Adjusts the array slice for legacy TileY MSAA
+/// @param pState - surface state
+/// @param array - array slice index
+/// @param sampleNum - requested sample
+INLINE void AdjustCoordsForMSAA(const SWR_SURFACE_STATE *pState, uint32_t& x, uint32_t& y, uint32_t& arrayIndex, uint32_t sampleNum)
+{
+    /// @todo: might want to templatize adjusting for sample slices when we support tileYS/tileYF.
+    if((pState->tileMode == SWR_TILE_MODE_YMAJOR ||
+        pState->tileMode == SWR_TILE_MODE_WMAJOR) && 
+       pState->bInterleavedSamples)
+    {
+        uint32_t newX, newY, newSampleX, newSampleY;
+        switch(pState->numSamples)
+        {
+        case 1:
+            newX = x;
+            newY = y;
+            newSampleX = newSampleY = 0;
+            break;
+        case 2:
+        {
+            assert(pState->type == SURFACE_2D);
+            static const uint32_t xMask = 0xFFFFFFFD;
+            static const uint32_t sampleMaskX = 0x1;
+            newX = pdep_u32(x, xMask);
+            newY = y;
+            newSampleX = pext_u32(sampleNum, sampleMaskX);
+            newSampleY = 0;
+        }
+            break;
+        case 4:
+        {
+            assert(pState->type == SURFACE_2D);
+            static const uint32_t mask = 0xFFFFFFFD;
+            static const uint32_t sampleMaskX = 0x1;
+            static const uint32_t sampleMaskY = 0x2;
+            newX = pdep_u32(x, mask);
+            newY = pdep_u32(y, mask);
+            newSampleX = pext_u32(sampleNum, sampleMaskX);
+            newSampleY = pext_u32(sampleNum, sampleMaskY);
+        }
+            break;
+        case 8:
+        {
+            assert(pState->type == SURFACE_2D);
+            static const uint32_t xMask = 0xFFFFFFF9;
+            static const uint32_t yMask = 0xFFFFFFFD;
+            static const uint32_t sampleMaskX = 0x5;
+            static const uint32_t sampleMaskY = 0x2;
+            newX = pdep_u32(x, xMask);
+            newY = pdep_u32(y, yMask);
+            newSampleX = pext_u32(sampleNum, sampleMaskX);
+            newSampleY = pext_u32(sampleNum, sampleMaskY);
+        }
+            break;
+        case 16:
+        {
+            assert(pState->type == SURFACE_2D);
+            static const uint32_t mask = 0xFFFFFFF9;
+            static const uint32_t sampleMaskX = 0x5;
+            static const uint32_t sampleMaskY = 0xA;
+            newX = pdep_u32(x, mask);
+            newY = pdep_u32(y, mask);
+            newSampleX = pext_u32(sampleNum, sampleMaskX);
+            newSampleY = pext_u32(sampleNum, sampleMaskY);
+        }
+            break;
+        default:
+            assert(0 && "Unsupported sample count");
+            newX = newY = 0;
+            newSampleX = newSampleY = 0;
+            break;
+        }
+        x = newX | (newSampleX << 1);
+        y = newY | (newSampleY << 1);
+    }
+    else if(pState->tileMode == SWR_TILE_MODE_YMAJOR ||
+            pState->tileMode == SWR_TILE_NONE)
+    {
+        uint32_t sampleShift;
+        switch(pState->numSamples)
+        {
+        case 1:
+            assert(sampleNum == 0);
+            sampleShift = 0;
+            break;
+        case 2:
+            assert(pState->type == SURFACE_2D);
+            sampleShift = 1;
+            break;
+        case 4:
+            assert(pState->type == SURFACE_2D);
+            sampleShift = 2;
+            break;
+        case 8:
+            assert(pState->type == SURFACE_2D);
+            sampleShift = 3;
+            break;
+        case 16:
+            assert(pState->type == SURFACE_2D);
+            sampleShift = 4;
+            break;
+        default:
+            assert(0 && "Unsupported sample count");
+            sampleShift = 0;
+            break;
+        }
+        arrayIndex = (arrayIndex << sampleShift) | sampleNum;
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes 2D surface offset
+/// @param x - horizontal offset from start of array slice and lod.
+/// @param y - vertical offset from start of array slice and lod.
+/// @param array - array slice index
+/// @param lod - lod index
+/// @param pState - surface state
+/// @param xOffsetBytes - output x offset in bytes.
+/// @param yOffsetRows - output y offset in bytes.
+template<bool UseCachedOffsets>
+INLINE void ComputeSurfaceOffset2D(uint32_t x, uint32_t y, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState, uint32_t &xOffsetBytes, uint32_t &yOffsetRows)
+{
+    const SWR_FORMAT_INFO &info = GetFormatInfo(pState->format);
+    uint32_t lodOffsetX, lodOffsetY;
+
+    if (UseCachedOffsets)
+    {
+        lodOffsetX = pState->lodOffsets[0][lod];
+        lodOffsetY = pState->lodOffsets[1][lod];
+    }
+    else
+    {
+        ComputeLODOffsetX(info, pState->width, pState->halign, lod, lodOffsetX);
+        ComputeLODOffsetY(info, pState->height, pState->valign, lod, lodOffsetY);
+    }
+
+    AdjustCoordsForMSAA(pState, x, y, array, sampleNum);
+    xOffsetBytes = (x + lodOffsetX + pState->xOffset) * info.Bpp;
+    yOffsetRows = (array * pState->qpitch) + lodOffsetY + y + pState->yOffset;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes 3D surface offset
+/// @param x - horizontal offset from start of array slice and lod.
+/// @param y - vertical offset from start of array slice and lod.
+/// @param z - depth offset from start of array slice and lod.
+/// @param lod - lod index
+/// @param pState - surface state
+/// @param xOffsetBytes - output x offset in bytes.
+/// @param yOffsetRows - output y offset in rows.
+/// @param zOffsetSlices - output y offset in slices.
+template<bool UseCachedOffsets>
+INLINE void ComputeSurfaceOffset3D(uint32_t x, uint32_t y, uint32_t z, uint32_t lod, const SWR_SURFACE_STATE *pState, uint32_t &xOffsetBytes, uint32_t &yOffsetRows, uint32_t &zOffsetSlices)
+{
+    const SWR_FORMAT_INFO &info = GetFormatInfo(pState->format);
+    uint32_t lodOffsetX, lodOffsetY;
+
+    if (UseCachedOffsets)
+    {
+        lodOffsetX = pState->lodOffsets[0][lod];
+        lodOffsetY = pState->lodOffsets[1][lod];
+    }
+    else
+    {
+        ComputeLODOffsetX(info, pState->width, pState->halign, lod, lodOffsetX);
+        ComputeLODOffsetY(info, pState->height, pState->valign, lod, lodOffsetY);
+    }
+
+    xOffsetBytes = (x + lodOffsetX) * info.Bpp;
+    yOffsetRows = lodOffsetY + y;
+    zOffsetSlices = z;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Swizzles the linear x,y offsets depending on surface tiling mode
+///        and returns final surface address
+/// @param xOffsetBytes - x offset from base of surface in bytes
+/// @param yOffsetRows - y offset from base of surface in rows
+/// @param pState - pointer to the surface state
+template<typename TTraits>
+INLINE uint32_t ComputeTileSwizzle2D(uint32_t xOffsetBytes, uint32_t yOffsetRows, const SWR_SURFACE_STATE *pState)
+{
+    return ComputeOffset2D<TTraits>(pState->pitch, xOffsetBytes, yOffsetRows);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Swizzles the linear x,y offsets depending on surface tiling mode
+///        and returns final surface address
+/// @param xOffsetBytes - x offset from base of surface in bytes
+/// @param yOffsetRows - y offset from base of surface in rows
+/// @param pState - pointer to the surface state
+template<typename TTraits>
+INLINE uint32_t ComputeTileSwizzle3D(uint32_t xOffsetBytes, uint32_t yOffsetRows, uint32_t zOffsetSlices, const SWR_SURFACE_STATE *pState)
+{
+    return ComputeOffset3D<TTraits>(pState->qpitch, pState->pitch, xOffsetBytes, yOffsetRows, zOffsetSlices);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Swizzles the linear x,y offsets depending on surface tiling mode
+///        and returns final surface address
+/// @param xOffsetBytes - x offset from base of surface in bytes
+/// @param yOffsetRows - y offset from base of surface in rows
+/// @param pState - pointer to the surface state
+INLINE
+uint32_t TileSwizzle2D(uint32_t xOffsetBytes, uint32_t yOffsetRows, const SWR_SURFACE_STATE *pState)
+{
+    switch (pState->tileMode)
+    {
+    case SWR_TILE_NONE: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_NONE, 32> >(xOffsetBytes, yOffsetRows, pState);
+    case SWR_TILE_SWRZ: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_SWRZ, 32> >(xOffsetBytes, yOffsetRows, pState);
+    case SWR_TILE_MODE_XMAJOR: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_MODE_XMAJOR, 8> >(xOffsetBytes, yOffsetRows, pState);
+    case SWR_TILE_MODE_YMAJOR: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_MODE_YMAJOR, 32> >(xOffsetBytes, yOffsetRows, pState);
+    case SWR_TILE_MODE_WMAJOR: return ComputeTileSwizzle2D<TilingTraits<SWR_TILE_MODE_WMAJOR, 8> >(xOffsetBytes, yOffsetRows, pState);
+    default: SWR_ASSERT(0, "Unsupported tiling mode");
+    }
+    return (uint32_t) NULL;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Swizzles the linear x,y,z offsets depending on surface tiling mode
+///        and returns final surface address
+/// @param xOffsetBytes - x offset from base of surface in bytes
+/// @param yOffsetRows - y offset from base of surface in rows
+/// @param zOffsetSlices - z offset from base of surface in slices
+/// @param pState - pointer to the surface state
+INLINE
+uint32_t TileSwizzle3D(uint32_t xOffsetBytes, uint32_t yOffsetRows, uint32_t zOffsetSlices, const SWR_SURFACE_STATE *pState)
+{
+    switch (pState->tileMode)
+    {
+    case SWR_TILE_NONE: return ComputeTileSwizzle3D<TilingTraits<SWR_TILE_NONE, 32> >(xOffsetBytes, yOffsetRows, zOffsetSlices, pState);
+    case SWR_TILE_SWRZ: return ComputeTileSwizzle3D<TilingTraits<SWR_TILE_SWRZ, 32> >(xOffsetBytes, yOffsetRows, zOffsetSlices, pState);
+    case SWR_TILE_MODE_YMAJOR: return ComputeTileSwizzle3D<TilingTraits<SWR_TILE_MODE_YMAJOR, 32> >(xOffsetBytes, yOffsetRows, zOffsetSlices, pState);
+    default: SWR_ASSERT(0, "Unsupported tiling mode");
+    }
+    return (uint32_t) NULL;
+}
+
+template<bool UseCachedOffsets>
+INLINE
+uint32_t ComputeSurfaceOffset(uint32_t x, uint32_t y, uint32_t z, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState)
+{
+    uint32_t offsetX = 0, offsetY = 0, offsetZ = 0;
+    switch (pState->type)
+    {
+    case SURFACE_BUFFER:
+    case SURFACE_STRUCTURED_BUFFER:
+        offsetX = x * pState->pitch;
+        return offsetX;
+        break;
+    case SURFACE_1D:
+        ComputeSurfaceOffset1D<UseCachedOffsets>(x, array, lod, pState, offsetX);
+        return TileSwizzle2D(offsetX, 0, pState);
+        break;
+    case SURFACE_2D:
+        ComputeSurfaceOffset2D<UseCachedOffsets>(x, y, array, sampleNum, lod, pState, offsetX, offsetY);
+        return TileSwizzle2D(offsetX, offsetY, pState);
+    case SURFACE_3D:
+        ComputeSurfaceOffset3D<UseCachedOffsets>(x, y, z, lod, pState, offsetX, offsetY, offsetZ);
+        return TileSwizzle3D(offsetX, offsetY, offsetZ, pState);
+        break;
+    case SURFACE_CUBE:
+        ComputeSurfaceOffset2D<UseCachedOffsets>(x, y, array, sampleNum, lod, pState, offsetX, offsetY);
+        return TileSwizzle2D(offsetX, offsetY, pState);
+        break;
+    default: SWR_ASSERT(0, "Unsupported format");
+    }
+
+    return (uint32_t) NULL;
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes surface address at the given location and lod
+/// @param x - x location in pixels
+/// @param y - y location in rows
+/// @param z - z location for 3D surfaces
+/// @param array - array slice for 1D and 2D surfaces
+/// @param lod - level of detail
+/// @param pState - pointer to the surface state
+template<bool UseCachedOffsets>
+INLINE
+void* ComputeSurfaceAddress(uint32_t x, uint32_t y, uint32_t z, uint32_t array, uint32_t sampleNum, uint32_t lod, const SWR_SURFACE_STATE *pState)
+{
+    return pState->pBaseAddress + ComputeSurfaceOffset<UseCachedOffsets>(x, y, z, array, sampleNum, lod, pState);
+}
diff --git a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h
new file mode 100644
index 00000000000..50f8e57c22a
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h
@@ -0,0 +1,263 @@
+/****************************************************************************
+* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+* 
+* @file tilingtraits.h
+* 
+* @brief Tiling traits.
+* 
+******************************************************************************/
+#pragma once
+
+#include "core/state.h"
+
+template<SWR_TILE_MODE mode, int>
+struct TilingTraits
+{
+    static const SWR_TILE_MODE TileMode{ mode };
+    static UINT GetCu() { SWR_ASSERT(0); return 0; }
+    static UINT GetCv() { SWR_ASSERT(0); return 0; }
+    static UINT GetCr() { SWR_ASSERT(0); return 0; }
+    static UINT GetTileIDShift() { SWR_ASSERT(0); return 0; }
+
+    /// @todo correct pdep shifts for all rastertile dims.  Unused for now
+    static UINT GetPdepX() { SWR_ASSERT(0); return 0x37; }
+    static UINT GetPdepY() { SWR_ASSERT(0); return 0xC8; }
+};
+
+template<int X> struct TilingTraits <SWR_TILE_NONE, X>
+{
+    static const SWR_TILE_MODE TileMode{ SWR_TILE_NONE };
+    static UINT GetCu() { return 0; }
+    static UINT GetCv() { return 0; }
+    static UINT GetCr() { return 0; }
+    static UINT GetTileIDShift() { return 0; }
+    static UINT GetPdepX() { return 0x00; }
+    static UINT GetPdepY() { return 0x00; }
+};
+
+template<> struct TilingTraits <SWR_TILE_SWRZ, 8>
+{
+    static const SWR_TILE_MODE TileMode{ SWR_TILE_SWRZ };
+    static UINT GetCu() { return KNOB_TILE_X_DIM_SHIFT; }
+    static UINT GetCv() { return KNOB_TILE_Y_DIM_SHIFT; }
+    static UINT GetCr() { return 0; }
+    static UINT GetTileIDShift() { return KNOB_TILE_X_DIM_SHIFT + KNOB_TILE_Y_DIM_SHIFT; }
+
+    /// @todo correct pdep shifts for all rastertile dims.  Unused for now
+    static UINT GetPdepX() { SWR_ASSERT(0); return 0x00; }
+    static UINT GetPdepY() { SWR_ASSERT(0); return 0x00; }
+};
+
+template<> struct TilingTraits <SWR_TILE_SWRZ, 32>
+{
+    static const SWR_TILE_MODE TileMode{ SWR_TILE_SWRZ };
+    static UINT GetCu() { return KNOB_TILE_X_DIM_SHIFT + 2; }
+    static UINT GetCv() { return KNOB_TILE_Y_DIM_SHIFT; }
+    static UINT GetCr() { return 0; }
+    static UINT GetTileIDShift() { return KNOB_TILE_X_DIM_SHIFT + KNOB_TILE_Y_DIM_SHIFT + 2; }
+
+    static UINT GetPdepX() { return 0x37; }
+    static UINT GetPdepY() { return 0xC8; }
+};
+
+template<> struct TilingTraits <SWR_TILE_SWRZ, 128>
+{
+    static const SWR_TILE_MODE TileMode{ SWR_TILE_SWRZ };
+    static UINT GetCu() { return KNOB_TILE_X_DIM_SHIFT + 4; }
+    static UINT GetCv() { return KNOB_TILE_Y_DIM_SHIFT; }
+    static UINT GetCr() { return 0; }
+    static UINT GetTileIDShift() { return KNOB_TILE_X_DIM_SHIFT + KNOB_TILE_Y_DIM_SHIFT + 4; }
+
+    /// @todo correct pdep shifts for all rastertile dims.  Unused for now
+    static UINT GetPdepX() { SWR_ASSERT(0); return 0x37; }
+    static UINT GetPdepY() { SWR_ASSERT(0); return 0xC8; }
+};
+
+// y-major tiling layout unaffected by element size
+template<int X> struct TilingTraits <SWR_TILE_MODE_YMAJOR, X>
+{
+    static const SWR_TILE_MODE TileMode{ SWR_TILE_MODE_YMAJOR };
+    static UINT GetCu() { return 7; }
+    static UINT GetCv() { return 5; }
+    static UINT GetCr() { return 0; }
+    static UINT GetTileIDShift() { return 12; }
+
+    static UINT GetPdepX() { return 0xe0f; }
+    static UINT GetPdepY() { return 0x1f0; }
+};
+
+// x-major tiling layout unaffected by element size
+template<int X> struct TilingTraits <SWR_TILE_MODE_XMAJOR, X>
+{
+    static const SWR_TILE_MODE TileMode{ SWR_TILE_MODE_XMAJOR };
+    static UINT GetCu() { return 9; }
+    static UINT GetCv() { return 3; }
+    static UINT GetCr() { return 0; }
+    static UINT GetTileIDShift() { return 12; }
+
+    static UINT GetPdepX() { return 0x1ff; }
+    static UINT GetPdepY() { return 0xe00; }
+};
+
+template<int X> struct TilingTraits <SWR_TILE_MODE_WMAJOR, X>
+{
+    static const SWR_TILE_MODE TileMode{ SWR_TILE_MODE_WMAJOR };
+    static UINT GetCu() { return 6; }
+    static UINT GetCv() { return 6; }
+    static UINT GetCr() { return 0; }
+    static UINT GetTileIDShift() { return 12; }
+
+    static UINT GetPdepX() { return 0xe15; }
+    static UINT GetPdepY() { return 0x1ea; }
+};
+
+INLINE
+UINT pdep_u32(UINT a, UINT mask)
+{
+#if KNOB_ARCH==KNOB_ARCH_AVX2
+    return _pdep_u32(a, mask);
+#else
+    UINT result = 0;
+
+    // copied from http://wm.ite.pl/articles/pdep-soft-emu.html 
+    // using bsf instead of funky loop
+    DWORD maskIndex;
+    while (_BitScanForward(&maskIndex, mask))
+    {
+        // 1. isolate lowest set bit of mask
+        const UINT lowest = 1 << maskIndex;
+
+        // 2. populate LSB from src
+        const UINT LSB = (UINT)((int)(a << 31) >> 31);
+
+        // 3. copy bit from mask
+        result |= LSB & lowest;
+
+        // 4. clear lowest bit
+        mask &= ~lowest;
+
+        // 5. prepare for next iteration
+        a >>= 1;
+    }
+
+    return result;
+#endif
+}
+
+INLINE
+UINT pext_u32(UINT a, UINT mask)
+{
+#if KNOB_ARCH==KNOB_ARCH_AVX2
+    return _pext_u32(a, mask);
+#else
+    UINT result = 0;
+    DWORD maskIndex;
+    uint32_t currentBit = 0;
+    while (_BitScanForward(&maskIndex, mask))
+    {
+        // 1. isolate lowest set bit of mask
+        const UINT lowest = 1 << maskIndex;
+
+        // 2. copy bit from mask
+        result |= ((a & lowest) > 0) << currentBit++;
+
+        // 3. clear lowest bit
+        mask &= ~lowest;
+    }
+    return result;
+#endif
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes the tileID for 2D tiled surfaces
+/// @param pitch - surface pitch in bytes
+/// @param tileX - x offset in tiles
+/// @param tileY - y offset in tiles
+template<typename TTraits>
+INLINE UINT ComputeTileOffset2D(UINT pitch, UINT tileX, UINT tileY)
+{
+    UINT tileID = tileY * (pitch >> TTraits::GetCu()) + tileX;
+    return tileID << TTraits::GetTileIDShift();
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes the tileID for 3D tiled surfaces
+/// @param qpitch - surface qpitch in rows
+/// @param pitch - surface pitch in bytes
+/// @param tileX - x offset in tiles
+/// @param tileY - y offset in tiles
+/// @param tileZ - y offset in tiles
+template<typename TTraits>
+INLINE UINT ComputeTileOffset3D(UINT qpitch, UINT pitch, UINT tileX, UINT tileY, UINT tileZ)
+{
+    UINT tileID = (tileZ * (qpitch >> TTraits::GetCv()) + tileY) * (pitch >> TTraits::GetCu()) + tileX;
+    return tileID << TTraits::GetTileIDShift();
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes the byte offset for 2D tiled surfaces
+/// @param pitch - surface pitch in bytes
+/// @param x - x offset in bytes
+/// @param y - y offset in rows
+template<typename TTraits>
+INLINE UINT ComputeOffset2D(UINT pitch, UINT x, UINT y)
+{
+    UINT tileID = ComputeTileOffset2D<TTraits>(pitch, x >> TTraits::GetCu(), y >> TTraits::GetCv());
+    UINT xSwizzle = pdep_u32(x, TTraits::GetPdepX());
+    UINT ySwizzle = pdep_u32(y, TTraits::GetPdepY());
+    return (tileID | xSwizzle | ySwizzle);
+}
+
+#if KNOB_ARCH <= KNOB_ARCH_AVX
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes the byte offset for 2D tiled surfaces. Specialization
+///        for tile-y surfaces that uses bit twiddling instead of pdep emulation.
+/// @param pitch - surface pitch in bytes
+/// @param x - x offset in bytes
+/// @param y - y offset in rows
+template<>
+INLINE UINT ComputeOffset2D<TilingTraits<SWR_TILE_MODE_YMAJOR, 32> >(UINT pitch, UINT x, UINT y)
+{
+    typedef TilingTraits<SWR_TILE_MODE_YMAJOR, 32> TTraits;
+
+    UINT tileID = ComputeTileOffset2D<TTraits>(pitch, x >> TTraits::GetCu(), y >> TTraits::GetCv());
+    UINT xSwizzle = ((x << 5) & 0xe00) | (x & 0xf);
+    UINT ySwizzle = (y << 4) & 0x1f0;
+    return (tileID | xSwizzle | ySwizzle);
+}
+#endif
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Computes the byte offset for 3D tiled surfaces
+/// @param qpitch - depth pitch in rows
+/// @param pitch - surface pitch in bytes
+/// @param x - x offset in bytes
+/// @param y - y offset in rows
+/// @param z - y offset in slices
+template<typename TTraits>
+INLINE UINT ComputeOffset3D(UINT qpitch, UINT pitch, UINT x, UINT y, UINT z)
+{
+    UINT tileID = ComputeTileOffset3D<TTraits>(qpitch, pitch, x >> TTraits::GetCu(), y >> TTraits::GetCv(), z >> TTraits::GetCr());
+    UINT xSwizzle = pdep_u32(x, TTraits::GetPdepX());
+    UINT ySwizzle = pdep_u32(y, TTraits::GetPdepY());
+    return (tileID | xSwizzle | ySwizzle);
+}
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py b/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py
new file mode 100644
index 00000000000..44ab69815b1
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py
@@ -0,0 +1,79 @@
+# Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+# Python source
+from __future__ import print_function
+import os
+import sys
+import knob_defs
+from mako.template import Template
+from mako.exceptions import RichTraceback
+
+def write_template_to_string(template_filename, **kwargs):
+    try:
+        template = Template(filename=template_filename)
+        # Split + Join fixes line-endings for whatever platform you are using
+        return '\n'.join(template.render(**kwargs).splitlines())
+    except:
+        traceback = RichTraceback()
+        for (filename, lineno, function, line) in traceback.traceback:
+            print("File %s, line %s, in %s" % (filename, lineno, function))
+            print(line, "\n")
+        print("%s: %s" % (str(traceback.error.__class__.__name__), traceback.error))
+
+def write_template_to_file(template_filename, output_filename, **kwargs):
+    with open(output_filename, "w") as outfile:
+        print(write_template_to_string(template_filename, **kwargs), file=outfile)
+
+def main(args=sys.argv[1:]):
+    if len(args) != 1:
+        print('Usage:', sys.argv[0], '<output_directory>', file=sys.stderr)
+        return 1
+
+    output_dir = args[0]
+    if not os.path.isdir(output_dir):
+        if os.path.exists(output_dir):
+            print('ERROR: Invalid output directory:', output_dir, file=sys.stderr)
+            return 1
+
+        try:
+            os.makedirs(output_dir)
+        except:
+            print('ERROR: Could not create output directory:', output_dir, file=sys.stderr)
+            return 1
+
+    # Output path exists, now just run the template
+    template_file = os.sep.join([sys.path[0], 'templates', 'knobs.template'])
+    output_file = os.sep.join([output_dir, 'gen_knobs.cpp'])
+    output_header = os.sep.join([output_dir, 'gen_knobs.h'])
+
+    for f in [output_header, output_file]:
+        write_template_to_file(template_file, f,
+                filename='gen_knobs',
+                knobs=knob_defs.KNOBS,
+                includes=['core/knobs_init.h', 'common/os.h', 'sstream', 'iomanip'],
+                gen_header=True if f == output_header else False)
+
+    return 0
+
+if __name__ == '__main__':
+    sys.exit(main())
+
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
new file mode 100644
index 00000000000..8c51e1e8e73
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
@@ -0,0 +1,226 @@
+# Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a
+# copy of this software and associated documentation files (the "Software"),
+# to deal in the Software without restriction, including without limitation
+# the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the
+# Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice (including the next
+# paragraph) shall be included in all copies or substantial portions of the
+# Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+# THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+# FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+
+# Python source
+KNOBS = [
+    ['ENABLE_ASSERT_DIALOGS', {
+        'type'      : 'bool',
+        'default'   : 'true',
+        'desc'      : ['Use dialogs when asserts fire.',
+                       'Asserts are only enabled in debug builds'],
+    }],
+
+    ['SINGLE_THREADED', {
+        'type'      : 'bool',
+        'default'   : 'false',
+        'desc'      : ['If enabled will perform all rendering on the API thread.',
+                       'This is useful mainly for debugging purposes.'],
+    }],
+
+    ['DUMP_SHADER_IR', {
+       'type'       : 'bool',
+       'default'    : 'false',
+       'desc'       : ['Dumps shader LLVM IR at various stages of jit compilation.'],
+    }],
+
+    ['USE_GENERIC_STORETILE', {
+        'type'      : 'bool',
+        'default'   : 'false',
+        'desc'      : ['Always use generic function for performing StoreTile.',
+                       'Will be slightly slower than using optimized (jitted) path'],
+    }],
+
+    ['FAST_CLEAR', {
+        'type'      : 'bool',
+        'default'   : 'true',
+        'desc'      : ['Replace 3D primitive execute with a SWRClearRT operation and',
+                       'defer clear execution to first backend op on hottile, or hottile store'],
+    }],
+
+    ['MAX_NUMA_NODES', {
+        'type'      : 'uint32_t',
+        'default'   : '0',
+        'desc'      : ['Maximum # of NUMA-nodes per system used for worker threads',
+                       '  0 == ALL NUMA-nodes in the system',
+                       '  N == Use at most N NUMA-nodes for rendering'],
+    }],
+
+    ['MAX_CORES_PER_NUMA_NODE', {
+        'type'      : 'uint32_t',
+        'default'   : '0',
+        'desc'      : ['Maximum # of cores per NUMA-node used for worker threads.',
+                       '  0 == ALL non-API thread cores per NUMA-node',
+                       '  N == Use at most N cores per NUMA-node'],
+    }],
+
+    ['MAX_THREADS_PER_CORE', {
+        'type'      : 'uint32_t',
+        'default'   : '1',
+        'desc'      : ['Maximum # of (hyper)threads per physical core used for worker threads.',
+                       '  0 == ALL hyper-threads per core',
+                       '  N == Use at most N hyper-threads per physical core'],
+    }],
+
+    ['MAX_WORKER_THREADS', {
+        'type'      : 'uint32_t',
+        'default'   : '0',
+        'desc'      : ['Maximum worker threads to spawn.',
+                       '',
+                       'IMPORTANT: If this is non-zero, no worker threads will be bound to',
+                       'specific HW threads.  They will all be "floating" SW threads.',
+                       'In this case, the above 3 KNOBS will be ignored.'],
+    }],
+
+    ['BUCKETS_START_FRAME', {
+        'type'      : 'uint32_t',
+        'default'   : '1200',
+        'desc'      : ['Frame from when to start saving buckets data.',
+                       '',
+                       'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h',
+                       'for this to have an effect.'],
+    }],
+
+    ['BUCKETS_END_FRAME', {
+        'type'      : 'uint32_t',
+        'default'   : '1400',
+        'desc'      : ['Frame at which to stop saving buckets data.',
+                       '',
+                       'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h',
+                       'for this to have an effect.'],
+    }],
+
+    ['WORKER_SPIN_LOOP_COUNT', {
+        'type'      : 'uint32_t',
+        'default'   : '5000',
+        'desc'      : ['Number of spin-loop iterations worker threads will perform',
+                       'before going to sleep when waiting for work'],
+    }],
+
+    ['MAX_DRAWS_IN_FLIGHT', {
+        'type'      : 'uint32_t',
+        'default'   : '160',
+        'desc'      : ['Maximum number of draws outstanding before API thread blocks.'],
+    }],
+
+    ['MAX_PRIMS_PER_DRAW', {
+       'type'       : 'uint32_t',
+       'default'    : '2040',
+       'desc'       : ['Maximum primitives in a single Draw().',
+                       'Larger primitives are split into smaller Draw calls.',
+                       'Should be a multiple of (3 * vectorWidth).'],
+    }],
+
+    ['MAX_TESS_PRIMS_PER_DRAW', {
+       'type'       : 'uint32_t',
+       'default'    : '16',
+       'desc'       : ['Maximum primitives in a single Draw() with tessellation enabled.',
+                       'Larger primitives are split into smaller Draw calls.',
+                       'Should be a multiple of (vectorWidth).'],
+    }],
+
+    ['MAX_FRAC_ODD_TESS_FACTOR', {
+        'type'      : 'float',
+        'default'   : '63.0f',
+        'desc'      : ['(DEBUG) Maximum tessellation factor for fractional-odd partitioning.'],
+    }],
+
+    ['MAX_FRAC_EVEN_TESS_FACTOR', {
+        'type'      : 'float',
+        'default'   : '64.0f',
+        'desc'      : ['(DEBUG) Maximum tessellation factor for fractional-even partitioning.'],
+    }],
+
+    ['MAX_INTEGER_TESS_FACTOR', {
+        'type'      : 'uint32_t',
+        'default'   : '64',
+        'desc'      : ['(DEBUG) Maximum tessellation factor for integer partitioning.'],
+    }],
+
+
+    ['BUCKETS_ENABLE_THREADVIZ', {
+        'type'      : 'bool',
+        'default'   : 'false',
+        'desc'      : ['Enable threadviz output.'],
+    }],
+
+    ['TOSS_DRAW', {
+        'type'      : 'bool',
+        'default'   : 'false',
+        'desc'      : ['Disable per-draw/dispatch execution'],
+    }],
+
+    ['TOSS_QUEUE_FE', {
+        'type'      : 'bool',
+        'default'   : 'false',
+        'desc'      : ['Stop per-draw execution at worker FE',
+                       '',
+                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+    }],
+
+    ['TOSS_FETCH', {
+        'type'      : 'bool',
+        'default'   : 'false',
+        'desc'      : ['Stop per-draw execution at vertex fetch',
+                       '',
+                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+    }],
+
+    ['TOSS_IA', {
+        'type'      : 'bool',
+        'default'   : 'false',
+        'desc'      : ['Stop per-draw execution at input assembler',
+                       '',
+                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+    }],
+
+    ['TOSS_VS', {
+        'type'      : 'bool',
+        'default'   : 'false',
+        'desc'      : ['Stop per-draw execution at vertex shader',
+                       '',
+                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+    }],
+
+    ['TOSS_SETUP_TRIS', {
+        'type'      : 'bool',
+        'default'   : 'false',
+        'desc'      : ['Stop per-draw execution at primitive setup',
+                       '',
+                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+    }],
+
+    ['TOSS_BIN_TRIS', {
+        'type'      : 'bool',
+        'default'   : 'false',
+        'desc'      : ['Stop per-draw execution at primitive binning',
+                       '',
+                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+    }],
+
+    ['TOSS_RS', {
+        'type'      : 'bool',
+        'default'   : 'false',
+        'desc'      : ['Stop per-draw execution at rasterizer',
+                       '',
+                       'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+    }],
+
+]
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/__init__.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/__init__.py
new file mode 100644
index 00000000000..d9638481889
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/__init__.py
@@ -0,0 +1,8 @@
+# mako/__init__.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+
+__version__ = '1.0.1'
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/_ast_util.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/_ast_util.py
new file mode 100644
index 00000000000..efbc4fc245d
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/_ast_util.py
@@ -0,0 +1,845 @@
+# mako/_ast_util.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+"""
+    ast
+    ~~~
+
+    The `ast` module helps Python applications to process trees of the Python
+    abstract syntax grammar.  The abstract syntax itself might change with
+    each Python release; this module helps to find out programmatically what
+    the current grammar looks like and allows modifications of it.
+
+    An abstract syntax tree can be generated by passing `ast.PyCF_ONLY_AST` as
+    a flag to the `compile()` builtin function or by using the `parse()`
+    function from this module.  The result will be a tree of objects whose
+    classes all inherit from `ast.AST`.
+
+    A modified abstract syntax tree can be compiled into a Python code object
+    using the built-in `compile()` function.
+
+    Additionally various helper functions are provided that make working with
+    the trees simpler.  The main intention of the helper functions and this
+    module in general is to provide an easy to use interface for libraries
+    that work tightly with the python syntax (template engines for example).
+
+
+    :copyright: Copyright 2008 by Armin Ronacher.
+    :license: Python License.
+"""
+from _ast import *
+from mako.compat import arg_stringname
+
+BOOLOP_SYMBOLS = {
+    And: 'and',
+    Or: 'or'
+}
+
+BINOP_SYMBOLS = {
+    Add: '+',
+    Sub: '-',
+    Mult: '*',
+    Div: '/',
+    FloorDiv: '//',
+    Mod: '%',
+    LShift: '<<',
+    RShift: '>>',
+    BitOr: '|',
+    BitAnd: '&',
+    BitXor: '^'
+}
+
+CMPOP_SYMBOLS = {
+    Eq: '==',
+    Gt: '>',
+    GtE: '>=',
+    In: 'in',
+    Is: 'is',
+    IsNot: 'is not',
+    Lt: '<',
+    LtE: '<=',
+    NotEq: '!=',
+    NotIn: 'not in'
+}
+
+UNARYOP_SYMBOLS = {
+    Invert: '~',
+    Not: 'not',
+    UAdd: '+',
+    USub: '-'
+}
+
+ALL_SYMBOLS = {}
+ALL_SYMBOLS.update(BOOLOP_SYMBOLS)
+ALL_SYMBOLS.update(BINOP_SYMBOLS)
+ALL_SYMBOLS.update(CMPOP_SYMBOLS)
+ALL_SYMBOLS.update(UNARYOP_SYMBOLS)
+
+
+def parse(expr, filename='<unknown>', mode='exec'):
+    """Parse an expression into an AST node."""
+    return compile(expr, filename, mode, PyCF_ONLY_AST)
+
+
+def to_source(node, indent_with=' ' * 4):
+    """
+    This function can convert a node tree back into python sourcecode.  This
+    is useful for debugging purposes, especially if you're dealing with custom
+    asts not generated by python itself.
+
+    It could be that the sourcecode is evaluable when the AST itself is not
+    compilable / evaluable.  The reason for this is that the AST contains some
+    more data than regular sourcecode does, which is dropped during
+    conversion.
+
+    Each level of indentation is replaced with `indent_with`.  Per default this
+    parameter is equal to four spaces as suggested by PEP 8, but it might be
+    adjusted to match the application's styleguide.
+    """
+    generator = SourceGenerator(indent_with)
+    generator.visit(node)
+    return ''.join(generator.result)
+
+
+def dump(node):
+    """
+    A very verbose representation of the node passed.  This is useful for
+    debugging purposes.
+    """
+    def _format(node):
+        if isinstance(node, AST):
+            return '%s(%s)' % (node.__class__.__name__,
+                               ', '.join('%s=%s' % (a, _format(b))
+                                         for a, b in iter_fields(node)))
+        elif isinstance(node, list):
+            return '[%s]' % ', '.join(_format(x) for x in node)
+        return repr(node)
+    if not isinstance(node, AST):
+        raise TypeError('expected AST, got %r' % node.__class__.__name__)
+    return _format(node)
+
+
+def copy_location(new_node, old_node):
+    """
+    Copy the source location hint (`lineno` and `col_offset`) from the
+    old to the new node if possible and return the new one.
+    """
+    for attr in 'lineno', 'col_offset':
+        if attr in old_node._attributes and attr in new_node._attributes \
+           and hasattr(old_node, attr):
+            setattr(new_node, attr, getattr(old_node, attr))
+    return new_node
+
+
+def fix_missing_locations(node):
+    """
+    Some nodes require a line number and the column offset.  Without that
+    information the compiler will abort the compilation.  Because it can be
+    a dull task to add appropriate line numbers and column offsets when
+    adding new nodes this function can help.  It copies the line number and
+    column offset of the parent node to the child nodes without this
+    information.
+
+    Unlike `copy_location` this works recursive and won't touch nodes that
+    already have a location information.
+    """
+    def _fix(node, lineno, col_offset):
+        if 'lineno' in node._attributes:
+            if not hasattr(node, 'lineno'):
+                node.lineno = lineno
+            else:
+                lineno = node.lineno
+        if 'col_offset' in node._attributes:
+            if not hasattr(node, 'col_offset'):
+                node.col_offset = col_offset
+            else:
+                col_offset = node.col_offset
+        for child in iter_child_nodes(node):
+            _fix(child, lineno, col_offset)
+    _fix(node, 1, 0)
+    return node
+
+
+def increment_lineno(node, n=1):
+    """
+    Increment the line numbers of all nodes by `n` if they have line number
+    attributes.  This is useful to "move code" to a different location in a
+    file.
+    """
+    for node in zip((node,), walk(node)):
+        if 'lineno' in node._attributes:
+            node.lineno = getattr(node, 'lineno', 0) + n
+
+
+def iter_fields(node):
+    """Iterate over all fields of a node, only yielding existing fields."""
+    # CPython 2.5 compat
+    if not hasattr(node, '_fields') or not node._fields:
+        return
+    for field in node._fields:
+        try:
+            yield field, getattr(node, field)
+        except AttributeError:
+            pass
+
+
+def get_fields(node):
+    """Like `iter_fiels` but returns a dict."""
+    return dict(iter_fields(node))
+
+
+def iter_child_nodes(node):
+    """Iterate over all child nodes or a node."""
+    for name, field in iter_fields(node):
+        if isinstance(field, AST):
+            yield field
+        elif isinstance(field, list):
+            for item in field:
+                if isinstance(item, AST):
+                    yield item
+
+
+def get_child_nodes(node):
+    """Like `iter_child_nodes` but returns a list."""
+    return list(iter_child_nodes(node))
+
+
+def get_compile_mode(node):
+    """
+    Get the mode for `compile` of a given node.  If the node is not a `mod`
+    node (`Expression`, `Module` etc.) a `TypeError` is thrown.
+    """
+    if not isinstance(node, mod):
+        raise TypeError('expected mod node, got %r' % node.__class__.__name__)
+    return {
+        Expression: 'eval',
+        Interactive: 'single'
+    }.get(node.__class__, 'expr')
+
+
+def get_docstring(node):
+    """
+    Return the docstring for the given node or `None` if no docstring can be
+    found.  If the node provided does not accept docstrings a `TypeError`
+    will be raised.
+    """
+    if not isinstance(node, (FunctionDef, ClassDef, Module)):
+        raise TypeError("%r can't have docstrings" % node.__class__.__name__)
+    if node.body and isinstance(node.body[0], Str):
+        return node.body[0].s
+
+
+def walk(node):
+    """
+    Iterate over all nodes.  This is useful if you only want to modify nodes in
+    place and don't care about the context or the order the nodes are returned.
+    """
+    from collections import deque
+    todo = deque([node])
+    while todo:
+        node = todo.popleft()
+        todo.extend(iter_child_nodes(node))
+        yield node
+
+
+class NodeVisitor(object):
+    """
+    Walks the abstract syntax tree and call visitor functions for every node
+    found.  The visitor functions may return values which will be forwarded
+    by the `visit` method.
+
+    Per default the visitor functions for the nodes are ``'visit_'`` +
+    class name of the node.  So a `TryFinally` node visit function would
+    be `visit_TryFinally`.  This behavior can be changed by overriding
+    the `get_visitor` function.  If no visitor function exists for a node
+    (return value `None`) the `generic_visit` visitor is used instead.
+
+    Don't use the `NodeVisitor` if you want to apply changes to nodes during
+    traversing.  For this a special visitor exists (`NodeTransformer`) that
+    allows modifications.
+    """
+
+    def get_visitor(self, node):
+        """
+        Return the visitor function for this node or `None` if no visitor
+        exists for this node.  In that case the generic visit function is
+        used instead.
+        """
+        method = 'visit_' + node.__class__.__name__
+        return getattr(self, method, None)
+
+    def visit(self, node):
+        """Visit a node."""
+        f = self.get_visitor(node)
+        if f is not None:
+            return f(node)
+        return self.generic_visit(node)
+
+    def generic_visit(self, node):
+        """Called if no explicit visitor function exists for a node."""
+        for field, value in iter_fields(node):
+            if isinstance(value, list):
+                for item in value:
+                    if isinstance(item, AST):
+                        self.visit(item)
+            elif isinstance(value, AST):
+                self.visit(value)
+
+
+class NodeTransformer(NodeVisitor):
+    """
+    Walks the abstract syntax tree and allows modifications of nodes.
+
+    The `NodeTransformer` will walk the AST and use the return value of the
+    visitor functions to replace or remove the old node.  If the return
+    value of the visitor function is `None` the node will be removed
+    from the previous location otherwise it's replaced with the return
+    value.  The return value may be the original node in which case no
+    replacement takes place.
+
+    Here an example transformer that rewrites all `foo` to `data['foo']`::
+
+        class RewriteName(NodeTransformer):
+
+            def visit_Name(self, node):
+                return copy_location(Subscript(
+                    value=Name(id='data', ctx=Load()),
+                    slice=Index(value=Str(s=node.id)),
+                    ctx=node.ctx
+                ), node)
+
+    Keep in mind that if the node you're operating on has child nodes
+    you must either transform the child nodes yourself or call the generic
+    visit function for the node first.
+
+    Nodes that were part of a collection of statements (that applies to
+    all statement nodes) may also return a list of nodes rather than just
+    a single node.
+
+    Usually you use the transformer like this::
+
+        node = YourTransformer().visit(node)
+    """
+
+    def generic_visit(self, node):
+        for field, old_value in iter_fields(node):
+            old_value = getattr(node, field, None)
+            if isinstance(old_value, list):
+                new_values = []
+                for value in old_value:
+                    if isinstance(value, AST):
+                        value = self.visit(value)
+                        if value is None:
+                            continue
+                        elif not isinstance(value, AST):
+                            new_values.extend(value)
+                            continue
+                    new_values.append(value)
+                old_value[:] = new_values
+            elif isinstance(old_value, AST):
+                new_node = self.visit(old_value)
+                if new_node is None:
+                    delattr(node, field)
+                else:
+                    setattr(node, field, new_node)
+        return node
+
+
+class SourceGenerator(NodeVisitor):
+    """
+    This visitor is able to transform a well formed syntax tree into python
+    sourcecode.  For more details have a look at the docstring of the
+    `node_to_source` function.
+    """
+
+    def __init__(self, indent_with):
+        self.result = []
+        self.indent_with = indent_with
+        self.indentation = 0
+        self.new_lines = 0
+
+    def write(self, x):
+        if self.new_lines:
+            if self.result:
+                self.result.append('\n' * self.new_lines)
+            self.result.append(self.indent_with * self.indentation)
+            self.new_lines = 0
+        self.result.append(x)
+
+    def newline(self, n=1):
+        self.new_lines = max(self.new_lines, n)
+
+    def body(self, statements):
+        self.new_line = True
+        self.indentation += 1
+        for stmt in statements:
+            self.visit(stmt)
+        self.indentation -= 1
+
+    def body_or_else(self, node):
+        self.body(node.body)
+        if node.orelse:
+            self.newline()
+            self.write('else:')
+            self.body(node.orelse)
+
+    def signature(self, node):
+        want_comma = []
+        def write_comma():
+            if want_comma:
+                self.write(', ')
+            else:
+                want_comma.append(True)
+
+        padding = [None] * (len(node.args) - len(node.defaults))
+        for arg, default in zip(node.args, padding + node.defaults):
+            write_comma()
+            self.visit(arg)
+            if default is not None:
+                self.write('=')
+                self.visit(default)
+        if node.vararg is not None:
+            write_comma()
+            self.write('*' + arg_stringname(node.vararg))
+        if node.kwarg is not None:
+            write_comma()
+            self.write('**' + arg_stringname(node.kwarg))
+
+    def decorators(self, node):
+        for decorator in node.decorator_list:
+            self.newline()
+            self.write('@')
+            self.visit(decorator)
+
+    # Statements
+
+    def visit_Assign(self, node):
+        self.newline()
+        for idx, target in enumerate(node.targets):
+            if idx:
+                self.write(', ')
+            self.visit(target)
+        self.write(' = ')
+        self.visit(node.value)
+
+    def visit_AugAssign(self, node):
+        self.newline()
+        self.visit(node.target)
+        self.write(BINOP_SYMBOLS[type(node.op)] + '=')
+        self.visit(node.value)
+
+    def visit_ImportFrom(self, node):
+        self.newline()
+        self.write('from %s%s import ' % ('.' * node.level, node.module))
+        for idx, item in enumerate(node.names):
+            if idx:
+                self.write(', ')
+            self.write(item)
+
+    def visit_Import(self, node):
+        self.newline()
+        for item in node.names:
+            self.write('import ')
+            self.visit(item)
+
+    def visit_Expr(self, node):
+        self.newline()
+        self.generic_visit(node)
+
+    def visit_FunctionDef(self, node):
+        self.newline(n=2)
+        self.decorators(node)
+        self.newline()
+        self.write('def %s(' % node.name)
+        self.signature(node.args)
+        self.write('):')
+        self.body(node.body)
+
+    def visit_ClassDef(self, node):
+        have_args = []
+        def paren_or_comma():
+            if have_args:
+                self.write(', ')
+            else:
+                have_args.append(True)
+                self.write('(')
+
+        self.newline(n=3)
+        self.decorators(node)
+        self.newline()
+        self.write('class %s' % node.name)
+        for base in node.bases:
+            paren_or_comma()
+            self.visit(base)
+        # XXX: the if here is used to keep this module compatible
+        #      with python 2.6.
+        if hasattr(node, 'keywords'):
+            for keyword in node.keywords:
+                paren_or_comma()
+                self.write(keyword.arg + '=')
+                self.visit(keyword.value)
+            if node.starargs is not None:
+                paren_or_comma()
+                self.write('*')
+                self.visit(node.starargs)
+            if node.kwargs is not None:
+                paren_or_comma()
+                self.write('**')
+                self.visit(node.kwargs)
+        self.write(have_args and '):' or ':')
+        self.body(node.body)
+
+    def visit_If(self, node):
+        self.newline()
+        self.write('if ')
+        self.visit(node.test)
+        self.write(':')
+        self.body(node.body)
+        while True:
+            else_ = node.orelse
+            if len(else_) == 1 and isinstance(else_[0], If):
+                node = else_[0]
+                self.newline()
+                self.write('elif ')
+                self.visit(node.test)
+                self.write(':')
+                self.body(node.body)
+            else:
+                self.newline()
+                self.write('else:')
+                self.body(else_)
+                break
+
+    def visit_For(self, node):
+        self.newline()
+        self.write('for ')
+        self.visit(node.target)
+        self.write(' in ')
+        self.visit(node.iter)
+        self.write(':')
+        self.body_or_else(node)
+
+    def visit_While(self, node):
+        self.newline()
+        self.write('while ')
+        self.visit(node.test)
+        self.write(':')
+        self.body_or_else(node)
+
+    def visit_With(self, node):
+        self.newline()
+        self.write('with ')
+        self.visit(node.context_expr)
+        if node.optional_vars is not None:
+            self.write(' as ')
+            self.visit(node.optional_vars)
+        self.write(':')
+        self.body(node.body)
+
+    def visit_Pass(self, node):
+        self.newline()
+        self.write('pass')
+
+    def visit_Print(self, node):
+        # XXX: python 2.6 only
+        self.newline()
+        self.write('print ')
+        want_comma = False
+        if node.dest is not None:
+            self.write(' >> ')
+            self.visit(node.dest)
+            want_comma = True
+        for value in node.values:
+            if want_comma:
+                self.write(', ')
+            self.visit(value)
+            want_comma = True
+        if not node.nl:
+            self.write(',')
+
+    def visit_Delete(self, node):
+        self.newline()
+        self.write('del ')
+        for idx, target in enumerate(node):
+            if idx:
+                self.write(', ')
+            self.visit(target)
+
+    def visit_TryExcept(self, node):
+        self.newline()
+        self.write('try:')
+        self.body(node.body)
+        for handler in node.handlers:
+            self.visit(handler)
+
+    def visit_TryFinally(self, node):
+        self.newline()
+        self.write('try:')
+        self.body(node.body)
+        self.newline()
+        self.write('finally:')
+        self.body(node.finalbody)
+
+    def visit_Global(self, node):
+        self.newline()
+        self.write('global ' + ', '.join(node.names))
+
+    def visit_Nonlocal(self, node):
+        self.newline()
+        self.write('nonlocal ' + ', '.join(node.names))
+
+    def visit_Return(self, node):
+        self.newline()
+        self.write('return ')
+        self.visit(node.value)
+
+    def visit_Break(self, node):
+        self.newline()
+        self.write('break')
+
+    def visit_Continue(self, node):
+        self.newline()
+        self.write('continue')
+
+    def visit_Raise(self, node):
+        # XXX: Python 2.6 / 3.0 compatibility
+        self.newline()
+        self.write('raise')
+        if hasattr(node, 'exc') and node.exc is not None:
+            self.write(' ')
+            self.visit(node.exc)
+            if node.cause is not None:
+                self.write(' from ')
+                self.visit(node.cause)
+        elif hasattr(node, 'type') and node.type is not None:
+            self.visit(node.type)
+            if node.inst is not None:
+                self.write(', ')
+                self.visit(node.inst)
+            if node.tback is not None:
+                self.write(', ')
+                self.visit(node.tback)
+
+    # Expressions
+
+    def visit_Attribute(self, node):
+        self.visit(node.value)
+        self.write('.' + node.attr)
+
+    def visit_Call(self, node):
+        want_comma = []
+        def write_comma():
+            if want_comma:
+                self.write(', ')
+            else:
+                want_comma.append(True)
+
+        self.visit(node.func)
+        self.write('(')
+        for arg in node.args:
+            write_comma()
+            self.visit(arg)
+        for keyword in node.keywords:
+            write_comma()
+            self.write(keyword.arg + '=')
+            self.visit(keyword.value)
+        if node.starargs is not None:
+            write_comma()
+            self.write('*')
+            self.visit(node.starargs)
+        if node.kwargs is not None:
+            write_comma()
+            self.write('**')
+            self.visit(node.kwargs)
+        self.write(')')
+
+    def visit_Name(self, node):
+        self.write(node.id)
+
+    def visit_NameConstant(self, node):
+        self.write(str(node.value))
+
+    def visit_arg(self, node):
+        self.write(node.arg)
+
+    def visit_Str(self, node):
+        self.write(repr(node.s))
+
+    def visit_Bytes(self, node):
+        self.write(repr(node.s))
+
+    def visit_Num(self, node):
+        self.write(repr(node.n))
+
+    def visit_Tuple(self, node):
+        self.write('(')
+        idx = -1
+        for idx, item in enumerate(node.elts):
+            if idx:
+                self.write(', ')
+            self.visit(item)
+        self.write(idx and ')' or ',)')
+
+    def sequence_visit(left, right):
+        def visit(self, node):
+            self.write(left)
+            for idx, item in enumerate(node.elts):
+                if idx:
+                    self.write(', ')
+                self.visit(item)
+            self.write(right)
+        return visit
+
+    visit_List = sequence_visit('[', ']')
+    visit_Set = sequence_visit('{', '}')
+    del sequence_visit
+
+    def visit_Dict(self, node):
+        self.write('{')
+        for idx, (key, value) in enumerate(zip(node.keys, node.values)):
+            if idx:
+                self.write(', ')
+            self.visit(key)
+            self.write(': ')
+            self.visit(value)
+        self.write('}')
+
+    def visit_BinOp(self, node):
+        self.write('(')
+        self.visit(node.left)
+        self.write(' %s ' % BINOP_SYMBOLS[type(node.op)])
+        self.visit(node.right)
+        self.write(')')
+
+    def visit_BoolOp(self, node):
+        self.write('(')
+        for idx, value in enumerate(node.values):
+            if idx:
+                self.write(' %s ' % BOOLOP_SYMBOLS[type(node.op)])
+            self.visit(value)
+        self.write(')')
+
+    def visit_Compare(self, node):
+        self.write('(')
+        self.visit(node.left)
+        for op, right in zip(node.ops, node.comparators):
+            self.write(' %s ' % CMPOP_SYMBOLS[type(op)])
+            self.visit(right)
+        self.write(')')
+
+    def visit_UnaryOp(self, node):
+        self.write('(')
+        op = UNARYOP_SYMBOLS[type(node.op)]
+        self.write(op)
+        if op == 'not':
+            self.write(' ')
+        self.visit(node.operand)
+        self.write(')')
+
+    def visit_Subscript(self, node):
+        self.visit(node.value)
+        self.write('[')
+        self.visit(node.slice)
+        self.write(']')
+
+    def visit_Slice(self, node):
+        if node.lower is not None:
+            self.visit(node.lower)
+        self.write(':')
+        if node.upper is not None:
+            self.visit(node.upper)
+        if node.step is not None:
+            self.write(':')
+            if not (isinstance(node.step, Name) and node.step.id == 'None'):
+                self.visit(node.step)
+
+    def visit_ExtSlice(self, node):
+        for idx, item in node.dims:
+            if idx:
+                self.write(', ')
+            self.visit(item)
+
+    def visit_Yield(self, node):
+        self.write('yield ')
+        self.visit(node.value)
+
+    def visit_Lambda(self, node):
+        self.write('lambda ')
+        self.signature(node.args)
+        self.write(': ')
+        self.visit(node.body)
+
+    def visit_Ellipsis(self, node):
+        self.write('Ellipsis')
+
+    def generator_visit(left, right):
+        def visit(self, node):
+            self.write(left)
+            self.visit(node.elt)
+            for comprehension in node.generators:
+                self.visit(comprehension)
+            self.write(right)
+        return visit
+
+    visit_ListComp = generator_visit('[', ']')
+    visit_GeneratorExp = generator_visit('(', ')')
+    visit_SetComp = generator_visit('{', '}')
+    del generator_visit
+
+    def visit_DictComp(self, node):
+        self.write('{')
+        self.visit(node.key)
+        self.write(': ')
+        self.visit(node.value)
+        for comprehension in node.generators:
+            self.visit(comprehension)
+        self.write('}')
+
+    def visit_IfExp(self, node):
+        self.visit(node.body)
+        self.write(' if ')
+        self.visit(node.test)
+        self.write(' else ')
+        self.visit(node.orelse)
+
+    def visit_Starred(self, node):
+        self.write('*')
+        self.visit(node.value)
+
+    def visit_Repr(self, node):
+        # XXX: python 2.6 only
+        self.write('`')
+        self.visit(node.value)
+        self.write('`')
+
+    # Helper Nodes
+
+    def visit_alias(self, node):
+        self.write(node.name)
+        if node.asname is not None:
+            self.write(' as ' + node.asname)
+
+    def visit_comprehension(self, node):
+        self.write(' for ')
+        self.visit(node.target)
+        self.write(' in ')
+        self.visit(node.iter)
+        if node.ifs:
+            for if_ in node.ifs:
+                self.write(' if ')
+                self.visit(if_)
+
+    def visit_excepthandler(self, node):
+        self.newline()
+        self.write('except')
+        if node.type is not None:
+            self.write(' ')
+            self.visit(node.type)
+            if node.name is not None:
+                self.write(' as ')
+                self.visit(node.name)
+        self.write(':')
+        self.body(node.body)
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/ast.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/ast.py
new file mode 100644
index 00000000000..65fd84dfe15
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/ast.py
@@ -0,0 +1,178 @@
+# mako/ast.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+"""utilities for analyzing expressions and blocks of Python
+code, as well as generating Python from AST nodes"""
+
+from mako import exceptions, pyparser, compat
+import re
+
+class PythonCode(object):
+    """represents information about a string containing Python code"""
+    def __init__(self, code, **exception_kwargs):
+        self.code = code
+
+        # represents all identifiers which are assigned to at some point in
+        # the code
+        self.declared_identifiers = set()
+
+        # represents all identifiers which are referenced before their
+        # assignment, if any
+        self.undeclared_identifiers = set()
+
+        # note that an identifier can be in both the undeclared and declared
+        # lists.
+
+        # using AST to parse instead of using code.co_varnames,
+        # code.co_names has several advantages:
+        # - we can locate an identifier as "undeclared" even if
+        # its declared later in the same block of code
+        # - AST is less likely to break with version changes
+        # (for example, the behavior of co_names changed a little bit
+        # in python version 2.5)
+        if isinstance(code, compat.string_types):
+            expr = pyparser.parse(code.lstrip(), "exec", **exception_kwargs)
+        else:
+            expr = code
+
+        f = pyparser.FindIdentifiers(self, **exception_kwargs)
+        f.visit(expr)
+
+class ArgumentList(object):
+    """parses a fragment of code as a comma-separated list of expressions"""
+    def __init__(self, code, **exception_kwargs):
+        self.codeargs = []
+        self.args = []
+        self.declared_identifiers = set()
+        self.undeclared_identifiers = set()
+        if isinstance(code, compat.string_types):
+            if re.match(r"\S", code) and not re.match(r",\s*$", code):
+                # if theres text and no trailing comma, insure its parsed
+                # as a tuple by adding a trailing comma
+                code  += ","
+            expr = pyparser.parse(code, "exec", **exception_kwargs)
+        else:
+            expr = code
+
+        f = pyparser.FindTuple(self, PythonCode, **exception_kwargs)
+        f.visit(expr)
+
+class PythonFragment(PythonCode):
+    """extends PythonCode to provide identifier lookups in partial control
+    statements
+
+    e.g.
+        for x in 5:
+        elif y==9:
+        except (MyException, e):
+    etc.
+    """
+    def __init__(self, code, **exception_kwargs):
+        m = re.match(r'^(\w+)(?:\s+(.*?))?:\s*(#|$)', code.strip(), re.S)
+        if not m:
+            raise exceptions.CompileException(
+                          "Fragment '%s' is not a partial control statement" %
+                          code, **exception_kwargs)
+        if m.group(3):
+            code = code[:m.start(3)]
+        (keyword, expr) = m.group(1,2)
+        if keyword in ['for','if', 'while']:
+            code = code + "pass"
+        elif keyword == 'try':
+            code = code + "pass\nexcept:pass"
+        elif keyword == 'elif' or keyword == 'else':
+            code = "if False:pass\n" + code + "pass"
+        elif keyword == 'except':
+            code = "try:pass\n" + code + "pass"
+        elif keyword == 'with':
+            code = code + "pass"
+        else:
+            raise exceptions.CompileException(
+                                "Unsupported control keyword: '%s'" %
+                                keyword, **exception_kwargs)
+        super(PythonFragment, self).__init__(code, **exception_kwargs)
+
+
+class FunctionDecl(object):
+    """function declaration"""
+    def __init__(self, code, allow_kwargs=True, **exception_kwargs):
+        self.code = code
+        expr = pyparser.parse(code, "exec", **exception_kwargs)
+
+        f = pyparser.ParseFunc(self, **exception_kwargs)
+        f.visit(expr)
+        if not hasattr(self, 'funcname'):
+            raise exceptions.CompileException(
+                            "Code '%s' is not a function declaration" % code,
+                            **exception_kwargs)
+        if not allow_kwargs and self.kwargs:
+            raise exceptions.CompileException(
+                                "'**%s' keyword argument not allowed here" %
+                                self.kwargnames[-1], **exception_kwargs)
+
+    def get_argument_expressions(self, as_call=False):
+        """Return the argument declarations of this FunctionDecl as a printable
+        list.
+
+        By default the return value is appropriate for writing in a ``def``;
+        set `as_call` to true to build arguments to be passed to the function
+        instead (assuming locals with the same names as the arguments exist).
+        """
+
+        namedecls = []
+
+        # Build in reverse order, since defaults and slurpy args come last
+        argnames = self.argnames[::-1]
+        kwargnames = self.kwargnames[::-1]
+        defaults = self.defaults[::-1]
+        kwdefaults = self.kwdefaults[::-1]
+
+        # Named arguments
+        if self.kwargs:
+            namedecls.append("**" + kwargnames.pop(0))
+
+        for name in kwargnames:
+            # Keyword-only arguments must always be used by name, so even if
+            # this is a call, print out `foo=foo`
+            if as_call:
+                namedecls.append("%s=%s" % (name, name))
+            elif kwdefaults:
+                default = kwdefaults.pop(0)
+                if default is None:
+                    # The AST always gives kwargs a default, since you can do
+                    # `def foo(*, a=1, b, c=3)`
+                    namedecls.append(name)
+                else:
+                    namedecls.append("%s=%s" % (
+                        name, pyparser.ExpressionGenerator(default).value()))
+            else:
+                namedecls.append(name)
+
+        # Positional arguments
+        if self.varargs:
+            namedecls.append("*" + argnames.pop(0))
+
+        for name in argnames:
+            if as_call or not defaults:
+                namedecls.append(name)
+            else:
+                default = defaults.pop(0)
+                namedecls.append("%s=%s" % (
+                    name, pyparser.ExpressionGenerator(default).value()))
+
+        namedecls.reverse()
+        return namedecls
+
+    @property
+    def allargnames(self):
+        return tuple(self.argnames) + tuple(self.kwargnames)
+
+class FunctionArgs(FunctionDecl):
+    """the argument portion of a function declaration"""
+
+    def __init__(self, code, **kwargs):
+        super(FunctionArgs, self).__init__("def ANON(%s):pass" % code,
+                **kwargs)
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/cache.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/cache.py
new file mode 100644
index 00000000000..c405c5171d7
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/cache.py
@@ -0,0 +1,238 @@
+# mako/cache.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+from mako import compat, util
+
+_cache_plugins = util.PluginLoader("mako.cache")
+
+register_plugin = _cache_plugins.register
+register_plugin("beaker", "mako.ext.beaker_cache", "BeakerCacheImpl")
+
+
+class Cache(object):
+    """Represents a data content cache made available to the module
+    space of a specific :class:`.Template` object.
+
+    .. versionadded:: 0.6
+       :class:`.Cache` by itself is mostly a
+       container for a :class:`.CacheImpl` object, which implements
+       a fixed API to provide caching services; specific subclasses exist to
+       implement different
+       caching strategies.   Mako includes a backend that works with
+       the Beaker caching system.   Beaker itself then supports
+       a number of backends (i.e. file, memory, memcached, etc.)
+
+    The construction of a :class:`.Cache` is part of the mechanics
+    of a :class:`.Template`, and programmatic access to this
+    cache is typically via the :attr:`.Template.cache` attribute.
+
+    """
+
+    impl = None
+    """Provide the :class:`.CacheImpl` in use by this :class:`.Cache`.
+
+    This accessor allows a :class:`.CacheImpl` with additional
+    methods beyond that of :class:`.Cache` to be used programmatically.
+
+    """
+
+    id = None
+    """Return the 'id' that identifies this cache.
+
+    This is a value that should be globally unique to the
+    :class:`.Template` associated with this cache, and can
+    be used by a caching system to name a local container
+    for data specific to this template.
+
+    """
+
+    starttime = None
+    """Epochal time value for when the owning :class:`.Template` was
+    first compiled.
+
+    A cache implementation may wish to invalidate data earlier than
+    this timestamp; this has the effect of the cache for a specific
+    :class:`.Template` starting clean any time the :class:`.Template`
+    is recompiled, such as when the original template file changed on
+    the filesystem.
+
+    """
+
+    def __init__(self, template, *args):
+        # check for a stale template calling the
+        # constructor
+        if isinstance(template, compat.string_types) and args:
+            return
+        self.template = template
+        self.id = template.module.__name__
+        self.starttime = template.module._modified_time
+        self._def_regions = {}
+        self.impl = self._load_impl(self.template.cache_impl)
+
+    def _load_impl(self, name):
+        return _cache_plugins.load(name)(self)
+
+    def get_or_create(self, key, creation_function, **kw):
+        """Retrieve a value from the cache, using the given creation function
+        to generate a new value."""
+
+        return self._ctx_get_or_create(key, creation_function, None, **kw)
+
+    def _ctx_get_or_create(self, key, creation_function, context, **kw):
+        """Retrieve a value from the cache, using the given creation function
+        to generate a new value."""
+
+        if not self.template.cache_enabled:
+            return creation_function()
+
+        return self.impl.get_or_create(
+            key,
+            creation_function,
+            **self._get_cache_kw(kw, context))
+
+    def set(self, key, value, **kw):
+        """Place a value in the cache.
+
+        :param key: the value's key.
+        :param value: the value.
+        :param \**kw: cache configuration arguments.
+
+        """
+
+        self.impl.set(key, value, **self._get_cache_kw(kw, None))
+
+    put = set
+    """A synonym for :meth:`.Cache.set`.
+
+    This is here for backwards compatibility.
+
+    """
+
+    def get(self, key, **kw):
+        """Retrieve a value from the cache.
+
+        :param key: the value's key.
+        :param \**kw: cache configuration arguments.  The
+         backend is configured using these arguments upon first request.
+         Subsequent requests that use the same series of configuration
+         values will use that same backend.
+
+        """
+        return self.impl.get(key, **self._get_cache_kw(kw, None))
+
+    def invalidate(self, key, **kw):
+        """Invalidate a value in the cache.
+
+        :param key: the value's key.
+        :param \**kw: cache configuration arguments.  The
+         backend is configured using these arguments upon first request.
+         Subsequent requests that use the same series of configuration
+         values will use that same backend.
+
+        """
+        self.impl.invalidate(key, **self._get_cache_kw(kw, None))
+
+    def invalidate_body(self):
+        """Invalidate the cached content of the "body" method for this
+        template.
+
+        """
+        self.invalidate('render_body', __M_defname='render_body')
+
+    def invalidate_def(self, name):
+        """Invalidate the cached content of a particular ``<%def>`` within this
+        template.
+
+        """
+
+        self.invalidate('render_%s' % name, __M_defname='render_%s' % name)
+
+    def invalidate_closure(self, name):
+        """Invalidate a nested ``<%def>`` within this template.
+
+        Caching of nested defs is a blunt tool as there is no
+        management of scope -- nested defs that use cache tags
+        need to have names unique of all other nested defs in the
+        template, else their content will be overwritten by
+        each other.
+
+        """
+
+        self.invalidate(name, __M_defname=name)
+
+    def _get_cache_kw(self, kw, context):
+        defname = kw.pop('__M_defname', None)
+        if not defname:
+            tmpl_kw = self.template.cache_args.copy()
+            tmpl_kw.update(kw)
+        elif defname in self._def_regions:
+            tmpl_kw = self._def_regions[defname]
+        else:
+            tmpl_kw = self.template.cache_args.copy()
+            tmpl_kw.update(kw)
+            self._def_regions[defname] = tmpl_kw
+        if context and self.impl.pass_context:
+            tmpl_kw = tmpl_kw.copy()
+            tmpl_kw.setdefault('context', context)
+        return tmpl_kw
+
+
+class CacheImpl(object):
+    """Provide a cache implementation for use by :class:`.Cache`."""
+
+    def __init__(self, cache):
+        self.cache = cache
+
+    pass_context = False
+    """If ``True``, the :class:`.Context` will be passed to
+    :meth:`get_or_create <.CacheImpl.get_or_create>` as the name ``'context'``.
+    """
+
+    def get_or_create(self, key, creation_function, **kw):
+        """Retrieve a value from the cache, using the given creation function
+        to generate a new value.
+
+        This function *must* return a value, either from
+        the cache, or via the given creation function.
+        If the creation function is called, the newly
+        created value should be populated into the cache
+        under the given key before being returned.
+
+        :param key: the value's key.
+        :param creation_function: function that when called generates
+         a new value.
+        :param \**kw: cache configuration arguments.
+
+        """
+        raise NotImplementedError()
+
+    def set(self, key, value, **kw):
+        """Place a value in the cache.
+
+        :param key: the value's key.
+        :param value: the value.
+        :param \**kw: cache configuration arguments.
+
+        """
+        raise NotImplementedError()
+
+    def get(self, key, **kw):
+        """Retrieve a value from the cache.
+
+        :param key: the value's key.
+        :param \**kw: cache configuration arguments.
+
+        """
+        raise NotImplementedError()
+
+    def invalidate(self, key, **kw):
+        """Invalidate a value in the cache.
+
+        :param key: the value's key.
+        :param \**kw: cache configuration arguments.
+
+        """
+        raise NotImplementedError()
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/cmd.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/cmd.py
new file mode 100644
index 00000000000..1a9ca56637c
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/cmd.py
@@ -0,0 +1,62 @@
+# mako/cmd.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+from argparse import ArgumentParser
+from os.path import isfile, dirname
+import sys
+from mako.template import Template
+from mako.lookup import TemplateLookup
+from mako import exceptions
+
+def varsplit(var):
+    if "=" not in var:
+        return (var, "")
+    return var.split("=", 1)
+
+def _exit():
+    sys.stderr.write(exceptions.text_error_template().render())
+    sys.exit(1)
+
+def cmdline(argv=None):
+
+    parser = ArgumentParser("usage: %prog [FILENAME]")
+    parser.add_argument("--var", default=[], action="append",
+                  help="variable (can be used multiple times, use name=value)")
+    parser.add_argument("--template-dir", default=[], action="append",
+                  help="Directory to use for template lookup (multiple "
+                    "directories may be provided). If not given then if the "
+                    "template is read from stdin, the value defaults to be "
+                    "the current directory, otherwise it defaults to be the "
+                    "parent directory of the file provided.")
+    parser.add_argument('input', nargs='?', default='-')
+
+    options = parser.parse_args(argv)
+    if options.input == '-':
+        lookup_dirs = options.template_dir or ["."]
+        lookup = TemplateLookup(lookup_dirs)
+        try:
+            template = Template(sys.stdin.read(), lookup=lookup)
+        except:
+            _exit()
+    else:
+        filename = options.input
+        if not isfile(filename):
+            raise SystemExit("error: can't find %s" % filename)
+        lookup_dirs = options.template_dir or [dirname(filename)]
+        lookup = TemplateLookup(lookup_dirs)
+        try:
+            template = Template(filename=filename, lookup=lookup)
+        except:
+            _exit()
+
+    kw = dict([varsplit(var) for var in options.var])
+    try:
+        print(template.render(**kw))
+    except:
+        _exit()
+
+
+if __name__ == "__main__":
+    cmdline()
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/codegen.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/codegen.py
new file mode 100644
index 00000000000..4b0bda86731
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/codegen.py
@@ -0,0 +1,1237 @@
+# mako/codegen.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+"""provides functionality for rendering a parsetree constructing into module
+source code."""
+
+import time
+import re
+from mako.pygen import PythonPrinter
+from mako import util, ast, parsetree, filters, exceptions
+from mako import compat
+
+
+MAGIC_NUMBER = 10
+
+# names which are hardwired into the
+# template and are not accessed via the
+# context itself
+RESERVED_NAMES = set(['context', 'loop', 'UNDEFINED'])
+
+def compile(node,
+                uri,
+                filename=None,
+                default_filters=None,
+                buffer_filters=None,
+                imports=None,
+                future_imports=None,
+                source_encoding=None,
+                generate_magic_comment=True,
+                disable_unicode=False,
+                strict_undefined=False,
+                enable_loop=True,
+                reserved_names=frozenset()):
+
+    """Generate module source code given a parsetree node,
+      uri, and optional source filename"""
+
+    # if on Py2K, push the "source_encoding" string to be
+    # a bytestring itself, as we will be embedding it into
+    # the generated source and we don't want to coerce the
+    # result into a unicode object, in "disable_unicode" mode
+    if not compat.py3k and isinstance(source_encoding, compat.text_type):
+        source_encoding = source_encoding.encode(source_encoding)
+
+
+    buf = util.FastEncodingBuffer()
+
+    printer = PythonPrinter(buf)
+    _GenerateRenderMethod(printer,
+                            _CompileContext(uri,
+                                            filename,
+                                            default_filters,
+                                            buffer_filters,
+                                            imports,
+                                            future_imports,
+                                            source_encoding,
+                                            generate_magic_comment,
+                                            disable_unicode,
+                                            strict_undefined,
+                                            enable_loop,
+                                            reserved_names),
+                                node)
+    return buf.getvalue()
+
+class _CompileContext(object):
+    def __init__(self,
+                    uri,
+                    filename,
+                    default_filters,
+                    buffer_filters,
+                    imports,
+                    future_imports,
+                    source_encoding,
+                    generate_magic_comment,
+                    disable_unicode,
+                    strict_undefined,
+                    enable_loop,
+                    reserved_names):
+        self.uri = uri
+        self.filename = filename
+        self.default_filters = default_filters
+        self.buffer_filters = buffer_filters
+        self.imports = imports
+        self.future_imports = future_imports
+        self.source_encoding = source_encoding
+        self.generate_magic_comment = generate_magic_comment
+        self.disable_unicode = disable_unicode
+        self.strict_undefined = strict_undefined
+        self.enable_loop = enable_loop
+        self.reserved_names = reserved_names
+
+class _GenerateRenderMethod(object):
+    """A template visitor object which generates the
+       full module source for a template.
+
+    """
+    def __init__(self, printer, compiler, node):
+        self.printer = printer
+        self.compiler = compiler
+        self.node = node
+        self.identifier_stack = [None]
+        self.in_def = isinstance(node, (parsetree.DefTag, parsetree.BlockTag))
+
+        if self.in_def:
+            name = "render_%s" % node.funcname
+            args = node.get_argument_expressions()
+            filtered = len(node.filter_args.args) > 0
+            buffered = eval(node.attributes.get('buffered', 'False'))
+            cached = eval(node.attributes.get('cached', 'False'))
+            defs = None
+            pagetag = None
+            if node.is_block and not node.is_anonymous:
+                args += ['**pageargs']
+        else:
+            defs = self.write_toplevel()
+            pagetag = self.compiler.pagetag
+            name = "render_body"
+            if pagetag is not None:
+                args = pagetag.body_decl.get_argument_expressions()
+                if not pagetag.body_decl.kwargs:
+                    args += ['**pageargs']
+                cached = eval(pagetag.attributes.get('cached', 'False'))
+                self.compiler.enable_loop = self.compiler.enable_loop or eval(
+                                        pagetag.attributes.get(
+                                                'enable_loop', 'False')
+                                    )
+            else:
+                args = ['**pageargs']
+                cached = False
+            buffered = filtered = False
+        if args is None:
+            args = ['context']
+        else:
+            args = [a for a in ['context'] + args]
+
+        self.write_render_callable(
+                            pagetag or node,
+                            name, args,
+                            buffered, filtered, cached)
+
+        if defs is not None:
+            for node in defs:
+                _GenerateRenderMethod(printer, compiler, node)
+
+        if not self.in_def:
+            self.write_metadata_struct()
+
+    def write_metadata_struct(self):
+        self.printer.source_map[self.printer.lineno] = \
+                    max(self.printer.source_map)
+        struct = {
+            "filename": self.compiler.filename,
+            "uri": self.compiler.uri,
+            "source_encoding": self.compiler.source_encoding,
+            "line_map": self.printer.source_map,
+        }
+        self.printer.writelines(
+            '"""',
+            '__M_BEGIN_METADATA',
+            compat.json.dumps(struct),
+            '__M_END_METADATA\n'
+            '"""'
+        )
+
+    @property
+    def identifiers(self):
+        return self.identifier_stack[-1]
+
+    def write_toplevel(self):
+        """Traverse a template structure for module-level directives and
+        generate the start of module-level code.
+
+        """
+        inherit = []
+        namespaces = {}
+        module_code = []
+
+        self.compiler.pagetag = None
+
+        class FindTopLevel(object):
+            def visitInheritTag(s, node):
+                inherit.append(node)
+            def visitNamespaceTag(s, node):
+                namespaces[node.name] = node
+            def visitPageTag(s, node):
+                self.compiler.pagetag = node
+            def visitCode(s, node):
+                if node.ismodule:
+                    module_code.append(node)
+
+        f = FindTopLevel()
+        for n in self.node.nodes:
+            n.accept_visitor(f)
+
+        self.compiler.namespaces = namespaces
+
+        module_ident = set()
+        for n in module_code:
+            module_ident = module_ident.union(n.declared_identifiers())
+
+        module_identifiers = _Identifiers(self.compiler)
+        module_identifiers.declared = module_ident
+
+        # module-level names, python code
+        if self.compiler.generate_magic_comment and \
+                self.compiler.source_encoding:
+            self.printer.writeline("# -*- coding:%s -*-" %
+                                    self.compiler.source_encoding)
+
+        if self.compiler.future_imports:
+            self.printer.writeline("from __future__ import %s" %
+                                   (", ".join(self.compiler.future_imports),))
+        self.printer.writeline("from mako import runtime, filters, cache")
+        self.printer.writeline("UNDEFINED = runtime.UNDEFINED")
+        self.printer.writeline("__M_dict_builtin = dict")
+        self.printer.writeline("__M_locals_builtin = locals")
+        self.printer.writeline("_magic_number = %r" % MAGIC_NUMBER)
+        self.printer.writeline("_modified_time = %r" % time.time())
+        self.printer.writeline("_enable_loop = %r" % self.compiler.enable_loop)
+        self.printer.writeline(
+                            "_template_filename = %r" % self.compiler.filename)
+        self.printer.writeline("_template_uri = %r" % self.compiler.uri)
+        self.printer.writeline(
+                    "_source_encoding = %r" % self.compiler.source_encoding)
+        if self.compiler.imports:
+            buf = ''
+            for imp in self.compiler.imports:
+                buf += imp + "\n"
+                self.printer.writeline(imp)
+            impcode = ast.PythonCode(
+                            buf,
+                            source='', lineno=0,
+                            pos=0,
+                            filename='template defined imports')
+        else:
+            impcode = None
+
+        main_identifiers = module_identifiers.branch(self.node)
+        module_identifiers.topleveldefs = \
+            module_identifiers.topleveldefs.\
+                union(main_identifiers.topleveldefs)
+        module_identifiers.declared.add("UNDEFINED")
+        if impcode:
+            module_identifiers.declared.update(impcode.declared_identifiers)
+
+        self.compiler.identifiers = module_identifiers
+        self.printer.writeline("_exports = %r" %
+                            [n.name for n in
+                            main_identifiers.topleveldefs.values()]
+                        )
+        self.printer.write_blanks(2)
+
+        if len(module_code):
+            self.write_module_code(module_code)
+
+        if len(inherit):
+            self.write_namespaces(namespaces)
+            self.write_inherit(inherit[-1])
+        elif len(namespaces):
+            self.write_namespaces(namespaces)
+
+        return list(main_identifiers.topleveldefs.values())
+
+    def write_render_callable(self, node, name, args, buffered, filtered,
+            cached):
+        """write a top-level render callable.
+
+        this could be the main render() method or that of a top-level def."""
+
+        if self.in_def:
+            decorator = node.decorator
+            if decorator:
+                self.printer.writeline(
+                                "@runtime._decorate_toplevel(%s)" % decorator)
+
+        self.printer.start_source(node.lineno)
+        self.printer.writelines(
+            "def %s(%s):" % (name, ','.join(args)),
+                # push new frame, assign current frame to __M_caller
+                "__M_caller = context.caller_stack._push_frame()",
+                "try:"
+        )
+        if buffered or filtered or cached:
+            self.printer.writeline("context._push_buffer()")
+
+        self.identifier_stack.append(
+                                self.compiler.identifiers.branch(self.node))
+        if (not self.in_def or self.node.is_block) and '**pageargs' in args:
+            self.identifier_stack[-1].argument_declared.add('pageargs')
+
+        if not self.in_def and (
+                                len(self.identifiers.locally_assigned) > 0 or
+                                len(self.identifiers.argument_declared) > 0
+                                ):
+            self.printer.writeline("__M_locals = __M_dict_builtin(%s)" %
+                                    ','.join([
+                                            "%s=%s" % (x, x) for x in
+                                            self.identifiers.argument_declared
+                                            ]))
+
+        self.write_variable_declares(self.identifiers, toplevel=True)
+
+        for n in self.node.nodes:
+            n.accept_visitor(self)
+
+        self.write_def_finish(self.node, buffered, filtered, cached)
+        self.printer.writeline(None)
+        self.printer.write_blanks(2)
+        if cached:
+            self.write_cache_decorator(
+                                node, name,
+                                args, buffered,
+                                self.identifiers, toplevel=True)
+
+    def write_module_code(self, module_code):
+        """write module-level template code, i.e. that which
+        is enclosed in <%! %> tags in the template."""
+        for n in module_code:
+            self.printer.start_source(n.lineno)
+            self.printer.write_indented_block(n.text)
+
+    def write_inherit(self, node):
+        """write the module-level inheritance-determination callable."""
+
+        self.printer.writelines(
+            "def _mako_inherit(template, context):",
+                "_mako_generate_namespaces(context)",
+                "return runtime._inherit_from(context, %s, _template_uri)" %
+                (node.parsed_attributes['file']),
+                None
+        )
+
+    def write_namespaces(self, namespaces):
+        """write the module-level namespace-generating callable."""
+        self.printer.writelines(
+            "def _mako_get_namespace(context, name):",
+                "try:",
+                    "return context.namespaces[(__name__, name)]",
+                "except KeyError:",
+                    "_mako_generate_namespaces(context)",
+                "return context.namespaces[(__name__, name)]",
+            None, None
+        )
+        self.printer.writeline("def _mako_generate_namespaces(context):")
+
+
+        for node in namespaces.values():
+            if 'import' in node.attributes:
+                self.compiler.has_ns_imports = True
+            self.printer.start_source(node.lineno)
+            if len(node.nodes):
+                self.printer.writeline("def make_namespace():")
+                export = []
+                identifiers = self.compiler.identifiers.branch(node)
+                self.in_def = True
+                class NSDefVisitor(object):
+                    def visitDefTag(s, node):
+                        s.visitDefOrBase(node)
+
+                    def visitBlockTag(s, node):
+                        s.visitDefOrBase(node)
+
+                    def visitDefOrBase(s, node):
+                        if node.is_anonymous:
+                            raise exceptions.CompileException(
+                                "Can't put anonymous blocks inside "
+                                "<%namespace>",
+                                **node.exception_kwargs
+                            )
+                        self.write_inline_def(node, identifiers, nested=False)
+                        export.append(node.funcname)
+                vis = NSDefVisitor()
+                for n in node.nodes:
+                    n.accept_visitor(vis)
+                self.printer.writeline("return [%s]" % (','.join(export)))
+                self.printer.writeline(None)
+                self.in_def = False
+                callable_name = "make_namespace()"
+            else:
+                callable_name = "None"
+
+            if 'file' in node.parsed_attributes:
+                self.printer.writeline(
+                                "ns = runtime.TemplateNamespace(%r,"
+                                " context._clean_inheritance_tokens(),"
+                                " templateuri=%s, callables=%s, "
+                                " calling_uri=_template_uri)" %
+                                (
+                                    node.name,
+                                    node.parsed_attributes.get('file', 'None'),
+                                    callable_name,
+                                )
+                            )
+            elif 'module' in node.parsed_attributes:
+                self.printer.writeline(
+                                "ns = runtime.ModuleNamespace(%r,"
+                                " context._clean_inheritance_tokens(),"
+                                " callables=%s, calling_uri=_template_uri,"
+                                " module=%s)" %
+                                (
+                                    node.name,
+                                    callable_name,
+                                    node.parsed_attributes.get(
+                                                'module', 'None')
+                                )
+                            )
+            else:
+                self.printer.writeline(
+                                "ns = runtime.Namespace(%r,"
+                                " context._clean_inheritance_tokens(),"
+                                " callables=%s, calling_uri=_template_uri)" %
+                                (
+                                    node.name,
+                                    callable_name,
+                                )
+                            )
+            if eval(node.attributes.get('inheritable', "False")):
+                self.printer.writeline("context['self'].%s = ns" % (node.name))
+
+            self.printer.writeline(
+                "context.namespaces[(__name__, %s)] = ns" % repr(node.name))
+            self.printer.write_blanks(1)
+        if not len(namespaces):
+            self.printer.writeline("pass")
+        self.printer.writeline(None)
+
+    def write_variable_declares(self, identifiers, toplevel=False, limit=None):
+        """write variable declarations at the top of a function.
+
+        the variable declarations are in the form of callable
+        definitions for defs and/or name lookup within the
+        function's context argument. the names declared are based
+        on the names that are referenced in the function body,
+        which don't otherwise have any explicit assignment
+        operation. names that are assigned within the body are
+        assumed to be locally-scoped variables and are not
+        separately declared.
+
+        for def callable definitions, if the def is a top-level
+        callable then a 'stub' callable is generated which wraps
+        the current Context into a closure. if the def is not
+        top-level, it is fully rendered as a local closure.
+
+        """
+
+        # collection of all defs available to us in this scope
+        comp_idents = dict([(c.funcname, c) for c in identifiers.defs])
+        to_write = set()
+
+        # write "context.get()" for all variables we are going to
+        # need that arent in the namespace yet
+        to_write = to_write.union(identifiers.undeclared)
+
+        # write closure functions for closures that we define
+        # right here
+        to_write = to_write.union(
+                        [c.funcname for c in identifiers.closuredefs.values()])
+
+        # remove identifiers that are declared in the argument
+        # signature of the callable
+        to_write = to_write.difference(identifiers.argument_declared)
+
+        # remove identifiers that we are going to assign to.
+        # in this way we mimic Python's behavior,
+        # i.e. assignment to a variable within a block
+        # means that variable is now a "locally declared" var,
+        # which cannot be referenced beforehand.
+        to_write = to_write.difference(identifiers.locally_declared)
+
+        if self.compiler.enable_loop:
+            has_loop = "loop" in to_write
+            to_write.discard("loop")
+        else:
+            has_loop = False
+
+        # if a limiting set was sent, constraint to those items in that list
+        # (this is used for the caching decorator)
+        if limit is not None:
+            to_write = to_write.intersection(limit)
+
+        if toplevel and getattr(self.compiler, 'has_ns_imports', False):
+            self.printer.writeline("_import_ns = {}")
+            self.compiler.has_imports = True
+            for ident, ns in self.compiler.namespaces.items():
+                if 'import' in ns.attributes:
+                    self.printer.writeline(
+                            "_mako_get_namespace(context, %r)."
+                                    "_populate(_import_ns, %r)" %
+                            (
+                                ident,
+                                re.split(r'\s*,\s*', ns.attributes['import'])
+                            ))
+
+        if has_loop:
+            self.printer.writeline(
+                'loop = __M_loop = runtime.LoopStack()'
+            )
+
+        for ident in to_write:
+            if ident in comp_idents:
+                comp = comp_idents[ident]
+                if comp.is_block:
+                    if not comp.is_anonymous:
+                        self.write_def_decl(comp, identifiers)
+                    else:
+                        self.write_inline_def(comp, identifiers, nested=True)
+                else:
+                    if comp.is_root():
+                        self.write_def_decl(comp, identifiers)
+                    else:
+                        self.write_inline_def(comp, identifiers, nested=True)
+
+            elif ident in self.compiler.namespaces:
+                self.printer.writeline(
+                            "%s = _mako_get_namespace(context, %r)" %
+                                (ident, ident)
+                            )
+            else:
+                if getattr(self.compiler, 'has_ns_imports', False):
+                    if self.compiler.strict_undefined:
+                        self.printer.writelines(
+                        "%s = _import_ns.get(%r, UNDEFINED)" %
+                        (ident, ident),
+                        "if %s is UNDEFINED:" % ident,
+                            "try:",
+                                "%s = context[%r]" % (ident, ident),
+                            "except KeyError:",
+                                "raise NameError(\"'%s' is not defined\")" %
+                                    ident,
+                            None, None
+                        )
+                    else:
+                        self.printer.writeline(
+                        "%s = _import_ns.get(%r, context.get(%r, UNDEFINED))" %
+                        (ident, ident, ident))
+                else:
+                    if self.compiler.strict_undefined:
+                        self.printer.writelines(
+                            "try:",
+                                "%s = context[%r]" % (ident, ident),
+                            "except KeyError:",
+                                "raise NameError(\"'%s' is not defined\")" %
+                                    ident,
+                            None
+                        )
+                    else:
+                        self.printer.writeline(
+                            "%s = context.get(%r, UNDEFINED)" % (ident, ident)
+                        )
+
+        self.printer.writeline("__M_writer = context.writer()")
+
+    def write_def_decl(self, node, identifiers):
+        """write a locally-available callable referencing a top-level def"""
+        funcname = node.funcname
+        namedecls = node.get_argument_expressions()
+        nameargs = node.get_argument_expressions(as_call=True)
+
+        if not self.in_def and (
+                                len(self.identifiers.locally_assigned) > 0 or
+                                len(self.identifiers.argument_declared) > 0):
+            nameargs.insert(0, 'context._locals(__M_locals)')
+        else:
+            nameargs.insert(0, 'context')
+        self.printer.writeline("def %s(%s):" % (funcname, ",".join(namedecls)))
+        self.printer.writeline(
+                    "return render_%s(%s)" % (funcname, ",".join(nameargs)))
+        self.printer.writeline(None)
+
+    def write_inline_def(self, node, identifiers, nested):
+        """write a locally-available def callable inside an enclosing def."""
+
+        namedecls = node.get_argument_expressions()
+
+        decorator = node.decorator
+        if decorator:
+            self.printer.writeline(
+                        "@runtime._decorate_inline(context, %s)" % decorator)
+        self.printer.writeline(
+                        "def %s(%s):" % (node.funcname, ",".join(namedecls)))
+        filtered = len(node.filter_args.args) > 0
+        buffered = eval(node.attributes.get('buffered', 'False'))
+        cached = eval(node.attributes.get('cached', 'False'))
+        self.printer.writelines(
+            # push new frame, assign current frame to __M_caller
+            "__M_caller = context.caller_stack._push_frame()",
+            "try:"
+        )
+        if buffered or filtered or cached:
+            self.printer.writelines(
+                "context._push_buffer()",
+            )
+
+        identifiers = identifiers.branch(node, nested=nested)
+
+        self.write_variable_declares(identifiers)
+
+        self.identifier_stack.append(identifiers)
+        for n in node.nodes:
+            n.accept_visitor(self)
+        self.identifier_stack.pop()
+
+        self.write_def_finish(node, buffered, filtered, cached)
+        self.printer.writeline(None)
+        if cached:
+            self.write_cache_decorator(node, node.funcname,
+                                        namedecls, False, identifiers,
+                                        inline=True, toplevel=False)
+
+    def write_def_finish(self, node, buffered, filtered, cached,
+            callstack=True):
+        """write the end section of a rendering function, either outermost or
+        inline.
+
+        this takes into account if the rendering function was filtered,
+        buffered, etc.  and closes the corresponding try: block if any, and
+        writes code to retrieve captured content, apply filters, send proper
+        return value."""
+
+        if not buffered and not cached and not filtered:
+            self.printer.writeline("return ''")
+            if callstack:
+                self.printer.writelines(
+                    "finally:",
+                        "context.caller_stack._pop_frame()",
+                    None
+                )
+
+        if buffered or filtered or cached:
+            if buffered or cached:
+                # in a caching scenario, don't try to get a writer
+                # from the context after popping; assume the caching
+                # implemenation might be using a context with no
+                # extra buffers
+                self.printer.writelines(
+                    "finally:",
+                        "__M_buf = context._pop_buffer()"
+                )
+            else:
+                self.printer.writelines(
+                    "finally:",
+                    "__M_buf, __M_writer = context._pop_buffer_and_writer()"
+                )
+
+            if callstack:
+                self.printer.writeline("context.caller_stack._pop_frame()")
+
+            s = "__M_buf.getvalue()"
+            if filtered:
+                s = self.create_filter_callable(node.filter_args.args, s,
+                                                False)
+            self.printer.writeline(None)
+            if buffered and not cached:
+                s = self.create_filter_callable(self.compiler.buffer_filters,
+                                                s, False)
+            if buffered or cached:
+                self.printer.writeline("return %s" % s)
+            else:
+                self.printer.writelines(
+                    "__M_writer(%s)" % s,
+                    "return ''"
+                )
+
+    def write_cache_decorator(self, node_or_pagetag, name,
+                                    args, buffered, identifiers,
+                                    inline=False, toplevel=False):
+        """write a post-function decorator to replace a rendering
+            callable with a cached version of itself."""
+
+        self.printer.writeline("__M_%s = %s" % (name, name))
+        cachekey = node_or_pagetag.parsed_attributes.get('cache_key',
+                                                         repr(name))
+
+        cache_args = {}
+        if self.compiler.pagetag is not None:
+            cache_args.update(
+                (
+                    pa[6:],
+                    self.compiler.pagetag.parsed_attributes[pa]
+                )
+                for pa in self.compiler.pagetag.parsed_attributes
+                if pa.startswith('cache_') and pa != 'cache_key'
+            )
+        cache_args.update(
+            (
+                pa[6:],
+                node_or_pagetag.parsed_attributes[pa]
+            ) for pa in node_or_pagetag.parsed_attributes
+            if pa.startswith('cache_') and pa != 'cache_key'
+        )
+        if 'timeout' in cache_args:
+            cache_args['timeout'] = int(eval(cache_args['timeout']))
+
+        self.printer.writeline("def %s(%s):" % (name, ','.join(args)))
+
+        # form "arg1, arg2, arg3=arg3, arg4=arg4", etc.
+        pass_args = [
+                        "%s=%s" % ((a.split('=')[0],) * 2) if '=' in a else a
+                        for a in args
+                    ]
+
+        self.write_variable_declares(
+                            identifiers,
+                            toplevel=toplevel,
+                            limit=node_or_pagetag.undeclared_identifiers()
+                        )
+        if buffered:
+            s = "context.get('local')."\
+                "cache._ctx_get_or_create("\
+                "%s, lambda:__M_%s(%s),  context, %s__M_defname=%r)" % (
+                                cachekey, name, ','.join(pass_args),
+                                ''.join(["%s=%s, " % (k, v)
+                                for k, v in cache_args.items()]),
+                                name
+                            )
+            # apply buffer_filters
+            s = self.create_filter_callable(self.compiler.buffer_filters, s,
+                                            False)
+            self.printer.writelines("return " + s, None)
+        else:
+            self.printer.writelines(
+                    "__M_writer(context.get('local')."
+                    "cache._ctx_get_or_create("
+                    "%s, lambda:__M_%s(%s), context, %s__M_defname=%r))" %
+                    (
+                        cachekey, name, ','.join(pass_args),
+                        ''.join(["%s=%s, " % (k, v)
+                        for k, v in cache_args.items()]),
+                        name,
+                    ),
+                    "return ''",
+                None
+            )
+
+    def create_filter_callable(self, args, target, is_expression):
+        """write a filter-applying expression based on the filters
+        present in the given filter names, adjusting for the global
+        'default' filter aliases as needed."""
+
+        def locate_encode(name):
+            if re.match(r'decode\..+', name):
+                return "filters." + name
+            elif self.compiler.disable_unicode:
+                return filters.NON_UNICODE_ESCAPES.get(name, name)
+            else:
+                return filters.DEFAULT_ESCAPES.get(name, name)
+
+        if 'n' not in args:
+            if is_expression:
+                if self.compiler.pagetag:
+                    args = self.compiler.pagetag.filter_args.args + args
+                if self.compiler.default_filters:
+                    args = self.compiler.default_filters + args
+        for e in args:
+            # if filter given as a function, get just the identifier portion
+            if e == 'n':
+                continue
+            m = re.match(r'(.+?)(\(.*\))', e)
+            if m:
+                ident, fargs = m.group(1, 2)
+                f = locate_encode(ident)
+                e = f + fargs
+            else:
+                e = locate_encode(e)
+                assert e is not None
+            target = "%s(%s)" % (e, target)
+        return target
+
+    def visitExpression(self, node):
+        self.printer.start_source(node.lineno)
+        if len(node.escapes) or \
+                (
+                    self.compiler.pagetag is not None and
+                    len(self.compiler.pagetag.filter_args.args)
+                ) or \
+                len(self.compiler.default_filters):
+
+            s = self.create_filter_callable(node.escapes_code.args,
+                                            "%s" % node.text, True)
+            self.printer.writeline("__M_writer(%s)" % s)
+        else:
+            self.printer.writeline("__M_writer(%s)" % node.text)
+
+    def visitControlLine(self, node):
+        if node.isend:
+            self.printer.writeline(None)
+            if node.has_loop_context:
+                self.printer.writeline('finally:')
+                self.printer.writeline("loop = __M_loop._exit()")
+                self.printer.writeline(None)
+        else:
+            self.printer.start_source(node.lineno)
+            if self.compiler.enable_loop and node.keyword == 'for':
+                text = mangle_mako_loop(node, self.printer)
+            else:
+                text = node.text
+            self.printer.writeline(text)
+            children = node.get_children()
+            # this covers the three situations where we want to insert a pass:
+            #    1) a ternary control line with no children,
+            #    2) a primary control line with nothing but its own ternary
+            #          and end control lines, and
+            #    3) any control line with no content other than comments
+            if not children or (
+                    compat.all(isinstance(c, (parsetree.Comment,
+                                            parsetree.ControlLine))
+                             for c in children) and
+                    compat.all((node.is_ternary(c.keyword) or c.isend)
+                             for c in children
+                             if isinstance(c, parsetree.ControlLine))):
+                self.printer.writeline("pass")
+
+    def visitText(self, node):
+        self.printer.start_source(node.lineno)
+        self.printer.writeline("__M_writer(%s)" % repr(node.content))
+
+    def visitTextTag(self, node):
+        filtered = len(node.filter_args.args) > 0
+        if filtered:
+            self.printer.writelines(
+                "__M_writer = context._push_writer()",
+                "try:",
+            )
+        for n in node.nodes:
+            n.accept_visitor(self)
+        if filtered:
+            self.printer.writelines(
+                "finally:",
+                "__M_buf, __M_writer = context._pop_buffer_and_writer()",
+                "__M_writer(%s)" %
+                self.create_filter_callable(
+                                node.filter_args.args,
+                                "__M_buf.getvalue()",
+                                False),
+                None
+            )
+
+    def visitCode(self, node):
+        if not node.ismodule:
+            self.printer.start_source(node.lineno)
+            self.printer.write_indented_block(node.text)
+
+            if not self.in_def and len(self.identifiers.locally_assigned) > 0:
+                # if we are the "template" def, fudge locally
+                # declared/modified variables into the "__M_locals" dictionary,
+                # which is used for def calls within the same template,
+                # to simulate "enclosing scope"
+                self.printer.writeline(
+                    '__M_locals_builtin_stored = __M_locals_builtin()')
+                self.printer.writeline(
+                    '__M_locals.update(__M_dict_builtin([(__M_key,'
+                    ' __M_locals_builtin_stored[__M_key]) for __M_key in'
+                    ' [%s] if __M_key in __M_locals_builtin_stored]))' %
+                    ','.join([repr(x) for x in node.declared_identifiers()]))
+
+    def visitIncludeTag(self, node):
+        self.printer.start_source(node.lineno)
+        args = node.attributes.get('args')
+        if args:
+            self.printer.writeline(
+                    "runtime._include_file(context, %s, _template_uri, %s)" %
+                    (node.parsed_attributes['file'], args))
+        else:
+            self.printer.writeline(
+                        "runtime._include_file(context, %s, _template_uri)" %
+                        (node.parsed_attributes['file']))
+
+    def visitNamespaceTag(self, node):
+        pass
+
+    def visitDefTag(self, node):
+        pass
+
+    def visitBlockTag(self, node):
+        if node.is_anonymous:
+            self.printer.writeline("%s()" % node.funcname)
+        else:
+            nameargs = node.get_argument_expressions(as_call=True)
+            nameargs += ['**pageargs']
+            self.printer.writeline("if 'parent' not in context._data or "
+                                  "not hasattr(context._data['parent'], '%s'):"
+                                  % node.funcname)
+            self.printer.writeline(
+                "context['self'].%s(%s)" % (node.funcname, ",".join(nameargs)))
+            self.printer.writeline("\n")
+
+    def visitCallNamespaceTag(self, node):
+        # TODO: we can put namespace-specific checks here, such
+        # as ensure the given namespace will be imported,
+        # pre-import the namespace, etc.
+        self.visitCallTag(node)
+
+    def visitCallTag(self, node):
+        self.printer.writeline("def ccall(caller):")
+        export = ['body']
+        callable_identifiers = self.identifiers.branch(node, nested=True)
+        body_identifiers = callable_identifiers.branch(node, nested=False)
+        # we want the 'caller' passed to ccall to be used
+        # for the body() function, but for other non-body()
+        # <%def>s within <%call> we want the current caller
+        # off the call stack (if any)
+        body_identifiers.add_declared('caller')
+
+        self.identifier_stack.append(body_identifiers)
+        class DefVisitor(object):
+            def visitDefTag(s, node):
+                s.visitDefOrBase(node)
+
+            def visitBlockTag(s, node):
+                s.visitDefOrBase(node)
+
+            def visitDefOrBase(s, node):
+                self.write_inline_def(node, callable_identifiers, nested=False)
+                if not node.is_anonymous:
+                    export.append(node.funcname)
+                # remove defs that are within the <%call> from the
+                # "closuredefs" defined in the body, so they dont render twice
+                if node.funcname in body_identifiers.closuredefs:
+                    del body_identifiers.closuredefs[node.funcname]
+
+        vis = DefVisitor()
+        for n in node.nodes:
+            n.accept_visitor(vis)
+        self.identifier_stack.pop()
+
+        bodyargs = node.body_decl.get_argument_expressions()
+        self.printer.writeline("def body(%s):" % ','.join(bodyargs))
+
+        # TODO: figure out best way to specify
+        # buffering/nonbuffering (at call time would be better)
+        buffered = False
+        if buffered:
+            self.printer.writelines(
+                "context._push_buffer()",
+                "try:"
+            )
+        self.write_variable_declares(body_identifiers)
+        self.identifier_stack.append(body_identifiers)
+
+        for n in node.nodes:
+            n.accept_visitor(self)
+        self.identifier_stack.pop()
+
+        self.write_def_finish(node, buffered, False, False, callstack=False)
+        self.printer.writelines(
+            None,
+            "return [%s]" % (','.join(export)),
+            None
+        )
+
+        self.printer.writelines(
+            # push on caller for nested call
+            "context.caller_stack.nextcaller = "
+                "runtime.Namespace('caller', context, "
+                                "callables=ccall(__M_caller))",
+            "try:")
+        self.printer.start_source(node.lineno)
+        self.printer.writelines(
+                "__M_writer(%s)" % self.create_filter_callable(
+                                                    [], node.expression, True),
+            "finally:",
+                "context.caller_stack.nextcaller = None",
+            None
+        )
+
+class _Identifiers(object):
+    """tracks the status of identifier names as template code is rendered."""
+
+    def __init__(self, compiler, node=None, parent=None, nested=False):
+        if parent is not None:
+            # if we are the branch created in write_namespaces(),
+            # we don't share any context from the main body().
+            if isinstance(node, parsetree.NamespaceTag):
+                self.declared = set()
+                self.topleveldefs = util.SetLikeDict()
+            else:
+                # things that have already been declared
+                # in an enclosing namespace (i.e. names we can just use)
+                self.declared = set(parent.declared).\
+                        union([c.name for c in parent.closuredefs.values()]).\
+                        union(parent.locally_declared).\
+                        union(parent.argument_declared)
+
+                # if these identifiers correspond to a "nested"
+                # scope, it means whatever the parent identifiers
+                # had as undeclared will have been declared by that parent,
+                # and therefore we have them in our scope.
+                if nested:
+                    self.declared = self.declared.union(parent.undeclared)
+
+                # top level defs that are available
+                self.topleveldefs = util.SetLikeDict(**parent.topleveldefs)
+        else:
+            self.declared = set()
+            self.topleveldefs = util.SetLikeDict()
+
+        self.compiler = compiler
+
+        # things within this level that are referenced before they
+        # are declared (e.g. assigned to)
+        self.undeclared = set()
+
+        # things that are declared locally.  some of these things
+        # could be in the "undeclared" list as well if they are
+        # referenced before declared
+        self.locally_declared = set()
+
+        # assignments made in explicit python blocks.
+        # these will be propagated to
+        # the context of local def calls.
+        self.locally_assigned = set()
+
+        # things that are declared in the argument
+        # signature of the def callable
+        self.argument_declared = set()
+
+        # closure defs that are defined in this level
+        self.closuredefs = util.SetLikeDict()
+
+        self.node = node
+
+        if node is not None:
+            node.accept_visitor(self)
+
+        illegal_names = self.compiler.reserved_names.intersection(
+                                                        self.locally_declared)
+        if illegal_names:
+            raise exceptions.NameConflictError(
+                "Reserved words declared in template: %s" %
+                ", ".join(illegal_names))
+
+
+    def branch(self, node, **kwargs):
+        """create a new Identifiers for a new Node, with
+          this Identifiers as the parent."""
+
+        return _Identifiers(self.compiler, node, self, **kwargs)
+
+    @property
+    def defs(self):
+        return set(self.topleveldefs.union(self.closuredefs).values())
+
+    def __repr__(self):
+        return "Identifiers(declared=%r, locally_declared=%r, "\
+                "undeclared=%r, topleveldefs=%r, closuredefs=%r, "\
+                "argumentdeclared=%r)" %\
+                (
+                    list(self.declared),
+                    list(self.locally_declared),
+                    list(self.undeclared),
+                    [c.name for c in self.topleveldefs.values()],
+                    [c.name for c in self.closuredefs.values()],
+                    self.argument_declared)
+
+    def check_declared(self, node):
+        """update the state of this Identifiers with the undeclared
+            and declared identifiers of the given node."""
+
+        for ident in node.undeclared_identifiers():
+            if ident != 'context' and\
+                    ident not in self.declared.union(self.locally_declared):
+                self.undeclared.add(ident)
+        for ident in node.declared_identifiers():
+            self.locally_declared.add(ident)
+
+    def add_declared(self, ident):
+        self.declared.add(ident)
+        if ident in self.undeclared:
+            self.undeclared.remove(ident)
+
+    def visitExpression(self, node):
+        self.check_declared(node)
+
+    def visitControlLine(self, node):
+        self.check_declared(node)
+
+    def visitCode(self, node):
+        if not node.ismodule:
+            self.check_declared(node)
+            self.locally_assigned = self.locally_assigned.union(
+                                                node.declared_identifiers())
+
+    def visitNamespaceTag(self, node):
+        # only traverse into the sub-elements of a
+        # <%namespace> tag if we are the branch created in
+        # write_namespaces()
+        if self.node is node:
+            for n in node.nodes:
+                n.accept_visitor(self)
+
+    def _check_name_exists(self, collection, node):
+        existing = collection.get(node.funcname)
+        collection[node.funcname] = node
+        if existing is not None and \
+            existing is not node and \
+            (node.is_block or existing.is_block):
+            raise exceptions.CompileException(
+                    "%%def or %%block named '%s' already "
+                    "exists in this template." %
+                    node.funcname, **node.exception_kwargs)
+
+    def visitDefTag(self, node):
+        if node.is_root() and not node.is_anonymous:
+            self._check_name_exists(self.topleveldefs, node)
+        elif node is not self.node:
+            self._check_name_exists(self.closuredefs, node)
+
+        for ident in node.undeclared_identifiers():
+            if ident != 'context' and \
+                    ident not in self.declared.union(self.locally_declared):
+                self.undeclared.add(ident)
+
+        # visit defs only one level deep
+        if node is self.node:
+            for ident in node.declared_identifiers():
+                self.argument_declared.add(ident)
+
+            for n in node.nodes:
+                n.accept_visitor(self)
+
+    def visitBlockTag(self, node):
+        if node is not self.node and not node.is_anonymous:
+
+            if isinstance(self.node, parsetree.DefTag):
+                raise exceptions.CompileException(
+                        "Named block '%s' not allowed inside of def '%s'"
+                        % (node.name, self.node.name), **node.exception_kwargs)
+            elif isinstance(self.node,
+                            (parsetree.CallTag, parsetree.CallNamespaceTag)):
+                raise exceptions.CompileException(
+                        "Named block '%s' not allowed inside of <%%call> tag"
+                        % (node.name, ), **node.exception_kwargs)
+
+        for ident in node.undeclared_identifiers():
+            if ident != 'context' and \
+                    ident not in self.declared.union(self.locally_declared):
+                self.undeclared.add(ident)
+
+        if not node.is_anonymous:
+            self._check_name_exists(self.topleveldefs, node)
+            self.undeclared.add(node.funcname)
+        elif node is not self.node:
+            self._check_name_exists(self.closuredefs, node)
+        for ident in node.declared_identifiers():
+            self.argument_declared.add(ident)
+        for n in node.nodes:
+            n.accept_visitor(self)
+
+    def visitTextTag(self, node):
+        for ident in node.undeclared_identifiers():
+            if ident != 'context' and \
+                    ident not in self.declared.union(self.locally_declared):
+                self.undeclared.add(ident)
+
+    def visitIncludeTag(self, node):
+        self.check_declared(node)
+
+    def visitPageTag(self, node):
+        for ident in node.declared_identifiers():
+            self.argument_declared.add(ident)
+        self.check_declared(node)
+
+    def visitCallNamespaceTag(self, node):
+        self.visitCallTag(node)
+
+    def visitCallTag(self, node):
+        if node is self.node:
+            for ident in node.undeclared_identifiers():
+                if ident != 'context' and \
+                        ident not in self.declared.union(
+                                                self.locally_declared):
+                    self.undeclared.add(ident)
+            for ident in node.declared_identifiers():
+                self.argument_declared.add(ident)
+            for n in node.nodes:
+                n.accept_visitor(self)
+        else:
+            for ident in node.undeclared_identifiers():
+                if ident != 'context' and \
+                        ident not in self.declared.union(
+                                                self.locally_declared):
+                    self.undeclared.add(ident)
+
+
+_FOR_LOOP = re.compile(
+        r'^for\s+((?:\(?)\s*[A-Za-z_][A-Za-z_0-9]*'
+        r'(?:\s*,\s*(?:[A-Za-z_][A-Za-z0-9_]*),??)*\s*(?:\)?))\s+in\s+(.*):'
+)
+
+def mangle_mako_loop(node, printer):
+    """converts a for loop into a context manager wrapped around a for loop
+    when access to the `loop` variable has been detected in the for loop body
+    """
+    loop_variable = LoopVariable()
+    node.accept_visitor(loop_variable)
+    if loop_variable.detected:
+        node.nodes[-1].has_loop_context = True
+        match = _FOR_LOOP.match(node.text)
+        if match:
+            printer.writelines(
+                    'loop = __M_loop._enter(%s)' % match.group(2),
+                    'try:'
+                    #'with __M_loop(%s) as loop:' % match.group(2)
+            )
+            text = 'for %s in loop:' % match.group(1)
+        else:
+            raise SyntaxError("Couldn't apply loop context: %s" % node.text)
+    else:
+        text = node.text
+    return text
+
+
+class LoopVariable(object):
+    """A node visitor which looks for the name 'loop' within undeclared
+    identifiers."""
+
+    def __init__(self):
+        self.detected = False
+
+    def _loop_reference_detected(self, node):
+        if 'loop' in node.undeclared_identifiers():
+            self.detected = True
+        else:
+            for n in node.get_children():
+                n.accept_visitor(self)
+
+    def visitControlLine(self, node):
+        self._loop_reference_detected(node)
+
+    def visitCode(self, node):
+        self._loop_reference_detected(node)
+
+    def visitExpression(self, node):
+        self._loop_reference_detected(node)
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/compat.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/compat.py
new file mode 100644
index 00000000000..fe277bbf05a
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/compat.py
@@ -0,0 +1,174 @@
+import sys
+import time
+
+py3k = sys.version_info >= (3, 0)
+py33 = sys.version_info >= (3, 3)
+py2k = sys.version_info < (3,)
+py26 = sys.version_info >= (2, 6)
+jython = sys.platform.startswith('java')
+win32 = sys.platform.startswith('win')
+pypy = hasattr(sys, 'pypy_version_info')
+
+if py3k:
+    from io import StringIO
+    import builtins as compat_builtins
+    from urllib.parse import quote_plus, unquote_plus
+    from html.entities import codepoint2name, name2codepoint
+    string_types = str,
+    binary_type = bytes
+    text_type = str
+
+    from io import BytesIO as byte_buffer
+
+    def u(s):
+        return s
+
+    def b(s):
+        return s.encode("latin-1")
+
+    def octal(lit):
+        return eval("0o" + lit)
+
+else:
+    import __builtin__ as compat_builtins
+    try:
+        from cStringIO import StringIO
+    except:
+        from StringIO import StringIO
+
+    byte_buffer = StringIO
+
+    from urllib import quote_plus, unquote_plus
+    from htmlentitydefs import codepoint2name, name2codepoint
+    string_types = basestring,
+    binary_type = str
+    text_type = unicode
+
+    def u(s):
+        return unicode(s, "utf-8")
+
+    def b(s):
+        return s
+
+    def octal(lit):
+        return eval("0" + lit)
+
+
+if py33:
+    from importlib import machinery
+    def load_module(module_id, path):
+        return machinery.SourceFileLoader(module_id, path).load_module()
+else:
+    import imp
+    def load_module(module_id, path):
+        fp = open(path, 'rb')
+        try:
+            return imp.load_source(module_id, path, fp)
+        finally:
+            fp.close()
+
+
+if py3k:
+    def reraise(tp, value, tb=None, cause=None):
+        if cause is not None:
+            value.__cause__ = cause
+        if value.__traceback__ is not tb:
+            raise value.with_traceback(tb)
+        raise value
+else:
+    exec("def reraise(tp, value, tb=None, cause=None):\n"
+            "    raise tp, value, tb\n")
+
+
+def exception_as():
+    return sys.exc_info()[1]
+
+try:
+    import threading
+    if py3k:
+        import _thread as thread
+    else:
+        import thread
+except ImportError:
+    import dummy_threading as threading
+    if py3k:
+        import _dummy_thread as thread
+    else:
+        import dummy_thread as thread
+
+if win32 or jython:
+    time_func = time.clock
+else:
+    time_func = time.time
+
+try:
+    from functools import partial
+except:
+    def partial(func, *args, **keywords):
+        def newfunc(*fargs, **fkeywords):
+            newkeywords = keywords.copy()
+            newkeywords.update(fkeywords)
+            return func(*(args + fargs), **newkeywords)
+        return newfunc
+
+
+all = all
+import json
+
+def exception_name(exc):
+    return exc.__class__.__name__
+
+try:
+    from inspect import CO_VARKEYWORDS, CO_VARARGS
+    def inspect_func_args(fn):
+        if py3k:
+            co = fn.__code__
+        else:
+            co = fn.func_code
+
+        nargs = co.co_argcount
+        names = co.co_varnames
+        args = list(names[:nargs])
+
+        varargs = None
+        if co.co_flags & CO_VARARGS:
+            varargs = co.co_varnames[nargs]
+            nargs = nargs + 1
+        varkw = None
+        if co.co_flags & CO_VARKEYWORDS:
+            varkw = co.co_varnames[nargs]
+
+        if py3k:
+            return args, varargs, varkw, fn.__defaults__
+        else:
+            return args, varargs, varkw, fn.func_defaults
+except ImportError:
+    import inspect
+    def inspect_func_args(fn):
+        return inspect.getargspec(fn)
+
+if py3k:
+    def callable(fn):
+        return hasattr(fn, '__call__')
+else:
+    callable = callable
+
+
+################################################
+# cross-compatible metaclass implementation
+# Copyright (c) 2010-2012 Benjamin Peterson
+def with_metaclass(meta, base=object):
+    """Create a base class with a metaclass."""
+    return meta("%sBase" % meta.__name__, (base,), {})
+################################################
+
+
+def arg_stringname(func_arg):
+    """Gets the string name of a kwarg or vararg
+    In Python3.4 a function's args are
+    of _ast.arg type not _ast.name
+    """
+    if hasattr(func_arg, 'arg'):
+        return func_arg.arg
+    else:
+        return str(func_arg)
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/exceptions.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/exceptions.py
new file mode 100644
index 00000000000..c531f2118d0
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/exceptions.py
@@ -0,0 +1,373 @@
+# mako/exceptions.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+"""exception classes"""
+
+import traceback
+import sys
+from mako import util, compat
+
+class MakoException(Exception):
+    pass
+
+class RuntimeException(MakoException):
+    pass
+
+def _format_filepos(lineno, pos, filename):
+    if filename is None:
+        return " at line: %d char: %d" % (lineno, pos)
+    else:
+        return " in file '%s' at line: %d char: %d" % (filename, lineno, pos)
+
+
+class CompileException(MakoException):
+    def __init__(self, message, source, lineno, pos, filename):
+        MakoException.__init__(self,
+                              message + _format_filepos(lineno, pos, filename))
+        self.lineno = lineno
+        self.pos = pos
+        self.filename = filename
+        self.source = source
+
+class SyntaxException(MakoException):
+    def __init__(self, message, source, lineno, pos, filename):
+        MakoException.__init__(self,
+                              message + _format_filepos(lineno, pos, filename))
+        self.lineno = lineno
+        self.pos = pos
+        self.filename = filename
+        self.source = source
+
+class UnsupportedError(MakoException):
+    """raised when a retired feature is used."""
+
+class NameConflictError(MakoException):
+    """raised when a reserved word is used inappropriately"""
+
+class TemplateLookupException(MakoException):
+    pass
+
+class TopLevelLookupException(TemplateLookupException):
+    pass
+
+class RichTraceback(object):
+    """Pull the current exception from the ``sys`` traceback and extracts
+    Mako-specific template information.
+
+    See the usage examples in :ref:`handling_exceptions`.
+
+    """
+    def __init__(self, error=None, traceback=None):
+        self.source, self.lineno = "", 0
+
+        if error is None or traceback is None:
+            t, value, tback = sys.exc_info()
+
+        if error is None:
+            error = value or t
+
+        if traceback is None:
+            traceback = tback
+
+        self.error = error
+        self.records = self._init(traceback)
+
+        if isinstance(self.error, (CompileException, SyntaxException)):
+            self.source = self.error.source
+            self.lineno = self.error.lineno
+            self._has_source = True
+
+        self._init_message()
+
+    @property
+    def errorname(self):
+        return compat.exception_name(self.error)
+
+    def _init_message(self):
+        """Find a unicode representation of self.error"""
+        try:
+            self.message = compat.text_type(self.error)
+        except UnicodeError:
+            try:
+                self.message = str(self.error)
+            except UnicodeEncodeError:
+                # Fallback to args as neither unicode nor
+                # str(Exception(u'\xe6')) work in Python < 2.6
+                self.message = self.error.args[0]
+        if not isinstance(self.message, compat.text_type):
+            self.message = compat.text_type(self.message, 'ascii', 'replace')
+
+    def _get_reformatted_records(self, records):
+        for rec in records:
+            if rec[6] is not None:
+                yield (rec[4], rec[5], rec[2], rec[6])
+            else:
+                yield tuple(rec[0:4])
+
+    @property
+    def traceback(self):
+        """Return a list of 4-tuple traceback records (i.e. normal python
+        format) with template-corresponding lines remapped to the originating
+        template.
+
+        """
+        return list(self._get_reformatted_records(self.records))
+
+    @property
+    def reverse_records(self):
+        return reversed(self.records)
+
+    @property
+    def reverse_traceback(self):
+        """Return the same data as traceback, except in reverse order.
+        """
+
+        return list(self._get_reformatted_records(self.reverse_records))
+
+    def _init(self, trcback):
+        """format a traceback from sys.exc_info() into 7-item tuples,
+        containing the regular four traceback tuple items, plus the original
+        template filename, the line number adjusted relative to the template
+        source, and code line from that line number of the template."""
+
+        import mako.template
+        mods = {}
+        rawrecords = traceback.extract_tb(trcback)
+        new_trcback = []
+        for filename, lineno, function, line in rawrecords:
+            if not line:
+                line = ''
+            try:
+                (line_map, template_lines) = mods[filename]
+            except KeyError:
+                try:
+                    info = mako.template._get_module_info(filename)
+                    module_source = info.code
+                    template_source = info.source
+                    template_filename = info.template_filename or filename
+                except KeyError:
+                    # A normal .py file (not a Template)
+                    if not compat.py3k:
+                        try:
+                            fp = open(filename, 'rb')
+                            encoding = util.parse_encoding(fp)
+                            fp.close()
+                        except IOError:
+                            encoding = None
+                        if encoding:
+                            line = line.decode(encoding)
+                        else:
+                            line = line.decode('ascii', 'replace')
+                    new_trcback.append((filename, lineno, function, line,
+                                            None, None, None, None))
+                    continue
+
+                template_ln = 1
+
+                source_map = mako.template.ModuleInfo.\
+                                get_module_source_metadata(
+                                    module_source, full_line_map=True)
+                line_map = source_map['full_line_map']
+
+                template_lines = [line for line in
+                                    template_source.split("\n")]
+                mods[filename] = (line_map, template_lines)
+
+            template_ln = line_map[lineno - 1]
+
+            if template_ln <= len(template_lines):
+                template_line = template_lines[template_ln - 1]
+            else:
+                template_line = None
+            new_trcback.append((filename, lineno, function,
+                                line, template_filename, template_ln,
+                                template_line, template_source))
+        if not self.source:
+            for l in range(len(new_trcback) - 1, 0, -1):
+                if new_trcback[l][5]:
+                    self.source = new_trcback[l][7]
+                    self.lineno = new_trcback[l][5]
+                    break
+            else:
+                if new_trcback:
+                    try:
+                        # A normal .py file (not a Template)
+                        fp = open(new_trcback[-1][0], 'rb')
+                        encoding = util.parse_encoding(fp)
+                        fp.seek(0)
+                        self.source = fp.read()
+                        fp.close()
+                        if encoding:
+                            self.source = self.source.decode(encoding)
+                    except IOError:
+                        self.source = ''
+                    self.lineno = new_trcback[-1][1]
+        return new_trcback
+
+
+def text_error_template(lookup=None):
+    """Provides a template that renders a stack trace in a similar format to
+    the Python interpreter, substituting source template filenames, line
+    numbers and code for that of the originating source template, as
+    applicable.
+
+    """
+    import mako.template
+    return mako.template.Template(r"""
+<%page args="error=None, traceback=None"/>
+<%!
+    from mako.exceptions import RichTraceback
+%>\
+<%
+    tback = RichTraceback(error=error, traceback=traceback)
+%>\
+Traceback (most recent call last):
+% for (filename, lineno, function, line) in tback.traceback:
+  File "${filename}", line ${lineno}, in ${function or '?'}
+    ${line | trim}
+% endfor
+${tback.errorname}: ${tback.message}
+""")
+
+
+def _install_pygments():
+    global syntax_highlight, pygments_html_formatter
+    from mako.ext.pygmentplugin import syntax_highlight,\
+            pygments_html_formatter
+
+def _install_fallback():
+    global syntax_highlight, pygments_html_formatter
+    from mako.filters import html_escape
+    pygments_html_formatter = None
+    def syntax_highlight(filename='', language=None):
+        return html_escape
+
+def _install_highlighting():
+    try:
+        _install_pygments()
+    except ImportError:
+        _install_fallback()
+_install_highlighting()
+
+def html_error_template():
+    """Provides a template that renders a stack trace in an HTML format,
+    providing an excerpt of code as well as substituting source template
+    filenames, line numbers and code for that of the originating source
+    template, as applicable.
+
+    The template's default ``encoding_errors`` value is
+    ``'htmlentityreplace'``. The template has two options. With the
+    ``full`` option disabled, only a section of an HTML document is
+    returned. With the ``css`` option disabled, the default stylesheet
+    won't be included.
+
+    """
+    import mako.template
+    return mako.template.Template(r"""
+<%!
+    from mako.exceptions import RichTraceback, syntax_highlight,\
+            pygments_html_formatter
+%>
+<%page args="full=True, css=True, error=None, traceback=None"/>
+% if full:
+<html>
+<head>
+    <title>Mako Runtime Error</title>
+% endif
+% if css:
+    <style>
+        body { font-family:verdana; margin:10px 30px 10px 30px;}
+        .stacktrace { margin:5px 5px 5px 5px; }
+        .highlight { padding:0px 10px 0px 10px; background-color:#9F9FDF; }
+        .nonhighlight { padding:0px; background-color:#DFDFDF; }
+        .sample { padding:10px; margin:10px 10px 10px 10px;
+                  font-family:monospace; }
+        .sampleline { padding:0px 10px 0px 10px; }
+        .sourceline { margin:5px 5px 10px 5px; font-family:monospace;}
+        .location { font-size:80%; }
+        .highlight { white-space:pre; }
+        .sampleline { white-space:pre; }
+
+    % if pygments_html_formatter:
+        ${pygments_html_formatter.get_style_defs()}
+        .linenos { min-width: 2.5em; text-align: right; }
+        pre { margin: 0; }
+        .syntax-highlighted { padding: 0 10px; }
+        .syntax-highlightedtable { border-spacing: 1px; }
+        .nonhighlight { border-top: 1px solid #DFDFDF;
+                        border-bottom: 1px solid #DFDFDF; }
+        .stacktrace .nonhighlight { margin: 5px 15px 10px; }
+        .sourceline { margin: 0 0; font-family:monospace; }
+        .code { background-color: #F8F8F8; width: 100%; }
+        .error .code { background-color: #FFBDBD; }
+        .error .syntax-highlighted { background-color: #FFBDBD; }
+    % endif
+
+    </style>
+% endif
+% if full:
+</head>
+<body>
+% endif
+
+<h2>Error !</h2>
+<%
+    tback = RichTraceback(error=error, traceback=traceback)
+    src = tback.source
+    line = tback.lineno
+    if src:
+        lines = src.split('\n')
+    else:
+        lines = None
+%>
+<h3>${tback.errorname}: ${tback.message|h}</h3>
+
+% if lines:
+    <div class="sample">
+    <div class="nonhighlight">
+% for index in range(max(0, line-4),min(len(lines), line+5)):
+    <%
+       if pygments_html_formatter:
+           pygments_html_formatter.linenostart = index + 1
+    %>
+    % if index + 1 == line:
+    <%
+       if pygments_html_formatter:
+           old_cssclass = pygments_html_formatter.cssclass
+           pygments_html_formatter.cssclass = 'error ' + old_cssclass
+    %>
+        ${lines[index] | syntax_highlight(language='mako')}
+    <%
+       if pygments_html_formatter:
+           pygments_html_formatter.cssclass = old_cssclass
+    %>
+    % else:
+        ${lines[index] | syntax_highlight(language='mako')}
+    % endif
+% endfor
+    </div>
+    </div>
+% endif
+
+<div class="stacktrace">
+% for (filename, lineno, function, line) in tback.reverse_traceback:
+    <div class="location">${filename}, line ${lineno}:</div>
+    <div class="nonhighlight">
+    <%
+       if pygments_html_formatter:
+           pygments_html_formatter.linenostart = lineno
+    %>
+      <div class="sourceline">${line | syntax_highlight(filename)}</div>
+    </div>
+% endfor
+</div>
+
+% if full:
+</body>
+</html>
+% endif
+""", output_encoding=sys.getdefaultencoding(),
+        encoding_errors='htmlentityreplace')
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/filters.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/filters.py
new file mode 100644
index 00000000000..d79ce2388f6
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/filters.py
@@ -0,0 +1,201 @@
+# mako/filters.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+
+import re
+import codecs
+
+from mako.compat import quote_plus, unquote_plus, codepoint2name, \
+        name2codepoint
+
+from mako import compat
+
+xml_escapes = {
+    '&': '&amp;',
+    '>': '&gt;',
+    '<': '&lt;',
+    '"': '&#34;',   # also &quot; in html-only
+    "'": '&#39;'    # also &apos; in html-only
+}
+
+# XXX: &quot; is valid in HTML and XML
+#      &apos; is not valid HTML, but is valid XML
+
+def legacy_html_escape(s):
+    """legacy HTML escape for non-unicode mode."""
+    s = s.replace("&", "&amp;")
+    s = s.replace(">", "&gt;")
+    s = s.replace("<", "&lt;")
+    s = s.replace('"', "&#34;")
+    s = s.replace("'", "&#39;")
+    return s
+
+
+try:
+    import markupsafe
+    html_escape = markupsafe.escape
+except ImportError:
+    html_escape = legacy_html_escape
+
+def xml_escape(string):
+    return re.sub(r'([&<"\'>])', lambda m: xml_escapes[m.group()], string)
+
+def url_escape(string):
+    # convert into a list of octets
+    string = string.encode("utf8")
+    return quote_plus(string)
+
+def legacy_url_escape(string):
+    # convert into a list of octets
+    return quote_plus(string)
+
+def url_unescape(string):
+    text = unquote_plus(string)
+    if not is_ascii_str(text):
+        text = text.decode("utf8")
+    return text
+
+def trim(string):
+    return string.strip()
+
+
+class Decode(object):
+    def __getattr__(self, key):
+        def decode(x):
+            if isinstance(x, compat.text_type):
+                return x
+            elif not isinstance(x, compat.binary_type):
+                return decode(str(x))
+            else:
+                return compat.text_type(x, encoding=key)
+        return decode
+decode = Decode()
+
+
+_ASCII_re = re.compile(r'\A[\x00-\x7f]*\Z')
+
+def is_ascii_str(text):
+    return isinstance(text, str) and _ASCII_re.match(text)
+
+################################################################
+
+class XMLEntityEscaper(object):
+    def __init__(self, codepoint2name, name2codepoint):
+        self.codepoint2entity = dict([(c, compat.text_type('&%s;' % n))
+                                      for c, n in codepoint2name.items()])
+        self.name2codepoint = name2codepoint
+
+    def escape_entities(self, text):
+        """Replace characters with their character entity references.
+
+        Only characters corresponding to a named entity are replaced.
+        """
+        return compat.text_type(text).translate(self.codepoint2entity)
+
+    def __escape(self, m):
+        codepoint = ord(m.group())
+        try:
+            return self.codepoint2entity[codepoint]
+        except (KeyError, IndexError):
+            return '&#x%X;' % codepoint
+
+
+    __escapable = re.compile(r'["&<>]|[^\x00-\x7f]')
+
+    def escape(self, text):
+        """Replace characters with their character references.
+
+        Replace characters by their named entity references.
+        Non-ASCII characters, if they do not have a named entity reference,
+        are replaced by numerical character references.
+
+        The return value is guaranteed to be ASCII.
+        """
+        return self.__escapable.sub(self.__escape, compat.text_type(text)
+                                    ).encode('ascii')
+
+    # XXX: This regexp will not match all valid XML entity names__.
+    # (It punts on details involving involving CombiningChars and Extenders.)
+    #
+    # .. __: http://www.w3.org/TR/2000/REC-xml-20001006#NT-EntityRef
+    __characterrefs = re.compile(r'''& (?:
+                                          \#(\d+)
+                                          | \#x([\da-f]+)
+                                          | ( (?!\d) [:\w] [-.:\w]+ )
+                                          ) ;''',
+                                 re.X | re.UNICODE)
+
+    def __unescape(self, m):
+        dval, hval, name = m.groups()
+        if dval:
+            codepoint = int(dval)
+        elif hval:
+            codepoint = int(hval, 16)
+        else:
+            codepoint = self.name2codepoint.get(name, 0xfffd)
+            # U+FFFD = "REPLACEMENT CHARACTER"
+        if codepoint < 128:
+            return chr(codepoint)
+        return chr(codepoint)
+
+    def unescape(self, text):
+        """Unescape character references.
+
+        All character references (both entity references and numerical
+        character references) are unescaped.
+        """
+        return self.__characterrefs.sub(self.__unescape, text)
+
+
+_html_entities_escaper = XMLEntityEscaper(codepoint2name, name2codepoint)
+
+html_entities_escape = _html_entities_escaper.escape_entities
+html_entities_unescape = _html_entities_escaper.unescape
+
+
+def htmlentityreplace_errors(ex):
+    """An encoding error handler.
+
+    This python `codecs`_ error handler replaces unencodable
+    characters with HTML entities, or, if no HTML entity exists for
+    the character, XML character references.
+
+    >>> u'The cost was \u20ac12.'.encode('latin1', 'htmlentityreplace')
+    'The cost was &euro;12.'
+    """
+    if isinstance(ex, UnicodeEncodeError):
+        # Handle encoding errors
+        bad_text = ex.object[ex.start:ex.end]
+        text = _html_entities_escaper.escape(bad_text)
+        return (compat.text_type(text), ex.end)
+    raise ex
+
+codecs.register_error('htmlentityreplace', htmlentityreplace_errors)
+
+
+# TODO: options to make this dynamic per-compilation will be added in a later
+# release
+DEFAULT_ESCAPES = {
+    'x': 'filters.xml_escape',
+    'h': 'filters.html_escape',
+    'u': 'filters.url_escape',
+    'trim': 'filters.trim',
+    'entity': 'filters.html_entities_escape',
+    'unicode': 'unicode',
+    'decode': 'decode',
+    'str': 'str',
+    'n': 'n'
+}
+
+if compat.py3k:
+    DEFAULT_ESCAPES.update({
+        'unicode': 'str'
+    })
+
+NON_UNICODE_ESCAPES = DEFAULT_ESCAPES.copy()
+NON_UNICODE_ESCAPES['h'] = 'filters.legacy_html_escape'
+NON_UNICODE_ESCAPES['u'] = 'filters.legacy_url_escape'
+
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/lexer.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/lexer.py
new file mode 100644
index 00000000000..1dda398215d
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/lexer.py
@@ -0,0 +1,441 @@
+# mako/lexer.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+"""provides the Lexer class for parsing template strings into parse trees."""
+
+import re
+import codecs
+from mako import parsetree, exceptions, compat
+from mako.pygen import adjust_whitespace
+
+_regexp_cache = {}
+
+class Lexer(object):
+    def __init__(self, text, filename=None,
+                        disable_unicode=False,
+                        input_encoding=None, preprocessor=None):
+        self.text = text
+        self.filename = filename
+        self.template = parsetree.TemplateNode(self.filename)
+        self.matched_lineno = 1
+        self.matched_charpos = 0
+        self.lineno = 1
+        self.match_position = 0
+        self.tag = []
+        self.control_line = []
+        self.ternary_stack = []
+        self.disable_unicode = disable_unicode
+        self.encoding = input_encoding
+
+        if compat.py3k and disable_unicode:
+            raise exceptions.UnsupportedError(
+                                    "Mako for Python 3 does not "
+                                    "support disabling Unicode")
+
+        if preprocessor is None:
+            self.preprocessor = []
+        elif not hasattr(preprocessor, '__iter__'):
+            self.preprocessor = [preprocessor]
+        else:
+            self.preprocessor = preprocessor
+
+    @property
+    def exception_kwargs(self):
+        return {'source': self.text,
+                'lineno': self.matched_lineno,
+                'pos': self.matched_charpos,
+                'filename': self.filename}
+
+    def match(self, regexp, flags=None):
+        """compile the given regexp, cache the reg, and call match_reg()."""
+
+        try:
+            reg = _regexp_cache[(regexp, flags)]
+        except KeyError:
+            if flags:
+                reg = re.compile(regexp, flags)
+            else:
+                reg = re.compile(regexp)
+            _regexp_cache[(regexp, flags)] = reg
+
+        return self.match_reg(reg)
+
+    def match_reg(self, reg):
+        """match the given regular expression object to the current text
+        position.
+
+        if a match occurs, update the current text and line position.
+
+        """
+
+        mp = self.match_position
+
+        match = reg.match(self.text, self.match_position)
+        if match:
+            (start, end) = match.span()
+            if end == start:
+                self.match_position = end + 1
+            else:
+                self.match_position = end
+            self.matched_lineno = self.lineno
+            lines = re.findall(r"\n", self.text[mp:self.match_position])
+            cp = mp - 1
+            while (cp >= 0 and cp < self.textlength and self.text[cp] != '\n'):
+                cp -= 1
+            self.matched_charpos = mp - cp
+            self.lineno += len(lines)
+            #print "MATCHED:", match.group(0), "LINE START:",
+            # self.matched_lineno, "LINE END:", self.lineno
+        #print "MATCH:", regexp, "\n", self.text[mp : mp + 15], \
+        #          (match and "TRUE" or "FALSE")
+        return match
+
+    def parse_until_text(self, *text):
+        startpos = self.match_position
+        text_re = r'|'.join(text)
+        brace_level = 0
+        while True:
+            match = self.match(r'#.*\n')
+            if match:
+                continue
+            match = self.match(r'(\"\"\"|\'\'\'|\"|\')((?<!\\)\\\1|.)*?\1',
+                               re.S)
+            if match:
+                continue
+            match = self.match(r'(%s)' % text_re)
+            if match:
+                if match.group(1) == '}' and brace_level > 0:
+                    brace_level -= 1
+                    continue
+                return \
+                    self.text[startpos:
+                              self.match_position - len(match.group(1))],\
+                    match.group(1)
+            match = self.match(r"(.*?)(?=\"|\'|#|%s)" % text_re, re.S)
+            if match:
+                brace_level += match.group(1).count('{')
+                brace_level -= match.group(1).count('}')
+                continue
+            raise exceptions.SyntaxException(
+                        "Expected: %s" %
+                        ','.join(text),
+                        **self.exception_kwargs)
+
+    def append_node(self, nodecls, *args, **kwargs):
+        kwargs.setdefault('source', self.text)
+        kwargs.setdefault('lineno', self.matched_lineno)
+        kwargs.setdefault('pos', self.matched_charpos)
+        kwargs['filename'] = self.filename
+        node = nodecls(*args, **kwargs)
+        if len(self.tag):
+            self.tag[-1].nodes.append(node)
+        else:
+            self.template.nodes.append(node)
+        # build a set of child nodes for the control line
+        # (used for loop variable detection)
+        # also build a set of child nodes on ternary control lines
+        # (used for determining if a pass needs to be auto-inserted
+        if self.control_line:
+            control_frame = self.control_line[-1]
+            control_frame.nodes.append(node)
+            if not (isinstance(node, parsetree.ControlLine) and
+                    control_frame.is_ternary(node.keyword)):
+                if self.ternary_stack and self.ternary_stack[-1]:
+                    self.ternary_stack[-1][-1].nodes.append(node)
+        if isinstance(node, parsetree.Tag):
+            if len(self.tag):
+                node.parent = self.tag[-1]
+            self.tag.append(node)
+        elif isinstance(node, parsetree.ControlLine):
+            if node.isend:
+                self.control_line.pop()
+                self.ternary_stack.pop()
+            elif node.is_primary:
+                self.control_line.append(node)
+                self.ternary_stack.append([])
+            elif self.control_line and \
+                    self.control_line[-1].is_ternary(node.keyword):
+                self.ternary_stack[-1].append(node)
+            elif self.control_line and \
+                    not self.control_line[-1].is_ternary(node.keyword):
+                raise exceptions.SyntaxException(
+                        "Keyword '%s' not a legal ternary for keyword '%s'" %
+                        (node.keyword, self.control_line[-1].keyword),
+                        **self.exception_kwargs)
+
+    _coding_re = re.compile(r'#.*coding[:=]\s*([-\w.]+).*\r?\n')
+
+    def decode_raw_stream(self, text, decode_raw, known_encoding, filename):
+        """given string/unicode or bytes/string, determine encoding
+           from magic encoding comment, return body as unicode
+           or raw if decode_raw=False
+
+        """
+        if isinstance(text, compat.text_type):
+            m = self._coding_re.match(text)
+            encoding = m and m.group(1) or known_encoding or 'ascii'
+            return encoding, text
+
+        if text.startswith(codecs.BOM_UTF8):
+            text = text[len(codecs.BOM_UTF8):]
+            parsed_encoding = 'utf-8'
+            m = self._coding_re.match(text.decode('utf-8', 'ignore'))
+            if m is not None and m.group(1) != 'utf-8':
+                raise exceptions.CompileException(
+                                "Found utf-8 BOM in file, with conflicting "
+                                "magic encoding comment of '%s'" % m.group(1),
+                                text.decode('utf-8', 'ignore'),
+                                0, 0, filename)
+        else:
+            m = self._coding_re.match(text.decode('utf-8', 'ignore'))
+            if m:
+                parsed_encoding = m.group(1)
+            else:
+                parsed_encoding = known_encoding or 'ascii'
+
+        if decode_raw:
+            try:
+                text = text.decode(parsed_encoding)
+            except UnicodeDecodeError:
+                raise exceptions.CompileException(
+                        "Unicode decode operation of encoding '%s' failed" %
+                        parsed_encoding,
+                        text.decode('utf-8', 'ignore'),
+                        0, 0, filename)
+
+        return parsed_encoding, text
+
+    def parse(self):
+        self.encoding, self.text = self.decode_raw_stream(self.text,
+                                        not self.disable_unicode,
+                                        self.encoding,
+                                        self.filename,)
+
+        for preproc in self.preprocessor:
+            self.text = preproc(self.text)
+
+        # push the match marker past the
+        # encoding comment.
+        self.match_reg(self._coding_re)
+
+        self.textlength = len(self.text)
+
+        while (True):
+            if self.match_position > self.textlength:
+                break
+
+            if self.match_end():
+                break
+            if self.match_expression():
+                continue
+            if self.match_control_line():
+                continue
+            if self.match_comment():
+                continue
+            if self.match_tag_start():
+                continue
+            if self.match_tag_end():
+                continue
+            if self.match_python_block():
+                continue
+            if self.match_text():
+                continue
+
+            if self.match_position > self.textlength:
+                break
+            raise exceptions.CompileException("assertion failed")
+
+        if len(self.tag):
+            raise exceptions.SyntaxException("Unclosed tag: <%%%s>" %
+                                                self.tag[-1].keyword,
+                                                **self.exception_kwargs)
+        if len(self.control_line):
+            raise exceptions.SyntaxException(
+                                    "Unterminated control keyword: '%s'" %
+                                    self.control_line[-1].keyword,
+                                    self.text,
+                                    self.control_line[-1].lineno,
+                                    self.control_line[-1].pos, self.filename)
+        return self.template
+
+    def match_tag_start(self):
+        match = self.match(r'''
+            \<%     # opening tag
+
+            ([\w\.\:]+)   # keyword
+
+            ((?:\s+\w+|\s*=\s*|".*?"|'.*?')*)  # attrname, = \
+                                               #        sign, string expression
+
+            \s*     # more whitespace
+
+            (/)?>   # closing
+
+            ''',
+
+            re.I | re.S | re.X)
+
+        if match:
+            keyword, attr, isend = match.groups()
+            self.keyword = keyword
+            attributes = {}
+            if attr:
+                for att in re.findall(
+                           r"\s*(\w+)\s*=\s*(?:'([^']*)'|\"([^\"]*)\")", attr):
+                    key, val1, val2 = att
+                    text = val1 or val2
+                    text = text.replace('\r\n', '\n')
+                    attributes[key] = text
+            self.append_node(parsetree.Tag, keyword, attributes)
+            if isend:
+                self.tag.pop()
+            else:
+                if keyword == 'text':
+                    match = self.match(r'(.*?)(?=\</%text>)',  re.S)
+                    if not match:
+                        raise exceptions.SyntaxException(
+                                            "Unclosed tag: <%%%s>" %
+                                            self.tag[-1].keyword,
+                                            **self.exception_kwargs)
+                    self.append_node(parsetree.Text, match.group(1))
+                    return self.match_tag_end()
+            return True
+        else:
+            return False
+
+    def match_tag_end(self):
+        match = self.match(r'\</%[\t ]*(.+?)[\t ]*>')
+        if match:
+            if not len(self.tag):
+                raise exceptions.SyntaxException(
+                                "Closing tag without opening tag: </%%%s>" %
+                                match.group(1),
+                                **self.exception_kwargs)
+            elif self.tag[-1].keyword != match.group(1):
+                raise exceptions.SyntaxException(
+                            "Closing tag </%%%s> does not match tag: <%%%s>" %
+                            (match.group(1), self.tag[-1].keyword),
+                            **self.exception_kwargs)
+            self.tag.pop()
+            return True
+        else:
+            return False
+
+    def match_end(self):
+        match = self.match(r'\Z', re.S)
+        if match:
+            string = match.group()
+            if string:
+                return string
+            else:
+                return True
+        else:
+            return False
+
+    def match_text(self):
+        match = self.match(r"""
+                (.*?)         # anything, followed by:
+                (
+                 (?<=\n)(?=[ \t]*(?=%|\#\#)) # an eval or line-based
+                                             # comment preceded by a
+                                             # consumed newline and whitespace
+                 |
+                 (?=\${)      # an expression
+                 |
+                 (?=</?[%&])  # a substitution or block or call start or end
+                              # - don't consume
+                 |
+                 (\\\r?\n)    # an escaped newline  - throw away
+                 |
+                 \Z           # end of string
+                )""", re.X | re.S)
+
+        if match:
+            text = match.group(1)
+            if text:
+                self.append_node(parsetree.Text, text)
+            return True
+        else:
+            return False
+
+    def match_python_block(self):
+        match = self.match(r"<%(!)?")
+        if match:
+            line, pos = self.matched_lineno, self.matched_charpos
+            text, end = self.parse_until_text(r'%>')
+            # the trailing newline helps
+            # compiler.parse() not complain about indentation
+            text = adjust_whitespace(text) + "\n"
+            self.append_node(
+                        parsetree.Code,
+                        text,
+                        match.group(1) == '!', lineno=line, pos=pos)
+            return True
+        else:
+            return False
+
+    def match_expression(self):
+        match = self.match(r"\${")
+        if match:
+            line, pos = self.matched_lineno, self.matched_charpos
+            text, end = self.parse_until_text(r'\|', r'}')
+            if end == '|':
+                escapes, end = self.parse_until_text(r'}')
+            else:
+                escapes = ""
+            text = text.replace('\r\n', '\n')
+            self.append_node(
+                            parsetree.Expression,
+                            text, escapes.strip(),
+                            lineno=line, pos=pos)
+            return True
+        else:
+            return False
+
+    def match_control_line(self):
+        match = self.match(
+                    r"(?<=^)[\t ]*(%(?!%)|##)[\t ]*((?:(?:\\r?\n)|[^\r\n])*)"
+                    r"(?:\r?\n|\Z)", re.M)
+        if match:
+            operator = match.group(1)
+            text = match.group(2)
+            if operator == '%':
+                m2 = re.match(r'(end)?(\w+)\s*(.*)', text)
+                if not m2:
+                    raise exceptions.SyntaxException(
+                                "Invalid control line: '%s'" %
+                                text,
+                                **self.exception_kwargs)
+                isend, keyword = m2.group(1, 2)
+                isend = (isend is not None)
+
+                if isend:
+                    if not len(self.control_line):
+                        raise exceptions.SyntaxException(
+                                "No starting keyword '%s' for '%s'" %
+                                (keyword, text),
+                                **self.exception_kwargs)
+                    elif self.control_line[-1].keyword != keyword:
+                        raise exceptions.SyntaxException(
+                                "Keyword '%s' doesn't match keyword '%s'" %
+                                (text, self.control_line[-1].keyword),
+                                **self.exception_kwargs)
+                self.append_node(parsetree.ControlLine, keyword, isend, text)
+            else:
+                self.append_node(parsetree.Comment, text)
+            return True
+        else:
+            return False
+
+    def match_comment(self):
+        """matches the multiline version of a comment"""
+        match = self.match(r"<%doc>(.*?)</%doc>", re.S)
+        if match:
+            self.append_node(parsetree.Comment, match.group(1))
+            return True
+        else:
+            return False
+
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/lookup.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/lookup.py
new file mode 100644
index 00000000000..2af5411907a
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/lookup.py
@@ -0,0 +1,359 @@
+# mako/lookup.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+import os, stat, posixpath, re
+from mako import exceptions, util
+from mako.template import Template
+
+try:
+    import threading
+except:
+    import dummy_threading as threading
+
+class TemplateCollection(object):
+    """Represent a collection of :class:`.Template` objects,
+    identifiable via URI.
+
+    A :class:`.TemplateCollection` is linked to the usage of
+    all template tags that address other templates, such
+    as ``<%include>``, ``<%namespace>``, and ``<%inherit>``.
+    The ``file`` attribute of each of those tags refers
+    to a string URI that is passed to that :class:`.Template`
+    object's :class:`.TemplateCollection` for resolution.
+
+    :class:`.TemplateCollection` is an abstract class,
+    with the usual default implementation being :class:`.TemplateLookup`.
+
+     """
+
+    def has_template(self, uri):
+        """Return ``True`` if this :class:`.TemplateLookup` is
+        capable of returning a :class:`.Template` object for the
+        given ``uri``.
+
+        :param uri: String URI of the template to be resolved.
+
+        """
+        try:
+            self.get_template(uri)
+            return True
+        except exceptions.TemplateLookupException:
+            return False
+
+    def get_template(self, uri, relativeto=None):
+        """Return a :class:`.Template` object corresponding to the given
+        ``uri``.
+
+        The default implementation raises
+        :class:`.NotImplementedError`. Implementations should
+        raise :class:`.TemplateLookupException` if the given ``uri``
+        cannot be resolved.
+
+        :param uri: String URI of the template to be resolved.
+        :param relativeto: if present, the given ``uri`` is assumed to
+         be relative to this URI.
+
+        """
+        raise NotImplementedError()
+
+    def filename_to_uri(self, uri, filename):
+        """Convert the given ``filename`` to a URI relative to
+           this :class:`.TemplateCollection`."""
+
+        return uri
+
+    def adjust_uri(self, uri, filename):
+        """Adjust the given ``uri`` based on the calling ``filename``.
+
+        When this method is called from the runtime, the
+        ``filename`` parameter is taken directly to the ``filename``
+        attribute of the calling template. Therefore a custom
+        :class:`.TemplateCollection` subclass can place any string
+        identifier desired in the ``filename`` parameter of the
+        :class:`.Template` objects it constructs and have them come back
+        here.
+
+        """
+        return uri
+
+class TemplateLookup(TemplateCollection):
+    """Represent a collection of templates that locates template source files
+    from the local filesystem.
+
+    The primary argument is the ``directories`` argument, the list of
+    directories to search:
+
+    .. sourcecode:: python
+
+        lookup = TemplateLookup(["/path/to/templates"])
+        some_template = lookup.get_template("/index.html")
+
+    The :class:`.TemplateLookup` can also be given :class:`.Template` objects
+    programatically using :meth:`.put_string` or :meth:`.put_template`:
+
+    .. sourcecode:: python
+
+        lookup = TemplateLookup()
+        lookup.put_string("base.html", '''
+            <html><body>${self.next()}</body></html>
+        ''')
+        lookup.put_string("hello.html", '''
+            <%include file='base.html'/>
+
+            Hello, world !
+        ''')
+
+
+    :param directories: A list of directory names which will be
+     searched for a particular template URI. The URI is appended
+     to each directory and the filesystem checked.
+
+    :param collection_size: Approximate size of the collection used
+     to store templates. If left at its default of ``-1``, the size
+     is unbounded, and a plain Python dictionary is used to
+     relate URI strings to :class:`.Template` instances.
+     Otherwise, a least-recently-used cache object is used which
+     will maintain the size of the collection approximately to
+     the number given.
+
+    :param filesystem_checks: When at its default value of ``True``,
+     each call to :meth:`.TemplateLookup.get_template()` will
+     compare the filesystem last modified time to the time in
+     which an existing :class:`.Template` object was created.
+     This allows the :class:`.TemplateLookup` to regenerate a
+     new :class:`.Template` whenever the original source has
+     been updated. Set this to ``False`` for a very minor
+     performance increase.
+
+    :param modulename_callable: A callable which, when present,
+     is passed the path of the source file as well as the
+     requested URI, and then returns the full path of the
+     generated Python module file. This is used to inject
+     alternate schemes for Python module location. If left at
+     its default of ``None``, the built in system of generation
+     based on ``module_directory`` plus ``uri`` is used.
+
+    All other keyword parameters available for
+    :class:`.Template` are mirrored here. When new
+    :class:`.Template` objects are created, the keywords
+    established with this :class:`.TemplateLookup` are passed on
+    to each new :class:`.Template`.
+
+    """
+
+    def __init__(self,
+                        directories=None,
+                        module_directory=None,
+                        filesystem_checks=True,
+                        collection_size=-1,
+                        format_exceptions=False,
+                        error_handler=None,
+                        disable_unicode=False,
+                        bytestring_passthrough=False,
+                        output_encoding=None,
+                        encoding_errors='strict',
+
+                        cache_args=None,
+                        cache_impl='beaker',
+                        cache_enabled=True,
+                        cache_type=None,
+                        cache_dir=None,
+                        cache_url=None,
+
+                        modulename_callable=None,
+                        module_writer=None,
+                        default_filters=None,
+                        buffer_filters=(),
+                        strict_undefined=False,
+                        imports=None,
+                        future_imports=None,
+                        enable_loop=True,
+                        input_encoding=None,
+                        preprocessor=None,
+                        lexer_cls=None):
+
+        self.directories = [posixpath.normpath(d) for d in
+                            util.to_list(directories, ())
+                            ]
+        self.module_directory = module_directory
+        self.modulename_callable = modulename_callable
+        self.filesystem_checks = filesystem_checks
+        self.collection_size = collection_size
+
+        if cache_args is None:
+            cache_args = {}
+        # transfer deprecated cache_* args
+        if cache_dir:
+            cache_args.setdefault('dir', cache_dir)
+        if cache_url:
+            cache_args.setdefault('url', cache_url)
+        if cache_type:
+            cache_args.setdefault('type', cache_type)
+
+        self.template_args = {
+            'format_exceptions':format_exceptions,
+            'error_handler':error_handler,
+            'disable_unicode':disable_unicode,
+            'bytestring_passthrough':bytestring_passthrough,
+            'output_encoding':output_encoding,
+            'cache_impl':cache_impl,
+            'encoding_errors':encoding_errors,
+            'input_encoding':input_encoding,
+            'module_directory':module_directory,
+            'module_writer':module_writer,
+            'cache_args':cache_args,
+            'cache_enabled':cache_enabled,
+            'default_filters':default_filters,
+            'buffer_filters':buffer_filters,
+            'strict_undefined':strict_undefined,
+            'imports':imports,
+            'future_imports':future_imports,
+            'enable_loop':enable_loop,
+            'preprocessor':preprocessor,
+            'lexer_cls':lexer_cls
+        }
+
+        if collection_size == -1:
+            self._collection = {}
+            self._uri_cache = {}
+        else:
+            self._collection = util.LRUCache(collection_size)
+            self._uri_cache = util.LRUCache(collection_size)
+        self._mutex = threading.Lock()
+
+    def get_template(self, uri):
+        """Return a :class:`.Template` object corresponding to the given
+        ``uri``.
+
+        .. note:: The ``relativeto`` argument is not supported here at the moment.
+
+        """
+
+        try:
+            if self.filesystem_checks:
+                return self._check(uri, self._collection[uri])
+            else:
+                return self._collection[uri]
+        except KeyError:
+            u = re.sub(r'^\/+', '', uri)
+            for dir in self.directories:
+                srcfile = posixpath.normpath(posixpath.join(dir, u))
+                if os.path.isfile(srcfile):
+                    return self._load(srcfile, uri)
+            else:
+                raise exceptions.TopLevelLookupException(
+                                    "Cant locate template for uri %r" % uri)
+
+    def adjust_uri(self, uri, relativeto):
+        """Adjust the given ``uri`` based on the given relative URI."""
+
+        key = (uri, relativeto)
+        if key in self._uri_cache:
+            return self._uri_cache[key]
+
+        if uri[0] != '/':
+            if relativeto is not None:
+                v = self._uri_cache[key] = posixpath.join(
+                                            posixpath.dirname(relativeto), uri)
+            else:
+                v = self._uri_cache[key] = '/' + uri
+        else:
+            v = self._uri_cache[key] = uri
+        return v
+
+
+    def filename_to_uri(self, filename):
+        """Convert the given ``filename`` to a URI relative to
+           this :class:`.TemplateCollection`."""
+
+        try:
+            return self._uri_cache[filename]
+        except KeyError:
+            value = self._relativeize(filename)
+            self._uri_cache[filename] = value
+            return value
+
+    def _relativeize(self, filename):
+        """Return the portion of a filename that is 'relative'
+           to the directories in this lookup.
+
+        """
+
+        filename = posixpath.normpath(filename)
+        for dir in self.directories:
+            if filename[0:len(dir)] == dir:
+                return filename[len(dir):]
+        else:
+            return None
+
+    def _load(self, filename, uri):
+        self._mutex.acquire()
+        try:
+            try:
+                # try returning from collection one
+                # more time in case concurrent thread already loaded
+                return self._collection[uri]
+            except KeyError:
+                pass
+            try:
+                if self.modulename_callable is not None:
+                    module_filename = self.modulename_callable(filename, uri)
+                else:
+                    module_filename = None
+                self._collection[uri] = template = Template(
+                                        uri=uri,
+                                        filename=posixpath.normpath(filename),
+                                        lookup=self,
+                                        module_filename=module_filename,
+                                        **self.template_args)
+                return template
+            except:
+                # if compilation fails etc, ensure
+                # template is removed from collection,
+                # re-raise
+                self._collection.pop(uri, None)
+                raise
+        finally:
+            self._mutex.release()
+
+    def _check(self, uri, template):
+        if template.filename is None:
+            return template
+
+        try:
+            template_stat = os.stat(template.filename)
+            if template.module._modified_time < \
+                        template_stat[stat.ST_MTIME]:
+                self._collection.pop(uri, None)
+                return self._load(template.filename, uri)
+            else:
+                return template
+        except OSError:
+            self._collection.pop(uri, None)
+            raise exceptions.TemplateLookupException(
+                                "Cant locate template for uri %r" % uri)
+
+
+    def put_string(self, uri, text):
+        """Place a new :class:`.Template` object into this
+        :class:`.TemplateLookup`, based on the given string of
+        ``text``.
+
+        """
+        self._collection[uri] = Template(
+                                    text,
+                                    lookup=self,
+                                    uri=uri,
+                                    **self.template_args)
+
+    def put_template(self, uri, template):
+        """Place a new :class:`.Template` object into this
+        :class:`.TemplateLookup`, based on the given
+        :class:`.Template` object.
+
+        """
+        self._collection[uri] = template
+
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/parsetree.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/parsetree.py
new file mode 100644
index 00000000000..49ec4e0696c
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/parsetree.py
@@ -0,0 +1,594 @@
+# mako/parsetree.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+"""defines the parse tree components for Mako templates."""
+
+from mako import exceptions, ast, util, filters, compat
+import re
+
+class Node(object):
+    """base class for a Node in the parse tree."""
+
+    def __init__(self, source, lineno, pos, filename):
+        self.source = source
+        self.lineno = lineno
+        self.pos = pos
+        self.filename = filename
+
+    @property
+    def exception_kwargs(self):
+        return {'source': self.source, 'lineno': self.lineno,
+                'pos': self.pos, 'filename': self.filename}
+
+    def get_children(self):
+        return []
+
+    def accept_visitor(self, visitor):
+        def traverse(node):
+            for n in node.get_children():
+                n.accept_visitor(visitor)
+
+        method = getattr(visitor, "visit" + self.__class__.__name__, traverse)
+        method(self)
+
+class TemplateNode(Node):
+    """a 'container' node that stores the overall collection of nodes."""
+
+    def __init__(self, filename):
+        super(TemplateNode, self).__init__('', 0, 0, filename)
+        self.nodes = []
+        self.page_attributes = {}
+
+    def get_children(self):
+        return self.nodes
+
+    def __repr__(self):
+        return "TemplateNode(%s, %r)" % (
+                    util.sorted_dict_repr(self.page_attributes),
+                    self.nodes)
+
+class ControlLine(Node):
+    """defines a control line, a line-oriented python line or end tag.
+
+    e.g.::
+
+        % if foo:
+            (markup)
+        % endif
+
+    """
+
+    has_loop_context = False
+
+    def __init__(self, keyword, isend, text, **kwargs):
+        super(ControlLine, self).__init__(**kwargs)
+        self.text = text
+        self.keyword = keyword
+        self.isend = isend
+        self.is_primary = keyword in ['for', 'if', 'while', 'try', 'with']
+        self.nodes = []
+        if self.isend:
+            self._declared_identifiers = []
+            self._undeclared_identifiers = []
+        else:
+            code = ast.PythonFragment(text, **self.exception_kwargs)
+            self._declared_identifiers = code.declared_identifiers
+            self._undeclared_identifiers = code.undeclared_identifiers
+
+    def get_children(self):
+        return self.nodes
+
+    def declared_identifiers(self):
+        return self._declared_identifiers
+
+    def undeclared_identifiers(self):
+        return self._undeclared_identifiers
+
+    def is_ternary(self, keyword):
+        """return true if the given keyword is a ternary keyword
+        for this ControlLine"""
+
+        return keyword in {
+            'if':set(['else', 'elif']),
+            'try':set(['except', 'finally']),
+            'for':set(['else'])
+        }.get(self.keyword, [])
+
+    def __repr__(self):
+        return "ControlLine(%r, %r, %r, %r)" % (
+            self.keyword,
+            self.text,
+            self.isend,
+            (self.lineno, self.pos)
+        )
+
+class Text(Node):
+    """defines plain text in the template."""
+
+    def __init__(self, content, **kwargs):
+        super(Text, self).__init__(**kwargs)
+        self.content = content
+
+    def __repr__(self):
+        return "Text(%r, %r)" % (self.content, (self.lineno, self.pos))
+
+class Code(Node):
+    """defines a Python code block, either inline or module level.
+
+    e.g.::
+
+        inline:
+        <%
+            x = 12
+        %>
+
+        module level:
+        <%!
+            import logger
+        %>
+
+    """
+
+    def __init__(self, text, ismodule, **kwargs):
+        super(Code, self).__init__(**kwargs)
+        self.text = text
+        self.ismodule = ismodule
+        self.code = ast.PythonCode(text, **self.exception_kwargs)
+
+    def declared_identifiers(self):
+        return self.code.declared_identifiers
+
+    def undeclared_identifiers(self):
+        return self.code.undeclared_identifiers
+
+    def __repr__(self):
+        return "Code(%r, %r, %r)" % (
+            self.text,
+            self.ismodule,
+            (self.lineno, self.pos)
+        )
+
+class Comment(Node):
+    """defines a comment line.
+
+    # this is a comment
+
+    """
+
+    def __init__(self, text, **kwargs):
+        super(Comment, self).__init__(**kwargs)
+        self.text = text
+
+    def __repr__(self):
+        return "Comment(%r, %r)" % (self.text, (self.lineno, self.pos))
+
+class Expression(Node):
+    """defines an inline expression.
+
+    ${x+y}
+
+    """
+
+    def __init__(self, text, escapes, **kwargs):
+        super(Expression, self).__init__(**kwargs)
+        self.text = text
+        self.escapes = escapes
+        self.escapes_code = ast.ArgumentList(escapes, **self.exception_kwargs)
+        self.code = ast.PythonCode(text, **self.exception_kwargs)
+
+    def declared_identifiers(self):
+        return []
+
+    def undeclared_identifiers(self):
+        # TODO: make the "filter" shortcut list configurable at parse/gen time
+        return self.code.undeclared_identifiers.union(
+                self.escapes_code.undeclared_identifiers.difference(
+                    set(filters.DEFAULT_ESCAPES.keys())
+                )
+            ).difference(self.code.declared_identifiers)
+
+    def __repr__(self):
+        return "Expression(%r, %r, %r)" % (
+            self.text,
+            self.escapes_code.args,
+            (self.lineno, self.pos)
+        )
+
+class _TagMeta(type):
+    """metaclass to allow Tag to produce a subclass according to
+    its keyword"""
+
+    _classmap = {}
+
+    def __init__(cls, clsname, bases, dict):
+        if getattr(cls, '__keyword__', None) is not None:
+            cls._classmap[cls.__keyword__] = cls
+        super(_TagMeta, cls).__init__(clsname, bases, dict)
+
+    def __call__(cls, keyword, attributes, **kwargs):
+        if ":" in keyword:
+            ns, defname = keyword.split(':')
+            return type.__call__(CallNamespaceTag, ns, defname,
+                                        attributes, **kwargs)
+
+        try:
+            cls = _TagMeta._classmap[keyword]
+        except KeyError:
+            raise exceptions.CompileException(
+                "No such tag: '%s'" % keyword,
+                source=kwargs['source'],
+                lineno=kwargs['lineno'],
+                pos=kwargs['pos'],
+                filename=kwargs['filename']
+            )
+        return type.__call__(cls, keyword, attributes, **kwargs)
+
+class Tag(compat.with_metaclass(_TagMeta, Node)):
+    """abstract base class for tags.
+
+    <%sometag/>
+
+    <%someothertag>
+        stuff
+    </%someothertag>
+
+    """
+    __keyword__ = None
+
+    def __init__(self, keyword, attributes, expressions,
+                        nonexpressions, required, **kwargs):
+        """construct a new Tag instance.
+
+        this constructor not called directly, and is only called
+        by subclasses.
+
+        :param keyword: the tag keyword
+
+        :param attributes: raw dictionary of attribute key/value pairs
+
+        :param expressions: a set of identifiers that are legal attributes,
+         which can also contain embedded expressions
+
+        :param nonexpressions: a set of identifiers that are legal
+         attributes, which cannot contain embedded expressions
+
+        :param \**kwargs:
+         other arguments passed to the Node superclass (lineno, pos)
+
+        """
+        super(Tag, self).__init__(**kwargs)
+        self.keyword = keyword
+        self.attributes = attributes
+        self._parse_attributes(expressions, nonexpressions)
+        missing = [r for r in required if r not in self.parsed_attributes]
+        if len(missing):
+            raise exceptions.CompileException(
+                "Missing attribute(s): %s" %
+                    ",".join([repr(m) for m in missing]),
+                **self.exception_kwargs)
+        self.parent = None
+        self.nodes = []
+
+    def is_root(self):
+        return self.parent is None
+
+    def get_children(self):
+        return self.nodes
+
+    def _parse_attributes(self, expressions, nonexpressions):
+        undeclared_identifiers = set()
+        self.parsed_attributes = {}
+        for key in self.attributes:
+            if key in expressions:
+                expr = []
+                for x in re.compile(r'(\${.+?})',
+                                    re.S).split(self.attributes[key]):
+                    m = re.compile(r'^\${(.+?)}$', re.S).match(x)
+                    if m:
+                        code = ast.PythonCode(m.group(1).rstrip(),
+                                **self.exception_kwargs)
+                        # we aren't discarding "declared_identifiers" here,
+                        # which we do so that list comprehension-declared
+                        # variables aren't counted.   As yet can't find a
+                        # condition that requires it here.
+                        undeclared_identifiers = \
+                            undeclared_identifiers.union(
+                                    code.undeclared_identifiers)
+                        expr.append('(%s)' % m.group(1))
+                    else:
+                        if x:
+                            expr.append(repr(x))
+                self.parsed_attributes[key] = " + ".join(expr) or repr('')
+            elif key in nonexpressions:
+                if re.search(r'\${.+?}', self.attributes[key]):
+                    raise exceptions.CompileException(
+                           "Attibute '%s' in tag '%s' does not allow embedded "
+                           "expressions"  % (key, self.keyword),
+                           **self.exception_kwargs)
+                self.parsed_attributes[key] = repr(self.attributes[key])
+            else:
+                raise exceptions.CompileException(
+                                    "Invalid attribute for tag '%s': '%s'" %
+                                    (self.keyword, key),
+                                    **self.exception_kwargs)
+        self.expression_undeclared_identifiers = undeclared_identifiers
+
+    def declared_identifiers(self):
+        return []
+
+    def undeclared_identifiers(self):
+        return self.expression_undeclared_identifiers
+
+    def __repr__(self):
+        return "%s(%r, %s, %r, %r)" % (self.__class__.__name__,
+                                    self.keyword,
+                                    util.sorted_dict_repr(self.attributes),
+                                    (self.lineno, self.pos),
+                                    self.nodes
+                                )
+
+class IncludeTag(Tag):
+    __keyword__ = 'include'
+
+    def __init__(self, keyword, attributes, **kwargs):
+        super(IncludeTag, self).__init__(
+                                    keyword,
+                                    attributes,
+                                    ('file', 'import', 'args'),
+                                    (), ('file',), **kwargs)
+        self.page_args = ast.PythonCode(
+                                "__DUMMY(%s)" % attributes.get('args', ''),
+                                 **self.exception_kwargs)
+
+    def declared_identifiers(self):
+        return []
+
+    def undeclared_identifiers(self):
+        identifiers = self.page_args.undeclared_identifiers.\
+                            difference(set(["__DUMMY"])).\
+                            difference(self.page_args.declared_identifiers)
+        return identifiers.union(super(IncludeTag, self).
+                                    undeclared_identifiers())
+
+class NamespaceTag(Tag):
+    __keyword__ = 'namespace'
+
+    def __init__(self, keyword, attributes, **kwargs):
+        super(NamespaceTag, self).__init__(
+                                        keyword, attributes,
+                                        ('file',),
+                                        ('name','inheritable',
+                                        'import','module'),
+                                        (), **kwargs)
+
+        self.name = attributes.get('name', '__anon_%s' % hex(abs(id(self))))
+        if not 'name' in attributes and not 'import' in attributes:
+            raise exceptions.CompileException(
+                "'name' and/or 'import' attributes are required "
+                "for <%namespace>",
+                **self.exception_kwargs)
+        if 'file' in attributes and 'module' in attributes:
+            raise exceptions.CompileException(
+                "<%namespace> may only have one of 'file' or 'module'",
+                **self.exception_kwargs
+            )
+
+    def declared_identifiers(self):
+        return []
+
+class TextTag(Tag):
+    __keyword__ = 'text'
+
+    def __init__(self, keyword, attributes, **kwargs):
+        super(TextTag, self).__init__(
+                                    keyword,
+                                    attributes, (),
+                                    ('filter'), (), **kwargs)
+        self.filter_args = ast.ArgumentList(
+                                    attributes.get('filter', ''),
+                                    **self.exception_kwargs)
+
+    def undeclared_identifiers(self):
+        return self.filter_args.\
+                            undeclared_identifiers.\
+                            difference(filters.DEFAULT_ESCAPES.keys()).union(
+                        self.expression_undeclared_identifiers
+                    )
+
+class DefTag(Tag):
+    __keyword__ = 'def'
+
+    def __init__(self, keyword, attributes, **kwargs):
+        expressions = ['buffered', 'cached'] + [
+                c for c in attributes if c.startswith('cache_')]
+
+
+        super(DefTag, self).__init__(
+                keyword,
+                attributes,
+                expressions,
+                ('name', 'filter', 'decorator'),
+                ('name',),
+                **kwargs)
+        name = attributes['name']
+        if re.match(r'^[\w_]+$', name):
+            raise exceptions.CompileException(
+                                "Missing parenthesis in %def",
+                                **self.exception_kwargs)
+        self.function_decl = ast.FunctionDecl("def " + name + ":pass",
+                                                    **self.exception_kwargs)
+        self.name = self.function_decl.funcname
+        self.decorator = attributes.get('decorator', '')
+        self.filter_args = ast.ArgumentList(
+                                attributes.get('filter', ''),
+                                **self.exception_kwargs)
+
+    is_anonymous = False
+    is_block = False
+
+    @property
+    def funcname(self):
+        return self.function_decl.funcname
+
+    def get_argument_expressions(self, **kw):
+        return self.function_decl.get_argument_expressions(**kw)
+
+    def declared_identifiers(self):
+        return self.function_decl.allargnames
+
+    def undeclared_identifiers(self):
+        res = []
+        for c in self.function_decl.defaults:
+            res += list(ast.PythonCode(c, **self.exception_kwargs).
+                                    undeclared_identifiers)
+        return set(res).union(
+            self.filter_args.\
+                            undeclared_identifiers.\
+                            difference(filters.DEFAULT_ESCAPES.keys())
+        ).union(
+            self.expression_undeclared_identifiers
+        ).difference(
+            self.function_decl.allargnames
+        )
+
+class BlockTag(Tag):
+    __keyword__ = 'block'
+
+    def __init__(self, keyword, attributes, **kwargs):
+        expressions = ['buffered', 'cached', 'args'] + [
+                 c for c in attributes if c.startswith('cache_')]
+
+        super(BlockTag, self).__init__(
+                keyword,
+                attributes,
+                expressions,
+                ('name','filter', 'decorator'),
+                (),
+                **kwargs)
+        name = attributes.get('name')
+        if name and not re.match(r'^[\w_]+$',name):
+            raise exceptions.CompileException(
+                               "%block may not specify an argument signature",
+                               **self.exception_kwargs)
+        if not name and attributes.get('args', None):
+            raise exceptions.CompileException(
+                                "Only named %blocks may specify args",
+                                **self.exception_kwargs
+                                )
+        self.body_decl = ast.FunctionArgs(attributes.get('args', ''),
+                                            **self.exception_kwargs)
+
+        self.name = name
+        self.decorator = attributes.get('decorator', '')
+        self.filter_args = ast.ArgumentList(
+                                attributes.get('filter', ''),
+                                **self.exception_kwargs)
+
+
+    is_block = True
+
+    @property
+    def is_anonymous(self):
+        return self.name is None
+
+    @property
+    def funcname(self):
+        return self.name or "__M_anon_%d" % (self.lineno, )
+
+    def get_argument_expressions(self, **kw):
+        return self.body_decl.get_argument_expressions(**kw)
+
+    def declared_identifiers(self):
+        return self.body_decl.allargnames
+
+    def undeclared_identifiers(self):
+        return (self.filter_args.\
+                            undeclared_identifiers.\
+                            difference(filters.DEFAULT_ESCAPES.keys())
+                ).union(self.expression_undeclared_identifiers)
+
+
+
+class CallTag(Tag):
+    __keyword__ = 'call'
+
+    def __init__(self, keyword, attributes, **kwargs):
+        super(CallTag, self).__init__(keyword, attributes,
+                                    ('args'), ('expr',), ('expr',), **kwargs)
+        self.expression = attributes['expr']
+        self.code = ast.PythonCode(self.expression, **self.exception_kwargs)
+        self.body_decl = ast.FunctionArgs(attributes.get('args', ''),
+                                            **self.exception_kwargs)
+
+    def declared_identifiers(self):
+        return self.code.declared_identifiers.union(self.body_decl.allargnames)
+
+    def undeclared_identifiers(self):
+        return self.code.undeclared_identifiers.\
+                    difference(self.code.declared_identifiers)
+
+class CallNamespaceTag(Tag):
+
+    def __init__(self, namespace, defname, attributes, **kwargs):
+        super(CallNamespaceTag, self).__init__(
+                    namespace + ":" + defname,
+                    attributes,
+                    tuple(attributes.keys()) + ('args', ),
+                    (),
+                    (),
+                    **kwargs)
+
+        self.expression = "%s.%s(%s)" % (
+                                namespace,
+                                defname,
+                                ",".join(["%s=%s" % (k, v) for k, v in
+                                            self.parsed_attributes.items()
+                                            if k != 'args'])
+                            )
+        self.code = ast.PythonCode(self.expression, **self.exception_kwargs)
+        self.body_decl = ast.FunctionArgs(
+                                    attributes.get('args', ''),
+                                    **self.exception_kwargs)
+
+    def declared_identifiers(self):
+        return self.code.declared_identifiers.union(self.body_decl.allargnames)
+
+    def undeclared_identifiers(self):
+        return self.code.undeclared_identifiers.\
+                    difference(self.code.declared_identifiers)
+
+class InheritTag(Tag):
+    __keyword__ = 'inherit'
+
+    def __init__(self, keyword, attributes, **kwargs):
+        super(InheritTag, self).__init__(
+                                keyword, attributes,
+                                ('file',), (), ('file',), **kwargs)
+
+class PageTag(Tag):
+    __keyword__ = 'page'
+
+    def __init__(self, keyword, attributes, **kwargs):
+        expressions =   ['cached', 'args', 'expression_filter', 'enable_loop'] + [
+                    c for c in attributes if c.startswith('cache_')]
+
+        super(PageTag, self).__init__(
+                keyword,
+                attributes,
+                expressions,
+                (),
+                (),
+                **kwargs)
+        self.body_decl = ast.FunctionArgs(attributes.get('args', ''),
+                                            **self.exception_kwargs)
+        self.filter_args = ast.ArgumentList(
+                                attributes.get('expression_filter', ''),
+                                **self.exception_kwargs)
+
+    def declared_identifiers(self):
+        return self.body_decl.allargnames
+
+
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/pygen.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/pygen.py
new file mode 100644
index 00000000000..5ba5125a4c7
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/pygen.py
@@ -0,0 +1,299 @@
+# mako/pygen.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+"""utilities for generating and formatting literal Python code."""
+
+import re
+from mako import exceptions
+
+class PythonPrinter(object):
+    def __init__(self, stream):
+        # indentation counter
+        self.indent = 0
+
+        # a stack storing information about why we incremented
+        # the indentation counter, to help us determine if we
+        # should decrement it
+        self.indent_detail = []
+
+        # the string of whitespace multiplied by the indent
+        # counter to produce a line
+        self.indentstring = "    "
+
+        # the stream we are writing to
+        self.stream = stream
+
+        # current line number
+        self.lineno = 1
+
+        # a list of lines that represents a buffered "block" of code,
+        # which can be later printed relative to an indent level
+        self.line_buffer = []
+
+        self.in_indent_lines = False
+
+        self._reset_multi_line_flags()
+
+        # mapping of generated python lines to template
+        # source lines
+        self.source_map = {}
+
+    def _update_lineno(self, num):
+        self.lineno += num
+
+    def start_source(self, lineno):
+        if self.lineno not in self.source_map:
+            self.source_map[self.lineno] = lineno
+
+    def write_blanks(self, num):
+        self.stream.write("\n" * num)
+        self._update_lineno(num)
+
+    def write_indented_block(self, block):
+        """print a line or lines of python which already contain indentation.
+
+        The indentation of the total block of lines will be adjusted to that of
+        the current indent level."""
+        self.in_indent_lines = False
+        for l in re.split(r'\r?\n', block):
+            self.line_buffer.append(l)
+            self._update_lineno(1)
+
+    def writelines(self, *lines):
+        """print a series of lines of python."""
+        for line in lines:
+            self.writeline(line)
+
+    def writeline(self, line):
+        """print a line of python, indenting it according to the current
+        indent level.
+
+        this also adjusts the indentation counter according to the
+        content of the line.
+
+        """
+
+        if not self.in_indent_lines:
+            self._flush_adjusted_lines()
+            self.in_indent_lines = True
+
+        if (line is None or
+            re.match(r"^\s*#",line) or
+            re.match(r"^\s*$", line)
+            ):
+            hastext = False
+        else:
+            hastext = True
+
+        is_comment = line and len(line) and line[0] == '#'
+
+        # see if this line should decrease the indentation level
+        if (not is_comment and
+            (not hastext or self._is_unindentor(line))
+            ):
+
+            if self.indent > 0:
+                self.indent -= 1
+                # if the indent_detail stack is empty, the user
+                # probably put extra closures - the resulting
+                # module wont compile.
+                if len(self.indent_detail) == 0:
+                    raise exceptions.SyntaxException(
+                                    "Too many whitespace closures")
+                self.indent_detail.pop()
+
+        if line is None:
+            return
+
+        # write the line
+        self.stream.write(self._indent_line(line) + "\n")
+        self._update_lineno(len(line.split("\n")))
+
+        # see if this line should increase the indentation level.
+        # note that a line can both decrase (before printing) and
+        # then increase (after printing) the indentation level.
+
+        if re.search(r":[ \t]*(?:#.*)?$", line):
+            # increment indentation count, and also
+            # keep track of what the keyword was that indented us,
+            # if it is a python compound statement keyword
+            # where we might have to look for an "unindent" keyword
+            match = re.match(r"^\s*(if|try|elif|while|for|with)", line)
+            if match:
+                # its a "compound" keyword, so we will check for "unindentors"
+                indentor = match.group(1)
+                self.indent += 1
+                self.indent_detail.append(indentor)
+            else:
+                indentor = None
+                # its not a "compound" keyword.  but lets also
+                # test for valid Python keywords that might be indenting us,
+                # else assume its a non-indenting line
+                m2 = re.match(r"^\s*(def|class|else|elif|except|finally)",
+                              line)
+                if m2:
+                    self.indent += 1
+                    self.indent_detail.append(indentor)
+
+    def close(self):
+        """close this printer, flushing any remaining lines."""
+        self._flush_adjusted_lines()
+
+    def _is_unindentor(self, line):
+        """return true if the given line is an 'unindentor',
+        relative to the last 'indent' event received.
+
+        """
+
+        # no indentation detail has been pushed on; return False
+        if len(self.indent_detail) == 0:
+            return False
+
+        indentor = self.indent_detail[-1]
+
+        # the last indent keyword we grabbed is not a
+        # compound statement keyword; return False
+        if indentor is None:
+            return False
+
+        # if the current line doesnt have one of the "unindentor" keywords,
+        # return False
+        match = re.match(r"^\s*(else|elif|except|finally).*\:", line)
+        if not match:
+            return False
+
+        # whitespace matches up, we have a compound indentor,
+        # and this line has an unindentor, this
+        # is probably good enough
+        return True
+
+        # should we decide that its not good enough, heres
+        # more stuff to check.
+        #keyword = match.group(1)
+
+        # match the original indent keyword
+        #for crit in [
+        #   (r'if|elif', r'else|elif'),
+        #   (r'try', r'except|finally|else'),
+        #   (r'while|for', r'else'),
+        #]:
+        #   if re.match(crit[0], indentor) and re.match(crit[1], keyword):
+        #        return True
+
+        #return False
+
+    def _indent_line(self, line, stripspace=''):
+        """indent the given line according to the current indent level.
+
+        stripspace is a string of space that will be truncated from the
+        start of the line before indenting."""
+
+        return re.sub(r"^%s" % stripspace, self.indentstring
+                      * self.indent, line)
+
+    def _reset_multi_line_flags(self):
+        """reset the flags which would indicate we are in a backslashed
+        or triple-quoted section."""
+
+        self.backslashed, self.triplequoted = False, False
+
+    def _in_multi_line(self, line):
+        """return true if the given line is part of a multi-line block,
+        via backslash or triple-quote."""
+
+        # we are only looking for explicitly joined lines here, not
+        # implicit ones (i.e. brackets, braces etc.).  this is just to
+        # guard against the possibility of modifying the space inside of
+        # a literal multiline string with unfortunately placed
+        # whitespace
+
+        current_state = (self.backslashed or self.triplequoted)
+
+        if re.search(r"\\$", line):
+            self.backslashed = True
+        else:
+            self.backslashed = False
+
+        triples = len(re.findall(r"\"\"\"|\'\'\'", line))
+        if triples == 1 or triples % 2 != 0:
+            self.triplequoted = not self.triplequoted
+
+        return current_state
+
+    def _flush_adjusted_lines(self):
+        stripspace = None
+        self._reset_multi_line_flags()
+
+        for entry in self.line_buffer:
+            if self._in_multi_line(entry):
+                self.stream.write(entry + "\n")
+            else:
+                entry = entry.expandtabs()
+                if stripspace is None and re.search(r"^[ \t]*[^# \t]", entry):
+                    stripspace = re.match(r"^([ \t]*)", entry).group(1)
+                self.stream.write(self._indent_line(entry, stripspace) + "\n")
+
+        self.line_buffer = []
+        self._reset_multi_line_flags()
+
+
+def adjust_whitespace(text):
+    """remove the left-whitespace margin of a block of Python code."""
+
+    state = [False, False]
+    (backslashed, triplequoted) = (0, 1)
+
+    def in_multi_line(line):
+        start_state = (state[backslashed] or state[triplequoted])
+
+        if re.search(r"\\$", line):
+            state[backslashed] = True
+        else:
+            state[backslashed] = False
+
+        def match(reg, t):
+            m = re.match(reg, t)
+            if m:
+                return m, t[len(m.group(0)):]
+            else:
+                return None, t
+
+        while line:
+            if state[triplequoted]:
+                m, line = match(r"%s" % state[triplequoted], line)
+                if m:
+                    state[triplequoted] = False
+                else:
+                    m, line = match(r".*?(?=%s|$)" % state[triplequoted], line)
+            else:
+                m, line = match(r'#', line)
+                if m:
+                    return start_state
+
+                m, line = match(r"\"\"\"|\'\'\'", line)
+                if m:
+                    state[triplequoted] = m.group(0)
+                    continue
+
+                m, line = match(r".*?(?=\"\"\"|\'\'\'|#|$)", line)
+
+        return start_state
+
+    def _indent_line(line, stripspace=''):
+        return re.sub(r"^%s" % stripspace, '', line)
+
+    lines = []
+    stripspace = None
+
+    for line in re.split(r'\r?\n', text):
+        if in_multi_line(line):
+            lines.append(line)
+        else:
+            line = line.expandtabs()
+            if stripspace is None and re.search(r"^[ \t]*[^# \t]", line):
+                stripspace = re.match(r"^([ \t]*)", line).group(1)
+            lines.append(_indent_line(line, stripspace))
+    return "\n".join(lines)
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/pyparser.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/pyparser.py
new file mode 100644
index 00000000000..bfa46a9fafd
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/pyparser.py
@@ -0,0 +1,232 @@
+# mako/pyparser.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+"""Handles parsing of Python code.
+
+Parsing to AST is done via _ast on Python > 2.5, otherwise the compiler
+module is used.
+"""
+
+from mako import exceptions, util, compat
+from mako.compat import arg_stringname
+import operator
+
+if compat.py3k:
+    # words that cannot be assigned to (notably
+    # smaller than the total keys in __builtins__)
+    reserved = set(['True', 'False', 'None', 'print'])
+
+    # the "id" attribute on a function node
+    arg_id = operator.attrgetter('arg')
+else:
+    # words that cannot be assigned to (notably
+    # smaller than the total keys in __builtins__)
+    reserved = set(['True', 'False', 'None'])
+
+    # the "id" attribute on a function node
+    arg_id = operator.attrgetter('id')
+
+import _ast
+util.restore__ast(_ast)
+from mako import _ast_util
+
+
+def parse(code, mode='exec', **exception_kwargs):
+    """Parse an expression into AST"""
+
+    try:
+        return _ast_util.parse(code, '<unknown>', mode)
+    except Exception:
+        raise exceptions.SyntaxException(
+                    "(%s) %s (%r)" % (
+                        compat.exception_as().__class__.__name__,
+                        compat.exception_as(),
+                        code[0:50]
+                    ), **exception_kwargs)
+
+
+class FindIdentifiers(_ast_util.NodeVisitor):
+
+    def __init__(self, listener, **exception_kwargs):
+        self.in_function = False
+        self.in_assign_targets = False
+        self.local_ident_stack = set()
+        self.listener = listener
+        self.exception_kwargs = exception_kwargs
+
+    def _add_declared(self, name):
+        if not self.in_function:
+            self.listener.declared_identifiers.add(name)
+        else:
+            self.local_ident_stack.add(name)
+
+    def visit_ClassDef(self, node):
+        self._add_declared(node.name)
+
+    def visit_Assign(self, node):
+
+        # flip around the visiting of Assign so the expression gets
+        # evaluated first, in the case of a clause like "x=x+5" (x
+        # is undeclared)
+
+        self.visit(node.value)
+        in_a = self.in_assign_targets
+        self.in_assign_targets = True
+        for n in node.targets:
+            self.visit(n)
+        self.in_assign_targets = in_a
+
+    if compat.py3k:
+
+        # ExceptHandler is in Python 2, but this block only works in
+        # Python 3 (and is required there)
+
+        def visit_ExceptHandler(self, node):
+            if node.name is not None:
+                self._add_declared(node.name)
+            if node.type is not None:
+                self.visit(node.type)
+            for statement in node.body:
+                self.visit(statement)
+
+    def visit_Lambda(self, node, *args):
+        self._visit_function(node, True)
+
+    def visit_FunctionDef(self, node):
+        self._add_declared(node.name)
+        self._visit_function(node, False)
+
+    def _expand_tuples(self, args):
+        for arg in args:
+            if isinstance(arg, _ast.Tuple):
+                for n in arg.elts:
+                    yield n
+            else:
+                yield arg
+
+    def _visit_function(self, node, islambda):
+
+        # push function state onto stack.  dont log any more
+        # identifiers as "declared" until outside of the function,
+        # but keep logging identifiers as "undeclared". track
+        # argument names in each function header so they arent
+        # counted as "undeclared"
+
+        inf = self.in_function
+        self.in_function = True
+
+        local_ident_stack = self.local_ident_stack
+        self.local_ident_stack = local_ident_stack.union([
+            arg_id(arg) for arg in self._expand_tuples(node.args.args)
+        ])
+        if islambda:
+            self.visit(node.body)
+        else:
+            for n in node.body:
+                self.visit(n)
+        self.in_function = inf
+        self.local_ident_stack = local_ident_stack
+
+    def visit_For(self, node):
+
+        # flip around visit
+
+        self.visit(node.iter)
+        self.visit(node.target)
+        for statement in node.body:
+            self.visit(statement)
+        for statement in node.orelse:
+            self.visit(statement)
+
+    def visit_Name(self, node):
+        if isinstance(node.ctx, _ast.Store):
+            # this is eqiuvalent to visit_AssName in
+            # compiler
+            self._add_declared(node.id)
+        elif node.id not in reserved and node.id \
+            not in self.listener.declared_identifiers and node.id \
+                not in self.local_ident_stack:
+            self.listener.undeclared_identifiers.add(node.id)
+
+    def visit_Import(self, node):
+        for name in node.names:
+            if name.asname is not None:
+                self._add_declared(name.asname)
+            else:
+                self._add_declared(name.name.split('.')[0])
+
+    def visit_ImportFrom(self, node):
+        for name in node.names:
+            if name.asname is not None:
+                self._add_declared(name.asname)
+            else:
+                if name.name == '*':
+                    raise exceptions.CompileException(
+                        "'import *' is not supported, since all identifier "
+                        "names must be explicitly declared.  Please use the "
+                        "form 'from <modulename> import <name1>, <name2>, "
+                        "...' instead.", **self.exception_kwargs)
+                self._add_declared(name.name)
+
+
+class FindTuple(_ast_util.NodeVisitor):
+
+    def __init__(self, listener, code_factory, **exception_kwargs):
+        self.listener = listener
+        self.exception_kwargs = exception_kwargs
+        self.code_factory = code_factory
+
+    def visit_Tuple(self, node):
+        for n in node.elts:
+            p = self.code_factory(n, **self.exception_kwargs)
+            self.listener.codeargs.append(p)
+            self.listener.args.append(ExpressionGenerator(n).value())
+            self.listener.declared_identifiers = \
+                self.listener.declared_identifiers.union(
+                                                p.declared_identifiers)
+            self.listener.undeclared_identifiers = \
+                self.listener.undeclared_identifiers.union(
+                                                p.undeclared_identifiers)
+
+
+class ParseFunc(_ast_util.NodeVisitor):
+
+    def __init__(self, listener, **exception_kwargs):
+        self.listener = listener
+        self.exception_kwargs = exception_kwargs
+
+    def visit_FunctionDef(self, node):
+        self.listener.funcname = node.name
+
+        argnames = [arg_id(arg) for arg in node.args.args]
+        if node.args.vararg:
+            argnames.append(arg_stringname(node.args.vararg))
+
+        if compat.py2k:
+            # kw-only args don't exist in Python 2
+            kwargnames = []
+        else:
+            kwargnames = [arg_id(arg) for arg in node.args.kwonlyargs]
+        if node.args.kwarg:
+            kwargnames.append(arg_stringname(node.args.kwarg))
+        self.listener.argnames = argnames
+        self.listener.defaults = node.args.defaults  # ast
+        self.listener.kwargnames = kwargnames
+        if compat.py2k:
+            self.listener.kwdefaults = []
+        else:
+            self.listener.kwdefaults = node.args.kw_defaults
+        self.listener.varargs = node.args.vararg
+        self.listener.kwargs = node.args.kwarg
+
+class ExpressionGenerator(object):
+
+    def __init__(self, astnode):
+        self.generator = _ast_util.SourceGenerator(' ' * 4)
+        self.generator.visit(astnode)
+
+    def value(self):
+        return ''.join(self.generator.result)
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/runtime.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/runtime.py
new file mode 100644
index 00000000000..6b6a35a9215
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/runtime.py
@@ -0,0 +1,878 @@
+# mako/runtime.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+"""provides runtime services for templates, including Context,
+Namespace, and various helper functions."""
+
+from mako import exceptions, util, compat
+from mako.compat import compat_builtins
+import sys
+
+
+class Context(object):
+    """Provides runtime namespace, output buffer, and various
+    callstacks for templates.
+
+    See :ref:`runtime_toplevel` for detail on the usage of
+    :class:`.Context`.
+
+     """
+
+    def __init__(self, buffer, **data):
+        self._buffer_stack = [buffer]
+
+        self._data = data
+
+        self._kwargs = data.copy()
+        self._with_template = None
+        self._outputting_as_unicode = None
+        self.namespaces = {}
+
+        # "capture" function which proxies to the
+        # generic "capture" function
+        self._data['capture'] = compat.partial(capture, self)
+
+        # "caller" stack used by def calls with content
+        self.caller_stack = self._data['caller'] = CallerStack()
+
+    def _set_with_template(self, t):
+        self._with_template = t
+        illegal_names = t.reserved_names.intersection(self._data)
+        if illegal_names:
+            raise exceptions.NameConflictError(
+                "Reserved words passed to render(): %s" %
+                ", ".join(illegal_names))
+
+    @property
+    def lookup(self):
+        """Return the :class:`.TemplateLookup` associated
+        with this :class:`.Context`.
+
+        """
+        return self._with_template.lookup
+
+    @property
+    def kwargs(self):
+        """Return the dictionary of top level keyword arguments associated
+        with this :class:`.Context`.
+
+        This dictionary only includes the top-level arguments passed to
+        :meth:`.Template.render`.  It does not include names produced within
+        the template execution such as local variable names or special names
+        such as ``self``, ``next``, etc.
+
+        The purpose of this dictionary is primarily for the case that
+        a :class:`.Template` accepts arguments via its ``<%page>`` tag,
+        which are normally expected to be passed via :meth:`.Template.render`,
+        except the template is being called in an inheritance context,
+        using the ``body()`` method.   :attr:`.Context.kwargs` can then be
+        used to propagate these arguments to the inheriting template::
+
+            ${next.body(**context.kwargs)}
+
+        """
+        return self._kwargs.copy()
+
+    def push_caller(self, caller):
+        """Push a ``caller`` callable onto the callstack for
+        this :class:`.Context`."""
+
+
+        self.caller_stack.append(caller)
+
+    def pop_caller(self):
+        """Pop a ``caller`` callable onto the callstack for this
+        :class:`.Context`."""
+
+        del self.caller_stack[-1]
+
+    def keys(self):
+        """Return a list of all names established in this :class:`.Context`."""
+
+        return list(self._data.keys())
+
+    def __getitem__(self, key):
+        if key in self._data:
+            return self._data[key]
+        else:
+            return compat_builtins.__dict__[key]
+
+    def _push_writer(self):
+        """push a capturing buffer onto this Context and return
+        the new writer function."""
+
+        buf = util.FastEncodingBuffer()
+        self._buffer_stack.append(buf)
+        return buf.write
+
+    def _pop_buffer_and_writer(self):
+        """pop the most recent capturing buffer from this Context
+        and return the current writer after the pop.
+
+        """
+
+        buf = self._buffer_stack.pop()
+        return buf, self._buffer_stack[-1].write
+
+    def _push_buffer(self):
+        """push a capturing buffer onto this Context."""
+
+        self._push_writer()
+
+    def _pop_buffer(self):
+        """pop the most recent capturing buffer from this Context."""
+
+        return self._buffer_stack.pop()
+
+    def get(self, key, default=None):
+        """Return a value from this :class:`.Context`."""
+
+        return self._data.get(key, compat_builtins.__dict__.get(key, default))
+
+    def write(self, string):
+        """Write a string to this :class:`.Context` object's
+        underlying output buffer."""
+
+        self._buffer_stack[-1].write(string)
+
+    def writer(self):
+        """Return the current writer function."""
+
+        return self._buffer_stack[-1].write
+
+    def _copy(self):
+        c = Context.__new__(Context)
+        c._buffer_stack = self._buffer_stack
+        c._data = self._data.copy()
+        c._kwargs = self._kwargs
+        c._with_template = self._with_template
+        c._outputting_as_unicode = self._outputting_as_unicode
+        c.namespaces = self.namespaces
+        c.caller_stack = self.caller_stack
+        return c
+
+    def _locals(self, d):
+        """Create a new :class:`.Context` with a copy of this
+        :class:`.Context`'s current state,
+        updated with the given dictionary.
+
+        The :attr:`.Context.kwargs` collection remains
+        unaffected.
+
+
+        """
+
+        if not d:
+            return self
+        c = self._copy()
+        c._data.update(d)
+        return c
+
+    def _clean_inheritance_tokens(self):
+        """create a new copy of this :class:`.Context`. with
+        tokens related to inheritance state removed."""
+
+        c = self._copy()
+        x = c._data
+        x.pop('self', None)
+        x.pop('parent', None)
+        x.pop('next', None)
+        return c
+
+class CallerStack(list):
+    def __init__(self):
+        self.nextcaller = None
+
+    def __nonzero__(self):
+        return self.__bool__()
+
+    def __bool__(self):
+        return len(self) and self._get_caller() and True or False
+
+    def _get_caller(self):
+        # this method can be removed once
+        # codegen MAGIC_NUMBER moves past 7
+        return self[-1]
+
+    def __getattr__(self, key):
+        return getattr(self._get_caller(), key)
+
+    def _push_frame(self):
+        frame = self.nextcaller or None
+        self.append(frame)
+        self.nextcaller = None
+        return frame
+
+    def _pop_frame(self):
+        self.nextcaller = self.pop()
+
+
+class Undefined(object):
+    """Represents an undefined value in a template.
+
+    All template modules have a constant value
+    ``UNDEFINED`` present which is an instance of this
+    object.
+
+    """
+    def __str__(self):
+        raise NameError("Undefined")
+
+    def __nonzero__(self):
+        return self.__bool__()
+
+    def __bool__(self):
+        return False
+
+UNDEFINED = Undefined()
+
+class LoopStack(object):
+    """a stack for LoopContexts that implements the context manager protocol
+    to automatically pop off the top of the stack on context exit
+    """
+
+    def __init__(self):
+        self.stack = []
+
+    def _enter(self, iterable):
+        self._push(iterable)
+        return self._top
+
+    def _exit(self):
+        self._pop()
+        return self._top
+
+    @property
+    def _top(self):
+        if self.stack:
+            return self.stack[-1]
+        else:
+            return self
+
+    def _pop(self):
+        return self.stack.pop()
+
+    def _push(self, iterable):
+        new = LoopContext(iterable)
+        if self.stack:
+            new.parent = self.stack[-1]
+        return self.stack.append(new)
+
+    def __getattr__(self, key):
+        raise exceptions.RuntimeException("No loop context is established")
+
+    def __iter__(self):
+        return iter(self._top)
+
+
+class LoopContext(object):
+    """A magic loop variable.
+    Automatically accessible in any ``% for`` block.
+
+    See the section :ref:`loop_context` for usage
+    notes.
+
+    :attr:`parent` -> :class:`.LoopContext` or ``None``
+        The parent loop, if one exists.
+    :attr:`index` -> `int`
+        The 0-based iteration count.
+    :attr:`reverse_index` -> `int`
+        The number of iterations remaining.
+    :attr:`first` -> `bool`
+        ``True`` on the first iteration, ``False`` otherwise.
+    :attr:`last` -> `bool`
+        ``True`` on the last iteration, ``False`` otherwise.
+    :attr:`even` -> `bool`
+        ``True`` when ``index`` is even.
+    :attr:`odd` -> `bool`
+        ``True`` when ``index`` is odd.
+    """
+
+    def __init__(self, iterable):
+        self._iterable = iterable
+        self.index = 0
+        self.parent = None
+
+    def __iter__(self):
+        for i in self._iterable:
+            yield i
+            self.index += 1
+
+    @util.memoized_instancemethod
+    def __len__(self):
+        return len(self._iterable)
+
+    @property
+    def reverse_index(self):
+        return len(self) - self.index - 1
+
+    @property
+    def first(self):
+        return self.index == 0
+
+    @property
+    def last(self):
+        return self.index == len(self) - 1
+
+    @property
+    def even(self):
+        return not self.odd
+
+    @property
+    def odd(self):
+        return bool(self.index % 2)
+
+    def cycle(self, *values):
+        """Cycle through values as the loop progresses.
+        """
+        if not values:
+            raise ValueError("You must provide values to cycle through")
+        return values[self.index % len(values)]
+
+
+class _NSAttr(object):
+    def __init__(self, parent):
+        self.__parent = parent
+    def __getattr__(self, key):
+        ns = self.__parent
+        while ns:
+            if hasattr(ns.module, key):
+                return getattr(ns.module, key)
+            else:
+                ns = ns.inherits
+        raise AttributeError(key)
+
+class Namespace(object):
+    """Provides access to collections of rendering methods, which
+      can be local, from other templates, or from imported modules.
+
+      To access a particular rendering method referenced by a
+      :class:`.Namespace`, use plain attribute access:
+
+      .. sourcecode:: mako
+
+        ${some_namespace.foo(x, y, z)}
+
+      :class:`.Namespace` also contains several built-in attributes
+      described here.
+
+      """
+
+    def __init__(self, name, context,
+                            callables=None, inherits=None,
+                            populate_self=True, calling_uri=None):
+        self.name = name
+        self.context = context
+        self.inherits = inherits
+        if callables is not None:
+            self.callables = dict([(c.__name__, c) for c in callables])
+
+    callables = ()
+
+    module = None
+    """The Python module referenced by this :class:`.Namespace`.
+
+    If the namespace references a :class:`.Template`, then
+    this module is the equivalent of ``template.module``,
+    i.e. the generated module for the template.
+
+    """
+
+    template = None
+    """The :class:`.Template` object referenced by this
+        :class:`.Namespace`, if any.
+
+    """
+
+    context = None
+    """The :class:`.Context` object for this :class:`.Namespace`.
+
+    Namespaces are often created with copies of contexts that
+    contain slightly different data, particularly in inheritance
+    scenarios. Using the :class:`.Context` off of a :class:`.Namespace` one
+    can traverse an entire chain of templates that inherit from
+    one-another.
+
+    """
+
+    filename = None
+    """The path of the filesystem file used for this
+    :class:`.Namespace`'s module or template.
+
+    If this is a pure module-based
+    :class:`.Namespace`, this evaluates to ``module.__file__``. If a
+    template-based namespace, it evaluates to the original
+    template file location.
+
+    """
+
+    uri = None
+    """The URI for this :class:`.Namespace`'s template.
+
+    I.e. whatever was sent to :meth:`.TemplateLookup.get_template()`.
+
+    This is the equivalent of :attr:`.Template.uri`.
+
+    """
+
+    _templateuri = None
+
+    @util.memoized_property
+    def attr(self):
+        """Access module level attributes by name.
+
+        This accessor allows templates to supply "scalar"
+        attributes which are particularly handy in inheritance
+        relationships.
+
+        .. seealso::
+
+            :ref:`inheritance_attr`
+
+            :ref:`namespace_attr_for_includes`
+
+        """
+        return _NSAttr(self)
+
+    def get_namespace(self, uri):
+        """Return a :class:`.Namespace` corresponding to the given ``uri``.
+
+        If the given ``uri`` is a relative URI (i.e. it does not
+        contain a leading slash ``/``), the ``uri`` is adjusted to
+        be relative to the ``uri`` of the namespace itself. This
+        method is therefore mostly useful off of the built-in
+        ``local`` namespace, described in :ref:`namespace_local`.
+
+        In
+        most cases, a template wouldn't need this function, and
+        should instead use the ``<%namespace>`` tag to load
+        namespaces. However, since all ``<%namespace>`` tags are
+        evaluated before the body of a template ever runs,
+        this method can be used to locate namespaces using
+        expressions that were generated within the body code of
+        the template, or to conditionally use a particular
+        namespace.
+
+        """
+        key = (self, uri)
+        if key in self.context.namespaces:
+            return self.context.namespaces[key]
+        else:
+            ns = TemplateNamespace(uri, self.context._copy(),
+                                templateuri=uri,
+                                calling_uri=self._templateuri)
+            self.context.namespaces[key] = ns
+            return ns
+
+    def get_template(self, uri):
+        """Return a :class:`.Template` from the given ``uri``.
+
+        The ``uri`` resolution is relative to the ``uri`` of this
+        :class:`.Namespace` object's :class:`.Template`.
+
+        """
+        return _lookup_template(self.context, uri, self._templateuri)
+
+    def get_cached(self, key, **kwargs):
+        """Return a value from the :class:`.Cache` referenced by this
+        :class:`.Namespace` object's :class:`.Template`.
+
+        The advantage to this method versus direct access to the
+        :class:`.Cache` is that the configuration parameters
+        declared in ``<%page>`` take effect here, thereby calling
+        up the same configured backend as that configured
+        by ``<%page>``.
+
+        """
+
+        return self.cache.get(key, **kwargs)
+
+    @property
+    def cache(self):
+        """Return the :class:`.Cache` object referenced
+        by this :class:`.Namespace` object's
+        :class:`.Template`.
+
+        """
+        return self.template.cache
+
+    def include_file(self, uri, **kwargs):
+        """Include a file at the given ``uri``."""
+
+        _include_file(self.context, uri, self._templateuri, **kwargs)
+
+    def _populate(self, d, l):
+        for ident in l:
+            if ident == '*':
+                for (k, v) in self._get_star():
+                    d[k] = v
+            else:
+                d[ident] = getattr(self, ident)
+
+    def _get_star(self):
+        if self.callables:
+            for key in self.callables:
+                yield (key, self.callables[key])
+
+    def __getattr__(self, key):
+        if key in self.callables:
+            val = self.callables[key]
+        elif self.inherits:
+            val = getattr(self.inherits, key)
+        else:
+            raise AttributeError(
+                    "Namespace '%s' has no member '%s'" %
+                    (self.name, key))
+        setattr(self, key, val)
+        return val
+
+class TemplateNamespace(Namespace):
+    """A :class:`.Namespace` specific to a :class:`.Template` instance."""
+
+    def __init__(self, name, context, template=None, templateuri=None,
+                            callables=None, inherits=None,
+                            populate_self=True, calling_uri=None):
+        self.name = name
+        self.context = context
+        self.inherits = inherits
+        if callables is not None:
+            self.callables = dict([(c.__name__, c) for c in callables])
+
+        if templateuri is not None:
+            self.template = _lookup_template(context, templateuri,
+                                                calling_uri)
+            self._templateuri = self.template.module._template_uri
+        elif template is not None:
+            self.template = template
+            self._templateuri = template.module._template_uri
+        else:
+            raise TypeError("'template' argument is required.")
+
+        if populate_self:
+            lclcallable, lclcontext = \
+                        _populate_self_namespace(context, self.template,
+                                                    self_ns=self)
+
+    @property
+    def module(self):
+        """The Python module referenced by this :class:`.Namespace`.
+
+        If the namespace references a :class:`.Template`, then
+        this module is the equivalent of ``template.module``,
+        i.e. the generated module for the template.
+
+        """
+        return self.template.module
+
+    @property
+    def filename(self):
+        """The path of the filesystem file used for this
+        :class:`.Namespace`'s module or template.
+        """
+        return self.template.filename
+
+    @property
+    def uri(self):
+        """The URI for this :class:`.Namespace`'s template.
+
+        I.e. whatever was sent to :meth:`.TemplateLookup.get_template()`.
+
+        This is the equivalent of :attr:`.Template.uri`.
+
+        """
+        return self.template.uri
+
+    def _get_star(self):
+        if self.callables:
+            for key in self.callables:
+                yield (key, self.callables[key])
+        def get(key):
+            callable_ = self.template._get_def_callable(key)
+            return compat.partial(callable_, self.context)
+        for k in self.template.module._exports:
+            yield (k, get(k))
+
+    def __getattr__(self, key):
+        if key in self.callables:
+            val = self.callables[key]
+        elif self.template.has_def(key):
+            callable_ = self.template._get_def_callable(key)
+            val = compat.partial(callable_, self.context)
+        elif self.inherits:
+            val = getattr(self.inherits, key)
+
+        else:
+            raise AttributeError(
+                    "Namespace '%s' has no member '%s'" %
+                    (self.name, key))
+        setattr(self, key, val)
+        return val
+
+class ModuleNamespace(Namespace):
+    """A :class:`.Namespace` specific to a Python module instance."""
+
+    def __init__(self, name, context, module,
+                            callables=None, inherits=None,
+                            populate_self=True, calling_uri=None):
+        self.name = name
+        self.context = context
+        self.inherits = inherits
+        if callables is not None:
+            self.callables = dict([(c.__name__, c) for c in callables])
+
+        mod = __import__(module)
+        for token in module.split('.')[1:]:
+            mod = getattr(mod, token)
+        self.module = mod
+
+    @property
+    def filename(self):
+        """The path of the filesystem file used for this
+        :class:`.Namespace`'s module or template.
+        """
+        return self.module.__file__
+
+    def _get_star(self):
+        if self.callables:
+            for key in self.callables:
+                yield (key, self.callables[key])
+        for key in dir(self.module):
+            if key[0] != '_':
+                callable_ = getattr(self.module, key)
+                if compat.callable(callable_):
+                    yield key, compat.partial(callable_, self.context)
+
+
+    def __getattr__(self, key):
+        if key in self.callables:
+            val = self.callables[key]
+        elif hasattr(self.module, key):
+            callable_ = getattr(self.module, key)
+            val = compat.partial(callable_, self.context)
+        elif self.inherits:
+            val = getattr(self.inherits, key)
+        else:
+            raise AttributeError(
+                    "Namespace '%s' has no member '%s'" %
+                    (self.name, key))
+        setattr(self, key, val)
+        return val
+
+def supports_caller(func):
+    """Apply a caller_stack compatibility decorator to a plain
+    Python function.
+
+    See the example in :ref:`namespaces_python_modules`.
+
+    """
+
+    def wrap_stackframe(context, *args, **kwargs):
+        context.caller_stack._push_frame()
+        try:
+            return func(context, *args, **kwargs)
+        finally:
+            context.caller_stack._pop_frame()
+    return wrap_stackframe
+
+def capture(context, callable_, *args, **kwargs):
+    """Execute the given template def, capturing the output into
+    a buffer.
+
+    See the example in :ref:`namespaces_python_modules`.
+
+    """
+
+    if not compat.callable(callable_):
+        raise exceptions.RuntimeException(
+                        "capture() function expects a callable as "
+                        "its argument (i.e. capture(func, *args, **kwargs))"
+                        )
+    context._push_buffer()
+    try:
+        callable_(*args, **kwargs)
+    finally:
+        buf = context._pop_buffer()
+    return buf.getvalue()
+
+def _decorate_toplevel(fn):
+    def decorate_render(render_fn):
+        def go(context, *args, **kw):
+            def y(*args, **kw):
+                return render_fn(context, *args, **kw)
+            try:
+                y.__name__ = render_fn.__name__[7:]
+            except TypeError:
+                # < Python 2.4
+                pass
+            return fn(y)(context, *args, **kw)
+        return go
+    return decorate_render
+
+def _decorate_inline(context, fn):
+    def decorate_render(render_fn):
+        dec = fn(render_fn)
+        def go(*args, **kw):
+            return dec(context, *args, **kw)
+        return go
+    return decorate_render
+
+def _include_file(context, uri, calling_uri, **kwargs):
+    """locate the template from the given uri and include it in
+    the current output."""
+
+    template = _lookup_template(context, uri, calling_uri)
+    (callable_, ctx) = _populate_self_namespace(
+                                context._clean_inheritance_tokens(),
+                                template)
+    callable_(ctx, **_kwargs_for_include(callable_, context._data, **kwargs))
+
+def _inherit_from(context, uri, calling_uri):
+    """called by the _inherit method in template modules to set
+    up the inheritance chain at the start of a template's
+    execution."""
+
+    if uri is None:
+        return None
+    template = _lookup_template(context, uri, calling_uri)
+    self_ns = context['self']
+    ih = self_ns
+    while ih.inherits is not None:
+        ih = ih.inherits
+    lclcontext = context._locals({'next': ih})
+    ih.inherits = TemplateNamespace("self:%s" % template.uri,
+                                lclcontext,
+                                template=template,
+                                populate_self=False)
+    context._data['parent'] = lclcontext._data['local'] = ih.inherits
+    callable_ = getattr(template.module, '_mako_inherit', None)
+    if callable_ is not None:
+        ret = callable_(template, lclcontext)
+        if ret:
+            return ret
+
+    gen_ns = getattr(template.module, '_mako_generate_namespaces', None)
+    if gen_ns is not None:
+        gen_ns(context)
+    return (template.callable_, lclcontext)
+
+def _lookup_template(context, uri, relativeto):
+    lookup = context._with_template.lookup
+    if lookup is None:
+        raise exceptions.TemplateLookupException(
+                            "Template '%s' has no TemplateLookup associated" %
+                            context._with_template.uri)
+    uri = lookup.adjust_uri(uri, relativeto)
+    try:
+        return lookup.get_template(uri)
+    except exceptions.TopLevelLookupException:
+        raise exceptions.TemplateLookupException(str(compat.exception_as()))
+
+def _populate_self_namespace(context, template, self_ns=None):
+    if self_ns is None:
+        self_ns = TemplateNamespace('self:%s' % template.uri,
+                                context, template=template,
+                                populate_self=False)
+    context._data['self'] = context._data['local'] = self_ns
+    if hasattr(template.module, '_mako_inherit'):
+        ret = template.module._mako_inherit(template, context)
+        if ret:
+            return ret
+    return (template.callable_, context)
+
+def _render(template, callable_, args, data, as_unicode=False):
+    """create a Context and return the string
+    output of the given template and template callable."""
+
+    if as_unicode:
+        buf = util.FastEncodingBuffer(as_unicode=True)
+    elif template.bytestring_passthrough:
+        buf = compat.StringIO()
+    else:
+        buf = util.FastEncodingBuffer(
+                        as_unicode=as_unicode,
+                        encoding=template.output_encoding,
+                        errors=template.encoding_errors)
+    context = Context(buf, **data)
+    context._outputting_as_unicode = as_unicode
+    context._set_with_template(template)
+
+    _render_context(template, callable_, context, *args,
+                            **_kwargs_for_callable(callable_, data))
+    return context._pop_buffer().getvalue()
+
+def _kwargs_for_callable(callable_, data):
+    argspec = compat.inspect_func_args(callable_)
+    # for normal pages, **pageargs is usually present
+    if argspec[2]:
+        return data
+
+    # for rendering defs from the top level, figure out the args
+    namedargs = argspec[0] + [v for v in argspec[1:3] if v is not None]
+    kwargs = {}
+    for arg in namedargs:
+        if arg != 'context' and arg in data and arg not in kwargs:
+            kwargs[arg] = data[arg]
+    return kwargs
+
+def _kwargs_for_include(callable_, data, **kwargs):
+    argspec = compat.inspect_func_args(callable_)
+    namedargs = argspec[0] + [v for v in argspec[1:3] if v is not None]
+    for arg in namedargs:
+        if arg != 'context' and arg in data and arg not in kwargs:
+            kwargs[arg] = data[arg]
+    return kwargs
+
+def _render_context(tmpl, callable_, context, *args, **kwargs):
+    import mako.template as template
+    # create polymorphic 'self' namespace for this
+    # template with possibly updated context
+    if not isinstance(tmpl, template.DefTemplate):
+        # if main render method, call from the base of the inheritance stack
+        (inherit, lclcontext) = _populate_self_namespace(context, tmpl)
+        _exec_template(inherit, lclcontext, args=args, kwargs=kwargs)
+    else:
+        # otherwise, call the actual rendering method specified
+        (inherit, lclcontext) = _populate_self_namespace(context, tmpl.parent)
+        _exec_template(callable_, context, args=args, kwargs=kwargs)
+
+def _exec_template(callable_, context, args=None, kwargs=None):
+    """execute a rendering callable given the callable, a
+    Context, and optional explicit arguments
+
+    the contextual Template will be located if it exists, and
+    the error handling options specified on that Template will
+    be interpreted here.
+    """
+    template = context._with_template
+    if template is not None and \
+            (template.format_exceptions or template.error_handler):
+        try:
+            callable_(context, *args, **kwargs)
+        except Exception:
+            _render_error(template, context, compat.exception_as())
+        except:
+            e = sys.exc_info()[0]
+            _render_error(template, context, e)
+    else:
+        callable_(context, *args, **kwargs)
+
+def _render_error(template, context, error):
+    if template.error_handler:
+        result = template.error_handler(context, error)
+        if not result:
+            compat.reraise(*sys.exc_info())
+    else:
+        error_template = exceptions.html_error_template()
+        if context._outputting_as_unicode:
+            context._buffer_stack[:] = [
+                                    util.FastEncodingBuffer(as_unicode=True)]
+        else:
+            context._buffer_stack[:] = [util.FastEncodingBuffer(
+                                            error_template.output_encoding,
+                                            error_template.encoding_errors)]
+
+        context._set_with_template(error_template)
+        error_template.render_context(context, error=error)
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/template.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/template.py
new file mode 100644
index 00000000000..fb6106289fa
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/template.py
@@ -0,0 +1,705 @@
+# mako/template.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+"""Provides the Template class, a facade for parsing, generating and executing
+template strings, as well as template runtime operations."""
+
+from mako.lexer import Lexer
+from mako import runtime, util, exceptions, codegen, cache, compat
+import os
+import re
+import shutil
+import stat
+import sys
+import tempfile
+import types
+import weakref
+
+
+class Template(object):
+    """Represents a compiled template.
+
+    :class:`.Template` includes a reference to the original
+    template source (via the :attr:`.source` attribute)
+    as well as the source code of the
+    generated Python module (i.e. the :attr:`.code` attribute),
+    as well as a reference to an actual Python module.
+
+    :class:`.Template` is constructed using either a literal string
+    representing the template text, or a filename representing a filesystem
+    path to a source file.
+
+    :param text: textual template source.  This argument is mutually
+     exclusive versus the ``filename`` parameter.
+
+    :param filename: filename of the source template.  This argument is
+     mutually exclusive versus the ``text`` parameter.
+
+    :param buffer_filters: string list of filters to be applied
+     to the output of ``%def``\ s which are buffered, cached, or otherwise
+     filtered, after all filters
+     defined with the ``%def`` itself have been applied. Allows the
+     creation of default expression filters that let the output
+     of return-valued ``%def``\ s "opt out" of that filtering via
+     passing special attributes or objects.
+
+    :param bytestring_passthrough: When ``True``, and ``output_encoding`` is
+     set to ``None``, and :meth:`.Template.render` is used to render,
+     the `StringIO` or `cStringIO` buffer will be used instead of the
+     default "fast" buffer.   This allows raw bytestrings in the
+     output stream, such as in expressions, to pass straight
+     through to the buffer.  This flag is forced
+     to ``True`` if ``disable_unicode`` is also configured.
+
+     .. versionadded:: 0.4
+        Added to provide the same behavior as that of the previous series.
+
+    :param cache_args: Dictionary of cache configuration arguments that
+     will be passed to the :class:`.CacheImpl`.   See :ref:`caching_toplevel`.
+
+    :param cache_dir:
+
+     .. deprecated:: 0.6
+        Use the ``'dir'`` argument in the ``cache_args`` dictionary.
+        See :ref:`caching_toplevel`.
+
+    :param cache_enabled: Boolean flag which enables caching of this
+     template.  See :ref:`caching_toplevel`.
+
+    :param cache_impl: String name of a :class:`.CacheImpl` caching
+     implementation to use.   Defaults to ``'beaker'``.
+
+    :param cache_type:
+
+     .. deprecated:: 0.6
+        Use the ``'type'`` argument in the ``cache_args`` dictionary.
+        See :ref:`caching_toplevel`.
+
+    :param cache_url:
+
+     .. deprecated:: 0.6
+        Use the ``'url'`` argument in the ``cache_args`` dictionary.
+        See :ref:`caching_toplevel`.
+
+    :param default_filters: List of string filter names that will
+     be applied to all expressions.  See :ref:`filtering_default_filters`.
+
+    :param disable_unicode: Disables all awareness of Python Unicode
+     objects.  See :ref:`unicode_disabled`.
+
+    :param enable_loop: When ``True``, enable the ``loop`` context variable.
+     This can be set to ``False`` to support templates that may
+     be making usage of the name "``loop``".   Individual templates can
+     re-enable the "loop" context by placing the directive
+     ``enable_loop="True"`` inside the ``<%page>`` tag -- see
+     :ref:`migrating_loop`.
+
+    :param encoding_errors: Error parameter passed to ``encode()`` when
+     string encoding is performed. See :ref:`usage_unicode`.
+
+    :param error_handler: Python callable which is called whenever
+     compile or runtime exceptions occur. The callable is passed
+     the current context as well as the exception. If the
+     callable returns ``True``, the exception is considered to
+     be handled, else it is re-raised after the function
+     completes. Is used to provide custom error-rendering
+     functions.
+
+    :param format_exceptions: if ``True``, exceptions which occur during
+     the render phase of this template will be caught and
+     formatted into an HTML error page, which then becomes the
+     rendered result of the :meth:`.render` call. Otherwise,
+     runtime exceptions are propagated outwards.
+
+    :param imports: String list of Python statements, typically individual
+     "import" lines, which will be placed into the module level
+     preamble of all generated Python modules. See the example
+     in :ref:`filtering_default_filters`.
+
+    :param future_imports: String list of names to import from `__future__`.
+     These will be concatenated into a comma-separated string and inserted
+     into the beginning of the template, e.g. ``futures_imports=['FOO',
+     'BAR']`` results in ``from __future__ import FOO, BAR``.  If you're
+     interested in using features like the new division operator, you must
+     use future_imports to convey that to the renderer, as otherwise the
+     import will not appear as the first executed statement in the generated
+     code and will therefore not have the desired effect.
+
+    :param input_encoding: Encoding of the template's source code.  Can
+     be used in lieu of the coding comment. See
+     :ref:`usage_unicode` as well as :ref:`unicode_toplevel` for
+     details on source encoding.
+
+    :param lookup: a :class:`.TemplateLookup` instance that will be used
+     for all file lookups via the ``<%namespace>``,
+     ``<%include>``, and ``<%inherit>`` tags. See
+     :ref:`usage_templatelookup`.
+
+    :param module_directory: Filesystem location where generated
+     Python module files will be placed.
+
+    :param module_filename: Overrides the filename of the generated
+     Python module file. For advanced usage only.
+
+    :param module_writer: A callable which overrides how the Python
+     module is written entirely.  The callable is passed the
+     encoded source content of the module and the destination
+     path to be written to.   The default behavior of module writing
+     uses a tempfile in conjunction with a file move in order
+     to make the operation atomic.   So a user-defined module
+     writing function that mimics the default behavior would be:
+
+     .. sourcecode:: python
+
+         import tempfile
+         import os
+         import shutil
+
+         def module_writer(source, outputpath):
+             (dest, name) = \\
+                 tempfile.mkstemp(
+                     dir=os.path.dirname(outputpath)
+                 )
+
+             os.write(dest, source)
+             os.close(dest)
+             shutil.move(name, outputpath)
+
+         from mako.template import Template
+         mytemplate = Template(
+                         filename="index.html",
+                         module_directory="/path/to/modules",
+                         module_writer=module_writer
+                     )
+
+     The function is provided for unusual configurations where
+     certain platform-specific permissions or other special
+     steps are needed.
+
+    :param output_encoding: The encoding to use when :meth:`.render`
+     is called.
+     See :ref:`usage_unicode` as well as :ref:`unicode_toplevel`.
+
+    :param preprocessor: Python callable which will be passed
+     the full template source before it is parsed. The return
+     result of the callable will be used as the template source
+     code.
+
+    :param lexer_cls: A :class:`.Lexer` class used to parse
+     the template.   The :class:`.Lexer` class is used by
+     default.
+
+     .. versionadded:: 0.7.4
+
+    :param strict_undefined: Replaces the automatic usage of
+     ``UNDEFINED`` for any undeclared variables not located in
+     the :class:`.Context` with an immediate raise of
+     ``NameError``. The advantage is immediate reporting of
+     missing variables which include the name.
+
+     .. versionadded:: 0.3.6
+
+    :param uri: string URI or other identifier for this template.
+     If not provided, the ``uri`` is generated from the filesystem
+     path, or from the in-memory identity of a non-file-based
+     template. The primary usage of the ``uri`` is to provide a key
+     within :class:`.TemplateLookup`, as well as to generate the
+     file path of the generated Python module file, if
+     ``module_directory`` is specified.
+
+    """
+
+    lexer_cls = Lexer
+
+    def __init__(self,
+                    text=None,
+                    filename=None,
+                    uri=None,
+                    format_exceptions=False,
+                    error_handler=None,
+                    lookup=None,
+                    output_encoding=None,
+                    encoding_errors='strict',
+                    module_directory=None,
+                    cache_args=None,
+                    cache_impl='beaker',
+                    cache_enabled=True,
+                    cache_type=None,
+                    cache_dir=None,
+                    cache_url=None,
+                    module_filename=None,
+                    input_encoding=None,
+                    disable_unicode=False,
+                    module_writer=None,
+                    bytestring_passthrough=False,
+                    default_filters=None,
+                    buffer_filters=(),
+                    strict_undefined=False,
+                    imports=None,
+                    future_imports=None,
+                    enable_loop=True,
+                    preprocessor=None,
+                    lexer_cls=None):
+        if uri:
+            self.module_id = re.sub(r'\W', "_", uri)
+            self.uri = uri
+        elif filename:
+            self.module_id = re.sub(r'\W', "_", filename)
+            drive, path = os.path.splitdrive(filename)
+            path = os.path.normpath(path).replace(os.path.sep, "/")
+            self.uri = path
+        else:
+            self.module_id = "memory:" + hex(id(self))
+            self.uri = self.module_id
+
+        u_norm = self.uri
+        if u_norm.startswith("/"):
+            u_norm = u_norm[1:]
+        u_norm = os.path.normpath(u_norm)
+        if u_norm.startswith(".."):
+            raise exceptions.TemplateLookupException(
+                    "Template uri \"%s\" is invalid - "
+                    "it cannot be relative outside "
+                    "of the root path." % self.uri)
+
+        self.input_encoding = input_encoding
+        self.output_encoding = output_encoding
+        self.encoding_errors = encoding_errors
+        self.disable_unicode = disable_unicode
+        self.bytestring_passthrough = bytestring_passthrough or disable_unicode
+        self.enable_loop = enable_loop
+        self.strict_undefined = strict_undefined
+        self.module_writer = module_writer
+
+        if compat.py3k and disable_unicode:
+            raise exceptions.UnsupportedError(
+                                    "Mako for Python 3 does not "
+                                    "support disabling Unicode")
+        elif output_encoding and disable_unicode:
+            raise exceptions.UnsupportedError(
+                                    "output_encoding must be set to "
+                                    "None when disable_unicode is used.")
+        if default_filters is None:
+            if compat.py3k or self.disable_unicode:
+                self.default_filters = ['str']
+            else:
+                self.default_filters = ['unicode']
+        else:
+            self.default_filters = default_filters
+        self.buffer_filters = buffer_filters
+
+        self.imports = imports
+        self.future_imports = future_imports
+        self.preprocessor = preprocessor
+
+        if lexer_cls is not None:
+            self.lexer_cls = lexer_cls
+
+        # if plain text, compile code in memory only
+        if text is not None:
+            (code, module) = _compile_text(self, text, filename)
+            self._code = code
+            self._source = text
+            ModuleInfo(module, None, self, filename, code, text)
+        elif filename is not None:
+            # if template filename and a module directory, load
+            # a filesystem-based module file, generating if needed
+            if module_filename is not None:
+                path = module_filename
+            elif module_directory is not None:
+                path = os.path.abspath(
+                        os.path.join(
+                            os.path.normpath(module_directory),
+                            u_norm + ".py"
+                            )
+                        )
+            else:
+                path = None
+            module = self._compile_from_file(path, filename)
+        else:
+            raise exceptions.RuntimeException(
+                                "Template requires text or filename")
+
+        self.module = module
+        self.filename = filename
+        self.callable_ = self.module.render_body
+        self.format_exceptions = format_exceptions
+        self.error_handler = error_handler
+        self.lookup = lookup
+
+        self.module_directory = module_directory
+
+        self._setup_cache_args(
+            cache_impl, cache_enabled, cache_args,
+            cache_type, cache_dir, cache_url
+        )
+
+
+    @util.memoized_property
+    def reserved_names(self):
+        if self.enable_loop:
+            return codegen.RESERVED_NAMES
+        else:
+            return codegen.RESERVED_NAMES.difference(['loop'])
+
+    def _setup_cache_args(self,
+                cache_impl, cache_enabled, cache_args,
+                cache_type, cache_dir, cache_url):
+        self.cache_impl = cache_impl
+        self.cache_enabled = cache_enabled
+        if cache_args:
+            self.cache_args = cache_args
+        else:
+            self.cache_args = {}
+
+        # transfer deprecated cache_* args
+        if cache_type:
+            self.cache_args['type'] = cache_type
+        if cache_dir:
+            self.cache_args['dir'] = cache_dir
+        if cache_url:
+            self.cache_args['url'] = cache_url
+
+    def _compile_from_file(self, path, filename):
+        if path is not None:
+            util.verify_directory(os.path.dirname(path))
+            filemtime = os.stat(filename)[stat.ST_MTIME]
+            if not os.path.exists(path) or \
+                        os.stat(path)[stat.ST_MTIME] < filemtime:
+                data = util.read_file(filename)
+                _compile_module_file(
+                            self,
+                            data,
+                            filename,
+                            path,
+                            self.module_writer)
+            module = compat.load_module(self.module_id, path)
+            del sys.modules[self.module_id]
+            if module._magic_number != codegen.MAGIC_NUMBER:
+                data = util.read_file(filename)
+                _compile_module_file(
+                            self,
+                            data,
+                            filename,
+                            path,
+                            self.module_writer)
+                module = compat.load_module(self.module_id, path)
+                del sys.modules[self.module_id]
+            ModuleInfo(module, path, self, filename, None, None)
+        else:
+            # template filename and no module directory, compile code
+            # in memory
+            data = util.read_file(filename)
+            code, module = _compile_text(
+                                self,
+                                data,
+                                filename)
+            self._source = None
+            self._code = code
+            ModuleInfo(module, None, self, filename, code, None)
+        return module
+
+    @property
+    def source(self):
+        """Return the template source code for this :class:`.Template`."""
+
+        return _get_module_info_from_callable(self.callable_).source
+
+    @property
+    def code(self):
+        """Return the module source code for this :class:`.Template`."""
+
+        return _get_module_info_from_callable(self.callable_).code
+
+    @util.memoized_property
+    def cache(self):
+        return cache.Cache(self)
+
+    @property
+    def cache_dir(self):
+        return self.cache_args['dir']
+    @property
+    def cache_url(self):
+        return self.cache_args['url']
+    @property
+    def cache_type(self):
+        return self.cache_args['type']
+
+    def render(self, *args, **data):
+        """Render the output of this template as a string.
+
+        If the template specifies an output encoding, the string
+        will be encoded accordingly, else the output is raw (raw
+        output uses `cStringIO` and can't handle multibyte
+        characters). A :class:`.Context` object is created corresponding
+        to the given data. Arguments that are explicitly declared
+        by this template's internal rendering method are also
+        pulled from the given ``*args``, ``**data`` members.
+
+        """
+        return runtime._render(self, self.callable_, args, data)
+
+    def render_unicode(self, *args, **data):
+        """Render the output of this template as a unicode object."""
+
+        return runtime._render(self,
+                                self.callable_,
+                                args,
+                                data,
+                                as_unicode=True)
+
+    def render_context(self, context, *args, **kwargs):
+        """Render this :class:`.Template` with the given context.
+
+        The data is written to the context's buffer.
+
+        """
+        if getattr(context, '_with_template', None) is None:
+            context._set_with_template(self)
+        runtime._render_context(self,
+                                self.callable_,
+                                context,
+                                *args,
+                                **kwargs)
+
+    def has_def(self, name):
+        return hasattr(self.module, "render_%s" % name)
+
+    def get_def(self, name):
+        """Return a def of this template as a :class:`.DefTemplate`."""
+
+        return DefTemplate(self, getattr(self.module, "render_%s" % name))
+
+    def _get_def_callable(self, name):
+        return getattr(self.module, "render_%s" % name)
+
+    @property
+    def last_modified(self):
+        return self.module._modified_time
+
+class ModuleTemplate(Template):
+    """A Template which is constructed given an existing Python module.
+
+        e.g.::
+
+        t = Template("this is a template")
+        f = file("mymodule.py", "w")
+        f.write(t.code)
+        f.close()
+
+        import mymodule
+
+        t = ModuleTemplate(mymodule)
+        print t.render()
+
+    """
+
+    def __init__(self, module,
+                        module_filename=None,
+                        template=None,
+                        template_filename=None,
+                        module_source=None,
+                        template_source=None,
+                        output_encoding=None,
+                        encoding_errors='strict',
+                        disable_unicode=False,
+                        bytestring_passthrough=False,
+                        format_exceptions=False,
+                        error_handler=None,
+                        lookup=None,
+                        cache_args=None,
+                        cache_impl='beaker',
+                        cache_enabled=True,
+                        cache_type=None,
+                        cache_dir=None,
+                        cache_url=None,
+    ):
+        self.module_id = re.sub(r'\W', "_", module._template_uri)
+        self.uri = module._template_uri
+        self.input_encoding = module._source_encoding
+        self.output_encoding = output_encoding
+        self.encoding_errors = encoding_errors
+        self.disable_unicode = disable_unicode
+        self.bytestring_passthrough = bytestring_passthrough or disable_unicode
+        self.enable_loop = module._enable_loop
+
+        if compat.py3k and disable_unicode:
+            raise exceptions.UnsupportedError(
+                                    "Mako for Python 3 does not "
+                                    "support disabling Unicode")
+        elif output_encoding and disable_unicode:
+            raise exceptions.UnsupportedError(
+                                    "output_encoding must be set to "
+                                    "None when disable_unicode is used.")
+
+        self.module = module
+        self.filename = template_filename
+        ModuleInfo(module,
+                        module_filename,
+                        self,
+                        template_filename,
+                        module_source,
+                        template_source)
+
+        self.callable_ = self.module.render_body
+        self.format_exceptions = format_exceptions
+        self.error_handler = error_handler
+        self.lookup = lookup
+        self._setup_cache_args(
+            cache_impl, cache_enabled, cache_args,
+            cache_type, cache_dir, cache_url
+        )
+
+class DefTemplate(Template):
+    """A :class:`.Template` which represents a callable def in a parent
+    template."""
+
+    def __init__(self, parent, callable_):
+        self.parent = parent
+        self.callable_ = callable_
+        self.output_encoding = parent.output_encoding
+        self.module = parent.module
+        self.encoding_errors = parent.encoding_errors
+        self.format_exceptions = parent.format_exceptions
+        self.error_handler = parent.error_handler
+        self.enable_loop = parent.enable_loop
+        self.lookup = parent.lookup
+        self.bytestring_passthrough = parent.bytestring_passthrough
+
+    def get_def(self, name):
+        return self.parent.get_def(name)
+
+class ModuleInfo(object):
+    """Stores information about a module currently loaded into
+    memory, provides reverse lookups of template source, module
+    source code based on a module's identifier.
+
+     """
+    _modules = weakref.WeakValueDictionary()
+
+    def __init__(self,
+                    module,
+                    module_filename,
+                    template,
+                    template_filename,
+                    module_source,
+                    template_source):
+        self.module = module
+        self.module_filename = module_filename
+        self.template_filename = template_filename
+        self.module_source = module_source
+        self.template_source = template_source
+        self._modules[module.__name__] = template._mmarker = self
+        if module_filename:
+            self._modules[module_filename] = self
+
+    @classmethod
+    def get_module_source_metadata(cls, module_source, full_line_map=False):
+        source_map = re.search(
+                        r"__M_BEGIN_METADATA(.+?)__M_END_METADATA",
+                        module_source, re.S).group(1)
+        source_map = compat.json.loads(source_map)
+        source_map['line_map'] = dict((int(k), int(v))
+                                    for k, v in source_map['line_map'].items())
+        if full_line_map:
+            f_line_map = source_map['full_line_map'] = []
+            line_map = source_map['line_map']
+
+            curr_templ_line = 1
+            for mod_line in range(1, max(line_map)):
+                if mod_line in line_map:
+                    curr_templ_line = line_map[mod_line]
+                f_line_map.append(curr_templ_line)
+        return source_map
+
+    @property
+    def code(self):
+        if self.module_source is not None:
+            return self.module_source
+        else:
+            return util.read_python_file(self.module_filename)
+
+    @property
+    def source(self):
+        if self.template_source is not None:
+            if self.module._source_encoding and \
+                    not isinstance(self.template_source, compat.text_type):
+                return self.template_source.decode(
+                                self.module._source_encoding)
+            else:
+                return self.template_source
+        else:
+            data = util.read_file(self.template_filename)
+            if self.module._source_encoding:
+                return data.decode(self.module._source_encoding)
+            else:
+                return data
+
+def _compile(template, text, filename, generate_magic_comment):
+    lexer = template.lexer_cls(text,
+                           filename,
+                           disable_unicode=template.disable_unicode,
+                           input_encoding=template.input_encoding,
+                           preprocessor=template.preprocessor)
+    node = lexer.parse()
+    source = codegen.compile(node,
+                            template.uri,
+                            filename,
+                            default_filters=template.default_filters,
+                            buffer_filters=template.buffer_filters,
+                            imports=template.imports,
+                            future_imports=template.future_imports,
+                            source_encoding=lexer.encoding,
+                            generate_magic_comment=generate_magic_comment,
+                            disable_unicode=template.disable_unicode,
+                            strict_undefined=template.strict_undefined,
+                            enable_loop=template.enable_loop,
+                            reserved_names=template.reserved_names)
+    return source, lexer
+
+def _compile_text(template, text, filename):
+    identifier = template.module_id
+    source, lexer = _compile(template, text, filename,
+                        generate_magic_comment=template.disable_unicode)
+
+    cid = identifier
+    if not compat.py3k and isinstance(cid, compat.text_type):
+        cid = cid.encode()
+    module = types.ModuleType(cid)
+    code = compile(source, cid, 'exec')
+
+    # this exec() works for 2.4->3.3.
+    exec(code, module.__dict__, module.__dict__)
+    return (source, module)
+
+def _compile_module_file(template, text, filename, outputpath, module_writer):
+    source, lexer = _compile(template, text, filename,
+                        generate_magic_comment=True)
+
+    if isinstance(source, compat.text_type):
+        source = source.encode(lexer.encoding or 'ascii')
+
+    if module_writer:
+        module_writer(source, outputpath)
+    else:
+        # make tempfiles in the same location as the ultimate
+        # location.   this ensures they're on the same filesystem,
+        # avoiding synchronization issues.
+        (dest, name) = tempfile.mkstemp(dir=os.path.dirname(outputpath))
+
+        os.write(dest, source)
+        os.close(dest)
+        shutil.move(name, outputpath)
+
+def _get_module_info_from_callable(callable_):
+    if compat.py3k:
+        return _get_module_info(callable_.__globals__['__name__'])
+    else:
+        return _get_module_info(callable_.func_globals['__name__'])
+
+def _get_module_info(filename):
+    return ModuleInfo._modules[filename]
+
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/mako/util.py b/src/gallium/drivers/swr/rasterizer/scripts/mako/util.py
new file mode 100644
index 00000000000..cba2ab7920c
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/mako/util.py
@@ -0,0 +1,360 @@
+# mako/util.py
+# Copyright (C) 2006-2015 the Mako authors and contributors <see AUTHORS file>
+#
+# This module is part of Mako and is released under
+# the MIT License: http://www.opensource.org/licenses/mit-license.php
+
+import re
+import collections
+import codecs
+import os
+from mako import compat
+import operator
+
+def update_wrapper(decorated, fn):
+    decorated.__wrapped__ = fn
+    decorated.__name__ = fn.__name__
+    return decorated
+
+
+class PluginLoader(object):
+    def __init__(self, group):
+        self.group = group
+        self.impls = {}
+
+    def load(self, name):
+        if name in self.impls:
+            return self.impls[name]()
+        else:
+            import pkg_resources
+            for impl in pkg_resources.iter_entry_points(
+                                self.group,
+                                name):
+                self.impls[name] = impl.load
+                return impl.load()
+            else:
+                from mako import exceptions
+                raise exceptions.RuntimeException(
+                        "Can't load plugin %s %s" %
+                        (self.group, name))
+
+    def register(self, name, modulepath, objname):
+        def load():
+            mod = __import__(modulepath)
+            for token in modulepath.split(".")[1:]:
+                mod = getattr(mod, token)
+            return getattr(mod, objname)
+        self.impls[name] = load
+
+def verify_directory(dir):
+    """create and/or verify a filesystem directory."""
+
+    tries = 0
+
+    while not os.path.exists(dir):
+        try:
+            tries += 1
+            os.makedirs(dir, compat.octal("0775"))
+        except:
+            if tries > 5:
+                raise
+
+def to_list(x, default=None):
+    if x is None:
+        return default
+    if not isinstance(x, (list, tuple)):
+        return [x]
+    else:
+        return x
+
+
+class memoized_property(object):
+    """A read-only @property that is only evaluated once."""
+    def __init__(self, fget, doc=None):
+        self.fget = fget
+        self.__doc__ = doc or fget.__doc__
+        self.__name__ = fget.__name__
+
+    def __get__(self, obj, cls):
+        if obj is None:
+            return self
+        obj.__dict__[self.__name__] = result = self.fget(obj)
+        return result
+
+class memoized_instancemethod(object):
+    """Decorate a method memoize its return value.
+
+    Best applied to no-arg methods: memoization is not sensitive to
+    argument values, and will always return the same value even when
+    called with different arguments.
+
+    """
+    def __init__(self, fget, doc=None):
+        self.fget = fget
+        self.__doc__ = doc or fget.__doc__
+        self.__name__ = fget.__name__
+
+    def __get__(self, obj, cls):
+        if obj is None:
+            return self
+        def oneshot(*args, **kw):
+            result = self.fget(obj, *args, **kw)
+            memo = lambda *a, **kw: result
+            memo.__name__ = self.__name__
+            memo.__doc__ = self.__doc__
+            obj.__dict__[self.__name__] = memo
+            return result
+        oneshot.__name__ = self.__name__
+        oneshot.__doc__ = self.__doc__
+        return oneshot
+
+class SetLikeDict(dict):
+    """a dictionary that has some setlike methods on it"""
+    def union(self, other):
+        """produce a 'union' of this dict and another (at the key level).
+
+        values in the second dict take precedence over that of the first"""
+        x = SetLikeDict(**self)
+        x.update(other)
+        return x
+
+class FastEncodingBuffer(object):
+    """a very rudimentary buffer that is faster than StringIO,
+    but doesn't crash on unicode data like cStringIO."""
+
+    def __init__(self, encoding=None, errors='strict', as_unicode=False):
+        self.data = collections.deque()
+        self.encoding = encoding
+        if as_unicode:
+            self.delim = compat.u('')
+        else:
+            self.delim = ''
+        self.as_unicode = as_unicode
+        self.errors = errors
+        self.write = self.data.append
+
+    def truncate(self):
+        self.data = collections.deque()
+        self.write = self.data.append
+
+    def getvalue(self):
+        if self.encoding:
+            return self.delim.join(self.data).encode(self.encoding,
+                                                     self.errors)
+        else:
+            return self.delim.join(self.data)
+
+class LRUCache(dict):
+    """A dictionary-like object that stores a limited number of items,
+    discarding lesser used items periodically.
+
+    this is a rewrite of LRUCache from Myghty to use a periodic timestamp-based
+    paradigm so that synchronization is not really needed.  the size management
+    is inexact.
+    """
+
+    class _Item(object):
+        def __init__(self, key, value):
+            self.key = key
+            self.value = value
+            self.timestamp = compat.time_func()
+        def __repr__(self):
+            return repr(self.value)
+
+    def __init__(self, capacity, threshold=.5):
+        self.capacity = capacity
+        self.threshold = threshold
+
+    def __getitem__(self, key):
+        item = dict.__getitem__(self, key)
+        item.timestamp = compat.time_func()
+        return item.value
+
+    def values(self):
+        return [i.value for i in dict.values(self)]
+
+    def setdefault(self, key, value):
+        if key in self:
+            return self[key]
+        else:
+            self[key] = value
+            return value
+
+    def __setitem__(self, key, value):
+        item = dict.get(self, key)
+        if item is None:
+            item = self._Item(key, value)
+            dict.__setitem__(self, key, item)
+        else:
+            item.value = value
+        self._manage_size()
+
+    def _manage_size(self):
+        while len(self) > self.capacity + self.capacity * self.threshold:
+            bytime = sorted(dict.values(self),
+                            key=operator.attrgetter('timestamp'), reverse=True)
+            for item in bytime[self.capacity:]:
+                try:
+                    del self[item.key]
+                except KeyError:
+                    # if we couldn't find a key, most likely some other thread
+                    # broke in on us. loop around and try again
+                    break
+
+# Regexp to match python magic encoding line
+_PYTHON_MAGIC_COMMENT_re = re.compile(
+    r'[ \t\f]* \# .* coding[=:][ \t]*([-\w.]+)',
+    re.VERBOSE)
+
+def parse_encoding(fp):
+    """Deduce the encoding of a Python source file (binary mode) from magic
+    comment.
+
+    It does this in the same way as the `Python interpreter`__
+
+    .. __: http://docs.python.org/ref/encodings.html
+
+    The ``fp`` argument should be a seekable file object in binary mode.
+    """
+    pos = fp.tell()
+    fp.seek(0)
+    try:
+        line1 = fp.readline()
+        has_bom = line1.startswith(codecs.BOM_UTF8)
+        if has_bom:
+            line1 = line1[len(codecs.BOM_UTF8):]
+
+        m = _PYTHON_MAGIC_COMMENT_re.match(line1.decode('ascii', 'ignore'))
+        if not m:
+            try:
+                import parser
+                parser.suite(line1.decode('ascii', 'ignore'))
+            except (ImportError, SyntaxError):
+                # Either it's a real syntax error, in which case the source
+                # is not valid python source, or line2 is a continuation of
+                # line1, in which case we don't want to scan line2 for a magic
+                # comment.
+                pass
+            else:
+                line2 = fp.readline()
+                m = _PYTHON_MAGIC_COMMENT_re.match(
+                                               line2.decode('ascii', 'ignore'))
+
+        if has_bom:
+            if m:
+                raise SyntaxError("python refuses to compile code with both a UTF8" \
+                      " byte-order-mark and a magic encoding comment")
+            return 'utf_8'
+        elif m:
+            return m.group(1)
+        else:
+            return None
+    finally:
+        fp.seek(pos)
+
+def sorted_dict_repr(d):
+    """repr() a dictionary with the keys in order.
+
+    Used by the lexer unit test to compare parse trees based on strings.
+
+    """
+    keys = list(d.keys())
+    keys.sort()
+    return "{" + ", ".join(["%r: %r" % (k, d[k]) for k in keys]) + "}"
+
+def restore__ast(_ast):
+    """Attempt to restore the required classes to the _ast module if it
+    appears to be missing them
+    """
+    if hasattr(_ast, 'AST'):
+        return
+    _ast.PyCF_ONLY_AST = 2 << 9
+    m = compile("""\
+def foo(): pass
+class Bar(object): pass
+if False: pass
+baz = 'mako'
+1 + 2 - 3 * 4 / 5
+6 // 7 % 8 << 9 >> 10
+11 & 12 ^ 13 | 14
+15 and 16 or 17
+-baz + (not +18) - ~17
+baz and 'foo' or 'bar'
+(mako is baz == baz) is not baz != mako
+mako > baz < mako >= baz <= mako
+mako in baz not in mako""", '<unknown>', 'exec', _ast.PyCF_ONLY_AST)
+    _ast.Module = type(m)
+
+    for cls in _ast.Module.__mro__:
+        if cls.__name__ == 'mod':
+            _ast.mod = cls
+        elif cls.__name__ == 'AST':
+            _ast.AST = cls
+
+    _ast.FunctionDef = type(m.body[0])
+    _ast.ClassDef = type(m.body[1])
+    _ast.If = type(m.body[2])
+
+    _ast.Name = type(m.body[3].targets[0])
+    _ast.Store = type(m.body[3].targets[0].ctx)
+    _ast.Str = type(m.body[3].value)
+
+    _ast.Sub = type(m.body[4].value.op)
+    _ast.Add = type(m.body[4].value.left.op)
+    _ast.Div = type(m.body[4].value.right.op)
+    _ast.Mult = type(m.body[4].value.right.left.op)
+
+    _ast.RShift = type(m.body[5].value.op)
+    _ast.LShift = type(m.body[5].value.left.op)
+    _ast.Mod = type(m.body[5].value.left.left.op)
+    _ast.FloorDiv = type(m.body[5].value.left.left.left.op)
+
+    _ast.BitOr = type(m.body[6].value.op)
+    _ast.BitXor = type(m.body[6].value.left.op)
+    _ast.BitAnd = type(m.body[6].value.left.left.op)
+
+    _ast.Or = type(m.body[7].value.op)
+    _ast.And = type(m.body[7].value.values[0].op)
+
+    _ast.Invert = type(m.body[8].value.right.op)
+    _ast.Not = type(m.body[8].value.left.right.op)
+    _ast.UAdd = type(m.body[8].value.left.right.operand.op)
+    _ast.USub = type(m.body[8].value.left.left.op)
+
+    _ast.Or = type(m.body[9].value.op)
+    _ast.And = type(m.body[9].value.values[0].op)
+
+    _ast.IsNot = type(m.body[10].value.ops[0])
+    _ast.NotEq = type(m.body[10].value.ops[1])
+    _ast.Is = type(m.body[10].value.left.ops[0])
+    _ast.Eq = type(m.body[10].value.left.ops[1])
+
+    _ast.Gt = type(m.body[11].value.ops[0])
+    _ast.Lt = type(m.body[11].value.ops[1])
+    _ast.GtE = type(m.body[11].value.ops[2])
+    _ast.LtE = type(m.body[11].value.ops[3])
+
+    _ast.In = type(m.body[12].value.ops[0])
+    _ast.NotIn = type(m.body[12].value.ops[1])
+
+
+
+def read_file(path, mode='rb'):
+    fp = open(path, mode)
+    try:
+        data = fp.read()
+        return data
+    finally:
+        fp.close()
+
+def read_python_file(path):
+    fp = open(path, "rb")
+    try:
+        encoding = parse_encoding(fp)
+        data = fp.read()
+        if encoding:
+            data = data.decode(encoding)
+        return data
+    finally:
+        fp.close()
+
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template b/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template
new file mode 100644
index 00000000000..922117e7e16
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template
@@ -0,0 +1,141 @@
+<%
+    max_len = 0
+    for knob in knobs:
+        if len(knob[0]) > max_len: max_len = len(knob[0])
+    max_len += len('KNOB_ ')
+    if max_len % 4: max_len += 4 - (max_len % 4)
+
+    def space_knob(knob):
+        knob_len = len('KNOB_' + knob)
+        return ' '*(max_len - knob_len)
+%>/******************************************************************************
+*
+* Copyright 2015
+* Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+* http ://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*
+% if gen_header:
+* @file ${filename}.h
+% else:
+* @file ${filename}.cpp
+% endif 
+*
+* @brief Dynamic Knobs for Core.
+*
+* ======================= AUTO GENERATED: DO NOT EDIT !!! ====================
+*
+******************************************************************************/
+%if gen_header:
+#pragma once
+#include <string>
+
+template <typename T>
+struct Knob
+{
+    const   T&  Value() const               { return m_Value; }
+    const   T&  Value(const T& newValue)    { m_Value = newValue; return Value(); }
+
+protected:
+    Knob(const T& defaultValue) : m_Value(defaultValue) {}
+
+private:
+    T m_Value;
+};
+
+#define DEFINE_KNOB(_name, _type, _default)                     \\
+
+    struct Knob_##_name : Knob<_type>                           \\
+
+    {                                                           \\
+
+        Knob_##_name() : Knob<_type>(_default) { }              \\
+
+        static const char* Name() { return "KNOB_" #_name; }    \\
+
+    } _name;
+
+#define GET_KNOB(_name)             g_GlobalKnobs._name.Value()
+#define SET_KNOB(_name, _newValue)  g_GlobalKnobs._name.Value(_newValue)
+
+struct GlobalKnobs
+{
+    % for knob in knobs:
+    //-----------------------------------------------------------
+    // KNOB_${knob[0]}
+    //
+    % for line in knob[1]['desc']:
+    // ${line}
+    % endfor
+    DEFINE_KNOB(${knob[0]}, ${knob[1]['type']}, ${knob[1]['default']});
+
+    % endfor
+    GlobalKnobs();
+    std::string ToString(const char* optPerLinePrefix="");
+};
+extern GlobalKnobs g_GlobalKnobs;
+
+% for knob in knobs:
+#define KNOB_${knob[0]}${space_knob(knob[0])}GET_KNOB(${knob[0]})
+% endfor
+
+
+% else:
+% for inc in includes:
+#include <${inc}>
+% endfor
+
+//========================================================
+// Static Data Members
+//========================================================
+GlobalKnobs g_GlobalKnobs;
+
+//========================================================
+// Knob Initialization
+//========================================================
+GlobalKnobs::GlobalKnobs()
+{
+    % for knob in knobs:
+    InitKnob(${knob[0]});
+    % endfor
+
+}
+
+//========================================================
+// Knob Display (Convert to String)
+//========================================================
+std::string GlobalKnobs::ToString(const char* optPerLinePrefix)
+{
+    std::basic_stringstream<char> str;
+    str << std::showbase << std::setprecision(1) << std::fixed;
+
+    if (optPerLinePrefix == nullptr) { optPerLinePrefix = ""; }
+
+    % for knob in knobs:
+    str << optPerLinePrefix << "KNOB_${knob[0]}:${space_knob(knob[0])}";
+    % if knob[1]['type'] == 'bool':
+    str << (KNOB_${knob[0]} ? "+\n" : "-\n");
+    % elif knob[1]['type'] != 'float':
+    str << std::hex << std::setw(11) << std::left << KNOB_${knob[0]};
+    str << std::dec << KNOB_${knob[0]} << "\n";
+    % else:
+    str << KNOB_${knob[0]} << "\n";
+    % endif
+    % endfor
+    str << std::ends;
+
+    return str.str();
+}
+
+
+% endif
diff --git a/src/gallium/drivers/swr/swr_clear.cpp b/src/gallium/drivers/swr/swr_clear.cpp
new file mode 100644
index 00000000000..103bca99441
--- /dev/null
+++ b/src/gallium/drivers/swr/swr_clear.cpp
@@ -0,0 +1,142 @@
+/****************************************************************************
+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ***************************************************************************/
+
+#include "swr_context.h"
+#include "swr_query.h"
+
+static void
+swr_clear(struct pipe_context *pipe,
+          unsigned buffers,
+          const union pipe_color_union *color,
+          double depth,
+          unsigned stencil)
+{
+   struct swr_context *ctx = swr_context(pipe);
+   struct pipe_framebuffer_state *fb = &ctx->framebuffer;
+
+   UINT clearMask = 0;
+
+   if (!swr_check_render_cond(pipe))
+      return;
+
+   if (ctx->dirty)
+      swr_update_derived(pipe);
+
+/* Update clearMask/targetMask */
+#if 0 /* XXX SWR currently only clears SWR_ATTACHMENT_COLOR0, don't bother   \
+         checking others yet. */
+   if (buffers & PIPE_CLEAR_COLOR && fb->nr_cbufs) {
+      UINT i;
+      for (i = 0; i < fb->nr_cbufs; ++i)
+         if (fb->cbufs[i])
+            clearMask |= (SWR_CLEAR_COLOR0 << i);
+   }
+#else
+   if (buffers & PIPE_CLEAR_COLOR && fb->cbufs[0])
+      clearMask |= SWR_CLEAR_COLOR;
+#endif
+
+   if (buffers & PIPE_CLEAR_DEPTH && fb->zsbuf)
+      clearMask |= SWR_CLEAR_DEPTH;
+
+   if (buffers & PIPE_CLEAR_STENCIL && fb->zsbuf)
+      clearMask |= SWR_CLEAR_STENCIL;
+
+#if 0 // XXX HACK, override clear color alpha. On ubuntu, clears are
+      // transparent.
+   ((union pipe_color_union *)color)->f[3] = 1.0; /* cast off your const'd-ness */
+#endif
+
+   /* Reset viewport to full framebuffer width/height before clear, then
+    * restore it  */
+   /* Scissor affects clear, viewport should not */
+   ctx->dirty |= SWR_NEW_VIEWPORT;
+   SWR_VIEWPORT vp = {0};
+   vp.width = ctx->framebuffer.width;
+   vp.height = ctx->framebuffer.height;
+   SwrSetViewports(ctx->swrContext, 1, &vp, NULL);
+
+   swr_update_draw_context(ctx);
+   SwrClearRenderTarget(ctx->swrContext, clearMask, color->f, depth, stencil);
+}
+
+
+#if 0 // XXX, these don't get called. how to get these called?  Do we need
+      // them?  Docs?
+static void
+swr_clear_render_target(struct pipe_context *pipe, struct pipe_surface *ps,
+                        const union pipe_color_union *color,
+                        unsigned x, unsigned y, unsigned w, unsigned h)
+{
+   struct swr_context *ctx = swr_context(pipe);
+   fprintf(stderr, "SWR swr_clear_render_target!\n");
+
+   ctx->dirty |= SWR_NEW_FRAMEBUFFER | SWR_NEW_SCISSOR;
+}
+
+static void
+swr_clear_depth_stencil(struct pipe_context *pipe, struct pipe_surface *ps,
+                        unsigned buffers, double depth, unsigned stencil,
+                        unsigned x, unsigned y, unsigned w, unsigned h)
+{
+   struct swr_context *ctx = swr_context(pipe);
+   fprintf(stderr, "SWR swr_clear_depth_stencil!\n");
+
+   ctx->dirty |= SWR_NEW_FRAMEBUFFER | SWR_NEW_SCISSOR;
+}
+
+static void
+swr_clear_buffer(struct pipe_context *pipe,
+                 struct pipe_resource *res,
+                 unsigned offset, unsigned size,
+                 const void *data, int data_size)
+{
+   fprintf(stderr, "SWR swr_clear_buffer!\n");
+   struct swr_context *ctx = swr_context(pipe);
+   struct swr_resource *buf = swr_resource(res);
+   union pipe_color_union color;
+   enum pipe_format dst_fmt;
+   unsigned width, height, elements;
+
+   assert(res->target == PIPE_BUFFER);
+   assert(buf);
+   assert(size % data_size == 0);
+
+   SWR_SURFACE_STATE &swr_buffer = buf->swr;
+
+   ctx->dirty |= SWR_NEW_FRAMEBUFFER | SWR_NEW_SCISSOR;
+}
+#endif
+
+
+void
+swr_clear_init(struct pipe_context *pipe)
+{
+   pipe->clear = swr_clear;
+#if 0 // XXX, these don't get called. how to get these called?  Do we need
+      // them?  Docs?
+   pipe->clear_render_target = swr_clear_render_target;
+   pipe->clear_depth_stencil = swr_clear_depth_stencil;
+   pipe->clear_buffer = swr_clear_buffer;
+#endif
+}
diff --git a/src/gallium/drivers/swr/swr_context.cpp b/src/gallium/drivers/swr/swr_context.cpp
new file mode 100644
index 00000000000..c8cb145d334
--- /dev/null
+++ b/src/gallium/drivers/swr/swr_context.cpp
@@ -0,0 +1,382 @@
+/****************************************************************************
+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ***************************************************************************/
+
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "util/u_format.h"
+
+extern "C" {
+#include "util/u_transfer.h"
+#include "util/u_surface.h"
+}
+
+#include "swr_context.h"
+#include "swr_memory.h"
+#include "swr_screen.h"
+#include "swr_resource.h"
+#include "swr_scratch.h"
+#include "swr_query.h"
+#include "swr_fence.h"
+
+#include "api.h"
+#include "backend.h"
+
+static struct pipe_surface *
+swr_create_surface(struct pipe_context *pipe,
+                   struct pipe_resource *pt,
+                   const struct pipe_surface *surf_tmpl)
+{
+   struct pipe_surface *ps;
+
+   ps = CALLOC_STRUCT(pipe_surface);
+   if (ps) {
+      pipe_reference_init(&ps->reference, 1);
+      pipe_resource_reference(&ps->texture, pt);
+      ps->context = pipe;
+      ps->format = surf_tmpl->format;
+      if (pt->target != PIPE_BUFFER) {
+         assert(surf_tmpl->u.tex.level <= pt->last_level);
+         ps->width = u_minify(pt->width0, surf_tmpl->u.tex.level);
+         ps->height = u_minify(pt->height0, surf_tmpl->u.tex.level);
+         ps->u.tex.level = surf_tmpl->u.tex.level;
+         ps->u.tex.first_layer = surf_tmpl->u.tex.first_layer;
+         ps->u.tex.last_layer = surf_tmpl->u.tex.last_layer;
+         if (ps->u.tex.first_layer != ps->u.tex.last_layer) {
+            debug_printf("creating surface with multiple layers, rendering "
+                         "to first layer only\n");
+         }
+      } else {
+         /* setting width as number of elements should get us correct
+          * renderbuffer width */
+         ps->width = surf_tmpl->u.buf.last_element
+            - surf_tmpl->u.buf.first_element + 1;
+         ps->height = pt->height0;
+         ps->u.buf.first_element = surf_tmpl->u.buf.first_element;
+         ps->u.buf.last_element = surf_tmpl->u.buf.last_element;
+         assert(ps->u.buf.first_element <= ps->u.buf.last_element);
+         assert(ps->u.buf.last_element < ps->width);
+      }
+   }
+   return ps;
+}
+
+static void
+swr_surface_destroy(struct pipe_context *pipe, struct pipe_surface *surf)
+{
+   assert(surf->texture);
+   struct pipe_resource *resource = surf->texture;
+
+   /* If the resource has been drawn to, store tiles. */
+   swr_store_dirty_resource(pipe, resource, SWR_TILE_RESOLVED);
+
+   pipe_resource_reference(&resource, NULL);
+   FREE(surf);
+}
+
+
+static void *
+swr_transfer_map(struct pipe_context *pipe,
+                 struct pipe_resource *resource,
+                 unsigned level,
+                 unsigned usage,
+                 const struct pipe_box *box,
+                 struct pipe_transfer **transfer)
+{
+   struct swr_screen *screen = swr_screen(pipe->screen);
+   struct swr_resource *spr = swr_resource(resource);
+   struct pipe_transfer *pt;
+   enum pipe_format format = resource->format;
+
+   assert(resource);
+   assert(level <= resource->last_level);
+
+   /* If mapping an attached rendertarget, store tiles to surface and set
+    * postStoreTileState to SWR_TILE_INVALID so tiles get reloaded on next use
+    * and nothing needs to be done at unmap. */
+   swr_store_dirty_resource(pipe, resource, SWR_TILE_INVALID);
+
+   if (!(usage & PIPE_TRANSFER_UNSYNCHRONIZED)) {
+      /* If resource is in use, finish fence before mapping.
+       * Unless requested not to block, then if not done return NULL map */
+      if (usage & PIPE_TRANSFER_DONTBLOCK) {
+         if (swr_is_fence_pending(screen->flush_fence))
+            return NULL;
+      } else {
+         if (spr->status) {
+            /* But, if there's no fence pending, submit one.
+             * XXX: Remove once draw timestamps are finished. */
+            if (!swr_is_fence_pending(screen->flush_fence))
+               swr_fence_submit(swr_context(pipe), screen->flush_fence);
+
+            swr_fence_finish(pipe->screen, screen->flush_fence, 0);
+            swr_resource_unused(pipe, spr);
+         }
+      }
+   }
+
+   pt = CALLOC_STRUCT(pipe_transfer);
+   if (!pt)
+      return NULL;
+   pipe_resource_reference(&pt->resource, resource);
+   pt->level = level;
+   pt->box = *box;
+   pt->stride = spr->row_stride[level];
+   pt->layer_stride = spr->img_stride[level];
+
+   /* if we're mapping the depth/stencil, copy in stencil */
+   if (spr->base.format == PIPE_FORMAT_Z24_UNORM_S8_UINT
+       && spr->has_stencil) {
+      for (unsigned i = 0; i < spr->alignedWidth * spr->alignedHeight; i++) {
+         spr->swr.pBaseAddress[4 * i + 3] = spr->secondary.pBaseAddress[i];
+      }
+   } else if (spr->base.format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT
+              && spr->has_stencil) {
+      for (unsigned i = 0; i < spr->alignedWidth * spr->alignedHeight; i++) {
+         spr->swr.pBaseAddress[8 * i + 4] = spr->secondary.pBaseAddress[i];
+      }
+   }
+
+   unsigned offset = box->z * pt->layer_stride + box->y * pt->stride
+      + box->x * util_format_get_blocksize(format);
+
+   *transfer = pt;
+
+   return spr->swr.pBaseAddress + offset + spr->mip_offsets[level];
+}
+
+static void
+swr_transfer_unmap(struct pipe_context *pipe, struct pipe_transfer *transfer)
+{
+   assert(transfer->resource);
+
+   struct swr_resource *res = swr_resource(transfer->resource);
+   /* if we're mapping the depth/stencil, copy out stencil */
+   if (res->base.format == PIPE_FORMAT_Z24_UNORM_S8_UINT
+       && res->has_stencil) {
+      for (unsigned i = 0; i < res->alignedWidth * res->alignedHeight; i++) {
+         res->secondary.pBaseAddress[i] = res->swr.pBaseAddress[4 * i + 3];
+      }
+   } else if (res->base.format == PIPE_FORMAT_Z32_FLOAT_S8X24_UINT
+              && res->has_stencil) {
+      for (unsigned i = 0; i < res->alignedWidth * res->alignedHeight; i++) {
+         res->secondary.pBaseAddress[i] = res->swr.pBaseAddress[8 * i + 4];
+      }
+   }
+
+   pipe_resource_reference(&transfer->resource, NULL);
+   FREE(transfer);
+}
+
+
+static void
+swr_resource_copy(struct pipe_context *pipe,
+                  struct pipe_resource *dst,
+                  unsigned dst_level,
+                  unsigned dstx,
+                  unsigned dsty,
+                  unsigned dstz,
+                  struct pipe_resource *src,
+                  unsigned src_level,
+                  const struct pipe_box *src_box)
+{
+   struct swr_screen *screen = swr_screen(pipe->screen);
+
+   /* If either the src or dst is a renderTarget, store tiles before copy */
+   swr_store_dirty_resource(pipe, src, SWR_TILE_RESOLVED);
+   swr_store_dirty_resource(pipe, dst, SWR_TILE_RESOLVED);
+
+   swr_fence_finish(pipe->screen, screen->flush_fence, 0);
+   swr_resource_unused(pipe, swr_resource(src));
+   swr_resource_unused(pipe, swr_resource(dst));
+
+   if ((dst->target == PIPE_BUFFER && src->target == PIPE_BUFFER)
+       || (dst->target != PIPE_BUFFER && src->target != PIPE_BUFFER)) {
+      util_resource_copy_region(
+         pipe, dst, dst_level, dstx, dsty, dstz, src, src_level, src_box);
+      return;
+   }
+
+   debug_printf("unhandled swr_resource_copy\n");
+}
+
+
+static void
+swr_blit(struct pipe_context *pipe, const struct pipe_blit_info *blit_info)
+{
+   struct swr_context *ctx = swr_context(pipe);
+   struct pipe_blit_info info = *blit_info;
+
+   if (blit_info->render_condition_enable && !swr_check_render_cond(pipe))
+      return;
+
+   if (info.src.resource->nr_samples > 1 && info.dst.resource->nr_samples <= 1
+       && !util_format_is_depth_or_stencil(info.src.resource->format)
+       && !util_format_is_pure_integer(info.src.resource->format)) {
+      debug_printf("swr: color resolve unimplemented\n");
+      return;
+   }
+
+   if (util_try_blit_via_copy_region(pipe, &info)) {
+      return; /* done */
+   }
+
+   if (info.mask & PIPE_MASK_S) {
+      debug_printf("swr: cannot blit stencil, skipping\n");
+      info.mask &= ~PIPE_MASK_S;
+   }
+
+   if (!util_blitter_is_blit_supported(ctx->blitter, &info)) {
+      debug_printf("swr: blit unsupported %s -> %s\n",
+                   util_format_short_name(info.src.resource->format),
+                   util_format_short_name(info.dst.resource->format));
+      return;
+   }
+
+   /* XXX turn off occlusion and streamout queries */
+
+   util_blitter_save_vertex_buffer_slot(ctx->blitter, ctx->vertex_buffer);
+   util_blitter_save_vertex_elements(ctx->blitter, (void *)ctx->velems);
+   util_blitter_save_vertex_shader(ctx->blitter, (void *)ctx->vs);
+   /*util_blitter_save_geometry_shader(ctx->blitter, (void*)ctx->gs);*/
+   util_blitter_save_so_targets(
+      ctx->blitter,
+      ctx->num_so_targets,
+      (struct pipe_stream_output_target **)ctx->so_targets);
+   util_blitter_save_rasterizer(ctx->blitter, (void *)ctx->rasterizer);
+   util_blitter_save_viewport(ctx->blitter, &ctx->viewport);
+   util_blitter_save_scissor(ctx->blitter, &ctx->scissor);
+   util_blitter_save_fragment_shader(ctx->blitter, ctx->fs);
+   util_blitter_save_blend(ctx->blitter, (void *)ctx->blend);
+   util_blitter_save_depth_stencil_alpha(ctx->blitter,
+                                         (void *)ctx->depth_stencil);
+   util_blitter_save_stencil_ref(ctx->blitter, &ctx->stencil_ref);
+   util_blitter_save_sample_mask(ctx->blitter, ctx->sample_mask);
+   util_blitter_save_framebuffer(ctx->blitter, &ctx->framebuffer);
+   util_blitter_save_fragment_sampler_states(
+      ctx->blitter,
+      ctx->num_samplers[PIPE_SHADER_FRAGMENT],
+      (void **)ctx->samplers[PIPE_SHADER_FRAGMENT]);
+   util_blitter_save_fragment_sampler_views(
+      ctx->blitter,
+      ctx->num_sampler_views[PIPE_SHADER_FRAGMENT],
+      ctx->sampler_views[PIPE_SHADER_FRAGMENT]);
+   util_blitter_save_render_condition(ctx->blitter,
+                                      ctx->render_cond_query,
+                                      ctx->render_cond_cond,
+                                      ctx->render_cond_mode);
+
+   util_blitter_blit(ctx->blitter, &info);
+}
+
+
+static void
+swr_destroy(struct pipe_context *pipe)
+{
+   struct swr_context *ctx = swr_context(pipe);
+
+   if (ctx->blitter)
+      util_blitter_destroy(ctx->blitter);
+
+   /* Idle core before deleting context */
+   SwrWaitForIdle(ctx->swrContext);
+   if (ctx->swrContext)
+      SwrDestroyContext(ctx->swrContext);
+
+   delete ctx->blendJIT;
+
+   swr_destroy_scratch_buffers(ctx);
+
+   FREE(ctx);
+}
+
+
+static void
+swr_render_condition(struct pipe_context *pipe,
+                     struct pipe_query *query,
+                     boolean condition,
+                     uint mode)
+{
+   struct swr_context *ctx = swr_context(pipe);
+
+   ctx->render_cond_query = query;
+   ctx->render_cond_mode = mode;
+   ctx->render_cond_cond = condition;
+}
+
+struct pipe_context *
+swr_create_context(struct pipe_screen *screen, void *priv, unsigned flags)
+{
+   struct swr_context *ctx = CALLOC_STRUCT(swr_context);
+   ctx->blendJIT =
+      new std::unordered_map<BLEND_COMPILE_STATE, PFN_BLEND_JIT_FUNC>;
+
+   SWR_CREATECONTEXT_INFO createInfo;
+   createInfo.driver = GL;
+   createInfo.privateStateSize = sizeof(swr_draw_context);
+   createInfo.maxSubContexts = 0;
+   createInfo.pfnLoadTile = swr_LoadHotTile;
+   createInfo.pfnStoreTile = swr_StoreHotTile;
+   createInfo.pfnClearTile = swr_StoreHotTileClear;
+   ctx->swrContext = SwrCreateContext(&createInfo);
+
+   /* Init Load/Store/ClearTiles Tables */
+   swr_InitMemoryModule();
+
+   InitBackendFuncTables();
+
+   if (ctx->swrContext == NULL)
+      goto fail;
+
+   ctx->pipe.screen = screen;
+   ctx->pipe.destroy = swr_destroy;
+   ctx->pipe.priv = priv;
+   ctx->pipe.create_surface = swr_create_surface;
+   ctx->pipe.surface_destroy = swr_surface_destroy;
+   ctx->pipe.transfer_map = swr_transfer_map;
+   ctx->pipe.transfer_unmap = swr_transfer_unmap;
+
+   ctx->pipe.transfer_flush_region = u_default_transfer_flush_region;
+   ctx->pipe.transfer_inline_write = u_default_transfer_inline_write;
+
+   ctx->pipe.resource_copy_region = swr_resource_copy;
+   ctx->pipe.render_condition = swr_render_condition;
+
+   swr_state_init(&ctx->pipe);
+   swr_clear_init(&ctx->pipe);
+   swr_draw_init(&ctx->pipe);
+   swr_query_init(&ctx->pipe);
+
+   ctx->pipe.blit = swr_blit;
+   ctx->blitter = util_blitter_create(&ctx->pipe);
+   if (!ctx->blitter)
+      goto fail;
+
+   swr_init_scratch_buffers(ctx);
+
+   return &ctx->pipe;
+
+fail:
+   /* Should really validate the init steps and fail gracefully */
+   swr_destroy(&ctx->pipe);
+   return NULL;
+}
diff --git a/src/gallium/drivers/swr/swr_context.h b/src/gallium/drivers/swr/swr_context.h
new file mode 100644
index 00000000000..73a8e8ddda1
--- /dev/null
+++ b/src/gallium/drivers/swr/swr_context.h
@@ -0,0 +1,182 @@
+/****************************************************************************
+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ***************************************************************************/
+
+#ifndef SWR_CONTEXT_H
+#define SWR_CONTEXT_H
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+#include "util/u_blitter.h"
+#include "jit_api.h"
+#include "swr_state.h"
+#include <unordered_map>
+
+#define SWR_NEW_BLEND (1 << 0)
+#define SWR_NEW_RASTERIZER (1 << 1)
+#define SWR_NEW_DEPTH_STENCIL_ALPHA (1 << 2)
+#define SWR_NEW_SAMPLER (1 << 3)
+#define SWR_NEW_SAMPLER_VIEW (1 << 4)
+#define SWR_NEW_VS (1 << 5)
+#define SWR_NEW_FS (1 << 6)
+#define SWR_NEW_VSCONSTANTS (1 << 7)
+#define SWR_NEW_FSCONSTANTS (1 << 8)
+#define SWR_NEW_VERTEX (1 << 9)
+#define SWR_NEW_STIPPLE (1 << 10)
+#define SWR_NEW_SCISSOR (1 << 11)
+#define SWR_NEW_VIEWPORT (1 << 12)
+#define SWR_NEW_FRAMEBUFFER (1 << 13)
+#define SWR_NEW_CLIP (1 << 14)
+#define SWR_NEW_SO (1 << 15)
+#define SWR_NEW_ALL 0x0000ffff
+
+namespace std
+{
+template <> struct hash<BLEND_COMPILE_STATE> {
+   std::size_t operator()(const BLEND_COMPILE_STATE &k) const
+   {
+      return util_hash_crc32(&k, sizeof(k));
+   }
+};
+};
+
+struct swr_jit_texture {
+   uint32_t width; // same as number of elements
+   uint32_t height;
+   uint32_t depth; // doubles as array size
+   uint32_t first_level;
+   uint32_t last_level;
+   const void *base_ptr;
+   uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS];
+   uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS];
+   uint32_t mip_offsets[PIPE_MAX_TEXTURE_LEVELS];
+};
+
+struct swr_jit_sampler {
+   float min_lod;
+   float max_lod;
+   float lod_bias;
+   float border_color[4];
+};
+
+struct swr_draw_context {
+   const float *constantVS[PIPE_MAX_CONSTANT_BUFFERS];
+   unsigned num_constantsVS[PIPE_MAX_CONSTANT_BUFFERS];
+   const float *constantFS[PIPE_MAX_CONSTANT_BUFFERS];
+   unsigned num_constantsFS[PIPE_MAX_CONSTANT_BUFFERS];
+
+   swr_jit_texture texturesVS[PIPE_MAX_SHADER_SAMPLER_VIEWS];
+   swr_jit_sampler samplersVS[PIPE_MAX_SAMPLERS];
+   swr_jit_texture texturesFS[PIPE_MAX_SHADER_SAMPLER_VIEWS];
+   swr_jit_sampler samplersFS[PIPE_MAX_SAMPLERS];
+
+   SWR_SURFACE_STATE renderTargets[SWR_NUM_ATTACHMENTS];
+};
+
+struct swr_context {
+   struct pipe_context pipe; /**< base class */
+
+   HANDLE swrContext;
+
+   /** Constant state objects */
+   struct swr_blend_state *blend;
+   struct pipe_sampler_state *samplers[PIPE_SHADER_TYPES][PIPE_MAX_SAMPLERS];
+   struct pipe_depth_stencil_alpha_state *depth_stencil;
+   struct pipe_rasterizer_state *rasterizer;
+
+   struct swr_vertex_shader *vs;
+   struct swr_fragment_shader *fs;
+   struct swr_vertex_element_state *velems;
+
+   /** Other rendering state */
+   struct pipe_blend_color blend_color;
+   struct pipe_stencil_ref stencil_ref;
+   struct pipe_clip_state clip;
+   struct pipe_constant_buffer
+      constants[PIPE_SHADER_TYPES][PIPE_MAX_CONSTANT_BUFFERS];
+   struct pipe_framebuffer_state framebuffer;
+   struct pipe_poly_stipple poly_stipple;
+   struct pipe_scissor_state scissor;
+   struct pipe_sampler_view *
+      sampler_views[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_SAMPLER_VIEWS];
+
+   struct pipe_viewport_state viewport;
+   struct pipe_vertex_buffer vertex_buffer[PIPE_MAX_ATTRIBS];
+   struct pipe_index_buffer index_buffer;
+
+   struct blitter_context *blitter;
+
+   /** Conditional query object and mode */
+   struct pipe_query *render_cond_query;
+   uint render_cond_mode;
+   boolean render_cond_cond;
+   unsigned active_queries;
+
+   unsigned num_vertex_buffers;
+   unsigned num_samplers[PIPE_SHADER_TYPES];
+   unsigned num_sampler_views[PIPE_SHADER_TYPES];
+
+   unsigned sample_mask;
+
+   // streamout
+   pipe_stream_output_target *so_targets[MAX_SO_STREAMS];
+   uint32_t num_so_targets;
+
+   /* Temp storage for user_buffer constants */
+   struct swr_scratch_buffers *scratch;
+
+   // blend jit functions
+   std::unordered_map<BLEND_COMPILE_STATE, PFN_BLEND_JIT_FUNC> *blendJIT;
+
+   /* Derived SWR API DrawState */
+   struct swr_derived_state derived;
+
+   /* SWR private state - draw context */
+   struct swr_draw_context swrDC;
+
+   unsigned dirty; /**< Mask of SWR_NEW_x flags */
+};
+
+static INLINE struct swr_context *
+swr_context(struct pipe_context *pipe)
+{
+   return (struct swr_context *)pipe;
+}
+
+static INLINE void
+swr_update_draw_context(struct swr_context *ctx)
+{
+   swr_draw_context *pDC =
+      (swr_draw_context *)SwrGetPrivateContextState(ctx->swrContext);
+   memcpy(pDC, &ctx->swrDC, sizeof(swr_draw_context));
+}
+
+struct pipe_context *swr_create_context(struct pipe_screen *, void *priv, unsigned flags);
+
+void swr_state_init(struct pipe_context *pipe);
+
+void swr_clear_init(struct pipe_context *pipe);
+
+void swr_draw_init(struct pipe_context *pipe);
+
+void swr_finish(struct pipe_context *pipe);
+#endif
diff --git a/src/gallium/drivers/swr/swr_context_llvm.h b/src/gallium/drivers/swr/swr_context_llvm.h
new file mode 100644
index 00000000000..58da813123f
--- /dev/null
+++ b/src/gallium/drivers/swr/swr_context_llvm.h
@@ -0,0 +1,124 @@
+/****************************************************************************
+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ***************************************************************************/
+
+#pragma once
+
+//////////////////////////////////////////////////////////////////////////
+/// Generate LLVM type information for swr_jit_texture
+INLINE static StructType *
+Gen_swr_jit_texture(JitManager *pShG)
+{
+   LLVMContext &ctx = pShG->mContext;
+   std::vector<Type *> members;
+
+   members.push_back(Type::getInt32Ty(ctx)); // width
+   members.push_back(Type::getInt32Ty(ctx)); // height
+   members.push_back(Type::getInt32Ty(ctx)); // depth
+   members.push_back(Type::getInt32Ty(ctx)); // first_level
+   members.push_back(Type::getInt32Ty(ctx)); // last_level
+   members.push_back(PointerType::get(Type::getInt8Ty(ctx), 0)); // base_ptr
+   members.push_back(ArrayType::get(Type::getInt32Ty(ctx),
+                                    PIPE_MAX_TEXTURE_LEVELS)); // row_stride
+   members.push_back(ArrayType::get(Type::getInt32Ty(ctx),
+                                    PIPE_MAX_TEXTURE_LEVELS)); // img_stride
+   members.push_back(ArrayType::get(Type::getInt32Ty(ctx),
+                                    PIPE_MAX_TEXTURE_LEVELS)); // mip_offsets
+
+   return StructType::get(ctx, members, false);
+}
+
+static const UINT swr_jit_texture_width = 0;
+static const UINT swr_jit_texture_height = 1;
+static const UINT swr_jit_texture_depth = 2;
+static const UINT swr_jit_texture_first_level = 3;
+static const UINT swr_jit_texture_last_level = 4;
+static const UINT swr_jit_texture_base_ptr = 5;
+static const UINT swr_jit_texture_row_stride = 6;
+static const UINT swr_jit_texture_img_stride = 7;
+static const UINT swr_jit_texture_mip_offsets = 8;
+
+//////////////////////////////////////////////////////////////////////////
+/// Generate LLVM type information for swr_jit_sampler
+INLINE static StructType *
+Gen_swr_jit_sampler(JitManager *pShG)
+{
+   LLVMContext &ctx = pShG->mContext;
+   std::vector<Type *> members;
+
+   members.push_back(Type::getFloatTy(ctx)); // min_lod
+   members.push_back(Type::getFloatTy(ctx)); // max_lod
+   members.push_back(Type::getFloatTy(ctx)); // lod_bias
+   members.push_back(
+      ArrayType::get(Type::getFloatTy(ctx), 4)); // border_color
+
+   return StructType::get(ctx, members, false);
+}
+
+static const UINT swr_jit_sampler_min_lod = 0;
+static const UINT swr_jit_sampler_max_lod = 1;
+static const UINT swr_jit_sampler_lod_bias = 2;
+static const UINT swr_jit_sampler_border_color = 3;
+
+//////////////////////////////////////////////////////////////////////////
+/// Generate LLVM type information for swr_draw_context
+INLINE static StructType *
+Gen_swr_draw_context(JitManager *pShG)
+{
+   LLVMContext &ctx = pShG->mContext;
+   std::vector<Type *> members;
+
+   members.push_back(
+      ArrayType::get(PointerType::get(Type::getFloatTy(ctx), 0),
+                     PIPE_MAX_CONSTANT_BUFFERS)); // constantVS
+   members.push_back(ArrayType::get(
+      Type::getInt32Ty(ctx), PIPE_MAX_CONSTANT_BUFFERS)); // num_constantsVS
+   members.push_back(
+      ArrayType::get(PointerType::get(Type::getFloatTy(ctx), 0),
+                     PIPE_MAX_CONSTANT_BUFFERS)); // constantFS
+   members.push_back(ArrayType::get(
+      Type::getInt32Ty(ctx), PIPE_MAX_CONSTANT_BUFFERS)); // num_constantsFS
+   members.push_back(
+      ArrayType::get(Gen_swr_jit_texture(pShG),
+                     PIPE_MAX_SHADER_SAMPLER_VIEWS)); // texturesVS
+   members.push_back(ArrayType::get(Gen_swr_jit_sampler(pShG),
+                                    PIPE_MAX_SAMPLERS)); // samplersVS
+   members.push_back(
+      ArrayType::get(Gen_swr_jit_texture(pShG),
+                     PIPE_MAX_SHADER_SAMPLER_VIEWS)); // texturesFS
+   members.push_back(ArrayType::get(Gen_swr_jit_sampler(pShG),
+                                    PIPE_MAX_SAMPLERS)); // samplersFS
+   members.push_back(ArrayType::get(Gen_SWR_SURFACE_STATE(pShG),
+                                    SWR_NUM_ATTACHMENTS)); // renderTargets
+
+   return StructType::get(ctx, members, false);
+}
+
+static const UINT swr_draw_context_constantVS = 0;
+static const UINT swr_draw_context_num_constantsVS = 1;
+static const UINT swr_draw_context_constantFS = 2;
+static const UINT swr_draw_context_num_constantsFS = 3;
+static const UINT swr_draw_context_texturesVS = 4;
+static const UINT swr_draw_context_samplersVS = 5;
+static const UINT swr_draw_context_texturesFS = 6;
+static const UINT swr_draw_context_samplersFS = 7;
+static const UINT swr_draw_context_renderTargets = 8;
diff --git a/src/gallium/drivers/swr/swr_draw.cpp b/src/gallium/drivers/swr/swr_draw.cpp
new file mode 100644
index 00000000000..428bf78cb55
--- /dev/null
+++ b/src/gallium/drivers/swr/swr_draw.cpp
@@ -0,0 +1,297 @@
+/****************************************************************************
+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ***************************************************************************/
+
+#include "swr_screen.h"
+#include "swr_context.h"
+#include "swr_resource.h"
+#include "swr_fence.h"
+#include "swr_query.h"
+#include "jit_api.h"
+
+#include "util/u_draw.h"
+#include "util/u_prim.h"
+
+/*
+ * Convert mesa PIPE_PRIM_X to SWR enum PRIMITIVE_TOPOLOGY
+ */
+static INLINE enum PRIMITIVE_TOPOLOGY
+swr_convert_prim_topology(const unsigned mode)
+{
+   switch (mode) {
+   case PIPE_PRIM_POINTS:
+      return TOP_POINT_LIST;
+   case PIPE_PRIM_LINES:
+      return TOP_LINE_LIST;
+   case PIPE_PRIM_LINE_LOOP:
+      return TOP_LINE_LOOP;
+   case PIPE_PRIM_LINE_STRIP:
+      return TOP_LINE_STRIP;
+   case PIPE_PRIM_TRIANGLES:
+      return TOP_TRIANGLE_LIST;
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      return TOP_TRIANGLE_STRIP;
+   case PIPE_PRIM_TRIANGLE_FAN:
+      return TOP_TRIANGLE_FAN;
+   case PIPE_PRIM_QUADS:
+      return TOP_QUAD_LIST;
+   case PIPE_PRIM_QUAD_STRIP:
+      return TOP_QUAD_STRIP;
+   case PIPE_PRIM_POLYGON:
+      return TOP_TRIANGLE_FAN; /* XXX TOP_POLYGON; */
+   case PIPE_PRIM_LINES_ADJACENCY:
+      return TOP_LINE_LIST_ADJ;
+   case PIPE_PRIM_LINE_STRIP_ADJACENCY:
+      return TOP_LISTSTRIP_ADJ;
+   case PIPE_PRIM_TRIANGLES_ADJACENCY:
+      return TOP_TRI_LIST_ADJ;
+   case PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY:
+      return TOP_TRI_STRIP_ADJ;
+   default:
+      assert(0 && "Unknown topology");
+      return TOP_UNKNOWN;
+   }
+};
+
+
+/*
+ * Draw vertex arrays, with optional indexing, optional instancing.
+ */
+static void
+swr_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
+{
+   struct swr_context *ctx = swr_context(pipe);
+
+   if (!swr_check_render_cond(pipe))
+      return;
+
+   if (info->indirect) {
+      util_draw_indirect(pipe, info);
+      return;
+   }
+
+   /* Update derived state, pass draw info to update function */
+   if (ctx->dirty)
+      swr_update_derived(pipe, info);
+
+   swr_update_draw_context(ctx);
+
+   if (ctx->vs->pipe.stream_output.num_outputs) {
+      if (!ctx->vs->soFunc[info->mode]) {
+         STREAMOUT_COMPILE_STATE state = {0};
+         struct pipe_stream_output_info *so = &ctx->vs->pipe.stream_output;
+
+         state.numVertsPerPrim = u_vertices_per_prim(info->mode);
+
+         uint32_t offsets[MAX_SO_STREAMS] = {0};
+         uint32_t num = 0;
+
+         for (uint32_t i = 0; i < so->num_outputs; i++) {
+            assert(so->output[i].stream == 0); // @todo
+            uint32_t output_buffer = so->output[i].output_buffer;
+            if (so->output[i].dst_offset != offsets[output_buffer]) {
+               // hole - need to fill
+               state.stream.decl[num].bufferIndex = output_buffer;
+               state.stream.decl[num].hole = true;
+               state.stream.decl[num].componentMask =
+                  (1 << (so->output[i].dst_offset - offsets[output_buffer]))
+                  - 1;
+               num++;
+               offsets[output_buffer] = so->output[i].dst_offset;
+            }
+
+            state.stream.decl[num].bufferIndex = output_buffer;
+            state.stream.decl[num].attribSlot = so->output[i].register_index - 1;
+            state.stream.decl[num].componentMask =
+               ((1 << so->output[i].num_components) - 1)
+               << so->output[i].start_component;
+            state.stream.decl[num].hole = false;
+            num++;
+
+            offsets[output_buffer] += so->output[i].num_components;
+         }
+
+         state.stream.numDecls = num;
+
+         HANDLE hJitMgr = swr_screen(pipe->screen)->hJitMgr;
+         ctx->vs->soFunc[info->mode] = JitCompileStreamout(hJitMgr, state);
+         debug_printf("so shader    %p\n", ctx->vs->soFunc[info->mode]);
+         assert(ctx->vs->soFunc[info->mode] && "Error: SoShader = NULL");
+      }
+
+      SwrSetSoFunc(ctx->swrContext, ctx->vs->soFunc[info->mode], 0);
+   }
+
+   struct swr_vertex_element_state *velems = ctx->velems;
+   if (!velems->fsFunc
+       || (velems->fsState.cutIndex != info->restart_index)
+       || (velems->fsState.bEnableCutIndex != info->primitive_restart)) {
+
+      velems->fsState.cutIndex = info->restart_index;
+      velems->fsState.bEnableCutIndex = info->primitive_restart;
+
+      /* Create Fetch Shader */
+      HANDLE hJitMgr = swr_screen(ctx->pipe.screen)->hJitMgr;
+      velems->fsFunc = JitCompileFetch(hJitMgr, velems->fsState);
+
+      debug_printf("fetch shader %p\n", velems->fsFunc);
+      assert(velems->fsFunc && "Error: FetchShader = NULL");
+   }
+
+   SwrSetFetchFunc(ctx->swrContext, velems->fsFunc);
+
+   if (info->indexed)
+      SwrDrawIndexedInstanced(ctx->swrContext,
+                              swr_convert_prim_topology(info->mode),
+                              info->count,
+                              info->instance_count,
+                              info->start,
+                              info->index_bias,
+                              info->start_instance);
+   else
+      SwrDrawInstanced(ctx->swrContext,
+                       swr_convert_prim_topology(info->mode),
+                       info->count,
+                       info->instance_count,
+                       info->start,
+                       info->start_instance);
+}
+
+
+static void
+swr_flush(struct pipe_context *pipe,
+          struct pipe_fence_handle **fence,
+          unsigned flags)
+{
+   struct swr_context *ctx = swr_context(pipe);
+   struct swr_screen *screen = swr_screen(pipe->screen);
+   struct pipe_surface *cb = ctx->framebuffer.cbufs[0];
+
+   /* If the current renderTarget is the display surface, store tiles back to
+    * the surface, in preparation for present (swr_flush_frontbuffer).
+    * Other renderTargets get stored back when attachment changes or
+    * swr_surface_destroy */
+   if (cb && swr_resource(cb->texture)->display_target)
+      swr_store_dirty_resource(pipe, cb->texture, SWR_TILE_RESOLVED);
+
+   if (fence)
+      swr_fence_reference(pipe->screen, fence, screen->flush_fence);
+}
+
+void
+swr_finish(struct pipe_context *pipe)
+{
+   struct pipe_fence_handle *fence = nullptr;
+
+   swr_flush(pipe, &fence, 0);
+   swr_fence_finish(pipe->screen, fence, 0);
+   swr_fence_reference(pipe->screen, &fence, NULL);
+}
+
+
+/*
+ * Store SWR HotTiles back to renderTarget surface.
+ */
+void
+swr_store_render_target(struct pipe_context *pipe,
+                        uint32_t attachment,
+                        enum SWR_TILE_STATE post_tile_state)
+{
+   struct swr_context *ctx = swr_context(pipe);
+   struct swr_draw_context *pDC = &ctx->swrDC;
+   struct SWR_SURFACE_STATE *renderTarget = &pDC->renderTargets[attachment];
+
+   /* Only proceed if there's a valid surface to store to */
+   if (renderTarget->pBaseAddress) {
+      /* Set viewport to full renderTarget width/height and disable scissor
+       * before StoreTiles */
+      boolean change_viewport =
+         (ctx->derived.vp.x != 0.0f || ctx->derived.vp.y != 0.0f
+          || ctx->derived.vp.width != renderTarget->width
+          || ctx->derived.vp.height != renderTarget->height);
+      if (change_viewport) {
+         SWR_VIEWPORT vp = {0};
+         vp.width = renderTarget->width;
+         vp.height = renderTarget->height;
+         SwrSetViewports(ctx->swrContext, 1, &vp, NULL);
+      }
+
+      boolean scissor_enable = ctx->derived.rastState.scissorEnable;
+      if (scissor_enable) {
+         ctx->derived.rastState.scissorEnable = FALSE;
+         SwrSetRastState(ctx->swrContext, &ctx->derived.rastState);
+      }
+
+      swr_update_draw_context(ctx);
+      SwrStoreTiles(ctx->swrContext,
+                    (enum SWR_RENDERTARGET_ATTACHMENT)attachment,
+                    post_tile_state);
+
+      /* Restore viewport and scissor enable */
+      if (change_viewport)
+         SwrSetViewports(ctx->swrContext, 1, &ctx->derived.vp, &ctx->derived.vpm);
+      if (scissor_enable) {
+         ctx->derived.rastState.scissorEnable = scissor_enable;
+         SwrSetRastState(ctx->swrContext, &ctx->derived.rastState);
+      }
+   }
+}
+
+void
+swr_store_dirty_resource(struct pipe_context *pipe,
+                         struct pipe_resource *resource,
+                         enum SWR_TILE_STATE post_tile_state)
+{
+   /* Only store resource if it has been written to */
+   if (swr_resource(resource)->status & SWR_RESOURCE_WRITE) {
+      struct swr_context *ctx = swr_context(pipe);
+      struct swr_screen *screen = swr_screen(pipe->screen);
+      struct swr_resource *spr = swr_resource(resource);
+
+      swr_draw_context *pDC = &ctx->swrDC;
+      SWR_SURFACE_STATE *renderTargets = pDC->renderTargets;
+      for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; i++)
+         if (renderTargets[i].pBaseAddress == spr->swr.pBaseAddress) {
+            swr_store_render_target(pipe, i, post_tile_state);
+
+            /* Mesa thinks depth/stencil are fused, so we'll never get an
+             * explicit resource for stencil.  So, if checking depth, then
+             * also check for stencil. */
+            if (spr->has_stencil && (i == SWR_ATTACHMENT_DEPTH)) {
+               swr_store_render_target(
+                  pipe, SWR_ATTACHMENT_STENCIL, post_tile_state);
+            }
+
+            /* This fence signals StoreTiles completion */
+            swr_fence_submit(ctx, screen->flush_fence);
+
+            break;
+         }
+   }
+}
+
+void
+swr_draw_init(struct pipe_context *pipe)
+{
+   pipe->draw_vbo = swr_draw_vbo;
+   pipe->flush = swr_flush;
+}
diff --git a/src/gallium/drivers/swr/swr_fence.cpp b/src/gallium/drivers/swr/swr_fence.cpp
new file mode 100644
index 00000000000..2e95b3936a6
--- /dev/null
+++ b/src/gallium/drivers/swr/swr_fence.cpp
@@ -0,0 +1,150 @@
+/****************************************************************************
+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ***************************************************************************/
+
+#include "pipe/p_screen.h"
+#include "util/u_memory.h"
+#include "os/os_time.h"
+
+#include "swr_context.h"
+#include "swr_screen.h"
+#include "swr_fence.h"
+
+#if defined(PIPE_CC_MSVC) // portable thread yield
+   #define sched_yield SwitchToThread
+#endif
+
+/*
+ * Fence callback, called by back-end thread on completion of all rendering up
+ * to SwrSync call.
+ */
+static void
+swr_sync_cb(uint64_t userData, uint64_t userData2, uint64_t userData3)
+{
+   struct swr_fence *fence = (struct swr_fence *)userData;
+
+   /* Correct value is in SwrSync data, and not the fence write field. */
+   fence->read = userData2;
+}
+
+/*
+ * Submit an existing fence.
+ */
+void
+swr_fence_submit(struct swr_context *ctx, struct pipe_fence_handle *fh)
+{
+   struct swr_fence *fence = swr_fence(fh);
+
+   fence->write++;
+   fence->pending = TRUE;
+   SwrSync(ctx->swrContext, swr_sync_cb, (uint64_t)fence, fence->write, 0);
+}
+
+/*
+ * Create a new fence object.
+ */
+struct pipe_fence_handle *
+swr_fence_create()
+{
+   static int fence_id = 0;
+   struct swr_fence *fence = CALLOC_STRUCT(swr_fence);
+   if (!fence)
+      return NULL;
+
+   pipe_reference_init(&fence->reference, 1);
+   fence->id = fence_id++;
+
+   return (struct pipe_fence_handle *)fence;
+}
+
+/** Destroy a fence.  Called when refcount hits zero. */
+static void
+swr_fence_destroy(struct swr_fence *fence)
+{
+   FREE(fence);
+}
+
+/**
+ * Set ptr = fence, with reference counting
+ */
+void
+swr_fence_reference(struct pipe_screen *screen,
+                    struct pipe_fence_handle **ptr,
+                    struct pipe_fence_handle *f)
+{
+   struct swr_fence *fence = swr_fence(f);
+   struct swr_fence *old;
+
+   if (likely(ptr)) {
+      old = swr_fence(*ptr);
+      *ptr = f;
+   } else {
+      old = NULL;
+   }
+
+   if (pipe_reference(&old->reference, &fence->reference))
+      swr_fence_destroy(old);
+}
+
+static INLINE boolean
+swr_is_fence_done(struct pipe_fence_handle *fence_handle)
+{
+   struct swr_fence *fence = swr_fence(fence_handle);
+   return (fence->read == fence->write);
+}
+
+/*
+ * Wait for the fence to finish.
+ */
+boolean
+swr_fence_finish(struct pipe_screen *screen,
+                 struct pipe_fence_handle *fence_handle,
+                 uint64_t timeout)
+{
+   while (!swr_is_fence_done(fence_handle))
+      sched_yield();
+
+   swr_fence(fence_handle)->pending = FALSE;
+
+   return TRUE;
+}
+
+
+uint64_t
+swr_get_timestamp(struct pipe_screen *screen)
+{
+   return os_time_get_nano();
+}
+
+
+void
+swr_fence_init(struct pipe_screen *p_screen)
+{
+   p_screen->fence_reference = swr_fence_reference;
+   p_screen->fence_finish = swr_fence_finish;
+   p_screen->get_timestamp = swr_get_timestamp;
+
+   /* Create persistant StoreTiles "flush" fence, used to signal completion
+    * of flushing tile state back to resource texture, via StoreTiles. */
+   struct swr_screen *screen = swr_screen(p_screen);
+   screen->flush_fence = swr_fence_create();
+}
diff --git a/src/gallium/drivers/swr/swr_fence.h b/src/gallium/drivers/swr/swr_fence.h
new file mode 100644
index 00000000000..df3776e8989
--- /dev/null
+++ b/src/gallium/drivers/swr/swr_fence.h
@@ -0,0 +1,72 @@
+/****************************************************************************
+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ ***************************************************************************/
+
+#ifndef SWR_FENCE_H
+#define SWR_FENCE_H
+
+#include "pipe/p_state.h"
+#include "util/u_inlines.h"
+
+struct pipe_screen;
+
+struct swr_fence {
+   struct pipe_reference reference;
+
+   uint64_t read;
+   uint64_t write;
+
+   unsigned pending;
+
+   unsigned id; /* Just for reference */
+};
+
+
+static inline struct swr_fence *
+swr_fence(struct pipe_fence_handle *fence)
+{
+   return (struct swr_fence *)fence;
+}
+
+static INLINE boolean
+swr_is_fence_pending(struct pipe_fence_handle *fence_handle)
+{
+   return swr_fence(fence_handle)->pending;
+}
+
+
+void swr_fence_init(struct pipe_screen *screen);
+
+struct pipe_fence_handle *swr_fence_create();
+
+void swr_fence_reference(struct pipe_screen *screen,
+                         struct pipe_fence_handle **ptr,
+                         struct pipe_fence_handle *f);
+
+boolean swr_fence_finish(struct pipe_screen *screen,
+                         struct pipe_fence_handle *fence_handle,
+                         uint64_t timeout);
+
+void
+swr_fence_submit(struct swr_context *ctx, struct pipe_fence_handle *fence);
+
+uint64_t swr_get_timestamp(struct pipe_screen *screen);
+
+#endif
diff --git a/src/gallium/drivers/swr/swr_loader.cpp b/src/gallium/drivers/swr/swr_loader.cpp
new file mode 100644
index 00000000000..2113c371c5f
--- /dev/null
+++ b/src/gallium/drivers/swr/swr_loader.cpp
@@ -0,0 +1,67 @@
+/****************************************************************************
+ * Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ***************************************************************************/
+
+#include "util/u_cpu_detect.h"
+#include "util/u_dl.h"
+#include "swr_public.h"
+
+#include <stdio.h>
+#include <dlfcn.h>
+
+typedef pipe_screen *(*screen_create_proc)(struct sw_winsys *winsys);
+
+struct pipe_screen *
+swr_create_screen(struct sw_winsys *winsys)
+{
+   fprintf(stderr, "SWR detected ");
+
+   util_dl_library *pLibrary = nullptr;
+
+   util_cpu_detect();
+   if (util_cpu_caps.has_avx2) {
+      fprintf(stderr, "AVX2\n");
+      pLibrary = util_dl_open("libswrAVX2.so");
+   } else if (util_cpu_caps.has_avx) {
+      fprintf(stderr, "AVX\n");
+      pLibrary = util_dl_open("libswrAVX.so");
+   } else {
+      fprintf(stderr, "no AVX/AVX2 support.  Aborting!\n");
+      exit(-1);
+   }
+
+   if (!pLibrary) {
+      fprintf(stderr, "SWR library load failure: %s\n", util_dl_error());
+      exit(-1);
+   }
+
+   util_dl_proc pScreenProc = util_dl_get_proc_address(pLibrary, "swr_create_screen");
+
+   if (!pScreenProc) {
+      fprintf(stderr, "SWR library search failure: %s\n", util_dl_error());
+      exit(-1);
+   }
+
+   screen_create_proc pScreenCreate = (screen_create_proc)pScreenProc;
+
+   return pScreenCreate(winsys);
+}
diff --git a/src/gallium/drivers/swr/swr_memory.h b/src/gallium/drivers/swr/swr_memory.h
new file mode 100644
index 00000000000..65fc169c85f
--- /dev/null
+++ b/src/gallium/drivers/swr/swr_memory.h
@@ -0,0 +1,99 @@
+/****************************************************************************
+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ***************************************************************************/
+
+#pragma once
+
+void LoadHotTile(
+    SWR_SURFACE_STATE *pSrcSurface,
+    SWR_FORMAT dstFormat,
+    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
+    UINT x, UINT y, uint32_t renderTargetArrayIndex,
+    uint8_t *pDstHotTile);
+
+void StoreHotTile(
+    SWR_SURFACE_STATE *pDstSurface,
+    SWR_FORMAT srcFormat,
+    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
+    UINT x, UINT y, uint32_t renderTargetArrayIndex,
+    uint8_t *pSrcHotTile);
+
+void StoreHotTileClear(
+    SWR_SURFACE_STATE *pDstSurface,
+    SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
+    UINT x,
+    UINT y,
+    const float* pClearColor);
+
+INLINE void
+swr_LoadHotTile(HANDLE hPrivateContext,
+                SWR_FORMAT dstFormat,
+                SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
+                UINT x, UINT y,
+                uint32_t renderTargetArrayIndex, uint8_t* pDstHotTile)
+{
+   // Grab source surface state from private context
+   swr_draw_context *pDC = (swr_draw_context*)hPrivateContext;
+   SWR_SURFACE_STATE *pSrcSurface = &pDC->renderTargets[renderTargetIndex];
+
+   LoadHotTile(pSrcSurface, dstFormat, renderTargetIndex, x, y, renderTargetArrayIndex, pDstHotTile);
+}
+
+INLINE void
+swr_StoreHotTile(HANDLE hPrivateContext,
+                 SWR_FORMAT srcFormat,
+                 SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
+                 UINT x, UINT y,
+                 uint32_t renderTargetArrayIndex, uint8_t* pSrcHotTile)
+{
+   // Grab destination surface state from private context
+   swr_draw_context *pDC = (swr_draw_context*)hPrivateContext;
+   SWR_SURFACE_STATE *pDstSurface = &pDC->renderTargets[renderTargetIndex];
+
+   StoreHotTile(pDstSurface, srcFormat, renderTargetIndex, x, y, renderTargetArrayIndex, pSrcHotTile);
+}
+
+INLINE void
+swr_StoreHotTileClear(HANDLE hPrivateContext,
+                      SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
+                      UINT x,
+                      UINT y,
+                      const float* pClearColor)
+{
+   // Grab destination surface state from private context
+   swr_draw_context *pDC = (swr_draw_context*)hPrivateContext;
+   SWR_SURFACE_STATE *pDstSurface = &pDC->renderTargets[renderTargetIndex];
+
+   StoreHotTileClear(pDstSurface, renderTargetIndex, x, y, pClearColor);
+}
+
+void InitSimLoadTilesTable();
+void InitSimStoreTilesTable();
+void InitSimClearTilesTable();
+
+/* Init Load/Store/ClearTiles Tables */
+INLINE void swr_InitMemoryModule()
+{
+   InitSimLoadTilesTable();
+   InitSimStoreTilesTable();
+   InitSimClearTilesTable();
+}
diff --git a/src/gallium/drivers/swr/swr_public.h b/src/gallium/drivers/swr/swr_public.h
new file mode 100644
index 00000000000..0814c3b85d6
--- /dev/null
+++ b/src/gallium/drivers/swr/swr_public.h
@@ -0,0 +1,46 @@
+/****************************************************************************
+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ***************************************************************************/
+
+#ifndef SWR_PUBLIC_H
+#define SWR_PUBLIC_H
+
+struct pipe_screen;
+struct sw_winsys;
+struct sw_displaytarget;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct pipe_screen *swr_create_screen(struct sw_winsys *winsys);
+
+struct sw_winsys *swr_get_winsys(struct pipe_screen *pipe);
+
+struct sw_displaytarget *swr_get_displaytarget(struct pipe_resource *resource);
+
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/src/gallium/drivers/swr/swr_query.cpp b/src/gallium/drivers/swr/swr_query.cpp
new file mode 100644
index 00000000000..810c50b2f8f
--- /dev/null
+++ b/src/gallium/drivers/swr/swr_query.cpp
@@ -0,0 +1,334 @@
+/****************************************************************************
+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ***************************************************************************/
+
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "os/os_time.h"
+#include "swr_context.h"
+#include "swr_fence.h"
+#include "swr_query.h"
+#include "swr_screen.h"
+#include "swr_state.h"
+
+
+static struct swr_query *
+swr_query(struct pipe_query *p)
+{
+   return (struct swr_query *)p;
+}
+
+static struct pipe_query *
+swr_create_query(struct pipe_context *pipe, unsigned type, unsigned index)
+{
+   struct swr_query *pq;
+
+   assert(type < PIPE_QUERY_TYPES);
+   assert(index < MAX_SO_STREAMS);
+
+   pq = CALLOC_STRUCT(swr_query);
+
+   if (pq) {
+      pq->type = type;
+      pq->index = index;
+   }
+
+   return (struct pipe_query *)pq;
+}
+
+
+static void
+swr_destroy_query(struct pipe_context *pipe, struct pipe_query *q)
+{
+   struct swr_query *pq = swr_query(q);
+
+   if (pq->fence) {
+      if (!swr_is_fence_pending(pq->fence)) {
+         swr_fence_submit(swr_context(pipe), pq->fence);
+         swr_fence_finish(pipe->screen, pq->fence, 0);
+      }
+      swr_fence_reference(pipe->screen, &pq->fence, NULL);
+   }
+
+   FREE(pq);
+}
+
+
+// XXX Create a fence callback, rather than stalling SwrWaitForIdle
+static void
+swr_gather_stats(struct pipe_context *pipe, struct swr_query *pq)
+{
+   struct swr_context *ctx = swr_context(pipe);
+
+   assert(pq->result);
+   union pipe_query_result *result = pq->result;
+   boolean enable_stats = pq->enable_stats;
+   SWR_STATS swr_stats = {0};
+
+   if (pq->fence) {
+      if (!swr_is_fence_pending(pq->fence)) {
+         swr_fence_submit(ctx, pq->fence);
+         swr_fence_finish(pipe->screen, pq->fence, 0);
+      }
+      swr_fence_reference(pipe->screen, &pq->fence, NULL);
+   }
+
+   /*
+    * These queries don't need SWR Stats enabled in the core
+    * Set and return.
+    */
+   switch (pq->type) {
+   case PIPE_QUERY_TIMESTAMP:
+   case PIPE_QUERY_TIME_ELAPSED:
+      result->u64 = swr_get_timestamp(pipe->screen);
+      return;
+      break;
+   case PIPE_QUERY_TIMESTAMP_DISJOINT:
+      /* nothing to do here */
+      return;
+      break;
+   case PIPE_QUERY_GPU_FINISHED:
+      result->b = TRUE; /* XXX TODO Add an api func to SWR to compare drawId
+                           vs LastRetiredId? */
+      return;
+      break;
+   default:
+      /* Any query that needs SwrCore stats */
+      break;
+   }
+
+   /*
+    * All other results are collected from SwrCore counters
+    */
+
+   /* XXX, Should turn this into a fence callback and skip the stall */
+   SwrGetStats(ctx->swrContext, &swr_stats);
+   /* SwrGetStats returns immediately, wait for collection */
+   SwrWaitForIdle(ctx->swrContext);
+
+   switch (pq->type) {
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+      result->u64 = swr_stats.DepthPassCount;
+      break;
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+      result->u64 = swr_stats.IaPrimitives;
+      break;
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      result->u64 = swr_stats.SoNumPrimsWritten[pq->index];
+      break;
+   case PIPE_QUERY_SO_STATISTICS:
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE: {
+      struct pipe_query_data_so_statistics *so_stats = &result->so_statistics;
+      so_stats->num_primitives_written =
+         swr_stats.SoNumPrimsWritten[pq->index];
+      so_stats->primitives_storage_needed =
+         swr_stats.SoPrimStorageNeeded[pq->index];
+   } break;
+   case PIPE_QUERY_PIPELINE_STATISTICS: {
+      struct pipe_query_data_pipeline_statistics *p_stats =
+         &result->pipeline_statistics;
+      p_stats->ia_vertices = swr_stats.IaVertices;
+      p_stats->ia_primitives = swr_stats.IaPrimitives;
+      p_stats->vs_invocations = swr_stats.VsInvocations;
+      p_stats->gs_invocations = swr_stats.GsInvocations;
+      p_stats->gs_primitives = swr_stats.GsPrimitives;
+      p_stats->c_invocations = swr_stats.CPrimitives;
+      p_stats->c_primitives = swr_stats.CPrimitives;
+      p_stats->ps_invocations = swr_stats.PsInvocations;
+      p_stats->hs_invocations = swr_stats.HsInvocations;
+      p_stats->ds_invocations = swr_stats.DsInvocations;
+      p_stats->cs_invocations = swr_stats.CsInvocations;
+   } break;
+   default:
+      assert(0 && "Unsupported query");
+      break;
+   }
+
+   /* Only change stat collection if there are no active queries */
+   if (ctx->active_queries == 0)
+      SwrEnableStats(ctx->swrContext, enable_stats);
+}
+
+
+static boolean
+swr_get_query_result(struct pipe_context *pipe,
+                     struct pipe_query *q,
+                     boolean wait,
+                     union pipe_query_result *result)
+{
+   struct swr_context *ctx = swr_context(pipe);
+   struct swr_query *pq = swr_query(q);
+
+   if (pq->fence) {
+      if (!swr_is_fence_pending(pq->fence)) {
+         swr_fence_submit(ctx, pq->fence);
+         if (!wait)
+            return FALSE;
+         swr_fence_finish(pipe->screen, pq->fence, 0);
+      }
+      swr_fence_reference(pipe->screen, &pq->fence, NULL);
+   }
+
+   /* XXX: Need to handle counter rollover */
+
+   switch (pq->type) {
+   /* Booleans */
+   case PIPE_QUERY_OCCLUSION_PREDICATE:
+      result->b = pq->end.u64 != pq->start.u64 ? TRUE : FALSE;
+      break;
+   case PIPE_QUERY_GPU_FINISHED:
+      result->b = pq->end.b;
+      break;
+   /* Counters */
+   case PIPE_QUERY_OCCLUSION_COUNTER:
+   case PIPE_QUERY_TIMESTAMP:
+   case PIPE_QUERY_TIME_ELAPSED:
+   case PIPE_QUERY_PRIMITIVES_GENERATED:
+   case PIPE_QUERY_PRIMITIVES_EMITTED:
+      result->u64 = pq->end.u64 - pq->start.u64;
+      break;
+   /* Structures */
+   case PIPE_QUERY_SO_STATISTICS: {
+      struct pipe_query_data_so_statistics *so_stats = &result->so_statistics;
+      struct pipe_query_data_so_statistics *start = &pq->start.so_statistics;
+      struct pipe_query_data_so_statistics *end = &pq->end.so_statistics;
+      so_stats->num_primitives_written =
+         end->num_primitives_written - start->num_primitives_written;
+      so_stats->primitives_storage_needed =
+         end->primitives_storage_needed - start->primitives_storage_needed;
+   } break;
+   case PIPE_QUERY_TIMESTAMP_DISJOINT: {
+      /* os_get_time_nano returns nanoseconds */
+      result->timestamp_disjoint.frequency = UINT64_C(1000000000);
+      result->timestamp_disjoint.disjoint = FALSE;
+   } break;
+   case PIPE_QUERY_PIPELINE_STATISTICS: {
+      struct pipe_query_data_pipeline_statistics *p_stats =
+         &result->pipeline_statistics;
+      struct pipe_query_data_pipeline_statistics *start =
+         &pq->start.pipeline_statistics;
+      struct pipe_query_data_pipeline_statistics *end =
+         &pq->end.pipeline_statistics;
+      p_stats->ia_vertices = end->ia_vertices - start->ia_vertices;
+      p_stats->ia_primitives = end->ia_primitives - start->ia_primitives;
+      p_stats->vs_invocations = end->vs_invocations - start->vs_invocations;
+      p_stats->gs_invocations = end->gs_invocations - start->gs_invocations;
+      p_stats->gs_primitives = end->gs_primitives - start->gs_primitives;
+      p_stats->c_invocations = end->c_invocations - start->c_invocations;
+      p_stats->c_primitives = end->c_primitives - start->c_primitives;
+      p_stats->ps_invocations = end->ps_invocations - start->ps_invocations;
+      p_stats->hs_invocations = end->hs_invocations - start->hs_invocations;
+      p_stats->ds_invocations = end->ds_invocations - start->ds_invocations;
+      p_stats->cs_invocations = end->cs_invocations - start->cs_invocations;
+   } break;
+   case PIPE_QUERY_SO_OVERFLOW_PREDICATE: {
+      struct pipe_query_data_so_statistics *start = &pq->start.so_statistics;
+      struct pipe_query_data_so_statistics *end = &pq->end.so_statistics;
+      uint64_t num_primitives_written =
+         end->num_primitives_written - start->num_primitives_written;
+      uint64_t primitives_storage_needed =
+         end->primitives_storage_needed - start->primitives_storage_needed;
+      result->b = num_primitives_written > primitives_storage_needed;
+   } break;
+   default:
+      assert(0 && "Unsupported query");
+      break;
+   }
+
+   return TRUE;
+}
+
+static boolean
+swr_begin_query(struct pipe_context *pipe, struct pipe_query *q)
+{
+   struct swr_context *ctx = swr_context(pipe);
+   struct swr_query *pq = swr_query(q);
+
+   /* Initialize Results */
+   memset(&pq->start, 0, sizeof(pq->start));
+   memset(&pq->end, 0, sizeof(pq->end));
+
+   /* Gather start stats and enable SwrCore counters */
+   pq->result = &pq->start;
+   pq->enable_stats = TRUE;
+   swr_gather_stats(pipe, pq);
+   ctx->active_queries++;
+
+   /* override start timestamp to 0 for TIMESTAMP query */
+   if (pq->type == PIPE_QUERY_TIMESTAMP)
+      pq->start.u64 = 0;
+
+   return true;
+}
+
+static void
+swr_end_query(struct pipe_context *pipe, struct pipe_query *q)
+{
+   struct swr_context *ctx = swr_context(pipe);
+   struct swr_query *pq = swr_query(q);
+
+   assert(ctx->active_queries
+          && "swr_end_query, there are no active queries!");
+   ctx->active_queries--;
+
+   /* Gather end stats and disable SwrCore counters */
+   pq->result = &pq->end;
+   pq->enable_stats = FALSE;
+   swr_gather_stats(pipe, pq);
+}
+
+
+boolean
+swr_check_render_cond(struct pipe_context *pipe)
+{
+   struct swr_context *ctx = swr_context(pipe);
+   boolean b, wait;
+   uint64_t result;
+
+   if (!ctx->render_cond_query)
+      return TRUE; /* no query predicate, draw normally */
+
+   wait = (ctx->render_cond_mode == PIPE_RENDER_COND_WAIT
+           || ctx->render_cond_mode == PIPE_RENDER_COND_BY_REGION_WAIT);
+
+   b = pipe->get_query_result(
+      pipe, ctx->render_cond_query, wait, (union pipe_query_result *)&result);
+   if (b)
+      return (!result == ctx->render_cond_cond);
+   else
+      return TRUE;
+}
+
+void
+swr_query_init(struct pipe_context *pipe)
+{
+   struct swr_context *ctx = swr_context(pipe);
+
+   pipe->create_query = swr_create_query;
+   pipe->destroy_query = swr_destroy_query;
+   pipe->begin_query = swr_begin_query;
+   pipe->end_query = swr_end_query;
+   pipe->get_query_result = swr_get_query_result;
+
+   ctx->active_queries = 0;
+}
diff --git a/src/gallium/drivers/swr/swr_query.h b/src/gallium/drivers/swr/swr_query.h
new file mode 100644
index 00000000000..836d07b68ae
--- /dev/null
+++ b/src/gallium/drivers/swr/swr_query.h
@@ -0,0 +1,46 @@
+/****************************************************************************
+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ***************************************************************************/
+
+#ifndef SWR_QUERY_H
+#define SWR_QUERY_H
+
+
+#include <limits.h>
+
+struct swr_query {
+   unsigned type; /* PIPE_QUERY_* */
+   unsigned index;
+
+   union pipe_query_result *result;
+   union pipe_query_result start;
+   union pipe_query_result end;
+
+   struct pipe_fence_handle *fence;
+
+   boolean enable_stats;
+};
+
+extern void swr_query_init(struct pipe_context *pipe);
+
+extern boolean swr_check_render_cond(struct pipe_context *pipe);
+#endif
diff --git a/src/gallium/drivers/swr/swr_resource.h b/src/gallium/drivers/swr/swr_resource.h
new file mode 100644
index 00000000000..2fdc7683cb8
--- /dev/null
+++ b/src/gallium/drivers/swr/swr_resource.h
@@ -0,0 +1,143 @@
+/****************************************************************************
+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ***************************************************************************/
+
+#ifndef SWR_RESOURCE_H
+#define SWR_RESOURCE_H
+
+#include "pipe/p_state.h"
+#include "api.h"
+
+struct sw_displaytarget;
+
+enum swr_resource_status {
+   SWR_RESOURCE_UNUSED = 0x0,
+   SWR_RESOURCE_READ = 0x1,
+   SWR_RESOURCE_WRITE = 0x2,
+};
+
+struct swr_resource {
+   struct pipe_resource base;
+
+   bool has_depth;
+   bool has_stencil;
+
+   UINT alignedWidth;
+   UINT alignedHeight;
+
+   SWR_SURFACE_STATE swr;
+   SWR_SURFACE_STATE secondary; /* for faking depth/stencil merged formats */
+
+   struct sw_displaytarget *display_target;
+
+   unsigned row_stride[PIPE_MAX_TEXTURE_LEVELS];
+   unsigned img_stride[PIPE_MAX_TEXTURE_LEVELS];
+   unsigned mip_offsets[PIPE_MAX_TEXTURE_LEVELS];
+
+   enum swr_resource_status status;
+
+   /* pipe_context to which resource is currently bound. */
+   struct pipe_context *bound_to_context;
+};
+
+
+static INLINE struct swr_resource *
+swr_resource(struct pipe_resource *resource)
+{
+   return (struct swr_resource *)resource;
+}
+
+static INLINE boolean
+swr_resource_is_texture(const struct pipe_resource *resource)
+{
+   switch (resource->target) {
+   case PIPE_BUFFER:
+      return FALSE;
+   case PIPE_TEXTURE_1D:
+   case PIPE_TEXTURE_1D_ARRAY:
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_2D_ARRAY:
+   case PIPE_TEXTURE_RECT:
+   case PIPE_TEXTURE_3D:
+   case PIPE_TEXTURE_CUBE:
+   case PIPE_TEXTURE_CUBE_ARRAY:
+      return TRUE;
+   default:
+      assert(0);
+      return FALSE;
+   }
+}
+
+
+static INLINE void *
+swr_resource_data(struct pipe_resource *resource)
+{
+   struct swr_resource *swr_r = swr_resource(resource);
+
+   assert(!swr_resource_is_texture(resource));
+
+   return swr_r->swr.pBaseAddress;
+}
+
+
+void swr_store_render_target(struct pipe_context *pipe,
+                             uint32_t attachment,
+                             enum SWR_TILE_STATE post_tile_state);
+
+void swr_store_dirty_resource(struct pipe_context *pipe,
+                              struct pipe_resource *resource,
+                              enum SWR_TILE_STATE post_tile_state);
+
+void swr_update_resource_status(struct pipe_context *,
+                                const struct pipe_draw_info *);
+
+/*
+ * Functions to indicate a resource's in-use status.
+ */
+static INLINE enum
+swr_resource_status & operator|=(enum swr_resource_status & a,
+                                 enum swr_resource_status  b) {
+   return (enum swr_resource_status &)((int&)a |= (int)b);
+}
+
+static INLINE void
+swr_resource_read(struct pipe_context *pipe, struct swr_resource *resource)
+{
+   resource->status |= SWR_RESOURCE_READ;
+   resource->bound_to_context = pipe;
+}
+
+static INLINE void
+swr_resource_write(struct pipe_context *pipe, struct swr_resource *resource)
+{
+   resource->status |= SWR_RESOURCE_WRITE;
+   resource->bound_to_context = pipe;
+}
+
+static INLINE void
+swr_resource_unused(struct pipe_context *pipe, struct swr_resource *resource)
+{
+   resource->status = SWR_RESOURCE_UNUSED;
+   resource->bound_to_context = nullptr;
+}
+
+#endif
diff --git a/src/gallium/drivers/swr/swr_scratch.cpp b/src/gallium/drivers/swr/swr_scratch.cpp
new file mode 100644
index 00000000000..28eb2acb910
--- /dev/null
+++ b/src/gallium/drivers/swr/swr_scratch.cpp
@@ -0,0 +1,116 @@
+/****************************************************************************
+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ***************************************************************************/
+
+#include "util/u_memory.h"
+#include "swr_context.h"
+#include "swr_scratch.h"
+#include "api.h"
+
+
+void *
+swr_copy_to_scratch_space(struct swr_context *ctx,
+                          struct swr_scratch_space *space,
+                          const void *user_buffer,
+                          unsigned int size)
+{
+   void *ptr;
+   assert(space);
+   assert(user_buffer);
+   assert(size);
+
+   if (size >= 2048) { /* XXX TODO create KNOB_ for this */
+      /* Use per draw SwrAllocDrawContextMemory for larger copies */
+      ptr = SwrAllocDrawContextMemory(ctx->swrContext, size, 4);
+   } else {
+      /* Allocate enough so that MAX_DRAWS_IN_FLIGHT sets fit. */
+      unsigned int max_size_in_flight = size * KNOB_MAX_DRAWS_IN_FLIGHT;
+
+      /* Need to grow space */
+      if (max_size_in_flight > space->current_size) {
+         /* Must idle the pipeline, this is infrequent */
+         SwrWaitForIdle(ctx->swrContext);
+
+         space->current_size = max_size_in_flight;
+
+         if (space->base) {
+            align_free(space->base);
+            space->base = NULL;
+         }
+
+         if (!space->base) {
+            space->base = (uint8_t *)align_malloc(space->current_size, 4);
+            space->head = (void *)space->base;
+         }
+      }
+
+      /* Wrap */
+      if (((uint8_t *)space->head + size)
+          >= ((uint8_t *)space->base + space->current_size)) {
+         /*
+          * TODO XXX: Should add a fence on wrap.  Assumption is that
+          * current_space >> size, and there are at least MAX_DRAWS_IN_FLIGHT
+          * draws in scratch.  So fence would always be met on wrap.  A fence
+          * would ensure that first frame in buffer is done before wrapping.
+          * If fence ever needs to be waited on, can increase buffer size.
+          * So far in testing, this hasn't been necessary.
+          */
+         space->head = space->base;
+      }
+
+      ptr = space->head;
+      space->head = (uint8_t *)space->head + size;
+   }
+
+   /* Copy user_buffer to scratch */
+   memcpy(ptr, user_buffer, size);
+
+   return ptr;
+}
+
+
+void
+swr_init_scratch_buffers(struct swr_context *ctx)
+{
+   struct swr_scratch_buffers *scratch;
+
+   scratch = CALLOC_STRUCT(swr_scratch_buffers);
+   ctx->scratch = scratch;
+}
+
+void
+swr_destroy_scratch_buffers(struct swr_context *ctx)
+{
+   struct swr_scratch_buffers *scratch = ctx->scratch;
+
+   if (scratch) {
+      if (scratch->vs_constants.base)
+         align_free(scratch->vs_constants.base);
+      if (scratch->fs_constants.base)
+         align_free(scratch->fs_constants.base);
+      if (scratch->vertex_buffer.base)
+         align_free(scratch->vertex_buffer.base);
+      if (scratch->index_buffer.base)
+         align_free(scratch->index_buffer.base);
+      FREE(scratch);
+   }
+}
diff --git a/src/gallium/drivers/swr/swr_scratch.h b/src/gallium/drivers/swr/swr_scratch.h
new file mode 100644
index 00000000000..74218d63644
--- /dev/null
+++ b/src/gallium/drivers/swr/swr_scratch.h
@@ -0,0 +1,63 @@
+/****************************************************************************
+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ***************************************************************************/
+
+#ifndef SWR_SCRATCH_H
+#define SWR_SCRATCH_H
+
+struct swr_scratch_space {
+   void *head;
+   unsigned int current_size;
+   /* TODO XXX: Add a fence for wrap condition. */
+
+   void *base;
+};
+
+struct swr_scratch_buffers {
+   struct swr_scratch_space vs_constants;
+   struct swr_scratch_space fs_constants;
+   struct swr_scratch_space vertex_buffer;
+   struct swr_scratch_space index_buffer;
+};
+
+
+/*
+ * swr_copy_to_scratch_space
+ * Copies size bytes of user_buffer into the scratch ring buffer.
+ * Used to store temporary data such as client arrays and constants.
+ *
+ * Inputs:
+ *   space ptr to scratch pool (vs_constants, fs_constants)
+ *   user_buffer, data to copy into scratch space
+ *   size to be copied
+ * Returns:
+ *   pointer to data copied to scratch space.
+ */
+void *swr_copy_to_scratch_space(struct swr_context *ctx,
+                                struct swr_scratch_space *space,
+                                const void *user_buffer,
+                                unsigned int size);
+
+void swr_init_scratch_buffers(struct swr_context *ctx);
+void swr_destroy_scratch_buffers(struct swr_context *ctx);
+
+#endif
diff --git a/src/gallium/drivers/swr/swr_screen.cpp b/src/gallium/drivers/swr/swr_screen.cpp
new file mode 100644
index 00000000000..e46df47570f
--- /dev/null
+++ b/src/gallium/drivers/swr/swr_screen.cpp
@@ -0,0 +1,745 @@
+/****************************************************************************
+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ***************************************************************************/
+
+#include "pipe/p_screen.h"
+#include "pipe/p_defines.h"
+#include "util/u_memory.h"
+#include "util/u_format.h"
+#include "util/u_inlines.h"
+#include "util/u_cpu_detect.h"
+
+#include "state_tracker/sw_winsys.h"
+
+extern "C" {
+#include "gallivm/lp_bld_limits.h"
+}
+
+#include "swr_public.h"
+#include "swr_screen.h"
+#include "swr_context.h"
+#include "swr_resource.h"
+#include "swr_fence.h"
+#include "gen_knobs.h"
+
+#include "jit_api.h"
+
+#include <stdio.h>
+
+/* MSVC case instensitive compare */
+#if defined(PIPE_CC_MSVC)
+   #define strcasecmp lstrcmpiA
+#endif
+
+/*
+ * Max texture sizes
+ * XXX Check max texture size values against core and sampler.
+ */
+#define SWR_MAX_TEXTURE_SIZE (4 * 1048 * 1048 * 1024ULL) /* 4GB */
+#define SWR_MAX_TEXTURE_2D_LEVELS 14  /* 8K x 8K for now */
+#define SWR_MAX_TEXTURE_3D_LEVELS 12  /* 2K x 2K x 2K for now */
+#define SWR_MAX_TEXTURE_CUBE_LEVELS 14  /* 8K x 8K for now */
+#define SWR_MAX_TEXTURE_ARRAY_LAYERS 512 /* 8K x 512 / 8K x 8K x 512 */
+
+static const char *
+swr_get_name(struct pipe_screen *screen)
+{
+   return "SWR";
+}
+
+static const char *
+swr_get_vendor(struct pipe_screen *screen)
+{
+   return "Intel Corporation";
+}
+
+static boolean
+swr_is_format_supported(struct pipe_screen *screen,
+                        enum pipe_format format,
+                        enum pipe_texture_target target,
+                        unsigned sample_count,
+                        unsigned bind)
+{
+   struct sw_winsys *winsys = swr_screen(screen)->winsys;
+   const struct util_format_description *format_desc;
+
+   assert(target == PIPE_BUFFER || target == PIPE_TEXTURE_1D
+          || target == PIPE_TEXTURE_1D_ARRAY
+          || target == PIPE_TEXTURE_2D
+          || target == PIPE_TEXTURE_2D_ARRAY
+          || target == PIPE_TEXTURE_RECT
+          || target == PIPE_TEXTURE_3D
+          || target == PIPE_TEXTURE_CUBE
+          || target == PIPE_TEXTURE_CUBE_ARRAY);
+
+   format_desc = util_format_description(format);
+   if (!format_desc)
+      return FALSE;
+
+   if (sample_count > 1)
+      return FALSE;
+
+   if (bind
+       & (PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT | PIPE_BIND_SHARED)) {
+      if (!winsys->is_displaytarget_format_supported(winsys, bind, format))
+         return FALSE;
+   }
+
+   if (bind & PIPE_BIND_RENDER_TARGET) {
+      if (format_desc->colorspace == UTIL_FORMAT_COLORSPACE_ZS)
+         return FALSE;
+
+      if (mesa_to_swr_format(format) == (SWR_FORMAT)-1)
+         return FALSE;
+
+      /*
+       * Although possible, it is unnatural to render into compressed or YUV
+       * surfaces. So disable these here to avoid going into weird paths
+       * inside the state trackers.
+       */
+      if (format_desc->block.width != 1 || format_desc->block.height != 1)
+         return FALSE;
+   }
+
+   if (bind & PIPE_BIND_DEPTH_STENCIL) {
+      if (format_desc->colorspace != UTIL_FORMAT_COLORSPACE_ZS)
+         return FALSE;
+
+      if (mesa_to_swr_format(format) == (SWR_FORMAT)-1)
+         return FALSE;
+   }
+
+   return TRUE;
+}
+
+static int
+swr_get_param(struct pipe_screen *screen, enum pipe_cap param)
+{
+   switch (param) {
+   case PIPE_CAP_NPOT_TEXTURES:
+   case PIPE_CAP_MIXED_FRAMEBUFFER_SIZES:
+      return 1;
+   case PIPE_CAP_TWO_SIDED_STENCIL:
+      return 1;
+   case PIPE_CAP_SM3:
+      return 1;
+   case PIPE_CAP_ANISOTROPIC_FILTER:
+      return 0;
+   case PIPE_CAP_POINT_SPRITE:
+      return 1;
+   case PIPE_CAP_MAX_RENDER_TARGETS:
+      return PIPE_MAX_COLOR_BUFS;
+   case PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS:
+      return 1;
+   case PIPE_CAP_OCCLUSION_QUERY:
+   case PIPE_CAP_QUERY_TIME_ELAPSED:
+   case PIPE_CAP_QUERY_PIPELINE_STATISTICS:
+      return 1;
+   case PIPE_CAP_TEXTURE_MIRROR_CLAMP:
+      return 1;
+   case PIPE_CAP_TEXTURE_SHADOW_MAP:
+      return 1;
+   case PIPE_CAP_TEXTURE_SWIZZLE:
+      return 1;
+   case PIPE_CAP_TEXTURE_BORDER_COLOR_QUIRK:
+      return 0;
+   case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
+      return SWR_MAX_TEXTURE_2D_LEVELS;
+   case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
+      return SWR_MAX_TEXTURE_3D_LEVELS;
+   case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
+      return SWR_MAX_TEXTURE_CUBE_LEVELS;
+   case PIPE_CAP_BLEND_EQUATION_SEPARATE:
+      return 1;
+   case PIPE_CAP_INDEP_BLEND_ENABLE:
+      return 1;
+   case PIPE_CAP_INDEP_BLEND_FUNC:
+      return 1;
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
+      return 0; // Don't support lower left frag coord.
+   case PIPE_CAP_TGSI_FS_COORD_ORIGIN_UPPER_LEFT:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
+   case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_INTEGER:
+      return 1;
+   case PIPE_CAP_DEPTH_CLIP_DISABLE:
+      return 1;
+   case PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS:
+      return MAX_SO_STREAMS;
+   case PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS:
+   case PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS:
+      return MAX_ATTRIBUTES;
+   case PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES:
+   case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS:
+      return 1024;
+   case PIPE_CAP_MAX_VERTEX_STREAMS:
+      return 1;
+   case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
+      return 2048;
+   case PIPE_CAP_PRIMITIVE_RESTART:
+      return 1;
+   case PIPE_CAP_SHADER_STENCIL_EXPORT:
+      return 1;
+   case PIPE_CAP_TGSI_INSTANCEID:
+   case PIPE_CAP_VERTEX_ELEMENT_INSTANCE_DIVISOR:
+   case PIPE_CAP_START_INSTANCE:
+      return 1;
+   case PIPE_CAP_SEAMLESS_CUBE_MAP:
+   case PIPE_CAP_SEAMLESS_CUBE_MAP_PER_TEXTURE:
+      return 1;
+   case PIPE_CAP_MAX_TEXTURE_ARRAY_LAYERS:
+      return SWR_MAX_TEXTURE_ARRAY_LAYERS;
+   case PIPE_CAP_MIN_TEXEL_OFFSET:
+      return -8;
+   case PIPE_CAP_MAX_TEXEL_OFFSET:
+      return 7;
+   case PIPE_CAP_CONDITIONAL_RENDER:
+      return 1;
+   case PIPE_CAP_TEXTURE_BARRIER:
+      return 0;
+   case PIPE_CAP_FRAGMENT_COLOR_CLAMPED:
+   case PIPE_CAP_VERTEX_COLOR_UNCLAMPED: /* draw module */
+   case PIPE_CAP_VERTEX_COLOR_CLAMPED: /* draw module */
+      return 1;
+   case PIPE_CAP_MIXED_COLORBUFFER_FORMATS:
+      return 1;
+   case PIPE_CAP_GLSL_FEATURE_LEVEL:
+      return 330;
+   case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
+      return 0;
+   case PIPE_CAP_COMPUTE:
+      return 0;
+   case PIPE_CAP_USER_VERTEX_BUFFERS:
+   case PIPE_CAP_USER_INDEX_BUFFERS:
+   case PIPE_CAP_USER_CONSTANT_BUFFERS:
+   case PIPE_CAP_STREAM_OUTPUT_PAUSE_RESUME:
+   case PIPE_CAP_TGSI_VS_LAYER_VIEWPORT:
+      return 1;
+   case PIPE_CAP_CONSTANT_BUFFER_OFFSET_ALIGNMENT:
+      return 16;
+   case PIPE_CAP_TGSI_CAN_COMPACT_CONSTANTS:
+   case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:
+   case PIPE_CAP_VERTEX_BUFFER_STRIDE_4BYTE_ALIGNED_ONLY:
+   case PIPE_CAP_VERTEX_ELEMENT_SRC_OFFSET_4BYTE_ALIGNED_ONLY:
+   case PIPE_CAP_TEXTURE_MULTISAMPLE:
+      return 0;
+   case PIPE_CAP_MIN_MAP_BUFFER_ALIGNMENT:
+      return 64;
+   case PIPE_CAP_QUERY_TIMESTAMP:
+      return 1;
+   case PIPE_CAP_CUBE_MAP_ARRAY:
+      return 0;
+   case PIPE_CAP_TEXTURE_BUFFER_OBJECTS:
+      return 1;
+   case PIPE_CAP_MAX_TEXTURE_BUFFER_SIZE:
+      return 65536;
+   case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
+      return 0;
+   case PIPE_CAP_TGSI_TEXCOORD:
+   case PIPE_CAP_PREFER_BLIT_BASED_TEXTURE_TRANSFER:
+      return 0;
+   case PIPE_CAP_MAX_VIEWPORTS:
+      return 1;
+   case PIPE_CAP_ENDIANNESS:
+      return PIPE_ENDIAN_NATIVE;
+   case PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS:
+   case PIPE_CAP_TEXTURE_GATHER_SM5:
+      return 0;
+   case PIPE_CAP_BUFFER_MAP_PERSISTENT_COHERENT:
+      return 1;
+   case PIPE_CAP_TEXTURE_QUERY_LOD:
+   case PIPE_CAP_SAMPLE_SHADING:
+   case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
+   case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
+   case PIPE_CAP_TGSI_FS_FINE_DERIVATIVE:
+   case PIPE_CAP_SAMPLER_VIEW_TARGET:
+      return 0;
+   case PIPE_CAP_FAKE_SW_MSAA:
+      return 1;
+   case PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET:
+   case PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET:
+      return 0;
+   case PIPE_CAP_DRAW_INDIRECT:
+      return 1;
+
+   case PIPE_CAP_VENDOR_ID:
+      return 0xFFFFFFFF;
+   case PIPE_CAP_DEVICE_ID:
+      return 0xFFFFFFFF;
+   case PIPE_CAP_ACCELERATED:
+      return 0;
+   case PIPE_CAP_VIDEO_MEMORY: {
+      /* XXX: Do we want to return the full amount of system memory ? */
+      uint64_t system_memory;
+
+      if (!os_get_total_physical_memory(&system_memory))
+         return 0;
+
+      return (int)(system_memory >> 20);
+   }
+   case PIPE_CAP_UMA:
+      return 1;
+   case PIPE_CAP_CONDITIONAL_RENDER_INVERTED:
+      return 1;
+   case PIPE_CAP_CLIP_HALFZ:
+      return 1;
+   case PIPE_CAP_VERTEXID_NOBASE:
+      return 0;
+   case PIPE_CAP_POLYGON_OFFSET_CLAMP:
+      return 1;
+   case PIPE_CAP_MULTISAMPLE_Z_RESOLVE:
+      return 0;
+   case PIPE_CAP_RESOURCE_FROM_USER_MEMORY:
+      return 0; // xxx
+   case PIPE_CAP_DEVICE_RESET_STATUS_QUERY:
+      return 0;
+   case PIPE_CAP_MAX_SHADER_PATCH_VARYINGS:
+      return 0;
+   case PIPE_CAP_DEPTH_BOUNDS_TEST:
+      return 0; // xxx
+   case PIPE_CAP_TEXTURE_FLOAT_LINEAR:
+   case PIPE_CAP_TEXTURE_HALF_FLOAT_LINEAR:
+      return 1;
+   case PIPE_CAP_TGSI_TXQS:
+   case PIPE_CAP_FORCE_PERSAMPLE_INTERP:
+   case PIPE_CAP_SHAREABLE_SHADERS:
+   case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
+   case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_DRAW_PARAMETERS:
+   case PIPE_CAP_TGSI_PACK_HALF_FLOAT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT:
+   case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS:
+   case PIPE_CAP_TGSI_FS_POSITION_IS_SYSVAL:
+   case PIPE_CAP_TGSI_FS_FACE_IS_INTEGER_SYSVAL:
+   case PIPE_CAP_SHADER_BUFFER_OFFSET_ALIGNMENT:
+   case PIPE_CAP_INVALIDATE_BUFFER:
+   case PIPE_CAP_GENERATE_MIPMAP:
+   case PIPE_CAP_STRING_MARKER:
+   case PIPE_CAP_BUFFER_SAMPLER_VIEW_RGBA_ONLY:
+   case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
+   case PIPE_CAP_QUERY_BUFFER_OBJECT:
+   case PIPE_CAP_QUERY_MEMORY_INFO:
+      return 0;
+   }
+
+   /* should only get here on unhandled cases */
+   debug_printf("Unexpected PIPE_CAP %d query\n", param);
+   return 0;
+}
+
+static int
+swr_get_shader_param(struct pipe_screen *screen,
+                     unsigned shader,
+                     enum pipe_shader_cap param)
+{
+   if (shader == PIPE_SHADER_VERTEX || shader == PIPE_SHADER_FRAGMENT)
+      return gallivm_get_shader_param(param);
+
+   // Todo: geometry, tesselation, compute
+   return 0;
+}
+
+
+static float
+swr_get_paramf(struct pipe_screen *screen, enum pipe_capf param)
+{
+   switch (param) {
+   case PIPE_CAPF_MAX_LINE_WIDTH:
+   case PIPE_CAPF_MAX_LINE_WIDTH_AA:
+   case PIPE_CAPF_MAX_POINT_WIDTH:
+      return 255.0; /* arbitrary */
+   case PIPE_CAPF_MAX_POINT_WIDTH_AA:
+      return 0.0;
+   case PIPE_CAPF_MAX_TEXTURE_ANISOTROPY:
+      return 0.0;
+   case PIPE_CAPF_MAX_TEXTURE_LOD_BIAS:
+      return 0.0;
+   case PIPE_CAPF_GUARD_BAND_LEFT:
+   case PIPE_CAPF_GUARD_BAND_TOP:
+   case PIPE_CAPF_GUARD_BAND_RIGHT:
+   case PIPE_CAPF_GUARD_BAND_BOTTOM:
+      return 0.0;
+   }
+   /* should only get here on unhandled cases */
+   debug_printf("Unexpected PIPE_CAPF %d query\n", param);
+   return 0.0;
+}
+
+SWR_FORMAT
+mesa_to_swr_format(enum pipe_format format)
+{
+   const struct util_format_description *format_desc =
+      util_format_description(format);
+   if (!format_desc)
+      return (SWR_FORMAT)-1;
+
+   // more robust check would be comparing all attributes of the formats
+   // luckily format names are mostly standardized
+   for (int i = 0; i < NUM_SWR_FORMATS; i++) {
+      const SWR_FORMAT_INFO &swr_desc = GetFormatInfo((SWR_FORMAT)i);
+
+      if (!strcasecmp(format_desc->short_name, swr_desc.name))
+         return (SWR_FORMAT)i;
+   }
+
+   // ... with some exceptions
+   switch (format) {
+   case PIPE_FORMAT_R8G8B8A8_SRGB:
+      return R8G8B8A8_UNORM_SRGB;
+   case PIPE_FORMAT_B8G8R8A8_SRGB:
+      return B8G8R8A8_UNORM_SRGB;
+   case PIPE_FORMAT_I8_UNORM:
+      return R8_UNORM;
+   case PIPE_FORMAT_Z16_UNORM:
+      return R16_UNORM;
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_Z24_UNORM_S8_UINT:
+      return R24_UNORM_X8_TYPELESS;
+   case PIPE_FORMAT_Z32_FLOAT:
+      return R32_FLOAT;
+   case PIPE_FORMAT_Z32_FLOAT_S8X24_UINT:
+      return R32_FLOAT_X8X24_TYPELESS;
+   case PIPE_FORMAT_L8A8_UNORM:
+      return R8G8_UNORM;
+   default:
+      break;
+   }
+
+   debug_printf("asked to convert unsupported format %s\n",
+                format_desc->name);
+   return (SWR_FORMAT)-1;
+}
+
+static boolean
+swr_displaytarget_layout(struct swr_screen *screen, struct swr_resource *res)
+{
+   struct sw_winsys *winsys = screen->winsys;
+   struct sw_displaytarget *dt;
+
+   UINT stride;
+   dt = winsys->displaytarget_create(winsys,
+                                     res->base.bind,
+                                     res->base.format,
+                                     res->alignedWidth,
+                                     res->alignedHeight,
+                                     64, NULL,
+                                     &stride);
+
+   if (dt == NULL)
+      return FALSE;
+
+   void *map = winsys->displaytarget_map(winsys, dt, 0);
+
+   res->display_target = dt;
+   res->swr.pBaseAddress = (uint8_t*) map;
+
+   /* Clear the display target surface */
+   if (map)
+      memset(map, 0, res->alignedHeight * stride);
+
+   winsys->displaytarget_unmap(winsys, dt);
+
+   return TRUE;
+}
+
+static boolean
+swr_texture_layout(struct swr_screen *screen,
+                   struct swr_resource *res,
+                   boolean allocate)
+{
+   struct pipe_resource *pt = &res->base;
+
+   pipe_format fmt = pt->format;
+   const struct util_format_description *desc = util_format_description(fmt);
+
+   res->has_depth = util_format_has_depth(desc);
+   res->has_stencil = util_format_has_stencil(desc);
+
+   if (res->has_stencil && !res->has_depth)
+      fmt = PIPE_FORMAT_R8_UINT;
+
+   res->swr.width = pt->width0;
+   res->swr.height = pt->height0;
+   res->swr.depth = pt->depth0;
+   res->swr.type = swr_convert_target_type(pt->target);
+   res->swr.tileMode = SWR_TILE_NONE;
+   res->swr.format = mesa_to_swr_format(fmt);
+   res->swr.numSamples = (1 << pt->nr_samples);
+
+   SWR_FORMAT_INFO finfo = GetFormatInfo(res->swr.format);
+
+   unsigned total_size = 0;
+   unsigned width = pt->width0;
+   unsigned height = pt->height0;
+   unsigned depth = pt->depth0;
+   unsigned layers = pt->array_size;
+
+   for (int level = 0; level <= pt->last_level; level++) {
+      unsigned alignedWidth, alignedHeight;
+      unsigned num_slices;
+
+      if (pt->bind & (PIPE_BIND_RENDER_TARGET | PIPE_BIND_DEPTH_STENCIL)) {
+         alignedWidth = align(width, KNOB_MACROTILE_X_DIM);
+         alignedHeight = align(height, KNOB_MACROTILE_Y_DIM);
+      } else {
+         alignedWidth = width;
+         alignedHeight = height;
+      }
+
+      if (level == 0) {
+         res->alignedWidth = alignedWidth;
+         res->alignedHeight = alignedHeight;
+      }
+
+      res->row_stride[level] = alignedWidth * finfo.Bpp;
+      res->img_stride[level] = res->row_stride[level] * alignedHeight;
+      res->mip_offsets[level] = total_size;
+
+      if (pt->target == PIPE_TEXTURE_3D)
+         num_slices = depth;
+      else if (pt->target == PIPE_TEXTURE_1D_ARRAY
+               || pt->target == PIPE_TEXTURE_2D_ARRAY
+               || pt->target == PIPE_TEXTURE_CUBE
+               || pt->target == PIPE_TEXTURE_CUBE_ARRAY)
+         num_slices = layers;
+      else
+         num_slices = 1;
+
+      total_size += res->img_stride[level] * num_slices;
+      if (total_size > SWR_MAX_TEXTURE_SIZE)
+         return FALSE;
+
+      width = u_minify(width, 1);
+      height = u_minify(height, 1);
+      depth = u_minify(depth, 1);
+   }
+
+   res->swr.halign = res->alignedWidth;
+   res->swr.valign = res->alignedHeight;
+   res->swr.pitch = res->row_stride[0];
+
+   if (allocate) {
+      res->swr.pBaseAddress = (uint8_t *)_aligned_malloc(total_size, 64);
+
+      if (res->has_depth && res->has_stencil) {
+         SWR_FORMAT_INFO finfo = GetFormatInfo(res->secondary.format);
+         res->secondary.width = pt->width0;
+         res->secondary.height = pt->height0;
+         res->secondary.depth = pt->depth0;
+         res->secondary.type = SURFACE_2D;
+         res->secondary.tileMode = SWR_TILE_NONE;
+         res->secondary.format = R8_UINT;
+         res->secondary.numSamples = (1 << pt->nr_samples);
+         res->secondary.pitch = res->alignedWidth * finfo.Bpp;
+
+         res->secondary.pBaseAddress = (uint8_t *)_aligned_malloc(
+            res->alignedHeight * res->secondary.pitch, 64);
+      }
+   }
+
+   return TRUE;
+}
+
+static boolean
+swr_can_create_resource(struct pipe_screen *screen,
+                        const struct pipe_resource *templat)
+{
+   struct swr_resource res;
+   memset(&res, 0, sizeof(res));
+   res.base = *templat;
+   return swr_texture_layout(swr_screen(screen), &res, false);
+}
+
+static struct pipe_resource *
+swr_resource_create(struct pipe_screen *_screen,
+                    const struct pipe_resource *templat)
+{
+   struct swr_screen *screen = swr_screen(_screen);
+   struct swr_resource *res = CALLOC_STRUCT(swr_resource);
+   if (!res)
+      return NULL;
+
+   res->base = *templat;
+   pipe_reference_init(&res->base.reference, 1);
+   res->base.screen = &screen->base;
+
+   if (swr_resource_is_texture(&res->base)) {
+      if (res->base.bind & (PIPE_BIND_DISPLAY_TARGET | PIPE_BIND_SCANOUT
+                            | PIPE_BIND_SHARED)) {
+         /* displayable surface
+          * first call swr_texture_layout without allocating to finish
+          * filling out the SWR_SURFAE_STATE in res */
+         swr_texture_layout(screen, res, false);
+         if (!swr_displaytarget_layout(screen, res))
+            goto fail;
+      } else {
+         /* texture map */
+         if (!swr_texture_layout(screen, res, true))
+            goto fail;
+      }
+   } else {
+      /* other data (vertex buffer, const buffer, etc) */
+      assert(util_format_get_blocksize(templat->format) == 1);
+      assert(templat->height0 == 1);
+      assert(templat->depth0 == 1);
+      assert(templat->last_level == 0);
+
+      /* Easiest to just call swr_texture_layout, as it sets up
+       * SWR_SURFAE_STATE in res */
+      if (!swr_texture_layout(screen, res, true))
+         goto fail;
+   }
+
+   return &res->base;
+
+fail:
+   FREE(res);
+   return NULL;
+}
+
+static void
+swr_resource_destroy(struct pipe_screen *p_screen, struct pipe_resource *pt)
+{
+   struct swr_screen *screen = swr_screen(p_screen);
+   struct swr_resource *spr = swr_resource(pt);
+   struct pipe_context *pipe = spr->bound_to_context;
+
+   /* Only wait on fence if the resource is being used */
+   if (pipe && spr->status) {
+      /* But, if there's no fence pending, submit one.
+       * XXX: Remove once draw timestamps are implmented. */
+      if (!swr_is_fence_pending(screen->flush_fence))
+         swr_fence_submit(swr_context(pipe), screen->flush_fence);
+
+      swr_fence_finish(p_screen, screen->flush_fence, 0);
+      swr_resource_unused(pipe, spr);
+   }
+
+   /*
+    * Free resource primary surface.  If resource is display target, winsys
+    * manages the buffer and will free it on displaytarget_destroy.
+    */
+   if (spr->display_target) {
+      /* display target */
+      struct sw_winsys *winsys = screen->winsys;
+      winsys->displaytarget_destroy(winsys, spr->display_target);
+   } else
+      _aligned_free(spr->swr.pBaseAddress);
+
+   _aligned_free(spr->secondary.pBaseAddress);
+
+   FREE(spr);
+}
+
+
+static void
+swr_flush_frontbuffer(struct pipe_screen *p_screen,
+                      struct pipe_resource *resource,
+                      unsigned level,
+                      unsigned layer,
+                      void *context_private,
+                      struct pipe_box *sub_box)
+{
+   struct swr_screen *screen = swr_screen(p_screen);
+   struct sw_winsys *winsys = screen->winsys;
+   struct swr_resource *spr = swr_resource(resource);
+   struct pipe_context *pipe = spr->bound_to_context;
+
+   if (pipe) {
+      swr_fence_finish(p_screen, screen->flush_fence, 0);
+      swr_resource_unused(pipe, spr);
+      SwrEndFrame(swr_context(pipe)->swrContext);
+   }
+
+   debug_assert(spr->display_target);
+   if (spr->display_target)
+      winsys->displaytarget_display(
+         winsys, spr->display_target, context_private, sub_box);
+}
+
+
+static void
+swr_destroy_screen(struct pipe_screen *p_screen)
+{
+   struct swr_screen *screen = swr_screen(p_screen);
+   struct sw_winsys *winsys = screen->winsys;
+
+   fprintf(stderr, "SWR destroy screen!\n");
+
+   swr_fence_finish(p_screen, screen->flush_fence, 0);
+   swr_fence_reference(p_screen, &screen->flush_fence, NULL);
+
+   JitDestroyContext(screen->hJitMgr);
+
+   if (winsys->destroy)
+      winsys->destroy(winsys);
+
+   FREE(screen);
+}
+
+PUBLIC
+struct pipe_screen *
+swr_create_screen(struct sw_winsys *winsys)
+{
+   struct swr_screen *screen = CALLOC_STRUCT(swr_screen);
+
+   if (!screen)
+      return NULL;
+
+   if (!getenv("KNOB_MAX_PRIMS_PER_DRAW")) {
+      g_GlobalKnobs.MAX_PRIMS_PER_DRAW.Value(49152);
+   }
+
+   screen->winsys = winsys;
+   screen->base.get_name = swr_get_name;
+   screen->base.get_vendor = swr_get_vendor;
+   screen->base.is_format_supported = swr_is_format_supported;
+   screen->base.context_create = swr_create_context;
+   screen->base.can_create_resource = swr_can_create_resource;
+
+   screen->base.destroy = swr_destroy_screen;
+   screen->base.get_param = swr_get_param;
+   screen->base.get_shader_param = swr_get_shader_param;
+   screen->base.get_paramf = swr_get_paramf;
+
+   screen->base.resource_create = swr_resource_create;
+   screen->base.resource_destroy = swr_resource_destroy;
+
+   screen->base.flush_frontbuffer = swr_flush_frontbuffer;
+
+   screen->hJitMgr = JitCreateContext(KNOB_SIMD_WIDTH, KNOB_ARCH_STR);
+
+   swr_fence_init(&screen->base);
+
+   return &screen->base;
+}
+
+struct sw_winsys *
+swr_get_winsys(struct pipe_screen *pipe)
+{
+   return ((struct swr_screen *)pipe)->winsys;
+}
+
+struct sw_displaytarget *
+swr_get_displaytarget(struct pipe_resource *resource)
+{
+   return ((struct swr_resource *)resource)->display_target;
+}
diff --git a/src/gallium/drivers/swr/swr_screen.h b/src/gallium/drivers/swr/swr_screen.h
new file mode 100644
index 00000000000..a96dc44cf66
--- /dev/null
+++ b/src/gallium/drivers/swr/swr_screen.h
@@ -0,0 +1,52 @@
+/****************************************************************************
+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ***************************************************************************/
+
+#ifndef SWR_SCREEN_H
+#define SWR_SCREEN_H
+
+#include "pipe/p_screen.h"
+#include "pipe/p_defines.h"
+#include "api.h"
+
+struct sw_winsys;
+
+struct swr_screen {
+   struct pipe_screen base;
+
+   struct pipe_fence_handle *flush_fence;
+
+   struct sw_winsys *winsys;
+
+   HANDLE hJitMgr;
+};
+
+static INLINE struct swr_screen *
+swr_screen(struct pipe_screen *pipe)
+{
+   return (struct swr_screen *)pipe;
+}
+
+SWR_FORMAT
+mesa_to_swr_format(enum pipe_format format);
+
+#endif
diff --git a/src/gallium/drivers/swr/swr_shader.cpp b/src/gallium/drivers/swr/swr_shader.cpp
new file mode 100644
index 00000000000..ff16d0f2f11
--- /dev/null
+++ b/src/gallium/drivers/swr/swr_shader.cpp
@@ -0,0 +1,591 @@
+/****************************************************************************
+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ***************************************************************************/
+
+#include "JitManager.h"
+#include "state.h"
+#include "state_llvm.h"
+#include "builder.h"
+
+#include "llvm-c/Core.h"
+#include "llvm/Support/CBindingWrapping.h"
+
+#include "tgsi/tgsi_strings.h"
+#include "gallivm/lp_bld_init.h"
+#include "gallivm/lp_bld_flow.h"
+#include "gallivm/lp_bld_struct.h"
+#include "gallivm/lp_bld_tgsi.h"
+
+#include "swr_context.h"
+#include "swr_context_llvm.h"
+#include "swr_state.h"
+#include "swr_screen.h"
+
+bool operator==(const swr_jit_key &lhs, const swr_jit_key &rhs)
+{
+   return !memcmp(&lhs, &rhs, sizeof(lhs));
+}
+
+void
+swr_generate_fs_key(struct swr_jit_key &key,
+                    struct swr_context *ctx,
+                    swr_fragment_shader *swr_fs)
+{
+   key.nr_cbufs = ctx->framebuffer.nr_cbufs;
+   key.light_twoside = ctx->rasterizer->light_twoside;
+   memcpy(&key.vs_output_semantic_name,
+          &ctx->vs->info.base.output_semantic_name,
+          sizeof(key.vs_output_semantic_name));
+   memcpy(&key.vs_output_semantic_idx,
+          &ctx->vs->info.base.output_semantic_index,
+          sizeof(key.vs_output_semantic_idx));
+
+   key.nr_samplers = swr_fs->info.base.file_max[TGSI_FILE_SAMPLER] + 1;
+
+   for (unsigned i = 0; i < key.nr_samplers; i++) {
+      if (swr_fs->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
+         lp_sampler_static_sampler_state(
+            &key.sampler[i].sampler_state,
+            ctx->samplers[PIPE_SHADER_FRAGMENT][i]);
+      }
+   }
+
+   /*
+    * XXX If TGSI_FILE_SAMPLER_VIEW exists assume all texture opcodes
+    * are dx10-style? Can't really have mixed opcodes, at least not
+    * if we want to skip the holes here (without rescanning tgsi).
+    */
+   if (swr_fs->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] != -1) {
+      key.nr_sampler_views =
+         swr_fs->info.base.file_max[TGSI_FILE_SAMPLER_VIEW] + 1;
+      for (unsigned i = 0; i < key.nr_sampler_views; i++) {
+         if (swr_fs->info.base.file_mask[TGSI_FILE_SAMPLER_VIEW] & (1 << i)) {
+            lp_sampler_static_texture_state(
+               &key.sampler[i].texture_state,
+               ctx->sampler_views[PIPE_SHADER_FRAGMENT][i]);
+         }
+      }
+   } else {
+      key.nr_sampler_views = key.nr_samplers;
+      for (unsigned i = 0; i < key.nr_sampler_views; i++) {
+         if (swr_fs->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
+            lp_sampler_static_texture_state(
+               &key.sampler[i].texture_state,
+               ctx->sampler_views[PIPE_SHADER_FRAGMENT][i]);
+         }
+      }
+   }
+}
+
+struct BuilderSWR : public Builder {
+   BuilderSWR(JitManager *pJitMgr)
+      : Builder(pJitMgr)
+   {
+      pJitMgr->SetupNewModule();
+   }
+
+   PFN_VERTEX_FUNC
+   CompileVS(struct pipe_context *ctx, swr_vertex_shader *swr_vs);
+   PFN_PIXEL_KERNEL CompileFS(struct swr_context *ctx, swr_jit_key &key);
+};
+
+PFN_VERTEX_FUNC
+BuilderSWR::CompileVS(struct pipe_context *ctx, swr_vertex_shader *swr_vs)
+{
+   swr_vs->linkageMask = 0;
+
+   for (unsigned i = 0; i < swr_vs->info.base.num_outputs; i++) {
+      switch (swr_vs->info.base.output_semantic_name[i]) {
+      case TGSI_SEMANTIC_POSITION:
+         break;
+      default:
+         swr_vs->linkageMask |= (1 << i);
+         break;
+      }
+   }
+
+   //   tgsi_dump(swr_vs->pipe.tokens, 0);
+
+   struct gallivm_state *gallivm =
+      gallivm_create("VS", wrap(&JM()->mContext));
+   gallivm->module = wrap(JM()->mpCurrentModule);
+
+   LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS];
+   LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS];
+
+   memset(outputs, 0, sizeof(outputs));
+
+   AttrBuilder attrBuilder;
+   attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float));
+   AttributeSet attrSet = AttributeSet::get(
+      JM()->mContext, AttributeSet::FunctionIndex, attrBuilder);
+
+   std::vector<Type *> vsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0),
+                              PointerType::get(Gen_SWR_VS_CONTEXT(JM()), 0)};
+   FunctionType *vsFuncType =
+      FunctionType::get(Type::getVoidTy(JM()->mContext), vsArgs, false);
+
+   // create new vertex shader function
+   auto pFunction = Function::Create(vsFuncType,
+                                     GlobalValue::ExternalLinkage,
+                                     "VS",
+                                     JM()->mpCurrentModule);
+   pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet);
+
+   BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction);
+   IRB()->SetInsertPoint(block);
+   LLVMPositionBuilderAtEnd(gallivm->builder, wrap(block));
+
+   auto argitr = pFunction->arg_begin();
+   Value *hPrivateData = &*argitr++;
+   hPrivateData->setName("hPrivateData");
+   Value *pVsCtx = &*argitr++;
+   pVsCtx->setName("vsCtx");
+   
+   Value *consts_ptr = GEP(hPrivateData, {C(0), C(swr_draw_context_constantVS)});
+
+   consts_ptr->setName("vs_constants");
+   Value *const_sizes_ptr =
+      GEP(hPrivateData, {0, swr_draw_context_num_constantsVS});
+   const_sizes_ptr->setName("num_vs_constants");
+
+   Value *vtxInput = LOAD(pVsCtx, {0, SWR_VS_CONTEXT_pVin});
+
+   for (uint32_t attrib = 0; attrib < PIPE_MAX_SHADER_INPUTS; attrib++) {
+      const unsigned mask = swr_vs->info.base.input_usage_mask[attrib];
+      for (uint32_t channel = 0; channel < TGSI_NUM_CHANNELS; channel++) {
+         if (mask & (1 << channel)) {
+            inputs[attrib][channel] =
+               wrap(LOAD(vtxInput, {0, 0, attrib, channel}));
+         }
+      }
+   }
+
+   struct lp_bld_tgsi_system_values system_values;
+   memset(&system_values, 0, sizeof(system_values));
+   system_values.instance_id = wrap(LOAD(pVsCtx, {0, SWR_VS_CONTEXT_InstanceID}));
+   system_values.vertex_id = wrap(LOAD(pVsCtx, {0, SWR_VS_CONTEXT_VertexID}));
+
+   lp_build_tgsi_soa(gallivm,
+                     swr_vs->pipe.tokens,
+                     lp_type_float_vec(32, 32 * 8),
+                     NULL, // mask
+                     wrap(consts_ptr),
+                     wrap(const_sizes_ptr),
+                     &system_values,
+                     inputs,
+                     outputs,
+                     NULL, // wrap(hPrivateData), (sampler context)
+                     NULL, // thread data
+                     NULL, // sampler
+                     &swr_vs->info.base,
+                     NULL); // geometry shader face
+
+   IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
+
+   Value *vtxOutput = LOAD(pVsCtx, {0, SWR_VS_CONTEXT_pVout});
+
+   for (uint32_t channel = 0; channel < TGSI_NUM_CHANNELS; channel++) {
+      for (uint32_t attrib = 0; attrib < PIPE_MAX_SHADER_OUTPUTS; attrib++) {
+         if (!outputs[attrib][channel])
+            continue;
+
+         Value *val = LOAD(unwrap(outputs[attrib][channel]));
+
+         uint32_t outSlot = attrib;
+         if (swr_vs->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_PSIZE)
+            outSlot = VERTEX_POINT_SIZE_SLOT;
+         STORE(val, vtxOutput, {0, 0, outSlot, channel});
+      }
+   }
+
+   RET_VOID();
+
+   gallivm_verify_function(gallivm, wrap(pFunction));
+   gallivm_compile_module(gallivm);
+
+   //   lp_debug_dump_value(func);
+
+   PFN_VERTEX_FUNC pFunc =
+      (PFN_VERTEX_FUNC)gallivm_jit_function(gallivm, wrap(pFunction));
+
+   debug_printf("vert shader  %p\n", pFunc);
+   assert(pFunc && "Error: VertShader = NULL");
+
+#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR >= 5)
+   JM()->mIsModuleFinalized = true;
+#endif
+
+   return pFunc;
+}
+
+PFN_VERTEX_FUNC
+swr_compile_vs(struct pipe_context *ctx, swr_vertex_shader *swr_vs)
+{
+   BuilderSWR builder(
+      reinterpret_cast<JitManager *>(swr_screen(ctx->screen)->hJitMgr));
+   return builder.CompileVS(ctx, swr_vs);
+}
+
+static unsigned
+locate_linkage(ubyte name, ubyte index, struct tgsi_shader_info *info)
+{
+   for (int i = 0; i < PIPE_MAX_SHADER_OUTPUTS; i++) {
+      if ((info->output_semantic_name[i] == name)
+          && (info->output_semantic_index[i] == index)) {
+         return i - 1; // position is not part of the linkage
+      }
+   }
+
+   if (name == TGSI_SEMANTIC_COLOR) { // BCOLOR fallback
+      for (int i = 0; i < PIPE_MAX_SHADER_OUTPUTS; i++) {
+         if ((info->output_semantic_name[i] == TGSI_SEMANTIC_BCOLOR)
+             && (info->output_semantic_index[i] == index)) {
+            return i - 1; // position is not part of the linkage
+         }
+      }
+   }
+
+   return 0xFFFFFFFF;
+}
+
+PFN_PIXEL_KERNEL
+BuilderSWR::CompileFS(struct swr_context *ctx, swr_jit_key &key)
+{
+   struct swr_fragment_shader *swr_fs = ctx->fs;
+
+   //   tgsi_dump(swr_fs->pipe.tokens, 0);
+
+   struct gallivm_state *gallivm =
+      gallivm_create("FS", wrap(&JM()->mContext));
+   gallivm->module = wrap(JM()->mpCurrentModule);
+
+   LLVMValueRef inputs[PIPE_MAX_SHADER_INPUTS][TGSI_NUM_CHANNELS];
+   LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][TGSI_NUM_CHANNELS];
+
+   memset(inputs, 0, sizeof(inputs));
+   memset(outputs, 0, sizeof(outputs));
+
+   struct lp_build_sampler_soa *sampler = NULL;
+
+   AttrBuilder attrBuilder;
+   attrBuilder.addStackAlignmentAttr(JM()->mVWidth * sizeof(float));
+   AttributeSet attrSet = AttributeSet::get(
+      JM()->mContext, AttributeSet::FunctionIndex, attrBuilder);
+
+   std::vector<Type *> fsArgs{PointerType::get(Gen_swr_draw_context(JM()), 0),
+                              PointerType::get(Gen_SWR_PS_CONTEXT(JM()), 0)};
+   FunctionType *funcType =
+      FunctionType::get(Type::getVoidTy(JM()->mContext), fsArgs, false);
+
+   auto pFunction = Function::Create(funcType,
+                                     GlobalValue::ExternalLinkage,
+                                     "FS",
+                                     JM()->mpCurrentModule);
+   pFunction->addAttributes(AttributeSet::FunctionIndex, attrSet);
+
+   BasicBlock *block = BasicBlock::Create(JM()->mContext, "entry", pFunction);
+   IRB()->SetInsertPoint(block);
+   LLVMPositionBuilderAtEnd(gallivm->builder, wrap(block));
+
+   auto args = pFunction->arg_begin();
+   Value *hPrivateData = &*args++;
+   hPrivateData->setName("hPrivateData");
+   Value *pPS = &*args++;
+   pPS->setName("psCtx");
+
+   Value *consts_ptr = GEP(hPrivateData, {0, swr_draw_context_constantFS});
+   consts_ptr->setName("fs_constants");
+   Value *const_sizes_ptr =
+      GEP(hPrivateData, {0, swr_draw_context_num_constantsFS});
+   const_sizes_ptr->setName("num_fs_constants");
+
+   // xxx should check for flat shading versus interpolation
+
+
+   // load *pAttribs, *pPerspAttribs
+   Value *pRawAttribs = LOAD(pPS, {0, SWR_PS_CONTEXT_pAttribs}, "pRawAttribs");
+   Value *pPerspAttribs =
+      LOAD(pPS, {0, SWR_PS_CONTEXT_pPerspAttribs}, "pPerspAttribs");
+
+   swr_fs->constantMask = 0;
+   swr_fs->pointSpriteMask = 0;
+
+   for (int attrib = 0; attrib < PIPE_MAX_SHADER_INPUTS; attrib++) {
+      const unsigned mask = swr_fs->info.base.input_usage_mask[attrib];
+      const unsigned interpMode = swr_fs->info.base.input_interpolate[attrib];
+      const unsigned interpLoc = swr_fs->info.base.input_interpolate_loc[attrib];
+
+      if (!mask)
+         continue;
+
+      // load i,j
+      Value *vi = nullptr, *vj = nullptr;
+      switch (interpLoc) {
+      case TGSI_INTERPOLATE_LOC_CENTER:
+         vi = LOAD(pPS, {0, SWR_PS_CONTEXT_vI, PixelPositions_center}, "i");
+         vj = LOAD(pPS, {0, SWR_PS_CONTEXT_vJ, PixelPositions_center}, "j");
+         break;
+      case TGSI_INTERPOLATE_LOC_CENTROID:
+         vi = LOAD(pPS, {0, SWR_PS_CONTEXT_vI, PixelPositions_centroid}, "i");
+         vj = LOAD(pPS, {0, SWR_PS_CONTEXT_vJ, PixelPositions_centroid}, "j");
+         break;
+      case TGSI_INTERPOLATE_LOC_SAMPLE:
+         vi = LOAD(pPS, {0, SWR_PS_CONTEXT_vI, PixelPositions_sample}, "i");
+         vj = LOAD(pPS, {0, SWR_PS_CONTEXT_vJ, PixelPositions_sample}, "j");
+         break;
+      }
+
+      // load/compute w
+      Value *vw = nullptr, *pAttribs;
+      if (interpMode == TGSI_INTERPOLATE_PERSPECTIVE) {
+         pAttribs = pPerspAttribs;
+         switch (interpLoc) {
+         case TGSI_INTERPOLATE_LOC_CENTER:
+            vw = VRCP(LOAD(pPS, {0, SWR_PS_CONTEXT_vOneOverW, PixelPositions_center}));
+            break;
+         case TGSI_INTERPOLATE_LOC_CENTROID:
+            vw = VRCP(LOAD(pPS, {0, SWR_PS_CONTEXT_vOneOverW, PixelPositions_centroid}));
+            break;
+         case TGSI_INTERPOLATE_LOC_SAMPLE:
+            vw = VRCP(LOAD(pPS, {0, SWR_PS_CONTEXT_vOneOverW, PixelPositions_sample}));
+            break;
+         }
+      } else {
+         pAttribs = pRawAttribs;
+         vw = VIMMED1(1.f);
+      }
+
+      vw->setName("w");
+
+      ubyte semantic_name = swr_fs->info.base.input_semantic_name[attrib];
+      ubyte semantic_idx = swr_fs->info.base.input_semantic_index[attrib];
+
+      if (semantic_name == TGSI_SEMANTIC_FACE) {
+         Value *ff =
+            UI_TO_FP(LOAD(pPS, {0, SWR_PS_CONTEXT_frontFace}), mFP32Ty);
+         ff = FSUB(FMUL(ff, C(2.0f)), C(1.0f));
+         ff = VECTOR_SPLAT(JM()->mVWidth, ff, "vFrontFace");
+
+         inputs[attrib][0] = wrap(ff);
+         inputs[attrib][1] = wrap(VIMMED1(0.0f));
+         inputs[attrib][2] = wrap(VIMMED1(0.0f));
+         inputs[attrib][3] = wrap(VIMMED1(1.0f));
+         continue;
+      } else if (semantic_name == TGSI_SEMANTIC_POSITION) { // gl_FragCoord
+         inputs[attrib][0] = wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vX, PixelPositions_center}, "vX"));
+         inputs[attrib][1] = wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vY, PixelPositions_center}, "vY"));
+         inputs[attrib][2] = wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vZ}, "vZ"));
+         inputs[attrib][3] =
+            wrap(LOAD(pPS, {0, SWR_PS_CONTEXT_vOneOverW, PixelPositions_center}, "vOneOverW"));
+         continue;
+      } else if (semantic_name == TGSI_SEMANTIC_PRIMID) {
+         Value *primID = LOAD(pPS, {0, SWR_PS_CONTEXT_primID}, "primID");
+         inputs[attrib][0] = wrap(VECTOR_SPLAT(JM()->mVWidth, primID));
+         inputs[attrib][1] = wrap(VIMMED1(0));
+         inputs[attrib][2] = wrap(VIMMED1(0));
+         inputs[attrib][3] = wrap(VIMMED1(0));
+         continue;
+      }
+
+      unsigned linkedAttrib =
+         locate_linkage(semantic_name, semantic_idx, &ctx->vs->info.base);
+      if (linkedAttrib == 0xFFFFFFFF) {
+         // not found - check for point sprite
+         if (ctx->rasterizer->sprite_coord_enable) {
+            linkedAttrib = ctx->vs->info.base.num_outputs - 1;
+            swr_fs->pointSpriteMask |= (1 << linkedAttrib);
+         } else {
+            fprintf(stderr,
+                    "Missing %s[%d]\n",
+                    tgsi_semantic_names[semantic_name],
+                    semantic_idx);
+            assert(0 && "attribute linkage not found");
+         }
+      }
+
+      if (interpMode == TGSI_INTERPOLATE_CONSTANT) {
+         swr_fs->constantMask |= 1 << linkedAttrib;
+      }
+
+      for (int channel = 0; channel < TGSI_NUM_CHANNELS; channel++) {
+         if (mask & (1 << channel)) {
+            Value *indexA = C(linkedAttrib * 12 + channel);
+            Value *indexB = C(linkedAttrib * 12 + channel + 4);
+            Value *indexC = C(linkedAttrib * 12 + channel + 8);
+
+            if ((semantic_name == TGSI_SEMANTIC_COLOR)
+                && ctx->rasterizer->light_twoside) {
+               unsigned bcolorAttrib = locate_linkage(
+                  TGSI_SEMANTIC_BCOLOR, semantic_idx, &ctx->vs->info.base);
+
+               unsigned diff = 12 * (bcolorAttrib - linkedAttrib);
+
+               Value *back =
+                  XOR(C(1), LOAD(pPS, {0, SWR_PS_CONTEXT_frontFace}), "backFace");
+
+               Value *offset = MUL(back, C(diff));
+               offset->setName("offset");
+
+               indexA = ADD(indexA, offset);
+               indexB = ADD(indexB, offset);
+               indexC = ADD(indexC, offset);
+
+               if (interpMode == TGSI_INTERPOLATE_CONSTANT) {
+                  swr_fs->constantMask |= 1 << bcolorAttrib;
+               }
+            }
+
+            Value *va = VBROADCAST(LOAD(GEP(pAttribs, indexA)));
+            Value *vb = VBROADCAST(LOAD(GEP(pAttribs, indexB)));
+            Value *vc = VBROADCAST(LOAD(GEP(pAttribs, indexC)));
+
+            if (interpMode == TGSI_INTERPOLATE_CONSTANT) {
+               inputs[attrib][channel] = wrap(va);
+            } else {
+               Value *vk = FSUB(FSUB(VIMMED1(1.0f), vi), vj);
+
+               vc = FMUL(vk, vc);
+
+               Value *interp = FMUL(va, vi);
+               Value *interp1 = FMUL(vb, vj);
+               interp = FADD(interp, interp1);
+               interp = FADD(interp, vc);
+               if (interpMode == TGSI_INTERPOLATE_PERSPECTIVE)
+                  interp = FMUL(interp, vw);
+               inputs[attrib][channel] = wrap(interp);
+            }
+         }
+      }
+   }
+
+   sampler = swr_sampler_soa_create(key.sampler);
+
+   struct lp_bld_tgsi_system_values system_values;
+   memset(&system_values, 0, sizeof(system_values));
+
+   struct lp_build_mask_context mask;
+
+   if (swr_fs->info.base.uses_kill) {
+      Value *mask_val = LOAD(pPS, {0, SWR_PS_CONTEXT_activeMask}, "activeMask");
+      lp_build_mask_begin(
+         &mask, gallivm, lp_type_float_vec(32, 32 * 8), wrap(mask_val));
+   }
+
+   lp_build_tgsi_soa(gallivm,
+                     swr_fs->pipe.tokens,
+                     lp_type_float_vec(32, 32 * 8),
+                     swr_fs->info.base.uses_kill ? &mask : NULL, // mask
+                     wrap(consts_ptr),
+                     wrap(const_sizes_ptr),
+                     &system_values,
+                     inputs,
+                     outputs,
+                     wrap(hPrivateData),
+                     NULL, // thread data
+                     sampler, // sampler
+                     &swr_fs->info.base,
+                     NULL); // geometry shader face
+
+   IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
+
+   for (uint32_t attrib = 0; attrib < swr_fs->info.base.num_outputs;
+        attrib++) {
+      switch (swr_fs->info.base.output_semantic_name[attrib]) {
+      case TGSI_SEMANTIC_POSITION: {
+         // write z
+         LLVMValueRef outZ =
+            LLVMBuildLoad(gallivm->builder, outputs[attrib][2], "");
+         STORE(unwrap(outZ), pPS, {0, SWR_PS_CONTEXT_vZ});
+         break;
+      }
+      case TGSI_SEMANTIC_COLOR: {
+         for (uint32_t channel = 0; channel < TGSI_NUM_CHANNELS; channel++) {
+            if (!outputs[attrib][channel])
+               continue;
+
+            LLVMValueRef out =
+               LLVMBuildLoad(gallivm->builder, outputs[attrib][channel], "");
+            if (swr_fs->info.base.properties[TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS]) {
+               for (uint32_t rt = 0; rt < key.nr_cbufs; rt++) {
+                  STORE(unwrap(out),
+                        pPS,
+                        {0, SWR_PS_CONTEXT_shaded, rt, channel});
+               }
+            } else {
+               STORE(unwrap(out),
+                     pPS,
+                     {0,
+                           SWR_PS_CONTEXT_shaded,
+                           swr_fs->info.base.output_semantic_index[attrib],
+                           channel});
+            }
+         }
+         break;
+      }
+      default: {
+         fprintf(stderr,
+                 "unknown output from FS %s[%d]\n",
+                 tgsi_semantic_names[swr_fs->info.base
+                                        .output_semantic_name[attrib]],
+                 swr_fs->info.base.output_semantic_index[attrib]);
+         break;
+      }
+      }
+   }
+
+   LLVMValueRef mask_result = 0;
+   if (swr_fs->info.base.uses_kill) {
+      mask_result = lp_build_mask_end(&mask);
+   }
+
+   IRB()->SetInsertPoint(unwrap(LLVMGetInsertBlock(gallivm->builder)));
+
+   if (swr_fs->info.base.uses_kill) {
+      STORE(unwrap(mask_result), pPS, {0, SWR_PS_CONTEXT_activeMask});
+   }
+
+   RET_VOID();
+
+   gallivm_verify_function(gallivm, wrap(pFunction));
+
+   gallivm_compile_module(gallivm);
+
+   PFN_PIXEL_KERNEL kernel =
+      (PFN_PIXEL_KERNEL)gallivm_jit_function(gallivm, wrap(pFunction));
+   debug_printf("frag shader  %p\n", kernel);
+   assert(kernel && "Error: FragShader = NULL");
+
+#if (LLVM_VERSION_MAJOR == 3) && (LLVM_VERSION_MINOR >= 5)
+   JM()->mIsModuleFinalized = true;
+#endif
+
+   return kernel;
+}
+
+PFN_PIXEL_KERNEL
+swr_compile_fs(struct swr_context *ctx, swr_jit_key &key)
+{
+   BuilderSWR builder(
+      reinterpret_cast<JitManager *>(swr_screen(ctx->pipe.screen)->hJitMgr));
+   return builder.CompileFS(ctx, key);
+}
diff --git a/src/gallium/drivers/swr/swr_shader.h b/src/gallium/drivers/swr/swr_shader.h
new file mode 100644
index 00000000000..e22a7c48c2a
--- /dev/null
+++ b/src/gallium/drivers/swr/swr_shader.h
@@ -0,0 +1,60 @@
+/****************************************************************************
+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ***************************************************************************/
+
+#pragma once
+
+class swr_vertex_shader;
+class swr_fragment_shader;
+class swr_jit_key;
+
+PFN_VERTEX_FUNC
+swr_compile_vs(struct pipe_context *ctx, swr_vertex_shader *swr_vs);
+
+PFN_PIXEL_KERNEL
+swr_compile_fs(struct swr_context *ctx, swr_jit_key &key);
+
+void swr_generate_fs_key(struct swr_jit_key &key,
+                         struct swr_context *ctx,
+                         swr_fragment_shader *swr_fs);
+
+struct swr_jit_key {
+   unsigned nr_cbufs;
+   unsigned light_twoside;
+   ubyte vs_output_semantic_name[PIPE_MAX_SHADER_OUTPUTS];
+   ubyte vs_output_semantic_idx[PIPE_MAX_SHADER_OUTPUTS];
+   unsigned nr_samplers;
+   unsigned nr_sampler_views;
+   struct swr_sampler_static_state sampler[PIPE_MAX_SHADER_SAMPLER_VIEWS];
+};
+
+namespace std
+{
+template <> struct hash<swr_jit_key> {
+   std::size_t operator()(const swr_jit_key &k) const
+   {
+      return util_hash_crc32(&k, sizeof(k));
+   }
+};
+};
+
+bool operator==(const swr_jit_key &lhs, const swr_jit_key &rhs);
diff --git a/src/gallium/drivers/swr/swr_state.cpp b/src/gallium/drivers/swr/swr_state.cpp
new file mode 100644
index 00000000000..47ee3cb2664
--- /dev/null
+++ b/src/gallium/drivers/swr/swr_state.cpp
@@ -0,0 +1,1437 @@
+/****************************************************************************
+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ***************************************************************************/
+
+#include "common/os.h"
+#include "jit_api.h"
+#include "JitManager.h"
+#include "state_llvm.h"
+
+#include "gallivm/lp_bld_tgsi.h"
+#include "util/u_format.h"
+
+#include "util/u_memory.h"
+#include "util/u_inlines.h"
+#include "util/u_helpers.h"
+#include "util/u_framebuffer.h"
+
+#include "swr_state.h"
+#include "swr_context.h"
+#include "swr_context_llvm.h"
+#include "swr_screen.h"
+#include "swr_resource.h"
+#include "swr_tex_sample.h"
+#include "swr_scratch.h"
+#include "swr_shader.h"
+#include "swr_fence.h"
+
+/* These should be pulled out into separate files as necessary
+ * Just initializing everything here to get going. */
+
+static void *
+swr_create_blend_state(struct pipe_context *pipe,
+                       const struct pipe_blend_state *blend)
+{
+   struct swr_blend_state *state = CALLOC_STRUCT(swr_blend_state);
+
+   memcpy(&state->pipe, blend, sizeof(*blend));
+
+   struct pipe_blend_state *pipe_blend = &state->pipe;
+
+   for (int target = 0;
+        target < std::min(SWR_NUM_RENDERTARGETS, PIPE_MAX_COLOR_BUFS);
+        target++) {
+
+      struct pipe_rt_blend_state *rt_blend = &pipe_blend->rt[target];
+      SWR_RENDER_TARGET_BLEND_STATE &blendState =
+         state->blendState.renderTarget[target];
+      RENDER_TARGET_BLEND_COMPILE_STATE &compileState =
+         state->compileState[target];
+
+      if (target != 0 && !pipe_blend->independent_blend_enable) {
+         memcpy(&compileState,
+                &state->compileState[0],
+                sizeof(RENDER_TARGET_BLEND_COMPILE_STATE));
+         continue;
+      }
+
+      compileState.blendEnable = rt_blend->blend_enable;
+      if (compileState.blendEnable) {
+         compileState.sourceAlphaBlendFactor =
+            swr_convert_blend_factor(rt_blend->alpha_src_factor);
+         compileState.destAlphaBlendFactor =
+            swr_convert_blend_factor(rt_blend->alpha_dst_factor);
+         compileState.sourceBlendFactor =
+            swr_convert_blend_factor(rt_blend->rgb_src_factor);
+         compileState.destBlendFactor =
+            swr_convert_blend_factor(rt_blend->rgb_dst_factor);
+
+         compileState.colorBlendFunc =
+            swr_convert_blend_func(rt_blend->rgb_func);
+         compileState.alphaBlendFunc =
+            swr_convert_blend_func(rt_blend->alpha_func);
+      }
+      compileState.logicOpEnable = state->pipe.logicop_enable;
+      if (compileState.logicOpEnable) {
+         compileState.logicOpFunc =
+            swr_convert_logic_op(state->pipe.logicop_func);
+      }
+
+      blendState.writeDisableRed =
+         (rt_blend->colormask & PIPE_MASK_R) ? 0 : 1;
+      blendState.writeDisableGreen =
+         (rt_blend->colormask & PIPE_MASK_G) ? 0 : 1;
+      blendState.writeDisableBlue =
+         (rt_blend->colormask & PIPE_MASK_B) ? 0 : 1;
+      blendState.writeDisableAlpha =
+         (rt_blend->colormask & PIPE_MASK_A) ? 0 : 1;
+
+      if (rt_blend->colormask == 0)
+         compileState.blendEnable = false;
+   }
+
+   return state;
+}
+
+static void
+swr_bind_blend_state(struct pipe_context *pipe, void *blend)
+{
+   struct swr_context *ctx = swr_context(pipe);
+
+   if (ctx->blend == blend)
+      return;
+
+   ctx->blend = (swr_blend_state *)blend;
+
+   ctx->dirty |= SWR_NEW_BLEND;
+}
+
+static void
+swr_delete_blend_state(struct pipe_context *pipe, void *blend)
+{
+   FREE(blend);
+}
+
+static void
+swr_set_blend_color(struct pipe_context *pipe,
+                    const struct pipe_blend_color *color)
+{
+   struct swr_context *ctx = swr_context(pipe);
+
+   ctx->blend_color = *color;
+
+   ctx->dirty |= SWR_NEW_BLEND;
+}
+
+static void
+swr_set_stencil_ref(struct pipe_context *pipe,
+                    const struct pipe_stencil_ref *ref)
+{
+   struct swr_context *ctx = swr_context(pipe);
+
+   ctx->stencil_ref = *ref;
+
+   ctx->dirty |= SWR_NEW_DEPTH_STENCIL_ALPHA;
+}
+
+static void *
+swr_create_depth_stencil_state(
+   struct pipe_context *pipe,
+   const struct pipe_depth_stencil_alpha_state *depth_stencil)
+{
+   struct pipe_depth_stencil_alpha_state *state;
+
+   state = (pipe_depth_stencil_alpha_state *)mem_dup(depth_stencil,
+                                                     sizeof *depth_stencil);
+
+   return state;
+}
+
+static void
+swr_bind_depth_stencil_state(struct pipe_context *pipe, void *depth_stencil)
+{
+   struct swr_context *ctx = swr_context(pipe);
+
+   if (ctx->depth_stencil == (pipe_depth_stencil_alpha_state *)depth_stencil)
+      return;
+
+   ctx->depth_stencil = (pipe_depth_stencil_alpha_state *)depth_stencil;
+
+   ctx->dirty |= SWR_NEW_DEPTH_STENCIL_ALPHA;
+}
+
+static void
+swr_delete_depth_stencil_state(struct pipe_context *pipe, void *depth)
+{
+   FREE(depth);
+}
+
+
+static void *
+swr_create_rasterizer_state(struct pipe_context *pipe,
+                            const struct pipe_rasterizer_state *rast)
+{
+   struct pipe_rasterizer_state *state;
+   state = (pipe_rasterizer_state *)mem_dup(rast, sizeof *rast);
+
+   return state;
+}
+
+static void
+swr_bind_rasterizer_state(struct pipe_context *pipe, void *handle)
+{
+   struct swr_context *ctx = swr_context(pipe);
+   const struct pipe_rasterizer_state *rasterizer =
+      (const struct pipe_rasterizer_state *)handle;
+
+   if (ctx->rasterizer == (pipe_rasterizer_state *)rasterizer)
+      return;
+
+   ctx->rasterizer = (pipe_rasterizer_state *)rasterizer;
+
+   ctx->dirty |= SWR_NEW_RASTERIZER;
+}
+
+static void
+swr_delete_rasterizer_state(struct pipe_context *pipe, void *rasterizer)
+{
+   FREE(rasterizer);
+}
+
+
+static void *
+swr_create_sampler_state(struct pipe_context *pipe,
+                         const struct pipe_sampler_state *sampler)
+{
+   struct pipe_sampler_state *state =
+      (pipe_sampler_state *)mem_dup(sampler, sizeof *sampler);
+
+   return state;
+}
+
+static void
+swr_bind_sampler_states(struct pipe_context *pipe,
+                        unsigned shader,
+                        unsigned start,
+                        unsigned num,
+                        void **samplers)
+{
+   struct swr_context *ctx = swr_context(pipe);
+   unsigned i;
+
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(start + num <= Elements(ctx->samplers[shader]));
+
+   /* set the new samplers */
+   ctx->num_samplers[shader] = num;
+   for (i = 0; i < num; i++) {
+      ctx->samplers[shader][start + i] = (pipe_sampler_state *)samplers[i];
+   }
+
+   ctx->dirty |= SWR_NEW_SAMPLER;
+}
+
+static void
+swr_delete_sampler_state(struct pipe_context *pipe, void *sampler)
+{
+   FREE(sampler);
+}
+
+
+static struct pipe_sampler_view *
+swr_create_sampler_view(struct pipe_context *pipe,
+                        struct pipe_resource *texture,
+                        const struct pipe_sampler_view *templ)
+{
+   struct pipe_sampler_view *view = CALLOC_STRUCT(pipe_sampler_view);
+
+   if (view) {
+      *view = *templ;
+      view->reference.count = 1;
+      view->texture = NULL;
+      pipe_resource_reference(&view->texture, texture);
+      view->context = pipe;
+   }
+
+   return view;
+}
+
+static void
+swr_set_sampler_views(struct pipe_context *pipe,
+                      unsigned shader,
+                      unsigned start,
+                      unsigned num,
+                      struct pipe_sampler_view **views)
+{
+   struct swr_context *ctx = swr_context(pipe);
+   uint i;
+
+   assert(num <= PIPE_MAX_SHADER_SAMPLER_VIEWS);
+
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(start + num <= Elements(ctx->sampler_views[shader]));
+
+   /* set the new sampler views */
+   ctx->num_sampler_views[shader] = num;
+   for (i = 0; i < num; i++) {
+      /* Note: we're using pipe_sampler_view_release() here to work around
+       * a possible crash when the old view belongs to another context that
+       * was already destroyed.
+       */
+      pipe_sampler_view_release(pipe, &ctx->sampler_views[shader][start + i]);
+      pipe_sampler_view_reference(&ctx->sampler_views[shader][start + i],
+                                  views[i]);
+   }
+
+   ctx->dirty |= SWR_NEW_SAMPLER_VIEW;
+}
+
+static void
+swr_sampler_view_destroy(struct pipe_context *pipe,
+                         struct pipe_sampler_view *view)
+{
+   pipe_resource_reference(&view->texture, NULL);
+   FREE(view);
+}
+
+static void *
+swr_create_vs_state(struct pipe_context *pipe,
+                    const struct pipe_shader_state *vs)
+{
+   struct swr_vertex_shader *swr_vs =
+      (swr_vertex_shader *)CALLOC_STRUCT(swr_vertex_shader);
+   if (!swr_vs)
+      return NULL;
+
+   swr_vs->pipe.tokens = tgsi_dup_tokens(vs->tokens);
+   swr_vs->pipe.stream_output = vs->stream_output;
+
+   lp_build_tgsi_info(vs->tokens, &swr_vs->info);
+
+   swr_vs->func = swr_compile_vs(pipe, swr_vs);
+
+   swr_vs->soState = {0};
+
+   if (swr_vs->pipe.stream_output.num_outputs) {
+      pipe_stream_output_info *stream_output = &swr_vs->pipe.stream_output;
+
+      swr_vs->soState.soEnable = true;
+      // soState.rasterizerDisable set on state dirty
+      // soState.streamToRasterizer not used
+
+      for (uint32_t i = 0; i < stream_output->num_outputs; i++) {
+         swr_vs->soState.streamMasks[stream_output->output[i].stream] |=
+            1 << (stream_output->output[i].register_index - 1);
+      }
+      for (uint32_t i = 0; i < MAX_SO_STREAMS; i++) {
+        swr_vs->soState.streamNumEntries[i] =
+             _mm_popcnt_u32(swr_vs->soState.streamMasks[i]);
+       }
+   }
+
+   return swr_vs;
+}
+
+static void
+swr_bind_vs_state(struct pipe_context *pipe, void *vs)
+{
+   struct swr_context *ctx = swr_context(pipe);
+
+   if (ctx->vs == vs)
+      return;
+
+   ctx->vs = (swr_vertex_shader *)vs;
+   ctx->dirty |= SWR_NEW_VS;
+}
+
+static void
+swr_delete_vs_state(struct pipe_context *pipe, void *vs)
+{
+   struct swr_vertex_shader *swr_vs = (swr_vertex_shader *)vs;
+   FREE((void *)swr_vs->pipe.tokens);
+   FREE(vs);
+}
+
+static void *
+swr_create_fs_state(struct pipe_context *pipe,
+                    const struct pipe_shader_state *fs)
+{
+   struct swr_fragment_shader *swr_fs = new swr_fragment_shader;
+   if (!swr_fs)
+      return NULL;
+
+   swr_fs->pipe.tokens = tgsi_dup_tokens(fs->tokens);
+
+   lp_build_tgsi_info(fs->tokens, &swr_fs->info);
+
+   return swr_fs;
+}
+
+
+static void
+swr_bind_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct swr_context *ctx = swr_context(pipe);
+
+   if (ctx->fs == fs)
+      return;
+
+   ctx->fs = (swr_fragment_shader *)fs;
+   ctx->dirty |= SWR_NEW_FS;
+}
+
+static void
+swr_delete_fs_state(struct pipe_context *pipe, void *fs)
+{
+   struct swr_fragment_shader *swr_fs = (swr_fragment_shader *)fs;
+   FREE((void *)swr_fs->pipe.tokens);
+   delete swr_fs;
+}
+
+
+static void
+swr_set_constant_buffer(struct pipe_context *pipe,
+                        uint shader,
+                        uint index,
+                        struct pipe_constant_buffer *cb)
+{
+   struct swr_context *ctx = swr_context(pipe);
+   struct pipe_resource *constants = cb ? cb->buffer : NULL;
+
+   assert(shader < PIPE_SHADER_TYPES);
+   assert(index < Elements(ctx->constants[shader]));
+
+   /* note: reference counting */
+   util_copy_constant_buffer(&ctx->constants[shader][index], cb);
+
+   if (shader == PIPE_SHADER_VERTEX || shader == PIPE_SHADER_GEOMETRY) {
+      ctx->dirty |= SWR_NEW_VSCONSTANTS;
+   } else if (shader == PIPE_SHADER_FRAGMENT) {
+      ctx->dirty |= SWR_NEW_FSCONSTANTS;
+   }
+
+   if (cb && cb->user_buffer) {
+      pipe_resource_reference(&constants, NULL);
+   }
+}
+
+
+static void *
+swr_create_vertex_elements_state(struct pipe_context *pipe,
+                                 unsigned num_elements,
+                                 const struct pipe_vertex_element *attribs)
+{
+   struct swr_vertex_element_state *velems;
+   assert(num_elements <= PIPE_MAX_ATTRIBS);
+   velems = CALLOC_STRUCT(swr_vertex_element_state);
+   if (velems) {
+      velems->fsState.numAttribs = num_elements;
+      for (unsigned i = 0; i < num_elements; i++) {
+         // XXX: we should do this keyed on the VS usage info
+
+         const struct util_format_description *desc =
+            util_format_description(attribs[i].src_format);
+
+         velems->fsState.layout[i].AlignedByteOffset = attribs[i].src_offset;
+         velems->fsState.layout[i].Format =
+            mesa_to_swr_format(attribs[i].src_format);
+         velems->fsState.layout[i].StreamIndex =
+            attribs[i].vertex_buffer_index;
+         velems->fsState.layout[i].InstanceEnable =
+            attribs[i].instance_divisor != 0;
+         velems->fsState.layout[i].ComponentControl0 =
+            desc->channel[0].type != UTIL_FORMAT_TYPE_VOID
+            ? ComponentControl::StoreSrc
+            : ComponentControl::Store0;
+         velems->fsState.layout[i].ComponentControl1 =
+            desc->channel[1].type != UTIL_FORMAT_TYPE_VOID
+            ? ComponentControl::StoreSrc
+            : ComponentControl::Store0;
+         velems->fsState.layout[i].ComponentControl2 =
+            desc->channel[2].type != UTIL_FORMAT_TYPE_VOID
+            ? ComponentControl::StoreSrc
+            : ComponentControl::Store0;
+         velems->fsState.layout[i].ComponentControl3 =
+            desc->channel[3].type != UTIL_FORMAT_TYPE_VOID
+            ? ComponentControl::StoreSrc
+            : ComponentControl::Store1Fp;
+         velems->fsState.layout[i].ComponentPacking = ComponentEnable::XYZW;
+         velems->fsState.layout[i].InstanceDataStepRate =
+            attribs[i].instance_divisor;
+
+         /* Calculate the pitch of each stream */
+         const SWR_FORMAT_INFO &swr_desc = GetFormatInfo(
+            mesa_to_swr_format(attribs[i].src_format));
+         velems->stream_pitch[attribs[i].vertex_buffer_index] += swr_desc.Bpp;
+      }
+   }
+
+   return velems;
+}
+
+static void
+swr_bind_vertex_elements_state(struct pipe_context *pipe, void *velems)
+{
+   struct swr_context *ctx = swr_context(pipe);
+   struct swr_vertex_element_state *swr_velems =
+      (struct swr_vertex_element_state *)velems;
+
+   ctx->velems = swr_velems;
+   ctx->dirty |= SWR_NEW_VERTEX;
+}
+
+static void
+swr_delete_vertex_elements_state(struct pipe_context *pipe, void *velems)
+{
+   /* XXX Need to destroy fetch shader? */
+   FREE(velems);
+}
+
+
+static void
+swr_set_vertex_buffers(struct pipe_context *pipe,
+                       unsigned start_slot,
+                       unsigned num_elements,
+                       const struct pipe_vertex_buffer *buffers)
+{
+   struct swr_context *ctx = swr_context(pipe);
+
+   assert(num_elements <= PIPE_MAX_ATTRIBS);
+
+   util_set_vertex_buffers_count(ctx->vertex_buffer,
+                                 &ctx->num_vertex_buffers,
+                                 buffers,
+                                 start_slot,
+                                 num_elements);
+
+   ctx->dirty |= SWR_NEW_VERTEX;
+}
+
+
+static void
+swr_set_index_buffer(struct pipe_context *pipe,
+                     const struct pipe_index_buffer *ib)
+{
+   struct swr_context *ctx = swr_context(pipe);
+
+   if (ib)
+      memcpy(&ctx->index_buffer, ib, sizeof(ctx->index_buffer));
+   else
+      memset(&ctx->index_buffer, 0, sizeof(ctx->index_buffer));
+
+   ctx->dirty |= SWR_NEW_VERTEX;
+}
+
+static void
+swr_set_polygon_stipple(struct pipe_context *pipe,
+                        const struct pipe_poly_stipple *stipple)
+{
+   struct swr_context *ctx = swr_context(pipe);
+
+   ctx->poly_stipple = *stipple; /* struct copy */
+   ctx->dirty |= SWR_NEW_STIPPLE;
+}
+
+static void
+swr_set_clip_state(struct pipe_context *pipe,
+                   const struct pipe_clip_state *clip)
+{
+   struct swr_context *ctx = swr_context(pipe);
+
+   ctx->clip = *clip;
+   /* XXX Unimplemented, but prevents crash */
+
+   ctx->dirty |= SWR_NEW_CLIP;
+}
+
+
+static void
+swr_set_scissor_states(struct pipe_context *pipe,
+                       unsigned start_slot,
+                       unsigned num_viewports,
+                       const struct pipe_scissor_state *scissor)
+{
+   struct swr_context *ctx = swr_context(pipe);
+
+   ctx->scissor = *scissor;
+   ctx->dirty |= SWR_NEW_SCISSOR;
+}
+
+static void
+swr_set_viewport_states(struct pipe_context *pipe,
+                        unsigned start_slot,
+                        unsigned num_viewports,
+                        const struct pipe_viewport_state *vpt)
+{
+   struct swr_context *ctx = swr_context(pipe);
+
+   ctx->viewport = *vpt;
+   ctx->dirty |= SWR_NEW_VIEWPORT;
+}
+
+
+static void
+swr_set_framebuffer_state(struct pipe_context *pipe,
+                          const struct pipe_framebuffer_state *fb)
+{
+   struct swr_context *ctx = swr_context(pipe);
+
+   boolean changed = !util_framebuffer_state_equal(&ctx->framebuffer, fb);
+
+   assert(fb->width <= KNOB_GUARDBAND_WIDTH);
+   assert(fb->height <= KNOB_GUARDBAND_HEIGHT);
+
+   if (changed) {
+      unsigned i;
+      for (i = 0; i < fb->nr_cbufs; ++i)
+         pipe_surface_reference(&ctx->framebuffer.cbufs[i], fb->cbufs[i]);
+      for (; i < ctx->framebuffer.nr_cbufs; ++i)
+         pipe_surface_reference(&ctx->framebuffer.cbufs[i], NULL);
+
+      ctx->framebuffer.nr_cbufs = fb->nr_cbufs;
+
+      ctx->framebuffer.width = fb->width;
+      ctx->framebuffer.height = fb->height;
+
+      pipe_surface_reference(&ctx->framebuffer.zsbuf, fb->zsbuf);
+
+      ctx->dirty |= SWR_NEW_FRAMEBUFFER;
+   }
+}
+
+
+static void
+swr_set_sample_mask(struct pipe_context *pipe, unsigned sample_mask)
+{
+   struct swr_context *ctx = swr_context(pipe);
+
+   if (sample_mask != ctx->sample_mask) {
+      ctx->sample_mask = sample_mask;
+      ctx->dirty |= SWR_NEW_RASTERIZER;
+   }
+}
+
+/*
+ * Update resource in-use status
+ * All resources bound to color or depth targets marked as WRITE resources.
+ * VBO Vertex/index buffers and texture views marked as READ resources.
+ */
+void
+swr_update_resource_status(struct pipe_context *pipe,
+                           const struct pipe_draw_info *p_draw_info)
+{
+   struct swr_context *ctx = swr_context(pipe);
+   struct pipe_framebuffer_state *fb = &ctx->framebuffer;
+
+   /* colorbuffer targets */
+   if (fb->nr_cbufs)
+      for (uint32_t i = 0; i < fb->nr_cbufs; ++i)
+         if (fb->cbufs[i])
+            swr_resource_write(pipe, swr_resource(fb->cbufs[i]->texture));
+
+   /* depth/stencil target */
+   if (fb->zsbuf)
+      swr_resource_write(pipe, swr_resource(fb->zsbuf->texture));
+
+   /* VBO vertex buffers */
+   for (uint32_t i = 0; i < ctx->num_vertex_buffers; i++) {
+      struct pipe_vertex_buffer *vb = &ctx->vertex_buffer[i];
+      if (!vb->user_buffer)
+         swr_resource_read(pipe, swr_resource(vb->buffer));
+   }
+
+   /* VBO index buffer */
+   if (p_draw_info && p_draw_info->indexed) {
+      struct pipe_index_buffer *ib = &ctx->index_buffer;
+      if (!ib->user_buffer)
+         swr_resource_read(pipe, swr_resource(ib->buffer));
+   }
+
+   /* texture sampler views */
+   for (uint32_t i = 0; i < PIPE_MAX_SHADER_SAMPLER_VIEWS; i++) {
+      struct pipe_sampler_view *view =
+         ctx->sampler_views[PIPE_SHADER_FRAGMENT][i];
+      if (view)
+         swr_resource_read(pipe, swr_resource(view->texture));
+   }
+}
+
+void
+swr_update_derived(struct pipe_context *pipe,
+                   const struct pipe_draw_info *p_draw_info)
+{
+   struct swr_context *ctx = swr_context(pipe);
+   struct swr_screen *screen = swr_screen(ctx->pipe.screen);
+
+   /* Any state that requires dirty flags to be re-triggered sets this mask */
+   /* For example, user_buffer vertex and index buffers. */
+   unsigned post_update_dirty_flags = 0;
+
+   /* Render Targets */
+   if (ctx->dirty & SWR_NEW_FRAMEBUFFER) {
+      struct pipe_framebuffer_state *fb = &ctx->framebuffer;
+      SWR_SURFACE_STATE *new_attachment[SWR_NUM_ATTACHMENTS] = {0};
+      UINT i;
+
+      /* colorbuffer targets */
+      if (fb->nr_cbufs)
+         for (i = 0; i < fb->nr_cbufs; ++i)
+            if (fb->cbufs[i]) {
+               struct swr_resource *colorBuffer =
+                  swr_resource(fb->cbufs[i]->texture);
+               new_attachment[SWR_ATTACHMENT_COLOR0 + i] = &colorBuffer->swr;
+            }
+
+      /* depth/stencil target */
+      if (fb->zsbuf) {
+         struct swr_resource *depthStencilBuffer =
+            swr_resource(fb->zsbuf->texture);
+         if (depthStencilBuffer->has_depth) {
+            new_attachment[SWR_ATTACHMENT_DEPTH] = &depthStencilBuffer->swr;
+
+            if (depthStencilBuffer->has_stencil)
+               new_attachment[SWR_ATTACHMENT_STENCIL] =
+                  &depthStencilBuffer->secondary;
+
+         } else if (depthStencilBuffer->has_stencil)
+            new_attachment[SWR_ATTACHMENT_STENCIL] = &depthStencilBuffer->swr;
+      }
+
+      /* Make the attachment updates */
+      swr_draw_context *pDC = &ctx->swrDC;
+      SWR_SURFACE_STATE *renderTargets = pDC->renderTargets;
+      unsigned need_fence = FALSE;
+      for (i = 0; i < SWR_NUM_ATTACHMENTS; i++) {
+         void *new_base = nullptr;
+         if (new_attachment[i])
+            new_base = new_attachment[i]->pBaseAddress;
+
+         /* StoreTile for changed target */
+         if (renderTargets[i].pBaseAddress != new_base) {
+            if (renderTargets[i].pBaseAddress) {
+               /* If changing attachment to a new target, mark tiles as
+                * INVALID so they are reloaded from surface.
+                * If detaching attachment, mark tiles as RESOLVED so core
+                * won't try to load from non-existent target. */
+               enum SWR_TILE_STATE post_state = (new_attachment[i]
+                  ? SWR_TILE_INVALID : SWR_TILE_RESOLVED);
+               swr_store_render_target(pipe, i, post_state);
+
+               need_fence |= TRUE;
+            }
+
+            /* Make new attachment */
+            if (new_attachment[i])
+               renderTargets[i] = *new_attachment[i];
+            else
+               if (renderTargets[i].pBaseAddress)
+                  renderTargets[i] = {0};
+         }
+      }
+
+      /* This fence ensures any attachment changes are resolved before the
+       * next draw */
+      if (need_fence)
+         swr_fence_submit(ctx, screen->flush_fence);
+   }
+
+   /* Raster state */
+   if (ctx->dirty & (SWR_NEW_RASTERIZER | SWR_NEW_FRAMEBUFFER)) {
+      pipe_rasterizer_state *rasterizer = ctx->rasterizer;
+      pipe_framebuffer_state *fb = &ctx->framebuffer;
+
+      SWR_RASTSTATE *rastState = &ctx->derived.rastState;
+      rastState->cullMode = swr_convert_cull_mode(rasterizer->cull_face);
+      rastState->frontWinding = rasterizer->front_ccw
+         ? SWR_FRONTWINDING_CCW
+         : SWR_FRONTWINDING_CW;
+      rastState->scissorEnable = rasterizer->scissor;
+      rastState->pointSize = rasterizer->point_size > 0.0f
+         ? rasterizer->point_size
+         : 1.0f;
+      rastState->lineWidth = rasterizer->line_width > 0.0f
+         ? rasterizer->line_width
+         : 1.0f;
+
+      rastState->pointParam = rasterizer->point_size_per_vertex;
+
+      rastState->pointSpriteEnable = rasterizer->sprite_coord_enable;
+      rastState->pointSpriteTopOrigin =
+         rasterizer->sprite_coord_mode == PIPE_SPRITE_COORD_UPPER_LEFT;
+
+      /* XXX TODO: Add multisample */
+      rastState->msaaRastEnable = false;
+      rastState->rastMode = SWR_MSAA_RASTMODE_OFF_PIXEL;
+      rastState->sampleCount = SWR_MULTISAMPLE_1X;
+      rastState->bForcedSampleCount = false;
+
+      bool do_offset = false;
+      switch (rasterizer->fill_front) {
+      case PIPE_POLYGON_MODE_FILL:
+         do_offset = rasterizer->offset_tri;
+         break;
+      case PIPE_POLYGON_MODE_LINE:
+         do_offset = rasterizer->offset_line;
+         break;
+      case PIPE_POLYGON_MODE_POINT:
+         do_offset = rasterizer->offset_point;
+         break;
+      }
+
+      if (do_offset) {
+         rastState->depthBias = rasterizer->offset_units;
+         rastState->slopeScaledDepthBias = rasterizer->offset_scale;
+         rastState->depthBiasClamp = rasterizer->offset_clamp;
+      } else {
+         rastState->depthBias = 0;
+         rastState->slopeScaledDepthBias = 0;
+         rastState->depthBiasClamp = 0;
+      }
+      struct pipe_surface *zb = fb->zsbuf;
+      if (zb && swr_resource(zb->texture)->has_depth)
+         rastState->depthFormat = swr_resource(zb->texture)->swr.format;
+
+      rastState->depthClipEnable = rasterizer->depth_clip;
+
+      SwrSetRastState(ctx->swrContext, rastState);
+   }
+
+   /* Scissor */
+   if (ctx->dirty & SWR_NEW_SCISSOR) {
+      pipe_scissor_state *scissor = &ctx->scissor;
+      BBOX bbox(scissor->miny, scissor->maxy,
+                scissor->minx, scissor->maxx);
+      SwrSetScissorRects(ctx->swrContext, 1, &bbox);
+   }
+
+   /* Viewport */
+   if (ctx->dirty & (SWR_NEW_VIEWPORT | SWR_NEW_FRAMEBUFFER
+                     | SWR_NEW_RASTERIZER)) {
+      pipe_viewport_state *state = &ctx->viewport;
+      pipe_framebuffer_state *fb = &ctx->framebuffer;
+      pipe_rasterizer_state *rasterizer = ctx->rasterizer;
+
+      SWR_VIEWPORT *vp = &ctx->derived.vp;
+      SWR_VIEWPORT_MATRIX *vpm = &ctx->derived.vpm;
+
+      vp->x = state->translate[0] - state->scale[0];
+      vp->width = state->translate[0] + state->scale[0];
+      vp->y = state->translate[1] - fabs(state->scale[1]);
+      vp->height = state->translate[1] + fabs(state->scale[1]);
+      if (rasterizer->clip_halfz == 0) {
+         vp->minZ = state->translate[2] - state->scale[2];
+         vp->maxZ = state->translate[2] + state->scale[2];
+      } else {
+         vp->minZ = state->translate[2];
+         vp->maxZ = state->translate[2] + state->scale[2];
+      }
+
+      vpm->m00 = state->scale[0];
+      vpm->m11 = state->scale[1];
+      vpm->m22 = state->scale[2];
+      vpm->m30 = state->translate[0];
+      vpm->m31 = state->translate[1];
+      vpm->m32 = state->translate[2];
+
+      /* Now that the matrix is calculated, clip the view coords to screen
+       * size.  OpenGL allows for -ve x,y in the viewport. */
+      vp->x = std::max(vp->x, 0.0f);
+      vp->y = std::max(vp->y, 0.0f);
+      vp->width = std::min(vp->width, (float)fb->width);
+      vp->height = std::min(vp->height, (float)fb->height);
+
+      SwrSetViewports(ctx->swrContext, 1, vp, vpm);
+   }
+
+   /* Set vertex & index buffers */
+   /* (using draw info if called by swr_draw_vbo) */
+   if (ctx->dirty & SWR_NEW_VERTEX) {
+      uint32_t size, pitch, max_vertex, partial_inbounds;
+      const uint8_t *p_data;
+
+      /* If being called by swr_draw_vbo, copy draw details */
+      struct pipe_draw_info info = {0};
+      if (p_draw_info)
+         info = *p_draw_info;
+
+      /* vertex buffers */
+      SWR_VERTEX_BUFFER_STATE swrVertexBuffers[PIPE_MAX_ATTRIBS];
+      for (UINT i = 0; i < ctx->num_vertex_buffers; i++) {
+         struct pipe_vertex_buffer *vb = &ctx->vertex_buffer[i];
+
+         pitch = vb->stride;
+         if (!vb->user_buffer) {
+            /* VBO
+             * size is based on buffer->width0 rather than info.max_index
+             * to prevent having to validate VBO on each draw */
+            size = vb->buffer->width0;
+            max_vertex = size / pitch;
+            partial_inbounds = size % pitch;
+
+            p_data = (const uint8_t *)swr_resource_data(vb->buffer)
+               + vb->buffer_offset;
+         } else {
+            /* Client buffer
+             * client memory is one-time use, re-trigger SWR_NEW_VERTEX to
+             * revalidate on each draw */
+            post_update_dirty_flags |= SWR_NEW_VERTEX;
+
+            if (pitch) {
+               size = (info.max_index - info.min_index + 1) * pitch;
+            } else {
+               /* pitch = 0, means constant value
+                * set size to 1 vertex */
+               size = ctx->velems->stream_pitch[i];
+            }
+
+            max_vertex = info.max_index + 1;
+            partial_inbounds = 0;
+
+            /* Copy only needed vertices to scratch space */
+            size = AlignUp(size, 4);
+            const void *ptr = (const uint8_t *) vb->user_buffer
+               + info.min_index * pitch;
+            ptr = swr_copy_to_scratch_space(
+               ctx, &ctx->scratch->vertex_buffer, ptr, size);
+            p_data = (const uint8_t *)ptr - info.min_index * pitch;
+         }
+
+         swrVertexBuffers[i] = {0};
+         swrVertexBuffers[i].index = i;
+         swrVertexBuffers[i].pitch = pitch;
+         swrVertexBuffers[i].pData = p_data;
+         swrVertexBuffers[i].size = size;
+         swrVertexBuffers[i].maxVertex = max_vertex;
+         swrVertexBuffers[i].partialInboundsSize = partial_inbounds;
+      }
+
+      SwrSetVertexBuffers(
+         ctx->swrContext, ctx->num_vertex_buffers, swrVertexBuffers);
+
+      /* index buffer, if required (info passed in by swr_draw_vbo) */
+      SWR_FORMAT index_type = R32_UINT; /* Default for non-indexed draws */
+      if (info.indexed) {
+         struct pipe_index_buffer *ib = &ctx->index_buffer;
+
+         pitch = ib->index_size ? ib->index_size : sizeof(uint32_t);
+         index_type = swr_convert_index_type(pitch);
+
+         if (!ib->user_buffer) {
+            /* VBO
+             * size is based on buffer->width0 rather than info.count
+             * to prevent having to validate VBO on each draw */
+            size = ib->buffer->width0;
+            p_data =
+               (const uint8_t *)swr_resource_data(ib->buffer) + ib->offset;
+         } else {
+            /* Client buffer
+             * client memory is one-time use, re-trigger SWR_NEW_VERTEX to
+             * revalidate on each draw */
+            post_update_dirty_flags |= SWR_NEW_VERTEX;
+
+            size = info.count * pitch;
+            size = AlignUp(size, 4);
+
+            /* Copy indices to scratch space */
+            const void *ptr = ib->user_buffer;
+            ptr = swr_copy_to_scratch_space(
+               ctx, &ctx->scratch->index_buffer, ptr, size);
+            p_data = (const uint8_t *)ptr;
+         }
+
+         SWR_INDEX_BUFFER_STATE swrIndexBuffer;
+         swrIndexBuffer.format = swr_convert_index_type(ib->index_size);
+         swrIndexBuffer.pIndices = p_data;
+         swrIndexBuffer.size = size;
+
+         SwrSetIndexBuffer(ctx->swrContext, &swrIndexBuffer);
+      }
+
+      struct swr_vertex_element_state *velems = ctx->velems;
+      if (velems && velems->fsState.indexType != index_type) {
+         velems->fsFunc = NULL;
+         velems->fsState.indexType = index_type;
+      }
+   }
+
+   /* VertexShader */
+   if (ctx->dirty & (SWR_NEW_VS | SWR_NEW_FRAMEBUFFER)) {
+      SwrSetVertexFunc(ctx->swrContext, ctx->vs->func);
+   }
+
+   swr_jit_key key;
+   if (ctx->dirty & (SWR_NEW_FS | SWR_NEW_SAMPLER | SWR_NEW_SAMPLER_VIEW
+                     | SWR_NEW_RASTERIZER | SWR_NEW_FRAMEBUFFER)) {
+      memset(&key, 0, sizeof(key));
+      swr_generate_fs_key(key, ctx, ctx->fs);
+      auto search = ctx->fs->map.find(key);
+      PFN_PIXEL_KERNEL func;
+      if (search != ctx->fs->map.end()) {
+         func = search->second;
+      } else {
+         func = swr_compile_fs(ctx, key);
+         ctx->fs->map.insert(std::make_pair(key, func));
+      }
+      SWR_PS_STATE psState = {0};
+      psState.pfnPixelShader = func;
+      psState.killsPixel = ctx->fs->info.base.uses_kill;
+      psState.inputCoverage = SWR_INPUT_COVERAGE_NORMAL;
+      psState.writesODepth = ctx->fs->info.base.writes_z;
+      psState.usesSourceDepth = ctx->fs->info.base.reads_z;
+      psState.shadingRate = SWR_SHADING_RATE_PIXEL; // XXX
+      psState.numRenderTargets = ctx->framebuffer.nr_cbufs;
+      psState.posOffset = SWR_PS_POSITION_SAMPLE_NONE; // XXX msaa
+      uint32_t barycentricsMask = 0;
+#if 0
+      // when we switch to mesa-master
+      if (ctx->fs->info.base.uses_persp_center ||
+          ctx->fs->info.base.uses_linear_center)
+         barycentricsMask |= SWR_BARYCENTRIC_PER_PIXEL_MASK;
+      if (ctx->fs->info.base.uses_persp_centroid ||
+          ctx->fs->info.base.uses_linear_centroid)
+         barycentricsMask |= SWR_BARYCENTRIC_CENTROID_MASK;
+      if (ctx->fs->info.base.uses_persp_sample ||
+          ctx->fs->info.base.uses_linear_sample)
+         barycentricsMask |= SWR_BARYCENTRIC_PER_SAMPLE_MASK;
+#else
+      for (unsigned i = 0; i < ctx->fs->info.base.num_inputs; i++) {
+         switch (ctx->fs->info.base.input_interpolate_loc[i]) {
+         case TGSI_INTERPOLATE_LOC_CENTER:
+            barycentricsMask |= SWR_BARYCENTRIC_PER_PIXEL_MASK;
+            break;
+         case TGSI_INTERPOLATE_LOC_CENTROID:
+            barycentricsMask |= SWR_BARYCENTRIC_CENTROID_MASK;
+            break;
+         case TGSI_INTERPOLATE_LOC_SAMPLE:
+            barycentricsMask |= SWR_BARYCENTRIC_PER_SAMPLE_MASK;
+            break;
+         }
+      }
+#endif
+      psState.barycentricsMask = barycentricsMask;
+      psState.usesUAV = false; // XXX
+      psState.forceEarlyZ = false;
+      SwrSetPixelShaderState(ctx->swrContext, &psState);
+   }
+
+   /* JIT sampler state */
+   if (ctx->dirty & SWR_NEW_SAMPLER) {
+      swr_draw_context *pDC = &ctx->swrDC;
+
+      for (unsigned i = 0; i < key.nr_samplers; i++) {
+         const struct pipe_sampler_state *sampler =
+            ctx->samplers[PIPE_SHADER_FRAGMENT][i];
+
+         if (sampler) {
+            pDC->samplersFS[i].min_lod = sampler->min_lod;
+            pDC->samplersFS[i].max_lod = sampler->max_lod;
+            pDC->samplersFS[i].lod_bias = sampler->lod_bias;
+            COPY_4V(pDC->samplersFS[i].border_color, sampler->border_color.f);
+         }
+      }
+   }
+
+   /* JIT sampler view state */
+   if (ctx->dirty & (SWR_NEW_SAMPLER_VIEW | SWR_NEW_FRAMEBUFFER)) {
+      swr_draw_context *pDC = &ctx->swrDC;
+
+      for (unsigned i = 0; i < key.nr_sampler_views; i++) {
+         struct pipe_sampler_view *view =
+            ctx->sampler_views[PIPE_SHADER_FRAGMENT][i];
+
+         if (view) {
+            struct pipe_resource *res = view->texture;
+            struct swr_resource *swr_res = swr_resource(res);
+            struct swr_jit_texture *jit_tex = &pDC->texturesFS[i];
+            memset(jit_tex, 0, sizeof(*jit_tex));
+            jit_tex->width = res->width0;
+            jit_tex->height = res->height0;
+            jit_tex->depth = res->depth0;
+            jit_tex->first_level = view->u.tex.first_level;
+            jit_tex->last_level = view->u.tex.last_level;
+            jit_tex->base_ptr = swr_res->swr.pBaseAddress;
+
+            for (unsigned level = jit_tex->first_level;
+                 level <= jit_tex->last_level;
+                 level++) {
+               jit_tex->row_stride[level] = swr_res->row_stride[level];
+               jit_tex->img_stride[level] = swr_res->img_stride[level];
+               jit_tex->mip_offsets[level] = swr_res->mip_offsets[level];
+            }
+         }
+      }
+   }
+
+   /* VertexShader Constants */
+   if (ctx->dirty & SWR_NEW_VSCONSTANTS) {
+      swr_draw_context *pDC = &ctx->swrDC;
+
+      for (UINT i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
+         const pipe_constant_buffer *cb =
+            &ctx->constants[PIPE_SHADER_VERTEX][i];
+         pDC->num_constantsVS[i] = cb->buffer_size;
+         if (cb->buffer)
+            pDC->constantVS[i] =
+               (const float *)((const uint8_t *)cb->buffer + cb->buffer_offset);
+         else {
+            /* Need to copy these constants to scratch space */
+            if (cb->user_buffer && cb->buffer_size) {
+               const void *ptr =
+                  ((const uint8_t *)cb->user_buffer + cb->buffer_offset);
+               uint32_t size = AlignUp(cb->buffer_size, 4);
+               ptr = swr_copy_to_scratch_space(
+                  ctx, &ctx->scratch->vs_constants, ptr, size);
+               pDC->constantVS[i] = (const float *)ptr;
+            }
+         }
+      }
+   }
+
+   /* FragmentShader Constants */
+   if (ctx->dirty & SWR_NEW_FSCONSTANTS) {
+      swr_draw_context *pDC = &ctx->swrDC;
+
+      for (UINT i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
+         const pipe_constant_buffer *cb =
+            &ctx->constants[PIPE_SHADER_FRAGMENT][i];
+         pDC->num_constantsFS[i] = cb->buffer_size;
+         if (cb->buffer)
+            pDC->constantFS[i] =
+               (const float *)((const uint8_t *)cb->buffer + cb->buffer_offset);
+         else {
+            /* Need to copy these constants to scratch space */
+            if (cb->user_buffer && cb->buffer_size) {
+               const void *ptr =
+                  ((const uint8_t *)cb->user_buffer + cb->buffer_offset);
+               uint32_t size = AlignUp(cb->buffer_size, 4);
+               ptr = swr_copy_to_scratch_space(
+                  ctx, &ctx->scratch->fs_constants, ptr, size);
+               pDC->constantFS[i] = (const float *)ptr;
+            }
+         }
+      }
+   }
+
+   /* Depth/stencil state */
+   if (ctx->dirty & (SWR_NEW_DEPTH_STENCIL_ALPHA | SWR_NEW_FRAMEBUFFER)) {
+      struct pipe_depth_state *depth = &(ctx->depth_stencil->depth);
+      struct pipe_stencil_state *stencil = ctx->depth_stencil->stencil;
+      SWR_DEPTH_STENCIL_STATE depthStencilState = {{0}};
+
+      /* XXX, incomplete.  Need to flesh out stencil & alpha test state
+      struct pipe_stencil_state *front_stencil =
+      ctx->depth_stencil.stencil[0];
+      struct pipe_stencil_state *back_stencil = ctx->depth_stencil.stencil[1];
+      struct pipe_alpha_state alpha;
+      */
+      if (stencil[0].enabled) {
+         depthStencilState.stencilWriteEnable = 1;
+         depthStencilState.stencilTestEnable = 1;
+         depthStencilState.stencilTestFunc =
+            swr_convert_depth_func(stencil[0].func);
+
+         depthStencilState.stencilPassDepthPassOp =
+            swr_convert_stencil_op(stencil[0].zpass_op);
+         depthStencilState.stencilPassDepthFailOp =
+            swr_convert_stencil_op(stencil[0].zfail_op);
+         depthStencilState.stencilFailOp =
+            swr_convert_stencil_op(stencil[0].fail_op);
+         depthStencilState.stencilWriteMask = stencil[0].writemask;
+         depthStencilState.stencilTestMask = stencil[0].valuemask;
+         depthStencilState.stencilRefValue = ctx->stencil_ref.ref_value[0];
+      }
+      if (stencil[1].enabled) {
+         depthStencilState.doubleSidedStencilTestEnable = 1;
+
+         depthStencilState.backfaceStencilTestFunc =
+            swr_convert_depth_func(stencil[1].func);
+
+         depthStencilState.backfaceStencilPassDepthPassOp =
+            swr_convert_stencil_op(stencil[1].zpass_op);
+         depthStencilState.backfaceStencilPassDepthFailOp =
+            swr_convert_stencil_op(stencil[1].zfail_op);
+         depthStencilState.backfaceStencilFailOp =
+            swr_convert_stencil_op(stencil[1].fail_op);
+         depthStencilState.backfaceStencilWriteMask = stencil[1].writemask;
+         depthStencilState.backfaceStencilTestMask = stencil[1].valuemask;
+
+         depthStencilState.backfaceStencilRefValue =
+            ctx->stencil_ref.ref_value[1];
+      }
+
+      depthStencilState.depthTestEnable = depth->enabled;
+      depthStencilState.depthTestFunc = swr_convert_depth_func(depth->func);
+      depthStencilState.depthWriteEnable = depth->writemask;
+      SwrSetDepthStencilState(ctx->swrContext, &depthStencilState);
+   }
+
+   /* Blend State */
+   if (ctx->dirty & (SWR_NEW_BLEND |
+                     SWR_NEW_FRAMEBUFFER |
+                     SWR_NEW_DEPTH_STENCIL_ALPHA)) {
+      struct pipe_framebuffer_state *fb = &ctx->framebuffer;
+
+      SWR_BLEND_STATE blendState;
+      memcpy(&blendState, &ctx->blend->blendState, sizeof(blendState));
+      blendState.constantColor[0] = ctx->blend_color.color[0];
+      blendState.constantColor[1] = ctx->blend_color.color[1];
+      blendState.constantColor[2] = ctx->blend_color.color[2];
+      blendState.constantColor[3] = ctx->blend_color.color[3];
+      blendState.alphaTestReference =
+         *((uint32_t*)&ctx->depth_stencil->alpha.ref_value);
+
+      // XXX MSAA
+      blendState.sampleMask = 0;
+      blendState.sampleCount = SWR_MULTISAMPLE_1X;
+
+      /* If there are no color buffers bound, disable writes on RT0
+       * and skip loop */
+      if (fb->nr_cbufs == 0) {
+         blendState.renderTarget[0].writeDisableRed = 1;
+         blendState.renderTarget[0].writeDisableGreen = 1;
+         blendState.renderTarget[0].writeDisableBlue = 1;
+         blendState.renderTarget[0].writeDisableAlpha = 1;
+         SwrSetBlendFunc(ctx->swrContext, 0, NULL);
+      }
+      else
+         for (int target = 0;
+               target < std::min(SWR_NUM_RENDERTARGETS,
+                                 PIPE_MAX_COLOR_BUFS);
+               target++) {
+            if (!fb->cbufs[target])
+               continue;
+
+            struct swr_resource *colorBuffer =
+               swr_resource(fb->cbufs[target]->texture);
+
+            BLEND_COMPILE_STATE compileState;
+            memset(&compileState, 0, sizeof(compileState));
+            compileState.format = colorBuffer->swr.format;
+            memcpy(&compileState.blendState,
+                   &ctx->blend->compileState[target],
+                   sizeof(compileState.blendState));
+
+            if (compileState.blendState.blendEnable == false &&
+                compileState.blendState.logicOpEnable == false) {
+               SwrSetBlendFunc(ctx->swrContext, target, NULL);
+               continue;
+            }
+
+            compileState.desc.alphaTestEnable =
+               ctx->depth_stencil->alpha.enabled;
+            compileState.desc.independentAlphaBlendEnable =
+               ctx->blend->pipe.independent_blend_enable;
+            compileState.desc.alphaToCoverageEnable =
+               ctx->blend->pipe.alpha_to_coverage;
+            compileState.desc.sampleMaskEnable = 0; // XXX
+            compileState.desc.numSamples = 1; // XXX
+
+            compileState.alphaTestFunction =
+               swr_convert_depth_func(ctx->depth_stencil->alpha.func);
+            compileState.alphaTestFormat = ALPHA_TEST_FLOAT32; // xxx
+
+            PFN_BLEND_JIT_FUNC func = NULL;
+            auto search = ctx->blendJIT->find(compileState);
+            if (search != ctx->blendJIT->end()) {
+               func = search->second;
+            } else {
+               HANDLE hJitMgr = screen->hJitMgr;
+               func = JitCompileBlend(hJitMgr, compileState);
+               debug_printf("BLEND shader %p\n", func);
+               assert(func && "Error: BlendShader = NULL");
+
+               ctx->blendJIT->insert(std::make_pair(compileState, func));
+            }
+            SwrSetBlendFunc(ctx->swrContext, target, func);
+         }
+
+      SwrSetBlendState(ctx->swrContext, &blendState);
+   }
+
+   if (ctx->dirty & SWR_NEW_STIPPLE) {
+      /* XXX What to do with this one??? SWR doesn't stipple */
+   }
+
+   if (ctx->dirty & (SWR_NEW_VS | SWR_NEW_SO | SWR_NEW_RASTERIZER)) {
+      ctx->vs->soState.rasterizerDisable =
+         ctx->rasterizer->rasterizer_discard;
+      SwrSetSoState(ctx->swrContext, &ctx->vs->soState);
+
+      pipe_stream_output_info *stream_output = &ctx->vs->pipe.stream_output;
+
+      for (uint32_t i = 0; i < ctx->num_so_targets; i++) {
+         SWR_STREAMOUT_BUFFER buffer = {0};
+         if (!ctx->so_targets[i])
+            continue;
+         buffer.enable = true;
+         buffer.pBuffer =
+            (uint32_t *)swr_resource_data(ctx->so_targets[i]->buffer);
+         buffer.bufferSize = ctx->so_targets[i]->buffer_size >> 2;
+         buffer.pitch = stream_output->stride[i];
+         buffer.streamOffset = ctx->so_targets[i]->buffer_offset >> 2;
+
+         SwrSetSoBuffers(ctx->swrContext, &buffer, i);
+      }
+   }
+
+   uint32_t linkage = ctx->vs->linkageMask;
+   if (ctx->rasterizer->sprite_coord_enable)
+      linkage |= (1 << ctx->vs->info.base.num_outputs);
+
+   SwrSetLinkage(ctx->swrContext, linkage, NULL);
+
+   // set up frontend state
+   SWR_FRONTEND_STATE feState = {0};
+   SwrSetFrontendState(ctx->swrContext, &feState);
+
+   // set up backend state
+   SWR_BACKEND_STATE backendState = {0};
+   backendState.numAttributes = 1;
+   backendState.numComponents[0] = 4;
+   backendState.constantInterpolationMask = ctx->fs->constantMask;
+   backendState.pointSpriteTexCoordMask = ctx->fs->pointSpriteMask;
+
+   SwrSetBackendState(ctx->swrContext, &backendState);
+
+   /* Ensure that any in-progress attachment change StoreTiles finish */
+   if (swr_is_fence_pending(screen->flush_fence))
+      swr_fence_finish(pipe->screen, screen->flush_fence, 0);
+
+   /* Finally, update the in-use status of all resources involved in draw */
+   swr_update_resource_status(pipe, p_draw_info);
+
+   ctx->dirty = post_update_dirty_flags;
+}
+
+
+static struct pipe_stream_output_target *
+swr_create_so_target(struct pipe_context *pipe,
+                     struct pipe_resource *buffer,
+                     unsigned buffer_offset,
+                     unsigned buffer_size)
+{
+   struct pipe_stream_output_target *target;
+
+   target = CALLOC_STRUCT(pipe_stream_output_target);
+   if (!target)
+      return NULL;
+
+   target->context = pipe;
+   target->reference.count = 1;
+   pipe_resource_reference(&target->buffer, buffer);
+   target->buffer_offset = buffer_offset;
+   target->buffer_size = buffer_size;
+   return target;
+}
+
+static void
+swr_destroy_so_target(struct pipe_context *pipe,
+                      struct pipe_stream_output_target *target)
+{
+   pipe_resource_reference(&target->buffer, NULL);
+   FREE(target);
+}
+
+static void
+swr_set_so_targets(struct pipe_context *pipe,
+                   unsigned num_targets,
+                   struct pipe_stream_output_target **targets,
+                   const unsigned *offsets)
+{
+   struct swr_context *swr = swr_context(pipe);
+   uint32_t i;
+
+   assert(num_targets < MAX_SO_STREAMS);
+
+   for (i = 0; i < num_targets; i++) {
+      pipe_so_target_reference(
+         (struct pipe_stream_output_target **)&swr->so_targets[i],
+         targets[i]);
+   }
+
+   for (/* fall-through */; i < swr->num_so_targets; i++) {
+      pipe_so_target_reference(
+         (struct pipe_stream_output_target **)&swr->so_targets[i], NULL);
+   }
+
+   swr->num_so_targets = num_targets;
+
+   swr->dirty = SWR_NEW_SO;
+}
+
+
+void
+swr_state_init(struct pipe_context *pipe)
+{
+   pipe->create_blend_state = swr_create_blend_state;
+   pipe->bind_blend_state = swr_bind_blend_state;
+   pipe->delete_blend_state = swr_delete_blend_state;
+
+   pipe->create_depth_stencil_alpha_state = swr_create_depth_stencil_state;
+   pipe->bind_depth_stencil_alpha_state = swr_bind_depth_stencil_state;
+   pipe->delete_depth_stencil_alpha_state = swr_delete_depth_stencil_state;
+
+   pipe->create_rasterizer_state = swr_create_rasterizer_state;
+   pipe->bind_rasterizer_state = swr_bind_rasterizer_state;
+   pipe->delete_rasterizer_state = swr_delete_rasterizer_state;
+
+   pipe->create_sampler_state = swr_create_sampler_state;
+   pipe->bind_sampler_states = swr_bind_sampler_states;
+   pipe->delete_sampler_state = swr_delete_sampler_state;
+
+   pipe->create_sampler_view = swr_create_sampler_view;
+   pipe->set_sampler_views = swr_set_sampler_views;
+   pipe->sampler_view_destroy = swr_sampler_view_destroy;
+
+   pipe->create_vs_state = swr_create_vs_state;
+   pipe->bind_vs_state = swr_bind_vs_state;
+   pipe->delete_vs_state = swr_delete_vs_state;
+
+   pipe->create_fs_state = swr_create_fs_state;
+   pipe->bind_fs_state = swr_bind_fs_state;
+   pipe->delete_fs_state = swr_delete_fs_state;
+
+   pipe->set_constant_buffer = swr_set_constant_buffer;
+
+   pipe->create_vertex_elements_state = swr_create_vertex_elements_state;
+   pipe->bind_vertex_elements_state = swr_bind_vertex_elements_state;
+   pipe->delete_vertex_elements_state = swr_delete_vertex_elements_state;
+
+   pipe->set_vertex_buffers = swr_set_vertex_buffers;
+   pipe->set_index_buffer = swr_set_index_buffer;
+
+   pipe->set_polygon_stipple = swr_set_polygon_stipple;
+   pipe->set_clip_state = swr_set_clip_state;
+   pipe->set_scissor_states = swr_set_scissor_states;
+   pipe->set_viewport_states = swr_set_viewport_states;
+
+   pipe->set_framebuffer_state = swr_set_framebuffer_state;
+
+   pipe->set_blend_color = swr_set_blend_color;
+   pipe->set_stencil_ref = swr_set_stencil_ref;
+
+   pipe->set_sample_mask = swr_set_sample_mask;
+
+   pipe->create_stream_output_target = swr_create_so_target;
+   pipe->stream_output_target_destroy = swr_destroy_so_target;
+   pipe->set_stream_output_targets = swr_set_so_targets;
+}
diff --git a/src/gallium/drivers/swr/swr_state.h b/src/gallium/drivers/swr/swr_state.h
new file mode 100644
index 00000000000..f0a7ff3b185
--- /dev/null
+++ b/src/gallium/drivers/swr/swr_state.h
@@ -0,0 +1,307 @@
+/****************************************************************************
+ * Copyright (C) 2015 Intel Corporation.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ ***************************************************************************/
+
+#ifndef SWR_STATE_H
+#define SWR_STATE_H
+
+#include "pipe/p_defines.h"
+#include "tgsi/tgsi_scan.h"
+#include "tgsi/tgsi_parse.h"
+#include "tgsi/tgsi_dump.h"
+#include "gallivm/lp_bld_tgsi.h"
+#include "util/u_hash.h"
+#include "api.h"
+#include "swr_tex_sample.h"
+#include "swr_shader.h"
+#include <unordered_map>
+
+/* skeleton */
+struct swr_vertex_shader {
+   struct pipe_shader_state pipe;
+   struct lp_tgsi_info info;
+   unsigned linkageMask;
+   PFN_VERTEX_FUNC func;
+   SWR_STREAMOUT_STATE soState;
+   PFN_SO_FUNC soFunc[PIPE_PRIM_MAX];
+};
+
+struct swr_fragment_shader {
+   struct pipe_shader_state pipe;
+   struct lp_tgsi_info info;
+   uint32_t constantMask;
+   uint32_t pointSpriteMask;
+   std::unordered_map<swr_jit_key, PFN_PIXEL_KERNEL> map;
+};
+
+/* Vertex element state */
+struct swr_vertex_element_state {
+   FETCH_COMPILE_STATE fsState;
+   PFN_FETCH_FUNC fsFunc;
+   uint32_t stream_pitch[PIPE_MAX_ATTRIBS];
+};
+
+struct swr_blend_state {
+   struct pipe_blend_state pipe;
+   SWR_BLEND_STATE blendState;
+   RENDER_TARGET_BLEND_COMPILE_STATE compileState[PIPE_MAX_COLOR_BUFS];
+};
+
+/*
+ * Derived SWR API DrawState
+ * For convenience of making simple changes without re-deriving state.
+ */
+struct swr_derived_state {
+   SWR_RASTSTATE rastState;
+   SWR_VIEWPORT vp;
+   SWR_VIEWPORT_MATRIX vpm;
+};
+
+void swr_update_derived(struct pipe_context *,
+                        const struct pipe_draw_info * = nullptr);
+
+/*
+ * Conversion functions: Convert mesa state defines to SWR.
+ */
+
+static INLINE SWR_LOGIC_OP
+swr_convert_logic_op(const UINT op)
+{
+   switch (op) {
+   case PIPE_LOGICOP_CLEAR:
+      return LOGICOP_CLEAR;
+   case PIPE_LOGICOP_NOR:
+      return LOGICOP_NOR;
+   case PIPE_LOGICOP_AND_INVERTED:
+      return LOGICOP_CLEAR;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      return LOGICOP_COPY_INVERTED;
+   case PIPE_LOGICOP_AND_REVERSE:
+      return LOGICOP_AND_REVERSE;
+   case PIPE_LOGICOP_INVERT:
+      return LOGICOP_INVERT;
+   case PIPE_LOGICOP_XOR:
+      return LOGICOP_XOR;
+   case PIPE_LOGICOP_NAND:
+      return LOGICOP_NAND;
+   case PIPE_LOGICOP_AND:
+      return LOGICOP_AND;
+   case PIPE_LOGICOP_EQUIV:
+      return LOGICOP_EQUIV;
+   case PIPE_LOGICOP_NOOP:
+      return LOGICOP_NOOP;
+   case PIPE_LOGICOP_OR_INVERTED:
+      return LOGICOP_OR_INVERTED;
+   case PIPE_LOGICOP_COPY:
+      return LOGICOP_COPY;
+   case PIPE_LOGICOP_OR_REVERSE:
+      return LOGICOP_OR_REVERSE;
+   case PIPE_LOGICOP_OR:
+      return LOGICOP_OR;
+   case PIPE_LOGICOP_SET:
+      return LOGICOP_SET;
+   default:
+      assert(0 && "Unsupported logic op");
+      return LOGICOP_NOOP;
+   }
+}
+
+static INLINE SWR_STENCILOP
+swr_convert_stencil_op(const UINT op)
+{
+   switch (op) {
+   case PIPE_STENCIL_OP_KEEP:
+      return STENCILOP_KEEP;
+   case PIPE_STENCIL_OP_ZERO:
+      return STENCILOP_ZERO;
+   case PIPE_STENCIL_OP_REPLACE:
+      return STENCILOP_REPLACE;
+   case PIPE_STENCIL_OP_INCR:
+      return STENCILOP_INCRSAT;
+   case PIPE_STENCIL_OP_DECR:
+      return STENCILOP_DECRSAT;
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      return STENCILOP_INCR;
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      return STENCILOP_DECR;
+   case PIPE_STENCIL_OP_INVERT:
+      return STENCILOP_INVERT;
+   default:
+      assert(0 && "Unsupported stencil op");
+      return STENCILOP_KEEP;
+   }
+}
+
+static INLINE SWR_FORMAT
+swr_convert_index_type(const UINT index_size)
+{
+   switch (index_size) {
+   case sizeof(unsigned char):
+      return R8_UINT;
+   case sizeof(unsigned short):
+      return R16_UINT;
+   case sizeof(unsigned int):
+      return R32_UINT;
+   default:
+      assert(0 && "Unsupported index type");
+      return R32_UINT;
+   }
+}
+
+
+static INLINE SWR_ZFUNCTION
+swr_convert_depth_func(const UINT pipe_func)
+{
+   switch (pipe_func) {
+   case PIPE_FUNC_NEVER:
+      return ZFUNC_NEVER;
+   case PIPE_FUNC_LESS:
+      return ZFUNC_LT;
+   case PIPE_FUNC_EQUAL:
+      return ZFUNC_EQ;
+   case PIPE_FUNC_LEQUAL:
+      return ZFUNC_LE;
+   case PIPE_FUNC_GREATER:
+      return ZFUNC_GT;
+   case PIPE_FUNC_NOTEQUAL:
+      return ZFUNC_NE;
+   case PIPE_FUNC_GEQUAL:
+      return ZFUNC_GE;
+   case PIPE_FUNC_ALWAYS:
+      return ZFUNC_ALWAYS;
+   default:
+      assert(0 && "Unsupported depth func");
+      return ZFUNC_ALWAYS;
+   }
+}
+
+
+static INLINE SWR_CULLMODE
+swr_convert_cull_mode(const UINT cull_face)
+{
+   switch (cull_face) {
+   case PIPE_FACE_NONE:
+      return SWR_CULLMODE_NONE;
+   case PIPE_FACE_FRONT:
+      return SWR_CULLMODE_FRONT;
+   case PIPE_FACE_BACK:
+      return SWR_CULLMODE_BACK;
+   case PIPE_FACE_FRONT_AND_BACK:
+      return SWR_CULLMODE_BOTH;
+   default:
+      assert(0 && "Invalid cull mode");
+      return SWR_CULLMODE_NONE;
+   }
+}
+
+static INLINE SWR_BLEND_OP
+swr_convert_blend_func(const UINT blend_func)
+{
+   switch (blend_func) {
+   case PIPE_BLEND_ADD:
+      return BLENDOP_ADD;
+   case PIPE_BLEND_SUBTRACT:
+      return BLENDOP_SUBTRACT;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      return BLENDOP_REVSUBTRACT;
+   case PIPE_BLEND_MIN:
+      return BLENDOP_MIN;
+   case PIPE_BLEND_MAX:
+      return BLENDOP_MAX;
+   default:
+      assert(0 && "Invalid blend func");
+      return BLENDOP_ADD;
+   }
+}
+
+static INLINE SWR_BLEND_FACTOR
+swr_convert_blend_factor(const UINT blend_factor)
+{
+   switch (blend_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      return BLENDFACTOR_ONE;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      return BLENDFACTOR_SRC_COLOR;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      return BLENDFACTOR_SRC_ALPHA;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      return BLENDFACTOR_DST_ALPHA;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      return BLENDFACTOR_DST_COLOR;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      return BLENDFACTOR_SRC_ALPHA_SATURATE;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      return BLENDFACTOR_CONST_COLOR;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+      return BLENDFACTOR_CONST_ALPHA;
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+      return BLENDFACTOR_SRC1_COLOR;
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+      return BLENDFACTOR_SRC1_ALPHA;
+   case PIPE_BLENDFACTOR_ZERO:
+      return BLENDFACTOR_ZERO;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      return BLENDFACTOR_INV_SRC_COLOR;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      return BLENDFACTOR_INV_SRC_ALPHA;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+      return BLENDFACTOR_INV_DST_ALPHA;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      return BLENDFACTOR_INV_DST_COLOR;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      return BLENDFACTOR_INV_CONST_COLOR;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+      return BLENDFACTOR_INV_CONST_ALPHA;
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+      return BLENDFACTOR_INV_SRC1_COLOR;
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      return BLENDFACTOR_INV_SRC1_ALPHA;
+   default:
+      assert(0 && "Invalid blend factor");
+      return BLENDFACTOR_ONE;
+   }
+}
+
+static INLINE enum SWR_SURFACE_TYPE
+swr_convert_target_type(const enum pipe_texture_target target)
+{
+   switch (target) {
+   case PIPE_BUFFER:
+      return SURFACE_BUFFER;
+   case PIPE_TEXTURE_1D:
+   case PIPE_TEXTURE_1D_ARRAY:
+      return SURFACE_1D;
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_2D_ARRAY:
+   case PIPE_TEXTURE_RECT:
+      return SURFACE_2D;
+   case PIPE_TEXTURE_3D:
+      return SURFACE_3D;
+   case PIPE_TEXTURE_CUBE:
+   case PIPE_TEXTURE_CUBE_ARRAY:
+      return SURFACE_CUBE;
+   default:
+      assert(0);
+      return SURFACE_NULL;
+   }
+}
+#endif
diff --git a/src/gallium/drivers/swr/swr_tex_sample.cpp b/src/gallium/drivers/swr/swr_tex_sample.cpp
new file mode 100644
index 00000000000..8e01e32e280
--- /dev/null
+++ b/src/gallium/drivers/swr/swr_tex_sample.cpp
@@ -0,0 +1,338 @@
+/**************************************************************************
+ *
+ * Copyright 2009 VMware, Inc.
+ * All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Largely a copy of llvmpipe's lp_tex_sample.c
+ */
+
+/**
+ * Texture sampling code generation
+ *
+ * This file is nothing more than ugly glue between three largely independent
+ * entities:
+ * - TGSI -> LLVM translation (i.e., lp_build_tgsi_soa)
+ * - texture sampling code generation (i.e., lp_build_sample_soa)
+ * - SWR driver
+ *
+ * All interesting code is in the functions mentioned above. There is really
+ * nothing to see here.
+ *
+ * @author Jose Fonseca <[email protected]>
+ */
+
+#include "state.h"
+#include "JitManager.h"
+#include "state_llvm.h"
+
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+#include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_type.h"
+#include "gallivm/lp_bld_sample.h"
+#include "gallivm/lp_bld_tgsi.h"
+#include "util/u_memory.h"
+
+#include "swr_tex_sample.h"
+#include "swr_context_llvm.h"
+
+
+/**
+ * This provides the bridge between the sampler state store in
+ * lp_jit_context and lp_jit_texture and the sampler code
+ * generator. It provides the texture layout information required by
+ * the texture sampler code generator in terms of the state stored in
+ * lp_jit_context and lp_jit_texture in runtime.
+ */
+struct swr_sampler_dynamic_state {
+   struct lp_sampler_dynamic_state base;
+
+   const struct swr_sampler_static_state *static_state;
+};
+
+
+/**
+ * This is the bridge between our sampler and the TGSI translator.
+ */
+struct swr_sampler_soa {
+   struct lp_build_sampler_soa base;
+
+   struct swr_sampler_dynamic_state dynamic_state;
+};
+
+
+/**
+ * Fetch the specified member of the lp_jit_texture structure.
+ * \param emit_load  if TRUE, emit the LLVM load instruction to actually
+ *                   fetch the field's value.  Otherwise, just emit the
+ *                   GEP code to address the field.
+ *
+ * @sa http://llvm.org/docs/GetElementPtr.html
+ */
+static LLVMValueRef
+swr_texture_member(const struct lp_sampler_dynamic_state *base,
+                   struct gallivm_state *gallivm,
+                   LLVMValueRef context_ptr,
+                   unsigned texture_unit,
+                   unsigned member_index,
+                   const char *member_name,
+                   boolean emit_load)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef indices[4];
+   LLVMValueRef ptr;
+   LLVMValueRef res;
+
+   assert(texture_unit < PIPE_MAX_SHADER_SAMPLER_VIEWS);
+
+   /* context[0] */
+   indices[0] = lp_build_const_int32(gallivm, 0);
+   /* context[0].textures */
+   indices[1] = lp_build_const_int32(gallivm, swr_draw_context_texturesFS);
+   /* context[0].textures[unit] */
+   indices[2] = lp_build_const_int32(gallivm, texture_unit);
+   /* context[0].textures[unit].member */
+   indices[3] = lp_build_const_int32(gallivm, member_index);
+
+   ptr = LLVMBuildGEP(builder, context_ptr, indices, Elements(indices), "");
+
+   if (emit_load)
+      res = LLVMBuildLoad(builder, ptr, "");
+   else
+      res = ptr;
+
+   lp_build_name(res, "context.texture%u.%s", texture_unit, member_name);
+
+   return res;
+}
+
+
+/**
+ * Helper macro to instantiate the functions that generate the code to
+ * fetch the members of lp_jit_texture to fulfill the sampler code
+ * generator requests.
+ *
+ * This complexity is the price we have to pay to keep the texture
+ * sampler code generator a reusable module without dependencies to
+ * swr internals.
+ */
+#define SWR_TEXTURE_MEMBER(_name, _emit_load)                                \
+   static LLVMValueRef swr_texture_##_name(                                  \
+      const struct lp_sampler_dynamic_state *base,                           \
+      struct gallivm_state *gallivm,                                         \
+      LLVMValueRef context_ptr,                                              \
+      unsigned texture_unit)                                                 \
+   {                                                                         \
+      return swr_texture_member(base,                                        \
+                                gallivm,                                     \
+                                context_ptr,                                 \
+                                texture_unit,                                \
+                                swr_jit_texture_##_name,                     \
+                                #_name,                                      \
+                                _emit_load);                                 \
+   }
+
+
+SWR_TEXTURE_MEMBER(width, TRUE)
+SWR_TEXTURE_MEMBER(height, TRUE)
+SWR_TEXTURE_MEMBER(depth, TRUE)
+SWR_TEXTURE_MEMBER(first_level, TRUE)
+SWR_TEXTURE_MEMBER(last_level, TRUE)
+SWR_TEXTURE_MEMBER(base_ptr, TRUE)
+SWR_TEXTURE_MEMBER(row_stride, FALSE)
+SWR_TEXTURE_MEMBER(img_stride, FALSE)
+SWR_TEXTURE_MEMBER(mip_offsets, FALSE)
+
+
+/**
+ * Fetch the specified member of the lp_jit_sampler structure.
+ * \param emit_load  if TRUE, emit the LLVM load instruction to actually
+ *                   fetch the field's value.  Otherwise, just emit the
+ *                   GEP code to address the field.
+ *
+ * @sa http://llvm.org/docs/GetElementPtr.html
+ */
+static LLVMValueRef
+swr_sampler_member(const struct lp_sampler_dynamic_state *base,
+                   struct gallivm_state *gallivm,
+                   LLVMValueRef context_ptr,
+                   unsigned sampler_unit,
+                   unsigned member_index,
+                   const char *member_name,
+                   boolean emit_load)
+{
+   LLVMBuilderRef builder = gallivm->builder;
+   LLVMValueRef indices[4];
+   LLVMValueRef ptr;
+   LLVMValueRef res;
+
+   assert(sampler_unit < PIPE_MAX_SAMPLERS);
+
+   /* context[0] */
+   indices[0] = lp_build_const_int32(gallivm, 0);
+   /* context[0].samplers */
+   indices[1] = lp_build_const_int32(gallivm, swr_draw_context_samplersFS);
+   /* context[0].samplers[unit] */
+   indices[2] = lp_build_const_int32(gallivm, sampler_unit);
+   /* context[0].samplers[unit].member */
+   indices[3] = lp_build_const_int32(gallivm, member_index);
+
+   ptr = LLVMBuildGEP(builder, context_ptr, indices, Elements(indices), "");
+
+   if (emit_load)
+      res = LLVMBuildLoad(builder, ptr, "");
+   else
+      res = ptr;
+
+   lp_build_name(res, "context.sampler%u.%s", sampler_unit, member_name);
+
+   return res;
+}
+
+
+#define SWR_SAMPLER_MEMBER(_name, _emit_load)                                \
+   static LLVMValueRef swr_sampler_##_name(                                  \
+      const struct lp_sampler_dynamic_state *base,                           \
+      struct gallivm_state *gallivm,                                         \
+      LLVMValueRef context_ptr,                                              \
+      unsigned sampler_unit)                                                 \
+   {                                                                         \
+      return swr_sampler_member(base,                                        \
+                                gallivm,                                     \
+                                context_ptr,                                 \
+                                sampler_unit,                                \
+                                swr_jit_sampler_##_name,                     \
+                                #_name,                                      \
+                                _emit_load);                                 \
+   }
+
+
+SWR_SAMPLER_MEMBER(min_lod, TRUE)
+SWR_SAMPLER_MEMBER(max_lod, TRUE)
+SWR_SAMPLER_MEMBER(lod_bias, TRUE)
+SWR_SAMPLER_MEMBER(border_color, FALSE)
+
+
+static void
+swr_sampler_soa_destroy(struct lp_build_sampler_soa *sampler)
+{
+   FREE(sampler);
+}
+
+
+/**
+ * Fetch filtered values from texture.
+ * The 'texel' parameter returns four vectors corresponding to R, G, B, A.
+ */
+static void
+swr_sampler_soa_emit_fetch_texel(const struct lp_build_sampler_soa *base,
+                                 struct gallivm_state *gallivm,
+                                 const struct lp_sampler_params *params)
+{
+   struct swr_sampler_soa *sampler = (struct swr_sampler_soa *)base;
+   unsigned texture_index = params->texture_index;
+   unsigned sampler_index = params->sampler_index;
+
+   assert(sampler_index < PIPE_MAX_SAMPLERS);
+   assert(texture_index < PIPE_MAX_SHADER_SAMPLER_VIEWS);
+
+#if 0
+      lp_build_sample_nop(gallivm, params->type, params->coords, params->texel);
+#else
+   lp_build_sample_soa(
+      &sampler->dynamic_state.static_state[texture_index].texture_state,
+      &sampler->dynamic_state.static_state[sampler_index].sampler_state,
+      &sampler->dynamic_state.base,
+      gallivm,
+      params);
+#endif
+}
+
+/**
+ * Fetch the texture size.
+ */
+static void
+swr_sampler_soa_emit_size_query(const struct lp_build_sampler_soa *base,
+                                struct gallivm_state *gallivm,
+                                struct lp_type type,
+                                unsigned texture_unit,
+                                unsigned target,
+                                LLVMValueRef context_ptr,
+                                boolean is_sviewinfo,
+                                enum lp_sampler_lod_property lod_property,
+                                LLVMValueRef explicit_lod, /* optional */
+                                LLVMValueRef *sizes_out)
+{
+   struct swr_sampler_soa *sampler = (struct swr_sampler_soa *)base;
+
+   assert(texture_unit < PIPE_MAX_SHADER_SAMPLER_VIEWS);
+
+   lp_build_size_query_soa(
+      gallivm,
+      &sampler->dynamic_state.static_state[texture_unit].texture_state,
+      &sampler->dynamic_state.base,
+      type,
+      texture_unit,
+      target,
+      context_ptr,
+      is_sviewinfo,
+      lod_property,
+      explicit_lod,
+      sizes_out);
+}
+
+
+struct lp_build_sampler_soa *
+swr_sampler_soa_create(const struct swr_sampler_static_state *static_state)
+{
+   struct swr_sampler_soa *sampler;
+
+   sampler = CALLOC_STRUCT(swr_sampler_soa);
+   if (!sampler)
+      return NULL;
+
+   sampler->base.destroy = swr_sampler_soa_destroy;
+   sampler->base.emit_tex_sample = swr_sampler_soa_emit_fetch_texel;
+   sampler->base.emit_size_query = swr_sampler_soa_emit_size_query;
+   sampler->dynamic_state.base.width = swr_texture_width;
+   sampler->dynamic_state.base.height = swr_texture_height;
+   sampler->dynamic_state.base.depth = swr_texture_depth;
+   sampler->dynamic_state.base.first_level = swr_texture_first_level;
+   sampler->dynamic_state.base.last_level = swr_texture_last_level;
+   sampler->dynamic_state.base.base_ptr = swr_texture_base_ptr;
+   sampler->dynamic_state.base.row_stride = swr_texture_row_stride;
+   sampler->dynamic_state.base.img_stride = swr_texture_img_stride;
+   sampler->dynamic_state.base.mip_offsets = swr_texture_mip_offsets;
+   sampler->dynamic_state.base.min_lod = swr_sampler_min_lod;
+   sampler->dynamic_state.base.max_lod = swr_sampler_max_lod;
+   sampler->dynamic_state.base.lod_bias = swr_sampler_lod_bias;
+   sampler->dynamic_state.base.border_color = swr_sampler_border_color;
+
+   sampler->dynamic_state.static_state = static_state;
+
+   return &sampler->base;
+}
diff --git a/src/gallium/drivers/swr/swr_tex_sample.h b/src/gallium/drivers/swr/swr_tex_sample.h
new file mode 100644
index 00000000000..f5c368c108d
--- /dev/null
+++ b/src/gallium/drivers/swr/swr_tex_sample.h
@@ -0,0 +1,47 @@
+/**************************************************************************
+ *
+ * Copyright 2007 VMware, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#pragma once
+
+#include "gallivm/lp_bld.h"
+
+struct swr_sampler_static_state {
+   /*
+    * These attributes are effectively interleaved for more sane key handling.
+    * However, there might be lots of null space if the amount of samplers and
+    * textures isn't the same.
+    */
+   struct lp_static_sampler_state sampler_state;
+   struct lp_static_texture_state texture_state;
+};
+
+/**
+ * Pure-LLVM texture sampling code generator.
+ *
+ */
+struct lp_build_sampler_soa *
+swr_sampler_soa_create(const struct swr_sampler_static_state *key);
diff --git a/src/gallium/drivers/trace/tr_screen.c b/src/gallium/drivers/trace/tr_screen.c
index 0612109c800..b24e1856aca 100644
--- a/src/gallium/drivers/trace/tr_screen.c
+++ b/src/gallium/drivers/trace/tr_screen.c
@@ -313,7 +313,8 @@ trace_screen_resource_create(struct pipe_screen *_screen,
 static struct pipe_resource *
 trace_screen_resource_from_handle(struct pipe_screen *_screen,
                                  const struct pipe_resource *templ,
-                                 struct winsys_handle *handle)
+                                 struct winsys_handle *handle,
+                                  unsigned usage)
 {
    struct trace_screen *tr_screen = trace_screen(_screen);
    struct pipe_screen *screen = tr_screen->screen;
@@ -321,7 +322,7 @@ trace_screen_resource_from_handle(struct pipe_screen *_screen,
 
    /* TODO trace call */
 
-   result = screen->resource_from_handle(screen, templ, handle);
+   result = screen->resource_from_handle(screen, templ, handle, usage);
 
    result = trace_resource_create(trace_screen(_screen), result);
 
@@ -331,7 +332,8 @@ trace_screen_resource_from_handle(struct pipe_screen *_screen,
 static boolean
 trace_screen_resource_get_handle(struct pipe_screen *_screen,
                                 struct pipe_resource *_resource,
-                                struct winsys_handle *handle)
+                                struct winsys_handle *handle,
+                                 unsigned usage)
 {
    struct trace_screen *tr_screen = trace_screen(_screen);
    struct trace_resource *tr_resource = trace_resource(_resource);
@@ -340,7 +342,7 @@ trace_screen_resource_get_handle(struct pipe_screen *_screen,
 
    /* TODO trace call */
 
-   return screen->resource_get_handle(screen, resource, handle);
+   return screen->resource_get_handle(screen, resource, handle, usage);
 }
 
 
diff --git a/src/gallium/drivers/vc4/Makefile.sources b/src/gallium/drivers/vc4/Makefile.sources
index a9a2742ec66..c5df0f17986 100644
--- a/src/gallium/drivers/vc4/Makefile.sources
+++ b/src/gallium/drivers/vc4/Makefile.sources
@@ -28,7 +28,7 @@ C_SOURCES := \
 	vc4_opt_cse.c \
 	vc4_opt_dead_code.c \
 	vc4_opt_small_immediates.c \
-	vc4_opt_vpm_writes.c \
+	vc4_opt_vpm.c \
 	vc4_program.c \
 	vc4_qir.c \
 	vc4_qir_lower_uniforms.c \
diff --git a/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c b/src/gallium/drivers/vc4/vc4_opt_vpm.c
index 73ded766db9..d15b0c1a39f 100644
--- a/src/gallium/drivers/vc4/vc4_opt_vpm_writes.c
+++ b/src/gallium/drivers/vc4/vc4_opt_vpm.c
@@ -22,16 +22,18 @@
  */
 
 /**
- * @file vc4_opt_vpm_writes.c
+ * @file vc4_opt_vpm.c
  *
- * This modifies instructions that generate the value consumed by a VPM write
- * to write directly into the VPM.
+ * This modifies instructions that:
+ * 1. exclusively consume a value read from the VPM to directly read the VPM if
+ *    other operands allow it.
+ * 2. generate the value consumed by a VPM write to write directly into the VPM.
  */
 
 #include "vc4_qir.h"
 
 bool
-qir_opt_vpm_writes(struct vc4_compile *c)
+qir_opt_vpm(struct vc4_compile *c)
 {
         if (c->stage == QSTAGE_FRAG)
                 return false;
@@ -52,8 +54,70 @@ qir_opt_vpm_writes(struct vc4_compile *c)
                 }
 
                 for (int i = 0; i < qir_get_op_nsrc(inst->op); i++) {
-                        if (inst->src[i].file == QFILE_TEMP)
-                                use_count[inst->src[i].index]++;
+                        if (inst->src[i].file == QFILE_TEMP) {
+                                uint32_t temp = inst->src[i].index;
+                                use_count[temp]++;
+                        }
+                }
+        }
+
+        /* For instructions reading from a temporary that contains a VPM read
+         * result, try to move the instruction up in place of the VPM read.
+         */
+        list_for_each_entry(struct qinst, inst, &c->instructions, link) {
+                if (!inst || qir_is_multi_instruction(inst))
+                        continue;
+
+                if (qir_depends_on_flags(inst) || inst->sf)
+                        continue;
+
+                if (qir_has_side_effects(c, inst) ||
+                    qir_has_side_effect_reads(c, inst) ||
+                    qir_is_tex(inst))
+                        continue;
+
+                for (int j = 0; j < qir_get_op_nsrc(inst->op); j++) {
+                        if (inst->src[j].file != QFILE_TEMP ||
+                            inst->src[j].pack)
+                                continue;
+
+                        uint32_t temp = inst->src[j].index;
+
+                        /* Since VPM reads pull from a FIFO, we only get to
+                         * read each VPM entry once (unless we reset the read
+                         * pointer).  That means we can't copy-propagate a VPM
+                         * read to multiple locations.
+                         */
+                        if (use_count[temp] != 1)
+                                continue;
+
+                        struct qinst *mov = c->defs[temp];
+                        if (!mov ||
+                            (mov->op != QOP_MOV &&
+                             mov->op != QOP_FMOV &&
+                             mov->op != QOP_MMOV) ||
+                            mov->src[0].file != QFILE_VPM) {
+                                continue;
+                        }
+
+                        uint32_t temps = 0;
+                        for (int k = 0; k < qir_get_op_nsrc(inst->op); k++) {
+                                if (inst->src[k].file == QFILE_TEMP)
+                                        temps++;
+                        }
+
+                        /* The instruction is safe to reorder if its other
+                         * sources are independent of previous instructions
+                         */
+                        if (temps == 1) {
+                                list_del(&inst->link);
+                                inst->src[j] = mov->src[0];
+                                list_replace(&mov->link, &inst->link);
+                                c->defs[temp] = NULL;
+                                free(mov);
+                                progress = true;
+                                break;
+                        }
                 }
         }
 
diff --git a/src/gallium/drivers/vc4/vc4_program.c b/src/gallium/drivers/vc4/vc4_program.c
index 5c91c02b539..81e8e9150d6 100644
--- a/src/gallium/drivers/vc4/vc4_program.c
+++ b/src/gallium/drivers/vc4/vc4_program.c
@@ -1729,6 +1729,8 @@ nir_to_qir(struct vc4_compile *c)
 }
 
 static const nir_shader_compiler_options nir_options = {
+        .lower_extract_byte = true,
+        .lower_extract_word = true,
         .lower_ffma = true,
         .lower_flrp = true,
         .lower_fpow = true,
diff --git a/src/gallium/drivers/vc4/vc4_qir.c b/src/gallium/drivers/vc4/vc4_qir.c
index f9eb0e151c5..65f0067c61e 100644
--- a/src/gallium/drivers/vc4/vc4_qir.c
+++ b/src/gallium/drivers/vc4/vc4_qir.c
@@ -526,7 +526,7 @@ qir_optimize(struct vc4_compile *c)
                 OPTPASS(qir_opt_copy_propagation);
                 OPTPASS(qir_opt_dead_code);
                 OPTPASS(qir_opt_small_immediates);
-                OPTPASS(qir_opt_vpm_writes);
+                OPTPASS(qir_opt_vpm);
 
                 if (!progress)
                         break;
diff --git a/src/gallium/drivers/vc4/vc4_qir.h b/src/gallium/drivers/vc4/vc4_qir.h
index bae31768bd8..4f39d72f552 100644
--- a/src/gallium/drivers/vc4/vc4_qir.h
+++ b/src/gallium/drivers/vc4/vc4_qir.h
@@ -484,7 +484,7 @@ bool qir_opt_copy_propagation(struct vc4_compile *c);
 bool qir_opt_cse(struct vc4_compile *c);
 bool qir_opt_dead_code(struct vc4_compile *c);
 bool qir_opt_small_immediates(struct vc4_compile *c);
-bool qir_opt_vpm_writes(struct vc4_compile *c);
+bool qir_opt_vpm(struct vc4_compile *c);
 void vc4_nir_lower_blend(struct vc4_compile *c);
 void vc4_nir_lower_io(struct vc4_compile *c);
 nir_ssa_def *vc4_nir_get_state_uniform(struct nir_builder *b,
diff --git a/src/gallium/drivers/vc4/vc4_resource.c b/src/gallium/drivers/vc4/vc4_resource.c
index 036da329987..ea212af0512 100644
--- a/src/gallium/drivers/vc4/vc4_resource.c
+++ b/src/gallium/drivers/vc4/vc4_resource.c
@@ -523,7 +523,8 @@ fail:
 static struct pipe_resource *
 vc4_resource_from_handle(struct pipe_screen *pscreen,
                          const struct pipe_resource *tmpl,
-                         struct winsys_handle *handle)
+                         struct winsys_handle *handle,
+                         unsigned usage)
 {
         struct vc4_resource *rsc = vc4_resource_setup(pscreen, tmpl);
         struct pipe_resource *prsc = &rsc->base.b;
diff --git a/src/gallium/drivers/vc4/vc4_screen.c b/src/gallium/drivers/vc4/vc4_screen.c
index a4b3efcfda3..92d910ba6a5 100644
--- a/src/gallium/drivers/vc4/vc4_screen.c
+++ b/src/gallium/drivers/vc4/vc4_screen.c
@@ -203,6 +203,10 @@ vc4_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
         case PIPE_CAP_SURFACE_REINTERPRET_BLOCKS:
         case PIPE_CAP_QUERY_BUFFER_OBJECT:
 	case PIPE_CAP_QUERY_MEMORY_INFO:
+	case PIPE_CAP_PCI_GROUP:
+        case PIPE_CAP_PCI_BUS:
+        case PIPE_CAP_PCI_DEVICE:
+        case PIPE_CAP_PCI_FUNCTION:
                 return 0;
 
                 /* Stream output. */
diff --git a/src/gallium/drivers/virgl/virgl_resource.c b/src/gallium/drivers/virgl/virgl_resource.c
index 0b2fc4ec497..2b3794765e2 100644
--- a/src/gallium/drivers/virgl/virgl_resource.c
+++ b/src/gallium/drivers/virgl/virgl_resource.c
@@ -64,7 +64,8 @@ static struct pipe_resource *virgl_resource_create(struct pipe_screen *screen,
 
 static struct pipe_resource *virgl_resource_from_handle(struct pipe_screen *screen,
                                                         const struct pipe_resource *templ,
-                                                        struct winsys_handle *whandle)
+                                                        struct winsys_handle *whandle,
+                                                        unsigned usage)
 {
     struct virgl_screen *vs = virgl_screen(screen);
     if (templ->target == PIPE_BUFFER)
diff --git a/src/gallium/drivers/virgl/virgl_screen.c b/src/gallium/drivers/virgl/virgl_screen.c
index d3f4e259cad..8126bdec40c 100644
--- a/src/gallium/drivers/virgl/virgl_screen.c
+++ b/src/gallium/drivers/virgl/virgl_screen.c
@@ -235,6 +235,10 @@ virgl_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_STRING_MARKER:
    case PIPE_CAP_QUERY_MEMORY_INFO:
+   case PIPE_CAP_PCI_GROUP:
+   case PIPE_CAP_PCI_BUS:
+   case PIPE_CAP_PCI_DEVICE:
+   case PIPE_CAP_PCI_FUNCTION:
       return 0;
    case PIPE_CAP_VENDOR_ID:
       return 0x1af4;
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index 010be62e638..bdd76ab1f81 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -404,9 +404,9 @@ enum pipe_flush_flags
  * The third flag has been added to be able to force textures to be created
  * in linear mode (no tiling).
  */
-#define PIPE_BIND_SCANOUT     (1 << 18) /*  */
-#define PIPE_BIND_SHARED      (1 << 19) /* get_texture_handle ??? */
-#define PIPE_BIND_LINEAR      (1 << 20)
+#define PIPE_BIND_SCANOUT     (1 << 19) /*  */
+#define PIPE_BIND_SHARED      (1 << 20) /* get_texture_handle ??? */
+#define PIPE_BIND_LINEAR      (1 << 21)
 
 
 /**
@@ -530,6 +530,25 @@ enum pipe_reset_status
 
 
 /**
+ * resource_get_handle flags.
+ */
+/* Requires pipe_context::flush_resource before external use. */
+#define PIPE_HANDLE_USAGE_EXPLICIT_FLUSH  (1 << 0)
+/* Expected external use of the resource: */
+#define PIPE_HANDLE_USAGE_READ            (1 << 1)
+#define PIPE_HANDLE_USAGE_WRITE           (1 << 2)
+#define PIPE_HANDLE_USAGE_READ_WRITE      (PIPE_HANDLE_USAGE_READ | \
+                                           PIPE_HANDLE_USAGE_WRITE)
+
+/**
+ * pipe_image_view access flags.
+ */
+#define PIPE_IMAGE_ACCESS_READ       (1 << 0)
+#define PIPE_IMAGE_ACCESS_WRITE      (1 << 1)
+#define PIPE_IMAGE_ACCESS_READ_WRITE (PIPE_IMAGE_ACCESS_READ | \
+                                      PIPE_IMAGE_ACCESS_WRITE)
+
+/**
  * Implementation capabilities/limits which are queried through
  * pipe_screen::get_param()
  */
@@ -658,6 +677,10 @@ enum pipe_cap
    PIPE_CAP_SURFACE_REINTERPRET_BLOCKS,
    PIPE_CAP_QUERY_BUFFER_OBJECT,
    PIPE_CAP_QUERY_MEMORY_INFO,
+   PIPE_CAP_PCI_GROUP,
+   PIPE_CAP_PCI_BUS,
+   PIPE_CAP_PCI_DEVICE,
+   PIPE_CAP_PCI_FUNCTION,
 };
 
 #define PIPE_QUIRK_TEXTURE_BORDER_COLOR_SWIZZLE_NV50 (1 << 0)
diff --git a/src/gallium/include/pipe/p_format.h b/src/gallium/include/pipe/p_format.h
index ab18523a80c..b22baa9c650 100644
--- a/src/gallium/include/pipe/p_format.h
+++ b/src/gallium/include/pipe/p_format.h
@@ -29,12 +29,12 @@
 #ifndef PIPE_FORMAT_H
 #define PIPE_FORMAT_H
 
+#include "p_config.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#include "p_config.h"
-
 /**
  * Formats for textures, surfaces and vertex data
  */
diff --git a/src/gallium/include/pipe/p_screen.h b/src/gallium/include/pipe/p_screen.h
index 211bc2440f9..4f30e75ab49 100644
--- a/src/gallium/include/pipe/p_screen.h
+++ b/src/gallium/include/pipe/p_screen.h
@@ -182,10 +182,13 @@ struct pipe_screen {
     * NOTE: in the case of DRM_API_HANDLE_TYPE_FD handles, the caller
     * retains ownership of the FD.  (This is consistent with
     * EGL_EXT_image_dma_buf_import)
+    *
+    * \param usage  A combination of PIPE_HANDLE_USAGE_* flags.
     */
    struct pipe_resource * (*resource_from_handle)(struct pipe_screen *,
 						  const struct pipe_resource *templat,
-						  struct winsys_handle *handle);
+						  struct winsys_handle *handle,
+						  unsigned usage);
 
    /**
     * Create a resource from user memory. This maps the user memory into
@@ -203,10 +206,13 @@ struct pipe_screen {
     * NOTE: in the case of DRM_API_HANDLE_TYPE_FD handles, the caller
     * takes ownership of the FD.  (This is consistent with
     * EGL_MESA_image_dma_buf_export)
+    *
+    * \param usage  A combination of PIPE_HANDLE_USAGE_* flags.
     */
    boolean (*resource_get_handle)(struct pipe_screen *,
 				  struct pipe_resource *tex,
-				  struct winsys_handle *handle);
+				  struct winsys_handle *handle,
+				  unsigned usage);
 
 
    void (*resource_destroy)(struct pipe_screen *,
diff --git a/src/gallium/include/pipe/p_shader_tokens.h b/src/gallium/include/pipe/p_shader_tokens.h
index 9d4a96a5a7e..7a34841088a 100644
--- a/src/gallium/include/pipe/p_shader_tokens.h
+++ b/src/gallium/include/pipe/p_shader_tokens.h
@@ -277,7 +277,8 @@ union tgsi_immediate_data
 #define TGSI_PROPERTY_TES_POINT_MODE         14
 #define TGSI_PROPERTY_NUM_CLIPDIST_ENABLED   15
 #define TGSI_PROPERTY_NUM_CULLDIST_ENABLED   16
-#define TGSI_PROPERTY_COUNT                  17
+#define TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL 17
+#define TGSI_PROPERTY_COUNT                  18
 
 struct tgsi_property {
    unsigned Type         : 4;  /**< TGSI_TOKEN_TYPE_PROPERTY */
@@ -743,7 +744,9 @@ struct tgsi_dst_register
 struct tgsi_instruction_memory
 {
    unsigned Qualifier : 3;  /* TGSI_MEMORY_ */
-   unsigned Padding   : 29;
+   unsigned Texture   : 8;  /* only for images: TGSI_TEXTURE_ */
+   unsigned Format    : 10; /* only for images: PIPE_FORMAT_ */
+   unsigned Padding   : 11;
 };
 
 #define TGSI_MEMBAR_SHADER_BUFFER (1 << 0)
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index c568c483940..2e720ce25f3 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -393,13 +393,14 @@ struct pipe_sampler_view
 
 
 /**
- * A description of a writable buffer or texture that can be bound to a shader
+ * A description of a buffer or texture image that can be bound to a shader
  * stage.
  */
 struct pipe_image_view
 {
    struct pipe_resource *resource; /**< resource into which this is a view  */
    enum pipe_format format;      /**< typed PIPE_FORMAT_x */
+   unsigned access;              /**< PIPE_IMAGE_ACCESS_x */
 
    union {
       struct {
diff --git a/src/gallium/include/pipe/p_video_codec.h b/src/gallium/include/pipe/p_video_codec.h
index 196d00bc546..b5575ab9afa 100644
--- a/src/gallium/include/pipe/p_video_codec.h
+++ b/src/gallium/include/pipe/p_video_codec.h
@@ -28,12 +28,12 @@
 #ifndef PIPE_VIDEO_CONTEXT_H
 #define PIPE_VIDEO_CONTEXT_H
 
+#include "pipe/p_video_state.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#include "pipe/p_video_state.h"
-
 struct pipe_screen;
 struct pipe_surface;
 struct pipe_macroblock;
diff --git a/src/gallium/include/pipe/p_video_enums.h b/src/gallium/include/pipe/p_video_enums.h
index 9a20146f43e..aff7842a888 100644
--- a/src/gallium/include/pipe/p_video_enums.h
+++ b/src/gallium/include/pipe/p_video_enums.h
@@ -28,6 +28,10 @@
 #ifndef PIPE_VIDEO_ENUMS_H
 #define PIPE_VIDEO_ENUMS_H
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 enum pipe_video_format
 {
    PIPE_VIDEO_FORMAT_UNKNOWN = 0,
@@ -87,4 +91,8 @@ enum pipe_video_entrypoint
    PIPE_VIDEO_ENTRYPOINT_ENCODE
 };
 
+#if defined(__cplusplus)
+}
+#endif
+
 #endif /* PIPE_VIDEO_ENUMS_H */
diff --git a/src/gallium/state_trackers/clover/core/kernel.cpp b/src/gallium/state_trackers/clover/core/kernel.cpp
index 8396be91553..c12755b0420 100644
--- a/src/gallium/state_trackers/clover/core/kernel.cpp
+++ b/src/gallium/state_trackers/clover/core/kernel.cpp
@@ -55,7 +55,7 @@ kernel::launch(command_queue &q,
    const auto reduced_grid_size =
       map(divides(), grid_size, block_size);
    void *st = exec.bind(&q, grid_offset);
-   struct pipe_grid_info info;
+   struct pipe_grid_info info = {};
 
    // The handles are created during exec_context::bind(), so we need make
    // sure to call exec_context::bind() before retrieving them.
diff --git a/src/gallium/state_trackers/dri/dri2.c b/src/gallium/state_trackers/dri/dri2.c
index a11a6cbbb0c..7f7fbc47e6d 100644
--- a/src/gallium/state_trackers/dri/dri2.c
+++ b/src/gallium/state_trackers/dri/dri2.c
@@ -354,7 +354,8 @@ dri2_allocate_buffer(__DRIscreen *sPriv,
       whandle.type = DRM_API_HANDLE_TYPE_KMS;
 
    screen->base.screen->resource_get_handle(screen->base.screen,
-         buffer->resource, &whandle);
+         buffer->resource, &whandle,
+         PIPE_HANDLE_USAGE_EXPLICIT_FLUSH | PIPE_HANDLE_USAGE_READ);
 
    buffer->base.attachment = attachment;
    buffer->base.name = whandle.handle;
@@ -539,7 +540,8 @@ dri2_allocate_textures(struct dri_context *ctx,
             whandle.type = DRM_API_HANDLE_TYPE_KMS;
          drawable->textures[statt] =
             screen->base.screen->resource_from_handle(screen->base.screen,
-                  &templ, &whandle);
+                  &templ, &whandle,
+                  PIPE_HANDLE_USAGE_EXPLICIT_FLUSH | PIPE_HANDLE_USAGE_READ);
          assert(drawable->textures[statt]);
       }
    }
@@ -756,7 +758,7 @@ dri2_create_image_from_winsys(__DRIscreen *_screen,
    whandle->stride = pitch * util_format_get_blocksize(pf);
 
    img->texture = screen->base.screen->resource_from_handle(screen->base.screen,
-         &templ, whandle);
+         &templ, whandle, PIPE_HANDLE_USAGE_READ_WRITE);
    if (!img->texture) {
       FREE(img);
       return NULL;
@@ -765,6 +767,7 @@ dri2_create_image_from_winsys(__DRIscreen *_screen,
    img->level = 0;
    img->layer = 0;
    img->dri_format = format;
+   img->use = 0;
    img->loader_private = loaderPrivate;
 
    return img;
@@ -884,6 +887,7 @@ dri2_create_image(__DRIscreen *_screen,
    img->layer = 0;
    img->dri_format = format;
    img->dri_components = 0;
+   img->use = use;
 
    img->loader_private = loaderPrivate;
    return img;
@@ -893,31 +897,38 @@ static GLboolean
 dri2_query_image(__DRIimage *image, int attrib, int *value)
 {
    struct winsys_handle whandle;
+   unsigned usage;
+
+   if (image->use & __DRI_IMAGE_USE_BACKBUFFER)
+      usage = PIPE_HANDLE_USAGE_EXPLICIT_FLUSH | PIPE_HANDLE_USAGE_READ;
+   else
+      usage = PIPE_HANDLE_USAGE_READ_WRITE;
+
    memset(&whandle, 0, sizeof(whandle));
 
    switch (attrib) {
    case __DRI_IMAGE_ATTRIB_STRIDE:
       whandle.type = DRM_API_HANDLE_TYPE_KMS;
       image->texture->screen->resource_get_handle(image->texture->screen,
-            image->texture, &whandle);
+            image->texture, &whandle, usage);
       *value = whandle.stride;
       return GL_TRUE;
    case __DRI_IMAGE_ATTRIB_HANDLE:
       whandle.type = DRM_API_HANDLE_TYPE_KMS;
       image->texture->screen->resource_get_handle(image->texture->screen,
-         image->texture, &whandle);
+         image->texture, &whandle, usage);
       *value = whandle.handle;
       return GL_TRUE;
    case __DRI_IMAGE_ATTRIB_NAME:
       whandle.type = DRM_API_HANDLE_TYPE_SHARED;
       image->texture->screen->resource_get_handle(image->texture->screen,
-         image->texture, &whandle);
+         image->texture, &whandle, usage);
       *value = whandle.handle;
       return GL_TRUE;
    case __DRI_IMAGE_ATTRIB_FD:
       whandle.type= DRM_API_HANDLE_TYPE_FD;
       image->texture->screen->resource_get_handle(image->texture->screen,
-         image->texture, &whandle);
+         image->texture, &whandle, usage);
       *value = whandle.handle;
       return GL_TRUE;
    case __DRI_IMAGE_ATTRIB_FORMAT:
diff --git a/src/gallium/state_trackers/dri/dri_screen.h b/src/gallium/state_trackers/dri/dri_screen.h
index 45459906588..dc4692a1c6b 100644
--- a/src/gallium/state_trackers/dri/dri_screen.h
+++ b/src/gallium/state_trackers/dri/dri_screen.h
@@ -109,6 +109,7 @@ struct __DRIimageRec {
    unsigned layer;
    uint32_t dri_format;
    uint32_t dri_components;
+   unsigned use;
 
    void *loader_private;
 
diff --git a/src/gallium/state_trackers/glx/xlib/glx_api.c b/src/gallium/state_trackers/glx/xlib/glx_api.c
index 0456d44104e..1c541b76db5 100644
--- a/src/gallium/state_trackers/glx/xlib/glx_api.c
+++ b/src/gallium/state_trackers/glx/xlib/glx_api.c
@@ -615,6 +615,7 @@ close_display_callback(Display *dpy, XExtCodes *codes)
 {
    xmesa_destroy_buffers_on_display(dpy);
    destroy_visuals_on_display(dpy);
+   xmesa_close_display(dpy);
    return 0;
 }
 
diff --git a/src/gallium/state_trackers/glx/xlib/xm_api.c b/src/gallium/state_trackers/glx/xlib/xm_api.c
index 2f5e1f5f1a8..5799cce033c 100644
--- a/src/gallium/state_trackers/glx/xlib/xm_api.c
+++ b/src/gallium/state_trackers/glx/xlib/xm_api.c
@@ -110,14 +110,6 @@ void xmesa_set_driver( const struct xm_driver *templ )
 }
 
 
-/*
- * XXX replace this with a linked list, or better yet, try to attach the
- * gallium/mesa extra bits to the X Display object with XAddExtension().
- */
-#define MAX_DISPLAYS 10
-static struct xmesa_display Displays[MAX_DISPLAYS];
-static int NumDisplays = 0;
-
 static int
 xmesa_get_param(struct st_manager *smapi,
                 enum st_manager_param param)
@@ -130,61 +122,145 @@ xmesa_get_param(struct st_manager *smapi,
    }
 }
 
+/* linked list of XMesaDisplay hooks per display */
+typedef struct _XMesaExtDisplayInfo {
+   struct _XMesaExtDisplayInfo *next;
+   Display *display;
+   struct xmesa_display mesaDisplay;
+} XMesaExtDisplayInfo;
+
+typedef struct _XMesaExtInfo {
+   XMesaExtDisplayInfo *head;
+   int ndisplays;
+} XMesaExtInfo;
+
+static XMesaExtInfo MesaExtInfo;
+
+/* hook to delete XMesaDisplay on XDestroyDisplay */
+extern void
+xmesa_close_display(Display *display)
+{
+   XMesaExtDisplayInfo *info, *prev;
+
+   assert(MesaExtInfo.ndisplays > 0);
+   assert(MesaExtInfo.head);
+
+   _XLockMutex(_Xglobal_lock);
+   /* first find display */
+   prev = NULL;
+   for (info = MesaExtInfo.head; info; info = info->next) {
+      if (info->display == display) {
+         prev = info;
+         break;
+      }
+   }
+
+   if (info == NULL) {
+      /* no display found */
+      _XUnlockMutex(_Xglobal_lock);
+      return;
+   }
+
+   /* remove display entry from list */
+   if (prev != MesaExtInfo.head) {
+      prev->next = info->next;
+   } else {
+      MesaExtInfo.head = info->next;
+   }
+   MesaExtInfo.ndisplays--;
+
+   _XUnlockMutex(_Xglobal_lock);
+
+   /* don't forget to clean up mesaDisplay */
+   XMesaDisplay xmdpy = &info->mesaDisplay;
+
+   /**
+    * XXX: Don't destroy the screens here, since there may still
+    * be some dangling screen pointers that are used after this point
+    * if (xmdpy->screen) {
+    *    xmdpy->screen->destroy(xmdpy->screen);
+    * }
+    */
+   free(xmdpy->smapi);
+
+   XFree((char *) info);
+}
+
 static XMesaDisplay
 xmesa_init_display( Display *display )
 {
    pipe_static_mutex(init_mutex);
    XMesaDisplay xmdpy;
-   int i;
+   XMesaExtDisplayInfo *info;
+
+   if (display == NULL) {
+      return NULL;
+   }
 
    pipe_mutex_lock(init_mutex);
 
-   /* Look for XMesaDisplay which corresponds to 'display' */
-   for (i = 0; i < NumDisplays; i++) {
-      if (Displays[i].display == display) {
+   /* Look for XMesaDisplay which corresponds to this display */
+   info = MesaExtInfo.head;
+   while(info) {
+      if (info->display == display) {
          /* Found it */
          pipe_mutex_unlock(init_mutex);
-         return &Displays[i];
+         return  &info->mesaDisplay;
       }
+      info = info->next;
    }
 
-   /* Create new XMesaDisplay */
+   /* Not found.  Create new XMesaDisplay */
+   /* first allocate X-related resources and hook destroy callback */
 
-   assert(NumDisplays < MAX_DISPLAYS);
-   xmdpy = &Displays[NumDisplays];
-   NumDisplays++;
-
-   if (!xmdpy->display && display) {
-      xmdpy->display = display;
-      xmdpy->screen = driver.create_pipe_screen(display);
-      xmdpy->smapi = CALLOC_STRUCT(st_manager);
-      if (xmdpy->smapi) {
-         xmdpy->smapi->screen = xmdpy->screen;
-         xmdpy->smapi->get_param = xmesa_get_param;
-      }
+   /* allocate mesa display info */
+   info = (XMesaExtDisplayInfo *) Xmalloc(sizeof(XMesaExtDisplayInfo));
+   if (info == NULL) {
+      pipe_mutex_unlock(init_mutex);
+      return NULL;
+   }
+   info->display = display;
+   xmdpy = &info->mesaDisplay; /* to be filled out below */
+
+   /* chain to the list of displays */
+   _XLockMutex(_Xglobal_lock);
+   info->next = MesaExtInfo.head;
+   MesaExtInfo.head = info;
+   MesaExtInfo.ndisplays++;
+   _XUnlockMutex(_Xglobal_lock);
+
+   /* now create the new XMesaDisplay info */
+   assert(display);
+
+   xmdpy->display = display;
+   xmdpy->screen = driver.create_pipe_screen(display);
+   xmdpy->smapi = CALLOC_STRUCT(st_manager);
+   xmdpy->pipe = NULL;
+   if (xmdpy->smapi) {
+      xmdpy->smapi->screen = xmdpy->screen;
+      xmdpy->smapi->get_param = xmesa_get_param;
+   }
 
-      if (xmdpy->screen && xmdpy->smapi) {
-         pipe_mutex_init(xmdpy->mutex);
+   if (xmdpy->screen && xmdpy->smapi) {
+      pipe_mutex_init(xmdpy->mutex);
+   }
+   else {
+      if (xmdpy->screen) {
+         xmdpy->screen->destroy(xmdpy->screen);
+         xmdpy->screen = NULL;
       }
-      else {
-         if (xmdpy->screen) {
-            xmdpy->screen->destroy(xmdpy->screen);
-            xmdpy->screen = NULL;
-         }
-         free(xmdpy->smapi);
-         xmdpy->smapi = NULL;
+      free(xmdpy->smapi);
+      xmdpy->smapi = NULL;
 
-         xmdpy->display = NULL;
-      }
+      xmdpy->display = NULL;
    }
-   if (!xmdpy->display || xmdpy->display != display)
-      xmdpy = NULL;
 
    pipe_mutex_unlock(init_mutex);
 
    return xmdpy;
 }
 
+
 /**********************************************************************/
 /*****                     X Utility Functions                    *****/
 /**********************************************************************/
diff --git a/src/gallium/state_trackers/glx/xlib/xm_api.h b/src/gallium/state_trackers/glx/xlib/xm_api.h
index ffdffc0940f..ccf35a5eb5a 100644
--- a/src/gallium/state_trackers/glx/xlib/xm_api.h
+++ b/src/gallium/state_trackers/glx/xlib/xm_api.h
@@ -378,6 +378,9 @@ xmesa_check_buffer_size(XMesaBuffer b);
 extern void
 xmesa_destroy_buffers_on_display(Display *dpy);
 
+extern void
+xmesa_close_display(Display *dpy);
+
 static inline GLuint
 xmesa_buffer_width(XMesaBuffer b)
 {
diff --git a/src/gallium/state_trackers/nine/swapchain9.c b/src/gallium/state_trackers/nine/swapchain9.c
index 1ab339c459c..e2fb4d4a1e3 100644
--- a/src/gallium/state_trackers/nine/swapchain9.c
+++ b/src/gallium/state_trackers/nine/swapchain9.c
@@ -87,7 +87,9 @@ D3DWindowBuffer_create(struct NineSwapChain9 *This,
 
     memset(&whandle, 0, sizeof(whandle));
     whandle.type = DRM_API_HANDLE_TYPE_FD;
-    This->screen->resource_get_handle(This->screen, resource, &whandle);
+    This->screen->resource_get_handle(This->screen, resource, &whandle,
+                                      PIPE_HANDLE_USAGE_EXPLICIT_FLUSH |
+                                      PIPE_HANDLE_USAGE_READ);
     stride = whandle.stride;
     dmaBufFd = whandle.handle;
     ID3DPresent_NewD3DWindowBufferFromDmaBuf(This->present,
diff --git a/src/gallium/state_trackers/va/buffer.c b/src/gallium/state_trackers/va/buffer.c
index c2c24d693f2..2fd86612e9a 100644
--- a/src/gallium/state_trackers/va/buffer.c
+++ b/src/gallium/state_trackers/va/buffer.c
@@ -302,7 +302,8 @@ vlVaAcquireBufferHandle(VADriverContextP ctx, VABufferID buf_id,
          memset(&whandle, 0, sizeof(whandle));
          whandle.type = DRM_API_HANDLE_TYPE_FD;
 
-         if (!screen->resource_get_handle(screen, buf->derived_surface.resource, &whandle))
+         if (!screen->resource_get_handle(screen, buf->derived_surface.resource,
+                                          &whandle, PIPE_HANDLE_USAGE_READ_WRITE))
             return VA_STATUS_ERROR_INVALID_BUFFER;
 
          buf_info->handle = (intptr_t)whandle.handle;
diff --git a/src/gallium/state_trackers/va/surface.c b/src/gallium/state_trackers/va/surface.c
index 84a94949c47..861dac260a9 100644
--- a/src/gallium/state_trackers/va/surface.c
+++ b/src/gallium/state_trackers/va/surface.c
@@ -470,7 +470,8 @@ suface_from_external_memory(VADriverContextP ctx, vlVaSurface *surface,
    whandle.handle = memory_attibute->buffers[index];
    whandle.stride = memory_attibute->pitches[index];
 
-   resource = pscreen->resource_from_handle(pscreen, &res_templ, &whandle);
+   resource = pscreen->resource_from_handle(pscreen, &res_templ, &whandle,
+                                            PIPE_HANDLE_USAGE_READ_WRITE);
 
    if (!resource)
       return VA_STATUS_ERROR_ALLOCATION_FAILED;
diff --git a/src/gallium/state_trackers/va/va_private.h b/src/gallium/state_trackers/va/va_private.h
index 614fa98fef7..d91de442fa7 100644
--- a/src/gallium/state_trackers/va/va_private.h
+++ b/src/gallium/state_trackers/va/va_private.h
@@ -148,11 +148,12 @@ PipeToProfile(enum pipe_video_profile profile)
       return VAProfileH264High;
    case PIPE_VIDEO_PROFILE_HEVC_MAIN:
       return VAProfileHEVCMain;
+   case PIPE_VIDEO_PROFILE_HEVC_MAIN_10:
+      return VAProfileHEVCMain10;
    case PIPE_VIDEO_PROFILE_MPEG4_AVC_EXTENDED:
    case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH10:
    case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH422:
    case PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH444:
-   case PIPE_VIDEO_PROFILE_HEVC_MAIN_10:
    case PIPE_VIDEO_PROFILE_HEVC_MAIN_12:
    case PIPE_VIDEO_PROFILE_HEVC_MAIN_STILL:
    case PIPE_VIDEO_PROFILE_HEVC_MAIN_444:
@@ -190,6 +191,8 @@ ProfileToPipe(VAProfile profile)
       return PIPE_VIDEO_PROFILE_MPEG4_AVC_HIGH;
    case VAProfileHEVCMain:
       return PIPE_VIDEO_PROFILE_HEVC_MAIN;
+   case VAProfileHEVCMain10:
+      return PIPE_VIDEO_PROFILE_HEVC_MAIN_10;
    case VAProfileNone:
        return PIPE_VIDEO_PROFILE_UNKNOWN;
    default:
diff --git a/src/gallium/state_trackers/xa/xa_tracker.c b/src/gallium/state_trackers/xa/xa_tracker.c
index d57464b7d60..f09baed1d84 100644
--- a/src/gallium/state_trackers/xa/xa_tracker.c
+++ b/src/gallium/state_trackers/xa/xa_tracker.c
@@ -362,7 +362,8 @@ surface_create(struct xa_tracker *xa,
 	template->bind |= PIPE_BIND_SCANOUT;
 
     if (whandle)
-	srf->tex = xa->screen->resource_from_handle(xa->screen, template, whandle);
+	srf->tex = xa->screen->resource_from_handle(xa->screen, template, whandle,
+                                                    PIPE_HANDLE_USAGE_READ_WRITE);
     else
 	srf->tex = xa->screen->resource_create(xa->screen, template);
     if (!srf->tex)
@@ -548,7 +549,8 @@ xa_surface_handle(struct xa_surface *srf,
 
     memset(&whandle, 0, sizeof(whandle));
     whandle.type = handle_type(type);
-    res = screen->resource_get_handle(screen, srf->tex, &whandle);
+    res = screen->resource_get_handle(screen, srf->tex, &whandle,
+                                      PIPE_HANDLE_USAGE_READ_WRITE);
     if (!res)
 	return -XA_ERR_INVAL;
 
diff --git a/src/gallium/targets/libgl-xlib/Makefile.am b/src/gallium/targets/libgl-xlib/Makefile.am
index d99caae3cb0..3f1382e2848 100644
--- a/src/gallium/targets/libgl-xlib/Makefile.am
+++ b/src/gallium/targets/libgl-xlib/Makefile.am
@@ -81,6 +81,11 @@ AM_CPPFLAGS += -DGALLIUM_LLVMPIPE
 lib@GL_LIB@_la_LDFLAGS += $(LLVM_LDFLAGS)
 endif
 
+if HAVE_GALLIUM_SWR
+lib@GL_LIB@_la_LIBADD += $(top_builddir)/src/gallium/drivers/swr/libmesaswr.la $(LLVM_LIBS)
+AM_CPPFLAGS += -DGALLIUM_SWR
+endif
+
 EXTRA_lib@GL_LIB@_la_DEPENDENCIES = libgl-xlib.sym
 EXTRA_DIST = SConscript libgl-xlib.sym
 
diff --git a/src/gallium/targets/osmesa/Makefile.am b/src/gallium/targets/osmesa/Makefile.am
index 38e515f8252..5d394866946 100644
--- a/src/gallium/targets/osmesa/Makefile.am
+++ b/src/gallium/targets/osmesa/Makefile.am
@@ -74,6 +74,12 @@ lib@OSMESA_LIB@_la_LDFLAGS += $(LLVM_LDFLAGS)
 lib@OSMESA_LIB@_la_LIBADD += $(top_builddir)/src/gallium/drivers/llvmpipe/libllvmpipe.la $(LLVM_LIBS)
 endif
 
+if HAVE_GALLIUM_SWR
+AM_CPPFLAGS += -DGALLIUM_SWR
+lib@OSMESA_LIB@_la_LDFLAGS += $(LLVM_LDFLAGS)
+lib@OSMESA_LIB@_la_LIBADD += $(top_builddir)/src/gallium/drivers/swr/libmesaswr.la $(LLVM_LIBS)
+endif
+
 EXTRA_lib@OSMESA_LIB@_la_DEPENDENCIES = osmesa.sym
 EXTRA_DIST = \
 	osmesa.sym \
diff --git a/src/gallium/tests/trivial/compute.c b/src/gallium/tests/trivial/compute.c
index 288cf2ad629..5d5e0b0b8c3 100644
--- a/src/gallium/tests/trivial/compute.c
+++ b/src/gallium/tests/trivial/compute.c
@@ -421,7 +421,7 @@ static void destroy_globals(struct context *ctx)
 
 static void launch_grid(struct context *ctx, const uint *block_layout,
                         const uint *grid_layout, uint32_t pc,
-                        const void *input)
+                        void *input)
 {
         struct pipe_context *pipe = ctx->pipe;
         struct pipe_grid_info info;
diff --git a/src/gallium/winsys/amdgpu/drm/addrlib/addrinterface.h b/src/gallium/winsys/amdgpu/drm/addrlib/addrinterface.h
index 03fbf2bd0ee..ead603378cf 100644
--- a/src/gallium/winsys/amdgpu/drm/addrlib/addrinterface.h
+++ b/src/gallium/winsys/amdgpu/drm/addrlib/addrinterface.h
@@ -33,13 +33,13 @@
 #ifndef __ADDR_INTERFACE_H__
 #define __ADDR_INTERFACE_H__
 
+#include "addrtypes.h"
+
 #if defined(__cplusplus)
 extern "C"
 {
 #endif
 
-#include "addrtypes.h"
-
 #define ADDRLIB_VERSION_MAJOR 5
 #define ADDRLIB_VERSION_MINOR 25
 #define ADDRLIB_VERSION ((ADDRLIB_VERSION_MAJOR << 16) | ADDRLIB_VERSION_MINOR)
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index 59a801b1426..b670f263329 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -390,14 +390,8 @@ static unsigned eg_tile_split_rev(unsigned eg_tile_split)
    }
 }
 
-static void amdgpu_bo_get_tiling(struct pb_buffer *_buf,
-                                 enum radeon_bo_layout *microtiled,
-                                 enum radeon_bo_layout *macrotiled,
-                                 unsigned *bankw, unsigned *bankh,
-                                 unsigned *tile_split,
-                                 unsigned *stencil_tile_split,
-                                 unsigned *mtilea,
-                                 bool *scanout)
+static void amdgpu_buffer_get_metadata(struct pb_buffer *_buf,
+                                       struct radeon_bo_metadata *md)
 {
    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
    struct amdgpu_bo_info info = {0};
@@ -410,61 +404,54 @@ static void amdgpu_bo_get_tiling(struct pb_buffer *_buf,
 
    tiling_flags = info.metadata.tiling_info;
 
-   *microtiled = RADEON_LAYOUT_LINEAR;
-   *macrotiled = RADEON_LAYOUT_LINEAR;
+   md->microtile = RADEON_LAYOUT_LINEAR;
+   md->macrotile = RADEON_LAYOUT_LINEAR;
 
    if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 4)  /* 2D_TILED_THIN1 */
-      *macrotiled = RADEON_LAYOUT_TILED;
+      md->macrotile = RADEON_LAYOUT_TILED;
    else if (AMDGPU_TILING_GET(tiling_flags, ARRAY_MODE) == 2) /* 1D_TILED_THIN1 */
-      *microtiled = RADEON_LAYOUT_TILED;
+      md->microtile = RADEON_LAYOUT_TILED;
 
-   if (bankw && tile_split && mtilea && tile_split) {
-      *bankw = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_WIDTH);
-      *bankh = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_HEIGHT);
-      *tile_split = eg_tile_split(AMDGPU_TILING_GET(tiling_flags, TILE_SPLIT));
-      *mtilea = 1 << AMDGPU_TILING_GET(tiling_flags, MACRO_TILE_ASPECT);
-   }
-   if (scanout)
-      *scanout = AMDGPU_TILING_GET(tiling_flags, MICRO_TILE_MODE) == 0; /* DISPLAY */
+   md->bankw = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_WIDTH);
+   md->bankh = 1 << AMDGPU_TILING_GET(tiling_flags, BANK_HEIGHT);
+   md->tile_split = eg_tile_split(AMDGPU_TILING_GET(tiling_flags, TILE_SPLIT));
+   md->mtilea = 1 << AMDGPU_TILING_GET(tiling_flags, MACRO_TILE_ASPECT);
+   md->scanout = AMDGPU_TILING_GET(tiling_flags, MICRO_TILE_MODE) == 0; /* DISPLAY */
+
+   md->size_metadata = info.metadata.size_metadata;
+   memcpy(md->metadata, info.metadata.umd_metadata, sizeof(md->metadata));
 }
 
-static void amdgpu_bo_set_tiling(struct pb_buffer *_buf,
-                                 struct radeon_winsys_cs *rcs,
-                                 enum radeon_bo_layout microtiled,
-                                 enum radeon_bo_layout macrotiled,
-                                 unsigned pipe_config,
-                                 unsigned bankw, unsigned bankh,
-                                 unsigned tile_split,
-                                 unsigned stencil_tile_split,
-                                 unsigned mtilea, unsigned num_banks,
-                                 uint32_t pitch,
-                                 bool scanout)
+static void amdgpu_buffer_set_metadata(struct pb_buffer *_buf,
+                                       struct radeon_bo_metadata *md)
 {
    struct amdgpu_winsys_bo *bo = amdgpu_winsys_bo(_buf);
    struct amdgpu_bo_metadata metadata = {0};
    uint32_t tiling_flags = 0;
 
-   if (macrotiled == RADEON_LAYOUT_TILED)
+   if (md->macrotile == RADEON_LAYOUT_TILED)
       tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 4); /* 2D_TILED_THIN1 */
-   else if (microtiled == RADEON_LAYOUT_TILED)
+   else if (md->microtile == RADEON_LAYOUT_TILED)
       tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 2); /* 1D_TILED_THIN1 */
    else
       tiling_flags |= AMDGPU_TILING_SET(ARRAY_MODE, 1); /* LINEAR_ALIGNED */
 
-   tiling_flags |= AMDGPU_TILING_SET(PIPE_CONFIG, pipe_config);
-   tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(bankw));
-   tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(bankh));
-   if (tile_split)
-      tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, eg_tile_split_rev(tile_split));
-   tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(mtilea));
-   tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(num_banks)-1);
+   tiling_flags |= AMDGPU_TILING_SET(PIPE_CONFIG, md->pipe_config);
+   tiling_flags |= AMDGPU_TILING_SET(BANK_WIDTH, util_logbase2(md->bankw));
+   tiling_flags |= AMDGPU_TILING_SET(BANK_HEIGHT, util_logbase2(md->bankh));
+   if (md->tile_split)
+      tiling_flags |= AMDGPU_TILING_SET(TILE_SPLIT, eg_tile_split_rev(md->tile_split));
+   tiling_flags |= AMDGPU_TILING_SET(MACRO_TILE_ASPECT, util_logbase2(md->mtilea));
+   tiling_flags |= AMDGPU_TILING_SET(NUM_BANKS, util_logbase2(md->num_banks)-1);
 
-   if (scanout)
+   if (md->scanout)
       tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 0); /* DISPLAY_MICRO_TILING */
    else
       tiling_flags |= AMDGPU_TILING_SET(MICRO_TILE_MODE, 1); /* THIN_MICRO_TILING */
 
    metadata.tiling_info = tiling_flags;
+   metadata.size_metadata = md->size_metadata;
+   memcpy(metadata.umd_metadata, md->metadata, sizeof(md->metadata));
 
    amdgpu_bo_set_metadata(bo->bo, &metadata);
 }
@@ -720,8 +707,8 @@ static uint64_t amdgpu_bo_get_va(struct pb_buffer *buf)
 
 void amdgpu_bo_init_functions(struct amdgpu_winsys *ws)
 {
-   ws->base.buffer_set_tiling = amdgpu_bo_set_tiling;
-   ws->base.buffer_get_tiling = amdgpu_bo_get_tiling;
+   ws->base.buffer_set_metadata = amdgpu_buffer_set_metadata;
+   ws->base.buffer_get_metadata = amdgpu_buffer_get_metadata;
    ws->base.buffer_map = amdgpu_bo_map;
    ws->base.buffer_unmap = amdgpu_bo_unmap;
    ws->base.buffer_wait = amdgpu_bo_wait;
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
index 6ad3cddf7cb..a2fb44a4b0e 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_cs.h
@@ -76,7 +76,7 @@ struct amdgpu_cs {
    uint8_t                     *flags;
    struct amdgpu_cs_buffer     *buffers;
 
-   int                         buffer_indices_hashlist[512];
+   int                         buffer_indices_hashlist[4096];
 
    uint64_t                    used_vram;
    uint64_t                    used_gart;
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
index fc7562d8f57..938b9c244b2 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -93,13 +93,26 @@ static unsigned cik_get_num_tile_pipes(struct amdgpu_gpu_info *info)
 }
 
 /* Helper function to do the ioctls needed for setup and init. */
-static boolean do_winsys_init(struct amdgpu_winsys *ws)
+static boolean do_winsys_init(struct amdgpu_winsys *ws, int fd)
 {
    struct amdgpu_buffer_size_alignments alignment_info = {};
    struct amdgpu_heap_info vram, gtt;
    struct drm_amdgpu_info_hw_ip dma = {}, uvd = {}, vce = {};
    uint32_t vce_version = 0, vce_feature = 0;
    int r, i, j;
+   drmDevicePtr devinfo;
+
+   /* Get PCI info. */
+   r = drmGetDevice(fd, &devinfo);
+   if (r) {
+      fprintf(stderr, "amdgpu: drmGetDevice failed.\n");
+      goto fail;
+   }
+   ws->info.pci_domain = devinfo->businfo.pci->domain;
+   ws->info.pci_bus = devinfo->businfo.pci->bus;
+   ws->info.pci_dev = devinfo->businfo.pci->dev;
+   ws->info.pci_func = devinfo->businfo.pci->func;
+   drmFreeDevice(&devinfo);
 
    /* Query hardware and driver information. */
    r = amdgpu_query_gpu_info(ws->dev, &ws->amdinfo);
@@ -437,7 +450,7 @@ amdgpu_winsys_create(int fd, radeon_screen_create_t screen_create)
    ws->info.drm_major = drm_major;
    ws->info.drm_minor = drm_minor;
 
-   if (!do_winsys_init(ws))
+   if (!do_winsys_init(ws, fd))
       goto fail;
 
    /* Create managers. */
diff --git a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
index 7e9ed0ca0fe..978df52447e 100644
--- a/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
+++ b/src/gallium/winsys/radeon/drm/radeon_drm_bo.c
@@ -636,14 +636,8 @@ static unsigned eg_tile_split_rev(unsigned eg_tile_split)
     }
 }
 
-static void radeon_bo_get_tiling(struct pb_buffer *_buf,
-                                 enum radeon_bo_layout *microtiled,
-                                 enum radeon_bo_layout *macrotiled,
-                                 unsigned *bankw, unsigned *bankh,
-                                 unsigned *tile_split,
-                                 unsigned *stencil_tile_split,
-                                 unsigned *mtilea,
-                                 bool *scanout)
+static void radeon_bo_get_metadata(struct pb_buffer *_buf,
+				   struct radeon_bo_metadata *md)
 {
     struct radeon_bo *bo = radeon_bo(_buf);
     struct drm_radeon_gem_set_tiling args;
@@ -657,81 +651,63 @@ static void radeon_bo_get_tiling(struct pb_buffer *_buf,
                         &args,
                         sizeof(args));
 
-    *microtiled = RADEON_LAYOUT_LINEAR;
-    *macrotiled = RADEON_LAYOUT_LINEAR;
+    md->microtile = RADEON_LAYOUT_LINEAR;
+    md->macrotile = RADEON_LAYOUT_LINEAR;
     if (args.tiling_flags & RADEON_TILING_MICRO)
-        *microtiled = RADEON_LAYOUT_TILED;
+        md->microtile = RADEON_LAYOUT_TILED;
     else if (args.tiling_flags & RADEON_TILING_MICRO_SQUARE)
-        *microtiled = RADEON_LAYOUT_SQUARETILED;
+        md->microtile = RADEON_LAYOUT_SQUARETILED;
 
     if (args.tiling_flags & RADEON_TILING_MACRO)
-        *macrotiled = RADEON_LAYOUT_TILED;
-    if (bankw && tile_split && stencil_tile_split && mtilea && tile_split) {
-        *bankw = (args.tiling_flags >> RADEON_TILING_EG_BANKW_SHIFT) & RADEON_TILING_EG_BANKW_MASK;
-        *bankh = (args.tiling_flags >> RADEON_TILING_EG_BANKH_SHIFT) & RADEON_TILING_EG_BANKH_MASK;
-        *tile_split = (args.tiling_flags >> RADEON_TILING_EG_TILE_SPLIT_SHIFT) & RADEON_TILING_EG_TILE_SPLIT_MASK;
-        *stencil_tile_split = (args.tiling_flags >> RADEON_TILING_EG_STENCIL_TILE_SPLIT_SHIFT) & RADEON_TILING_EG_STENCIL_TILE_SPLIT_MASK;
-        *mtilea = (args.tiling_flags >> RADEON_TILING_EG_MACRO_TILE_ASPECT_SHIFT) & RADEON_TILING_EG_MACRO_TILE_ASPECT_MASK;
-        *tile_split = eg_tile_split(*tile_split);
-    }
-    if (scanout)
-        *scanout = bo->rws->gen >= DRV_SI && !(args.tiling_flags & RADEON_TILING_R600_NO_SCANOUT);
+        md->macrotile = RADEON_LAYOUT_TILED;
+
+    md->bankw = (args.tiling_flags >> RADEON_TILING_EG_BANKW_SHIFT) & RADEON_TILING_EG_BANKW_MASK;
+    md->bankh = (args.tiling_flags >> RADEON_TILING_EG_BANKH_SHIFT) & RADEON_TILING_EG_BANKH_MASK;
+    md->tile_split = (args.tiling_flags >> RADEON_TILING_EG_TILE_SPLIT_SHIFT) & RADEON_TILING_EG_TILE_SPLIT_MASK;
+    md->stencil_tile_split = (args.tiling_flags >> RADEON_TILING_EG_STENCIL_TILE_SPLIT_SHIFT) & RADEON_TILING_EG_STENCIL_TILE_SPLIT_MASK;
+    md->mtilea = (args.tiling_flags >> RADEON_TILING_EG_MACRO_TILE_ASPECT_SHIFT) & RADEON_TILING_EG_MACRO_TILE_ASPECT_MASK;
+    md->tile_split = eg_tile_split(md->tile_split);
+    md->scanout = bo->rws->gen >= DRV_SI && !(args.tiling_flags & RADEON_TILING_R600_NO_SCANOUT);
 }
 
-static void radeon_bo_set_tiling(struct pb_buffer *_buf,
-                                 struct radeon_winsys_cs *rcs,
-                                 enum radeon_bo_layout microtiled,
-                                 enum radeon_bo_layout macrotiled,
-                                 unsigned pipe_config,
-                                 unsigned bankw, unsigned bankh,
-                                 unsigned tile_split,
-                                 unsigned stencil_tile_split,
-                                 unsigned mtilea, unsigned num_banks,
-                                 uint32_t pitch,
-                                 bool scanout)
+static void radeon_bo_set_metadata(struct pb_buffer *_buf,
+                                   struct radeon_bo_metadata *md)
 {
     struct radeon_bo *bo = radeon_bo(_buf);
-    struct radeon_drm_cs *cs = radeon_drm_cs(rcs);
     struct drm_radeon_gem_set_tiling args;
 
     memset(&args, 0, sizeof(args));
 
-    /* Tiling determines how DRM treats the buffer data.
-     * We must flush CS when changing it if the buffer is referenced. */
-    if (cs && radeon_bo_is_referenced_by_cs(cs, bo)) {
-        cs->flush_cs(cs->flush_data, 0, NULL);
-    }
-
     os_wait_until_zero(&bo->num_active_ioctls, PIPE_TIMEOUT_INFINITE);
 
-    if (microtiled == RADEON_LAYOUT_TILED)
+    if (md->microtile == RADEON_LAYOUT_TILED)
         args.tiling_flags |= RADEON_TILING_MICRO;
-    else if (microtiled == RADEON_LAYOUT_SQUARETILED)
+    else if (md->microtile == RADEON_LAYOUT_SQUARETILED)
         args.tiling_flags |= RADEON_TILING_MICRO_SQUARE;
 
-    if (macrotiled == RADEON_LAYOUT_TILED)
+    if (md->macrotile == RADEON_LAYOUT_TILED)
         args.tiling_flags |= RADEON_TILING_MACRO;
 
-    args.tiling_flags |= (bankw & RADEON_TILING_EG_BANKW_MASK) <<
+    args.tiling_flags |= (md->bankw & RADEON_TILING_EG_BANKW_MASK) <<
         RADEON_TILING_EG_BANKW_SHIFT;
-    args.tiling_flags |= (bankh & RADEON_TILING_EG_BANKH_MASK) <<
+    args.tiling_flags |= (md->bankh & RADEON_TILING_EG_BANKH_MASK) <<
         RADEON_TILING_EG_BANKH_SHIFT;
-    if (tile_split) {
-	args.tiling_flags |= (eg_tile_split_rev(tile_split) &
+    if (md->tile_split) {
+	args.tiling_flags |= (eg_tile_split_rev(md->tile_split) &
 			      RADEON_TILING_EG_TILE_SPLIT_MASK) <<
 	    RADEON_TILING_EG_TILE_SPLIT_SHIFT;
     }
-    args.tiling_flags |= (stencil_tile_split &
+    args.tiling_flags |= (md->stencil_tile_split &
 			  RADEON_TILING_EG_STENCIL_TILE_SPLIT_MASK) <<
         RADEON_TILING_EG_STENCIL_TILE_SPLIT_SHIFT;
-    args.tiling_flags |= (mtilea & RADEON_TILING_EG_MACRO_TILE_ASPECT_MASK) <<
+    args.tiling_flags |= (md->mtilea & RADEON_TILING_EG_MACRO_TILE_ASPECT_MASK) <<
         RADEON_TILING_EG_MACRO_TILE_ASPECT_SHIFT;
 
-    if (bo->rws->gen >= DRV_SI && !scanout)
+    if (bo->rws->gen >= DRV_SI && !md->scanout)
         args.tiling_flags |= RADEON_TILING_R600_NO_SCANOUT;
 
     args.handle = bo->handle;
-    args.pitch = pitch;
+    args.pitch = md->stride;
 
     drmCommandWriteRead(bo->rws->fd,
                         DRM_RADEON_GEM_SET_TILING,
@@ -1064,8 +1040,8 @@ static uint64_t radeon_winsys_bo_va(struct pb_buffer *buf)
 
 void radeon_drm_bo_init_functions(struct radeon_drm_winsys *ws)
 {
-    ws->base.buffer_set_tiling = radeon_bo_set_tiling;
-    ws->base.buffer_get_tiling = radeon_bo_get_tiling;
+    ws->base.buffer_set_metadata = radeon_bo_set_metadata;
+    ws->base.buffer_get_metadata = radeon_bo_get_metadata;
     ws->base.buffer_map = radeon_bo_map;
     ws->base.buffer_unmap = radeon_bo_unmap;
     ws->base.buffer_wait = radeon_bo_wait;
diff --git a/src/gallium/winsys/svga/drm/vmw_context.c b/src/gallium/winsys/svga/drm/vmw_context.c
index dae121e4053..8d23bff5d74 100644
--- a/src/gallium/winsys/svga/drm/vmw_context.c
+++ b/src/gallium/winsys/svga/drm/vmw_context.c
@@ -315,6 +315,13 @@ vmw_swc_reserve(struct svga_winsys_context *swc,
    return vswc->command.buffer + vswc->command.used;
 }
 
+static unsigned
+vmw_swc_get_command_buffer_size(struct svga_winsys_context *swc)
+{
+   const struct vmw_svga_winsys_context *vswc = vmw_svga_winsys_context(swc);
+   return vswc->command.used;
+}
+
 static void
 vmw_swc_context_relocation(struct svga_winsys_context *swc,
 			   uint32 *cid)
@@ -761,6 +768,7 @@ vmw_svga_winsys_context_create(struct svga_winsys_screen *sws)
 
    vswc->base.destroy = vmw_swc_destroy;
    vswc->base.reserve = vmw_swc_reserve;
+   vswc->base.get_command_buffer_size = vmw_swc_get_command_buffer_size;
    vswc->base.surface_relocation = vmw_swc_surface_relocation;
    vswc->base.region_relocation = vmw_swc_region_relocation;
    vswc->base.mob_relocation = vmw_swc_mob_relocation;
diff --git a/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c b/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c
index c86d95a14fe..7fc93e74812 100644
--- a/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c
+++ b/src/gallium/winsys/svga/drm/vmw_screen_ioctl.c
@@ -52,6 +52,7 @@
 #include <unistd.h>
 
 #define VMW_MAX_DEFAULT_TEXTURE_SIZE   (128 * 1024 * 1024)
+#define VMW_FENCE_TIMEOUT_SECONDS 60
 
 struct vmw_region
 {
@@ -721,7 +722,7 @@ vmw_ioctl_fence_finish(struct vmw_winsys_screen *vws,
    memset(&arg, 0, sizeof(arg));
 
    arg.handle = handle;
-   arg.timeout_us = 10*1000000;
+   arg.timeout_us = VMW_FENCE_TIMEOUT_SECONDS*1000000;
    arg.lazy = 0;
    arg.flags = vflags;
 
diff --git a/src/gallium/winsys/svga/drm/vmw_surface.c b/src/gallium/winsys/svga/drm/vmw_surface.c
index 6c0ad3bbf19..a438b1a7c5b 100644
--- a/src/gallium/winsys/svga/drm/vmw_surface.c
+++ b/src/gallium/winsys/svga/drm/vmw_surface.c
@@ -170,6 +170,8 @@ vmw_svga_winsys_surface_unmap(struct svga_winsys_context *swc,
       *rebind = vsrf->rebind;
       vsrf->rebind = FALSE;
       vmw_svga_winsys_buffer_unmap(&vsrf->screen->base, vsrf->buf);
+   } else {
+      *rebind = FALSE;
    }
    pipe_mutex_unlock(vsrf->mutex);
 }
diff --git a/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c b/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c
index 4d87a580cb1..e130cd256e9 100644
--- a/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c
+++ b/src/gallium/winsys/sw/wrapper/wrapper_sw_winsys.c
@@ -185,7 +185,8 @@ wsw_dt_from_handle(struct sw_winsys *ws,
    struct wrapper_sw_winsys *wsw = wrapper_sw_winsys(ws);
    struct pipe_resource *tex;
 
-   tex = wsw->screen->resource_from_handle(wsw->screen, templ, whandle);
+   tex = wsw->screen->resource_from_handle(wsw->screen, templ, whandle,
+                                           PIPE_HANDLE_USAGE_READ_WRITE);
    if (!tex)
       return NULL;
 
@@ -201,7 +202,8 @@ wsw_dt_get_handle(struct sw_winsys *ws,
    struct wrapper_sw_displaytarget *wdt = wrapper_sw_displaytarget(dt);
    struct pipe_resource *tex = wdt->tex;
 
-   return wsw->screen->resource_get_handle(wsw->screen, tex, whandle);
+   return wsw->screen->resource_get_handle(wsw->screen, tex, whandle,
+                                           PIPE_HANDLE_USAGE_READ_WRITE);
 }
 
 static void *
diff --git a/src/gbm/main/gbm.h b/src/gbm/main/gbm.h
index 8db2153e84b..63d9a9edfd6 100644
--- a/src/gbm/main/gbm.h
+++ b/src/gbm/main/gbm.h
@@ -28,16 +28,16 @@
 #ifndef _GBM_H_
 #define _GBM_H_
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-
 #define __GBM__ 1
 
 #include <stddef.h>
 #include <stdint.h>
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
 /**
  * \file gbm.h
  * \brief Generic Buffer Manager
diff --git a/src/loader/loader_dri3_helper.c b/src/loader/loader_dri3_helper.c
index 62bfe845c08..896f2252f36 100644
--- a/src/loader/loader_dri3_helper.c
+++ b/src/loader/loader_dri3_helper.c
@@ -858,7 +858,8 @@ dri3_alloc_render_buffer(struct loader_dri3_drawable *draw, unsigned int format,
                                                       width, height,
                                                       format,
                                                       __DRI_IMAGE_USE_SHARE |
-                                                      __DRI_IMAGE_USE_SCANOUT,
+                                                      __DRI_IMAGE_USE_SCANOUT |
+                                                      __DRI_IMAGE_USE_BACKBUFFER,
                                                       buffer);
       pixmap_buffer = buffer->image;
 
@@ -878,7 +879,8 @@ dri3_alloc_render_buffer(struct loader_dri3_drawable *draw, unsigned int format,
         (draw->ext->image->createImage)(draw->dri_screen,
                                         width, height, format,
                                         __DRI_IMAGE_USE_SHARE |
-                                           __DRI_IMAGE_USE_LINEAR,
+                                        __DRI_IMAGE_USE_LINEAR |
+                                        __DRI_IMAGE_USE_BACKBUFFER,
                                         buffer);
       pixmap_buffer = buffer->linear_buffer;
 
diff --git a/src/mapi/glapi/gen/ARB_direct_state_access.xml b/src/mapi/glapi/gen/ARB_direct_state_access.xml
index 293d7164680..155b6f8d528 100644
--- a/src/mapi/glapi/gen/ARB_direct_state_access.xml
+++ b/src/mapi/glapi/gen/ARB_direct_state_access.xml
@@ -153,32 +153,32 @@
 
    <!-- Framebuffer object functions -->
 
-   <function name="CreateFramebuffers" offset="assign">
+   <function name="CreateFramebuffers">
       <param name="n" type="GLsizei" />
       <param name="framebuffers" type="GLuint *" />
    </function>
 
-   <function name="NamedFramebufferRenderbuffer" offset="assign">
+   <function name="NamedFramebufferRenderbuffer">
       <param name="framebuffer" type="GLuint" />
       <param name="attachment" type="GLenum" />
       <param name="renderbuffertarget" type="GLenum" />
       <param name="renderbuffer" type="GLuint" />
    </function>
 
-   <function name="NamedFramebufferParameteri" offset="assign">
+   <function name="NamedFramebufferParameteri">
       <param name="framebuffer" type="GLuint" />
       <param name="pname" type="GLenum" />
       <param name="param" type="GLint" />
    </function>
 
-   <function name="NamedFramebufferTexture" offset="assign">
+   <function name="NamedFramebufferTexture">
       <param name="framebuffer" type="GLuint" />
       <param name="attachment" type="GLenum" />
       <param name="texture" type="GLuint" />
       <param name="level" type="GLint" />
    </function>
 
-   <function name="NamedFramebufferTextureLayer" offset="assign">
+   <function name="NamedFramebufferTextureLayer">
       <param name="framebuffer" type="GLuint" />
       <param name="attachment" type="GLenum" />
       <param name="texture" type="GLuint" />
@@ -186,29 +186,29 @@
       <param name="layer" type="GLint" />
    </function>
 
-   <function name="NamedFramebufferDrawBuffer" offset="assign">
+   <function name="NamedFramebufferDrawBuffer">
       <param name="framebuffer" type="GLuint" />
       <param name="buf" type="GLenum" />
    </function>
 
-   <function name="NamedFramebufferDrawBuffers" offset="assign">
+   <function name="NamedFramebufferDrawBuffers">
       <param name="framebuffer" type="GLuint" />
       <param name="n" type="GLsizei" />
       <param name="bufs" type="const GLenum *" />
    </function>
 
-   <function name="NamedFramebufferReadBuffer" offset="assign">
+   <function name="NamedFramebufferReadBuffer">
       <param name="framebuffer" type="GLuint" />
       <param name="buf" type="GLenum" />
    </function>
 
-   <function name="InvalidateNamedFramebufferData" offset="assign">
+   <function name="InvalidateNamedFramebufferData">
       <param name="framebuffer" type="GLuint" />
       <param name="numAttachments" type="GLsizei" />
       <param name="attachments" type="const GLenum *" />
    </function>
 
-   <function name="InvalidateNamedFramebufferSubData" offset="assign">
+   <function name="InvalidateNamedFramebufferSubData">
       <param name="framebuffer" type="GLuint" />
       <param name="numAttachments" type="GLsizei" />
       <param name="attachments" type="const GLenum *" />
@@ -218,35 +218,35 @@
       <param name="height" type="GLsizei" />
    </function>
 
-   <function name="ClearNamedFramebufferiv" offset="assign">
+   <function name="ClearNamedFramebufferiv">
       <param name="framebuffer" type="GLuint" />
       <param name="buffer" type="GLenum" />
       <param name="drawbuffer" type="GLint" />
       <param name="value" type="const GLint *" />
    </function>
 
-   <function name="ClearNamedFramebufferuiv" offset="assign">
+   <function name="ClearNamedFramebufferuiv">
       <param name="framebuffer" type="GLuint" />
       <param name="buffer" type="GLenum" />
       <param name="drawbuffer" type="GLint" />
       <param name="value" type="const GLuint *" />
    </function>
 
-   <function name="ClearNamedFramebufferfv" offset="assign">
+   <function name="ClearNamedFramebufferfv">
       <param name="framebuffer" type="GLuint" />
       <param name="buffer" type="GLenum" />
       <param name="drawbuffer" type="GLint" />
       <param name="value" type="const GLfloat *" />
    </function>
 
-   <function name="ClearNamedFramebufferfi" offset="assign">
+   <function name="ClearNamedFramebufferfi">
       <param name="framebuffer" type="GLuint" />
       <param name="buffer" type="GLenum" />
       <param name="depth" type="GLfloat" />
       <param name="stencil" type="GLint" />
    </function>
 
-   <function name="BlitNamedFramebuffer" offset="assign">
+   <function name="BlitNamedFramebuffer">
       <param name="readFramebuffer" type="GLuint" />
       <param name="drawFramebuffer" type="GLuint" />
       <param name="srcX0" type="GLint" />
@@ -261,19 +261,19 @@
       <param name="filter" type="GLenum" />
    </function>
 
-   <function name="CheckNamedFramebufferStatus" offset="assign">
+   <function name="CheckNamedFramebufferStatus">
       <return type="GLenum" />
       <param name="framebuffer" type="GLuint" />
       <param name="target" type="GLenum" />
    </function>
 
-   <function name="GetNamedFramebufferParameteriv" offset="assign">
+   <function name="GetNamedFramebufferParameteriv">
       <param name="framebuffer" type="GLuint" />
       <param name="pname" type="GLenum" />
       <param name="param" type="GLint *" />
    </function>
 
-   <function name="GetNamedFramebufferAttachmentParameteriv" offset="assign">
+   <function name="GetNamedFramebufferAttachmentParameteriv">
       <param name="framebuffer" type="GLuint" />
       <param name="attachment" type="GLenum" />
       <param name="pname" type="GLenum" />
diff --git a/src/mapi/glapi/gen/ARB_get_texture_sub_image.xml b/src/mapi/glapi/gen/ARB_get_texture_sub_image.xml
index 14e1c20b9d5..47e26abb28e 100644
--- a/src/mapi/glapi/gen/ARB_get_texture_sub_image.xml
+++ b/src/mapi/glapi/gen/ARB_get_texture_sub_image.xml
@@ -7,7 +7,7 @@
 
 <category name="GL_ARB_get_texture_sub_image" number="165">
 
-    <function name="GetTextureSubImage" offset="assign">
+    <function name="GetTextureSubImage">
         <param name="texture" type="GLuint"/>
         <param name="level" type="GLint"/>
         <param name="xoffset" type="GLint"/>
@@ -22,7 +22,7 @@
         <param name="pixels" type="GLvoid *"/>
     </function>
 
-    <function name="GetCompressedTextureSubImage" offset="assign">
+    <function name="GetCompressedTextureSubImage">
         <param name="texture" type="GLuint"/>
         <param name="level" type="GLint"/>
         <param name="xoffset" type="GLint"/>
diff --git a/src/mapi/glapi/gen/ARB_internalformat_query2.xml b/src/mapi/glapi/gen/ARB_internalformat_query2.xml
new file mode 100644
index 00000000000..9b0f320fba7
--- /dev/null
+++ b/src/mapi/glapi/gen/ARB_internalformat_query2.xml
@@ -0,0 +1,119 @@
+<?xml version="1.0"?>
+<!DOCTYPE OpenGLAPI SYSTEM "gl_API.dtd">
+
+<OpenGLAPI>
+
+<category name="GL_ARB_internalformat_query2" number="131">
+    <!-- Other existing enums are reused for this extension. -->
+
+        <enum name="INTERNALFORMAT_SUPPORTED"        value="0x826F"/>
+        <enum name="INTERNALFORMAT_PREFERRED"        value="0x8270"/>
+        <enum name="INTERNALFORMAT_RED_SIZE"         value="0x8271"/>
+        <enum name="INTERNALFORMAT_GREEN_SIZE"       value="0x8272"/>
+        <enum name="INTERNALFORMAT_BLUE_SIZE"        value="0x8273"/>
+        <enum name="INTERNALFORMAT_ALPHA_SIZE"       value="0x8274"/>
+        <enum name="INTERNALFORMAT_DEPTH_SIZE"       value="0x8275"/>
+        <enum name="INTERNALFORMAT_STENCIL_SIZE"     value="0x8276"/>
+        <enum name="INTERNALFORMAT_SHARED_SIZE"      value="0x8277"/>
+        <enum name="INTERNALFORMAT_RED_TYPE"         value="0x8278"/>
+        <enum name="INTERNALFORMAT_GREEN_TYPE"       value="0x8279"/>
+        <enum name="INTERNALFORMAT_BLUE_TYPE"        value="0x827A"/>
+        <enum name="INTERNALFORMAT_ALPHA_TYPE"       value="0x827B"/>
+        <enum name="INTERNALFORMAT_DEPTH_TYPE"       value="0x827C"/>
+        <enum name="INTERNALFORMAT_STENCIL_TYPE"     value="0x827D"/>
+        <enum name="MAX_WIDTH"                       value="0x827E"/>
+        <enum name="MAX_HEIGHT"                      value="0x827F"/>
+        <enum name="MAX_DEPTH"                       value="0x8280"/>
+        <enum name="MAX_LAYERS"                      value="0x8281"/>
+        <enum name="MAX_COMBINED_DIMENSIONS"         value="0x8282"/>
+        <enum name="COLOR_COMPONENTS"                value="0x8283"/>
+        <enum name="DEPTH_COMPONENTS"                value="0x8284"/>
+        <enum name="STENCIL_COMPONENTS"              value="0x8285"/>
+        <enum name="COLOR_RENDERABLE"                value="0x8286"/>
+        <enum name="DEPTH_RENDERABLE"                value="0x8287"/>
+        <enum name="STENCIL_RENDERABLE"              value="0x8288"/>
+        <enum name="FRAMEBUFFER_RENDERABLE"          value="0x8289"/>
+        <enum name="FRAMEBUFFER_RENDERABLE_LAYERED"  value="0x828A"/>
+        <enum name="FRAMEBUFFER_BLEND"               value="0x828B"/>
+        <enum name="READ_PIXELS"                     value="0x828C"/>
+        <enum name="READ_PIXELS_FORMAT"              value="0x828D"/>
+        <enum name="READ_PIXELS_TYPE"                value="0x828E"/>
+        <enum name="TEXTURE_IMAGE_FORMAT"            value="0x828F"/>
+        <enum name="TEXTURE_IMAGE_TYPE"              value="0x8290"/>
+        <enum name="GET_TEXTURE_IMAGE_FORMAT"        value="0x8291"/>
+        <enum name="GET_TEXTURE_IMAGE_TYPE"          value="0x8292"/>
+        <enum name="MIPMAP"                          value="0x8293"/>
+        <enum name="MANUAL_GENERATE_MIPMAP"          value="0x8294"/>
+        <enum name="AUTO_GENERATE_MIPMAP"            value="0x8295"/>
+        <enum name="COLOR_ENCODING"                  value="0x8296"/>
+        <enum name="SRGB_READ"                       value="0x8297"/>
+        <enum name="SRGB_WRITE"                      value="0x8298"/>
+        <enum name="SRGB_DECODE_ARB"                 value="0x8299"/>
+        <enum name="FILTER"                          value="0x829A"/>
+        <enum name="VERTEX_TEXTURE"                  value="0x829B"/>
+        <enum name="TESS_CONTROL_TEXTURE"            value="0x829C"/>
+        <enum name="TESS_EVALUATION_TEXTURE"         value="0x829D"/>
+        <enum name="GEOMETRY_TEXTURE"                value="0x829E"/>
+        <enum name="FRAGMENT_TEXTURE"                value="0x829F"/>
+        <enum name="COMPUTE_TEXTURE"                 value="0x82A0"/>
+        <enum name="TEXTURE_SHADOW"                  value="0x82A1"/>
+        <enum name="TEXTURE_GATHER"                  value="0x82A2"/>
+        <enum name="TEXTURE_GATHER_SHADOW"           value="0x82A3"/>
+        <enum name="SHADER_IMAGE_LOAD"               value="0x82A4"/>
+        <enum name="SHADER_IMAGE_STORE"              value="0x82A5"/>
+        <enum name="SHADER_IMAGE_ATOMIC"             value="0x82A6"/>
+        <enum name="IMAGE_TEXEL_SIZE"                value="0x82A7"/>
+        <enum name="IMAGE_COMPATIBILITY_CLASS"       value="0x82A8"/>
+        <enum name="IMAGE_PIXEL_FORMAT"              value="0x82A9"/>
+        <enum name="IMAGE_PIXEL_TYPE"                value="0x82AA"/>
+        <enum name="SIMULTANEOUS_TEXTURE_AND_DEPTH_TEST" value="0x82AC"/>
+        <enum name="SIMULTANEOUS_TEXTURE_AND_STENCIL_TEST" value="0x82AD"/>
+        <enum name="SIMULTANEOUS_TEXTURE_AND_DEPTH_WRITE" value="0x82AE"/>
+        <enum name="SIMULTANEOUS_TEXTURE_AND_STENCIL_WRITE" value="0x82AF"/>
+        <enum name="TEXTURE_COMPRESSED"              value="0x86A1"/>
+        <enum name="TEXTURE_COMPRESSED_BLOCK_WIDTH"  value="0x82B1"/>
+        <enum name="TEXTURE_COMPRESSED_BLOCK_HEIGHT" value="0x82B2"/>
+        <enum name="TEXTURE_COMPRESSED_BLOCK_SIZE"   value="0x82B3"/>
+        <enum name="CLEAR_BUFFER"                    value="0x82B4"/>
+        <enum name="TEXTURE_VIEW"                    value="0x82B5"/>
+        <enum name="VIEW_COMPATIBILITY_CLASS"        value="0x82B6"/>
+        <enum name="FULL_SUPPORT"                    value="0x82B7"/>
+        <enum name="CAVEAT_SUPPORT"                  value="0x82B8"/>
+        <enum name="IMAGE_CLASS_4_X_32"              value="0x82B9"/>
+        <enum name="IMAGE_CLASS_2_X_32"              value="0x82BA"/>
+        <enum name="IMAGE_CLASS_1_X_32"              value="0x82BB"/>
+        <enum name="IMAGE_CLASS_4_X_16"              value="0x82BC"/>
+        <enum name="IMAGE_CLASS_2_X_16"              value="0x82BD"/>
+        <enum name="IMAGE_CLASS_1_X_16"              value="0x82BE"/>
+        <enum name="IMAGE_CLASS_4_X_8"               value="0x82BF"/>
+        <enum name="IMAGE_CLASS_2_X_8"               value="0x82C0"/>
+        <enum name="IMAGE_CLASS_1_X_8"               value="0x82C1"/>
+        <enum name="IMAGE_CLASS_11_11_10"            value="0x82C2"/>
+        <enum name="IMAGE_CLASS_10_10_10_2"          value="0x82C3"/>
+        <enum name="VIEW_CLASS_128_BITS"             value="0x82C4"/>
+        <enum name="VIEW_CLASS_96_BITS"              value="0x82C5"/>
+        <enum name="VIEW_CLASS_64_BITS"              value="0x82C6"/>
+        <enum name="VIEW_CLASS_48_BITS"              value="0x82C7"/>
+        <enum name="VIEW_CLASS_32_BITS"              value="0x82C8"/>
+        <enum name="VIEW_CLASS_24_BITS"              value="0x82C9"/>
+        <enum name="VIEW_CLASS_16_BITS"              value="0x82CA"/>
+        <enum name="VIEW_CLASS_8_BITS"               value="0x82CB"/>
+        <enum name="VIEW_CLASS_S3TC_DXT1_RGB"        value="0x82CC"/>
+        <enum name="VIEW_CLASS_S3TC_DXT1_RGBA"       value="0x82CD"/>
+        <enum name="VIEW_CLASS_S3TC_DXT3_RGBA"       value="0x82CE"/>
+        <enum name="VIEW_CLASS_S3TC_DXT5_RGBA"       value="0x82CF"/>
+        <enum name="VIEW_CLASS_RGTC1_RED"            value="0x82D0"/>
+        <enum name="VIEW_CLASS_RGTC2_RG"             value="0x82D1"/>
+        <enum name="VIEW_CLASS_BPTC_UNORM"           value="0x82D2"/>
+        <enum name="VIEW_CLASS_BPTC_FLOAT"           value="0x82D3"/>
+
+    <function name="GetInternalformati64v" es2="3.0">
+        <param name="target" type="GLenum"/>
+        <param name="internalformat" type="GLenum"/>
+        <param name="pname" type="GLenum"/>
+        <param name="bufSize" type="GLsizei"/>
+        <param name="params" type="GLint64 *"/>
+    </function>
+</category>
+
+</OpenGLAPI>
diff --git a/src/mapi/glapi/gen/ARB_shader_subroutine.xml b/src/mapi/glapi/gen/ARB_shader_subroutine.xml
index 04b75cb8f59..8a7d08c7f71 100644
--- a/src/mapi/glapi/gen/ARB_shader_subroutine.xml
+++ b/src/mapi/glapi/gen/ARB_shader_subroutine.xml
@@ -7,21 +7,21 @@
 
 <category name="GL_ARB_shader_subroutine" number="90">
 
-    <function name="GetSubroutineUniformLocation" offset="assign">
+    <function name="GetSubroutineUniformLocation">
         <param name="program" type="GLuint"/>
         <param name="shadertype" type="GLenum"/>
         <param name="name" type="const GLchar *"/>
         <return type="GLint"/>
     </function>
 
-    <function name="GetSubroutineIndex" offset="assign">
+    <function name="GetSubroutineIndex">
         <param name="program" type="GLuint"/>
         <param name="shadertype" type="GLenum"/>
         <param name="name" type="const GLchar *"/>
         <return type="GLuint"/>
     </function>
 
-    <function name="GetActiveSubroutineUniformiv" offset="assign">
+    <function name="GetActiveSubroutineUniformiv">
         <param name="program" type="GLuint"/>
         <param name="shadertype" type="GLenum"/>
         <param name="index" type="GLuint"/>
@@ -29,7 +29,7 @@
         <param name="values" type="GLint *" output="true"/>
     </function>
 
-    <function name="GetActiveSubroutineUniformName" offset="assign">
+    <function name="GetActiveSubroutineUniformName">
         <param name="program" type="GLuint"/>
         <param name="shadertype" type="GLenum"/>
         <param name="index" type="GLuint"/>
@@ -38,7 +38,7 @@
         <param name="name" type="GLchar *" output="true"/>
     </function>
 
-    <function name="GetActiveSubroutineName" offset="assign">
+    <function name="GetActiveSubroutineName">
         <param name="program" type="GLuint"/>
         <param name="shadertype" type="GLenum"/>
         <param name="index" type="GLuint"/>
@@ -47,19 +47,19 @@
         <param name="name" type="GLchar *" output="true"/>
     </function>
 
-    <function name="UniformSubroutinesuiv" offset="assign">
+    <function name="UniformSubroutinesuiv">
         <param name="shadertype" type="GLenum"/>
         <param name="count" type="GLsizei"/>
         <param name="indices" type="const GLuint *"/>
     </function>
 
-    <function name="GetUniformSubroutineuiv" offset="assign">
+    <function name="GetUniformSubroutineuiv">
         <param name="shadertype" type="GLenum"/>
         <param name="location" type="GLint"/>
         <param name="params" type="GLuint *" output="true"/>
     </function>
 
-    <function name="GetProgramStageiv" offset="assign">
+    <function name="GetProgramStageiv">
         <param name="program" type="GLuint"/>
         <param name="shadertype" type="GLenum"/>
         <param name="pname" type="GLenum"/>
diff --git a/src/mapi/glapi/gen/ARB_tessellation_shader.xml b/src/mapi/glapi/gen/ARB_tessellation_shader.xml
index 16a213933ef..77f2228ae3b 100644
--- a/src/mapi/glapi/gen/ARB_tessellation_shader.xml
+++ b/src/mapi/glapi/gen/ARB_tessellation_shader.xml
@@ -49,11 +49,11 @@
     <enum value="0x8E89" name="MAX_TESS_CONTROL_UNIFORM_BLOCKS"/>
     <enum value="0x8E8A" name="MAX_TESS_EVALUATION_UNIFORM_BLOCKS"/>
 
-    <function name="PatchParameteri" offset="assign">
+    <function name="PatchParameteri">
         <param name="pname" type="GLenum"/>
         <param name="value" type="GLint"/>
     </function>
-    <function name="PatchParameterfv" offset="assign">
+    <function name="PatchParameterfv">
         <param name="pname" type="GLenum"/>
         <param name="values" type="const GLfloat *"/>
     </function>
diff --git a/src/mapi/glapi/gen/Makefile.am b/src/mapi/glapi/gen/Makefile.am
index cd7feabba24..8421af48854 100644
--- a/src/mapi/glapi/gen/Makefile.am
+++ b/src/mapi/glapi/gen/Makefile.am
@@ -148,6 +148,7 @@ API_XML = \
 	ARB_indirect_parameters.xml \
 	ARB_instanced_arrays.xml \
 	ARB_internalformat_query.xml \
+	ARB_internalformat_query2.xml \
 	ARB_invalidate_subdata.xml \
 	ARB_map_buffer_range.xml \
 	ARB_multi_bind.xml \
diff --git a/src/mapi/glapi/gen/gl_API.xml b/src/mapi/glapi/gen/gl_API.xml
index db98ac05fd9..8b49f915169 100644
--- a/src/mapi/glapi/gen/gl_API.xml
+++ b/src/mapi/glapi/gen/gl_API.xml
@@ -8195,7 +8195,7 @@
 
 <xi:include href="ARB_framebuffer_no_attachments.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
-<!-- ARB extensions #131 -->
+<xi:include href="ARB_internalformat_query2.xml" xmlns:xi="http://www.w3.org/2001/XInclude"/>
 
 <category name="GL_ARB_explicit_uniform_location" number="128">
     <enum name="MAX_UNIFORM_LOCATIONS" count="1" value="0x826E" >
diff --git a/src/mesa/drivers/common/driverfuncs.c b/src/mesa/drivers/common/driverfuncs.c
index 752aaf6c006..e96f92af5bb 100644
--- a/src/mesa/drivers/common/driverfuncs.c
+++ b/src/mesa/drivers/common/driverfuncs.c
@@ -90,7 +90,7 @@ _mesa_init_driver_functions(struct dd_function_table *driver)
 
    /* Texture functions */
    driver->ChooseTextureFormat = _mesa_choose_tex_format;
-   driver->QuerySamplesForFormat = _mesa_query_samples_for_format;
+   driver->QueryInternalFormat = _mesa_query_internal_format_default;
    driver->TexImage = _mesa_store_teximage;
    driver->TexSubImage = _mesa_store_texsubimage;
    driver->GetTexSubImage = _mesa_meta_GetTexSubImage;
diff --git a/src/mesa/drivers/common/meta.h b/src/mesa/drivers/common/meta.h
index 60ae5f7577f..c2efa50a33d 100644
--- a/src/mesa/drivers/common/meta.h
+++ b/src/mesa/drivers/common/meta.h
@@ -536,7 +536,7 @@ _mesa_meta_pbo_TexSubImage(struct gl_context *ctx, GLuint dims,
                            int xoffset, int yoffset, int zoffset,
                            int width, int height, int depth,
                            GLenum format, GLenum type, const void *pixels,
-                           bool allocate_storage, bool create_pbo,
+                           bool create_pbo,
                            const struct gl_pixelstore_attrib *packing);
 
 extern bool
diff --git a/src/mesa/drivers/common/meta_tex_subimage.c b/src/mesa/drivers/common/meta_tex_subimage.c
index 639d3236359..dfd3327dd55 100644
--- a/src/mesa/drivers/common/meta_tex_subimage.c
+++ b/src/mesa/drivers/common/meta_tex_subimage.c
@@ -175,7 +175,7 @@ _mesa_meta_pbo_TexSubImage(struct gl_context *ctx, GLuint dims,
                            int xoffset, int yoffset, int zoffset,
                            int width, int height, int depth,
                            GLenum format, GLenum type, const void *pixels,
-                           bool allocate_storage, bool create_pbo,
+                           bool create_pbo,
                            const struct gl_pixelstore_attrib *packing)
 {
    struct gl_buffer_object *pbo = NULL;
@@ -214,19 +214,18 @@ _mesa_meta_pbo_TexSubImage(struct gl_context *ctx, GLuint dims,
     */
    image_height = packing->ImageHeight == 0 ? height : packing->ImageHeight;
 
+   _mesa_meta_begin(ctx, ~(MESA_META_PIXEL_TRANSFER |
+                           MESA_META_PIXEL_STORE));
+
    pbo_tex_image = create_texture_for_pbo(ctx, create_pbo,
                                           GL_PIXEL_UNPACK_BUFFER,
                                           dims, width, height, depth,
                                           format, type, pixels, packing,
                                           &pbo, &pbo_tex);
-   if (!pbo_tex_image)
+   if (!pbo_tex_image) {
+      _mesa_meta_end(ctx);
       return false;
-
-   if (allocate_storage)
-      ctx->Driver.AllocTextureImageBuffer(ctx, tex_image);
-
-   _mesa_meta_begin(ctx, ~(MESA_META_PIXEL_TRANSFER |
-                           MESA_META_PIXEL_STORE));
+   }
 
    readFb = ctx->Driver.NewFramebuffer(ctx, 0xDEADBEEF);
    if (readFb == NULL)
@@ -361,15 +360,18 @@ _mesa_meta_pbo_GetTexSubImage(struct gl_context *ctx, GLuint dims,
     */
    image_height = packing->ImageHeight == 0 ? height : packing->ImageHeight;
 
+   _mesa_meta_begin(ctx, ~(MESA_META_PIXEL_TRANSFER |
+                           MESA_META_PIXEL_STORE));
+
    pbo_tex_image = create_texture_for_pbo(ctx, false, GL_PIXEL_PACK_BUFFER,
                                           dims, width, height, depth,
                                           format, type, pixels, packing,
                                           &pbo, &pbo_tex);
-   if (!pbo_tex_image)
-      return false;
 
-   _mesa_meta_begin(ctx, ~(MESA_META_PIXEL_TRANSFER |
-                           MESA_META_PIXEL_STORE));
+   if (!pbo_tex_image) {
+      _mesa_meta_end(ctx);
+      return false;
+   }
 
    /* GL_CLAMP_FRAGMENT_COLOR doesn't affect ReadPixels and GettexImage */
    if (ctx->Extensions.ARB_color_buffer_float)
diff --git a/src/mesa/drivers/dri/i915/intel_context.h b/src/mesa/drivers/dri/i915/intel_context.h
index aecd7c23f45..39b328a3f3e 100644
--- a/src/mesa/drivers/dri/i915/intel_context.h
+++ b/src/mesa/drivers/dri/i915/intel_context.h
@@ -40,17 +40,16 @@ extern "C" {
 	#define virtual virt
 #endif
 
-#include "drm.h"
-#include "intel_bufmgr.h"
-
-#include "intel_screen.h"
-#include "intel_tex_obj.h"
-#include "i915_drm.h"
-
+#include <drm.h>
+#include <intel_bufmgr.h>
+#include <i915_drm.h>
 #ifdef __cplusplus
 	#undef virtual
 #endif
 
+#include "intel_screen.h"
+#include "intel_tex_obj.h"
+
 #include "tnl/t_vertex.h"
 
 #define TAG(x) intel##x
diff --git a/src/mesa/drivers/dri/i965/Makefile.sources b/src/mesa/drivers/dri/i965/Makefile.sources
index 8f92fd7cfd2..2802ec9887c 100644
--- a/src/mesa/drivers/dri/i965/Makefile.sources
+++ b/src/mesa/drivers/dri/i965/Makefile.sources
@@ -124,6 +124,7 @@ i965_FILES = \
 	brw_ff_gs.h \
 	brw_fs_channel_expressions.cpp \
 	brw_fs_vector_splitting.cpp \
+	brw_formatquery.c \
 	brw_gs.c \
 	brw_gs.h \
 	brw_gs_state.c \
diff --git a/src/mesa/drivers/dri/i965/brw_clip_unfilled.c b/src/mesa/drivers/dri/i965/brw_clip_unfilled.c
index 3c18858abf1..d333d10d299 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_unfilled.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_unfilled.c
@@ -85,8 +85,10 @@ static void compute_tri_direction( struct brw_clip_compile *c )
    /* Take their crossproduct:
     */
    brw_set_default_access_mode(p, BRW_ALIGN_16);
-   brw_MUL(p, vec4(brw_null_reg()), brw_swizzle(e, 1,2,0,3),  brw_swizzle(f,2,0,1,3));
-   brw_MAC(p, vec4(e),  negate(brw_swizzle(e, 2,0,1,3)), brw_swizzle(f,1,2,0,3));
+   brw_MUL(p, vec4(brw_null_reg()), brw_swizzle(e, BRW_SWIZZLE_YZXW),
+           brw_swizzle(f, BRW_SWIZZLE_ZXYW));
+   brw_MAC(p, vec4(e),  negate(brw_swizzle(e, BRW_SWIZZLE_ZXYW)),
+           brw_swizzle(f, BRW_SWIZZLE_YZXW));
    brw_set_default_access_mode(p, BRW_ALIGN_1);
 
    brw_MUL(p, c->reg.dir, c->reg.dir, vec4(e));
diff --git a/src/mesa/drivers/dri/i965/brw_clip_util.c b/src/mesa/drivers/dri/i965/brw_clip_util.c
index 7ef3305a25f..3e6664e4a82 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_util.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_util.c
@@ -98,7 +98,8 @@ void brw_clip_project_position(struct brw_clip_compile *c, struct brw_reg pos )
    /* value.xyz *= value.rhw
     */
    brw_set_default_access_mode(p, BRW_ALIGN_16);
-   brw_MUL(p, brw_writemask(pos, WRITEMASK_XYZ), pos, brw_swizzle1(pos, W));
+   brw_MUL(p, brw_writemask(pos, WRITEMASK_XYZ), pos,
+           brw_swizzle(pos, BRW_SWIZZLE_WWWW));
    brw_set_default_access_mode(p, BRW_ALIGN_1);
 }
 
@@ -194,11 +195,11 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
       brw_set_default_access_mode(p, BRW_ALIGN_16);
       brw_MOV(p,
               brw_writemask(t_nopersp, WRITEMASK_ZW),
-              brw_swizzle(tmp, 0, 1, 0, 1));
+              brw_swizzle(tmp, BRW_SWIZZLE_XYXY));
 
       /* t_nopersp = vec4(v1.xy, dest.xy) - v0.xyxy */
       brw_ADD(p, t_nopersp, t_nopersp,
-              negate(brw_swizzle(v0_ndc_copy, 0, 1, 0, 1)));
+              negate(brw_swizzle(v0_ndc_copy, BRW_SWIZZLE_XYXY)));
 
       /* Add the absolute values of the X and Y deltas so that if
        * the points aren't in the same place on the screen we get
@@ -212,8 +213,8 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
        */
       brw_ADD(p,
               brw_writemask(t_nopersp, WRITEMASK_XY),
-              brw_abs(brw_swizzle(t_nopersp, 0, 2, 0, 0)),
-              brw_abs(brw_swizzle(t_nopersp, 1, 3, 0, 0)));
+              brw_abs(brw_swizzle(t_nopersp, BRW_SWIZZLE_XZXZ)),
+              brw_abs(brw_swizzle(t_nopersp, BRW_SWIZZLE_YWYW)));
       brw_set_default_access_mode(p, BRW_ALIGN_1);
 
       /* If the points are in the same place, just substitute a
@@ -234,7 +235,7 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
       brw_MUL(p, vec1(t_nopersp), vec1(t_nopersp),
             vec1(suboffset(t_nopersp, 1)));
       brw_set_default_access_mode(p, BRW_ALIGN_16);
-      brw_MOV(p, t_nopersp, brw_swizzle(t_nopersp, 0, 0, 0, 0));
+      brw_MOV(p, t_nopersp, brw_swizzle(t_nopersp, BRW_SWIZZLE_XXXX));
       brw_set_default_access_mode(p, BRW_ALIGN_1);
 
       release_tmp(c, tmp);
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 31b6b2a3641..2d480d02366 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -75,53 +75,29 @@
  * Mesa's Driver Functions
  ***************************************/
 
-static size_t
-brw_query_samples_for_format(struct gl_context *ctx, GLenum target,
-                             GLenum internalFormat, int samples[16])
-{
-   struct brw_context *brw = brw_context(ctx);
-
-   (void) target;
-
-   switch (brw->gen) {
-   case 9:
-      samples[0] = 16;
-      samples[1] = 8;
-      samples[2] = 4;
-      samples[3] = 2;
-      return 4;
-
-   case 8:
-      samples[0] = 8;
-      samples[1] = 4;
-      samples[2] = 2;
-      return 3;
-
-   case 7:
-      samples[0] = 8;
-      samples[1] = 4;
-      return 2;
-
-   case 6:
-      samples[0] = 4;
-      return 1;
+const char *const brw_vendor_string = "Intel Open Source Technology Center";
 
+static const char *
+get_bsw_model(const struct intel_screen *intelScreen)
+{
+   switch (intelScreen->eu_total) {
+   case 16:
+      return "405";
+   case 12:
+      return "400";
    default:
-      assert(brw->gen < 6);
-      samples[0] = 1;
-      return 1;
+      return "   ";
    }
 }
 
-const char *const brw_vendor_string = "Intel Open Source Technology Center";
-
 const char *
-brw_get_renderer_string(unsigned deviceID)
+brw_get_renderer_string(const struct intel_screen *intelScreen)
 {
    const char *chipset;
    static char buffer[128];
+   char *bsw = NULL;
 
-   switch (deviceID) {
+   switch (intelScreen->deviceID) {
 #undef CHIPSET
 #define CHIPSET(id, symbol, str) case id: chipset = str; break;
 #include "pci_ids/i965_pci_ids.h"
@@ -130,7 +106,18 @@ brw_get_renderer_string(unsigned deviceID)
       break;
    }
 
+   /* Braswell branding is funny, so we have to fix it up here */
+   if (intelScreen->deviceID == 0x22B1) {
+      bsw = strdup(chipset);
+      char *needle = strstr(bsw, "XXX");
+      if (needle) {
+         memcpy(needle, get_bsw_model(intelScreen), 3);
+         chipset = bsw;
+      }
+   }
+
    (void) driGetRendererString(buffer, chipset, 0);
+   free(bsw);
    return buffer;
 }
 
@@ -145,7 +132,7 @@ intel_get_string(struct gl_context * ctx, GLenum name)
 
    case GL_RENDERER:
       return
-         (GLubyte *) brw_get_renderer_string(brw->intelScreen->deviceID);
+         (GLubyte *) brw_get_renderer_string(brw->intelScreen);
 
    default:
       return NULL;
@@ -379,7 +366,7 @@ brw_init_driver_functions(struct brw_context *brw,
    if (brw->gen >= 7)
       brw_init_conditional_render_functions(functions);
 
-   functions->QuerySamplesForFormat = brw_query_samples_for_format;
+   functions->QueryInternalFormat = brw_query_internal_format;
 
    functions->NewTransformFeedback = brw_new_transform_feedback;
    functions->DeleteTransformFeedback = brw_delete_transform_feedback;
@@ -682,6 +669,11 @@ brw_initialize_context_constants(struct brw_context *brw)
          brw->intelScreen->compiler->glsl_compiler_options[i];
    }
 
+   if (brw->gen >= 7) {
+      ctx->Const.MaxViewportWidth = 32768;
+      ctx->Const.MaxViewportHeight = 32768;
+   }
+
    /* ARB_viewport_array */
    if (brw->gen >= 6 && ctx->API == API_OPENGL_CORE) {
       ctx->Const.MaxViewports = GEN6_NUM_VIEWPORTS;
@@ -698,8 +690,8 @@ brw_initialize_context_constants(struct brw_context *brw)
       ctx->Const.MaxVertexStreams = MIN2(4, MAX_VERTEX_STREAMS);
 
    /* ARB_framebuffer_no_attachments */
-   ctx->Const.MaxFramebufferWidth = ctx->Const.MaxViewportWidth;
-   ctx->Const.MaxFramebufferHeight = ctx->Const.MaxViewportHeight;
+   ctx->Const.MaxFramebufferWidth = 16384;
+   ctx->Const.MaxFramebufferHeight = 16384;
    ctx->Const.MaxFramebufferLayers = ctx->Const.MaxArrayTextureLayers;
    ctx->Const.MaxFramebufferSamples = max_samples;
 }
@@ -962,7 +954,18 @@ brwCreateContext(gl_api api,
    brw->max_ds_threads = devinfo->max_ds_threads;
    brw->max_gs_threads = devinfo->max_gs_threads;
    brw->max_wm_threads = devinfo->max_wm_threads;
-   brw->max_cs_threads = devinfo->max_cs_threads;
+   /* FINISHME: Do this for all platforms that the kernel supports */
+   if (brw->is_cherryview &&
+       screen->subslice_total > 0 && screen->eu_total > 0) {
+      /* Logical CS threads = EUs per subslice * 7 threads per EU */
+      brw->max_cs_threads = screen->eu_total / screen->subslice_total * 7;
+
+      /* Fuse configurations may give more threads than expected, never less. */
+      if (brw->max_cs_threads < devinfo->max_cs_threads)
+         brw->max_cs_threads = devinfo->max_cs_threads;
+   } else {
+      brw->max_cs_threads = devinfo->max_cs_threads;
+   }
    brw->urb.size = devinfo->urb.size;
    brw->urb.min_vs_entries = devinfo->urb.min_vs_entries;
    brw->urb.max_vs_entries = devinfo->urb.max_vs_entries;
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 6b82bea52c0..a953745b114 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -1341,7 +1341,8 @@ extern void intelInitClearFuncs(struct dd_function_table *functions);
  */
 extern const char *const brw_vendor_string;
 
-extern const char *brw_get_renderer_string(unsigned deviceID);
+extern const char *
+brw_get_renderer_string(const struct intel_screen *intelScreen);
 
 enum {
    DRI_CONF_BO_REUSE_DISABLED,
@@ -1875,6 +1876,11 @@ void brw_emit_depth_stall_flushes(struct brw_context *brw);
 void gen7_emit_vs_workaround_flush(struct brw_context *brw);
 void gen7_emit_cs_stall_flush(struct brw_context *brw);
 
+/* brw_queryformat.c */
+void brw_query_internal_format(struct gl_context *ctx, GLenum target,
+                               GLenum internalFormat, GLenum pname,
+                               GLint *params);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_device_info.c b/src/mesa/drivers/dri/i965/brw_device_info.c
index 38a27da898c..3666190fc36 100644
--- a/src/mesa/drivers/dri/i965/brw_device_info.c
+++ b/src/mesa/drivers/dri/i965/brw_device_info.c
@@ -312,7 +312,7 @@ static const struct brw_device_info brw_device_info_chv = {
    .max_ds_threads = 80,
    .max_gs_threads = 80,
    .max_wm_threads = 128,
-   .max_cs_threads = 28,
+   .max_cs_threads = 6 * 7,
    .urb = {
       .size = 192,
       .min_vs_entries = 34,
diff --git a/src/mesa/drivers/dri/i965/brw_device_info.h b/src/mesa/drivers/dri/i965/brw_device_info.h
index 48e0dee9084..4e7f3135960 100644
--- a/src/mesa/drivers/dri/i965/brw_device_info.h
+++ b/src/mesa/drivers/dri/i965/brw_device_info.h
@@ -25,6 +25,9 @@
 #pragma once
 #include <stdbool.h>
 
+/**
+ * Intel hardware information and quirks
+ */
 struct brw_device_info
 {
    int gen; /**< Generation number: 4, 5, 6, 7, ... */
@@ -49,7 +52,7 @@ struct brw_device_info
    bool has_resource_streamer;
 
    /**
-    * Quirks:
+    * \name Intel hardware quirks
     *  @{
     */
    bool has_negative_rhw_bug;
@@ -65,26 +68,69 @@ struct brw_device_info
    /** @} */
 
    /**
-    * GPU Limits:
+    * \name GPU hardware limits
+    *
+    * In general, you can find shader thread maximums by looking at the "Maximum
+    * Number of Threads" field in the Intel PRM description of the 3DSTATE_VS,
+    * 3DSTATE_GS, 3DSTATE_HS, 3DSTATE_DS, and 3DSTATE_PS commands. URB entry
+    * limits come from the "Number of URB Entries" field in the the
+    * 3DSTATE_URB_VS command and friends.
+    *
+    * These fields are used to calculate the scratch space to allocate.  The
+    * amount of scratch space can be larger without being harmful on modern
+    * GPUs, however, prior to Haswell, programming the maximum number of threads
+    * to greater than the hardware maximum would cause GPU performance to tank.
+    *
     *  @{
     */
    /**
     * Total number of slices present on the device whether or not they've been
     * fused off.
+    *
+    * XXX: CS thread counts are limited by the inability to do cross subslice
+    * communication. It is the effectively the number of logical threads which
+    * can be executed in a subslice. Fuse configurations may cause this number
+    * to change, so we program @max_cs_threads as the lower maximum.
     */
    unsigned num_slices;
-   unsigned max_vs_threads;
-   unsigned max_hs_threads;
-   unsigned max_ds_threads;
-   unsigned max_gs_threads;
+   unsigned max_vs_threads;   /**< Maximum Vertex Shader threads */
+   unsigned max_hs_threads;   /**< Maximum Hull Shader threads */
+   unsigned max_ds_threads;   /**< Maximum Domain Shader threads */
+   unsigned max_gs_threads;   /**< Maximum Geometry Shader threads. */
+   /**
+    * Theoretical maximum number of Pixel Shader threads.
+    *
+    * PSD means Pixel Shader Dispatcher. On modern Intel GPUs, hardware will
+    * automatically scale pixel shader thread count, based on a single value
+    * programmed into 3DSTATE_PS.
+    *
+    * To calculate the maximum number of threads for Gen8 beyond (which have
+    * multiple Pixel Shader Dispatchers):
+    *
+    * - Look up 3DSTATE_PS and find "Maximum Number of Threads Per PSD"
+    * - Usually there's only one PSD per subslice, so use the number of
+    *   subslices for number of PSDs.
+    * - For max_wm_threads, the total should be PSD threads * #PSDs.
+    */
    unsigned max_wm_threads;
+
+   /**
+    * Maximum Compute Shader threads.
+    *
+    * Thread count * number of EUs per subslice
+    */
    unsigned max_cs_threads;
 
    struct {
       /**
-       * Hardware default URB size.  The units this is expressed in are
-       * somewhat inconsistent: 512b units on Gen4-5, KB on Gen6-7, and KB
-       * times the slice count on Gen8+.
+       * Hardware default URB size.
+       *
+       * The units this is expressed in are somewhat inconsistent: 512b units
+       * on Gen4-5, KB on Gen6-7, and KB times the slice count on Gen8+.
+       *
+       * Look up "URB Size" in the "Device Attributes" page, and take the
+       * maximum.  Look up the slice count for each GT SKU on the same page.
+       * urb.size = URB Size (kbytes) / slice count
        */
       unsigned size;
       unsigned min_vs_entries;
diff --git a/src/mesa/drivers/dri/i965/brw_eu.c b/src/mesa/drivers/dri/i965/brw_eu.c
index 40ec87d38f0..6961a88c6a8 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.c
+++ b/src/mesa/drivers/dri/i965/brw_eu.c
@@ -110,6 +110,50 @@ brw_swap_cmod(uint32_t cmod)
    }
 }
 
+/**
+ * Get the least significant bit offset of the i+1-th component of immediate
+ * type \p type.  For \p i equal to the two's complement of j, return the
+ * offset of the j-th component starting from the end of the vector.  For
+ * scalar register types return zero.
+ */
+static unsigned
+imm_shift(enum brw_reg_type type, unsigned i)
+{
+   assert(type != BRW_REGISTER_TYPE_UV && type != BRW_REGISTER_TYPE_V &&
+          "Not implemented.");
+
+   if (type == BRW_REGISTER_TYPE_VF)
+      return 8 * (i & 3);
+   else
+      return 0;
+}
+
+/**
+ * Swizzle an arbitrary immediate \p x of the given type according to the
+ * permutation specified as \p swz.
+ */
+uint32_t
+brw_swizzle_immediate(enum brw_reg_type type, uint32_t x, unsigned swz)
+{
+   if (imm_shift(type, 1)) {
+      const unsigned n = 32 / imm_shift(type, 1);
+      uint32_t y = 0;
+
+      for (unsigned i = 0; i < n; i++) {
+         /* Shift the specified component all the way to the right and left to
+          * discard any undesired L/MSBs, then shift it right into component i.
+          */
+         y |= x >> imm_shift(type, (i & ~3) + BRW_GET_SWZ(swz, i & 3))
+                << imm_shift(type, ~0u)
+                >> imm_shift(type, ~0u - i);
+      }
+
+      return y;
+   } else {
+      return x;
+   }
+}
+
 void
 brw_set_default_exec_size(struct brw_codegen *p, unsigned value)
 {
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 2ef1d7bb825..6f11f597492 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -3402,7 +3402,7 @@ brw_broadcast(struct brw_codegen *p,
           */
          inst = brw_MOV(p,
                         brw_null_reg(),
-                        stride(brw_swizzle1(idx, 0), 0, 4, 1));
+                        stride(brw_swizzle(idx, BRW_SWIZZLE_XXXX), 0, 4, 1));
          brw_inst_set_pred_control(devinfo, inst, BRW_PREDICATE_NONE);
          brw_inst_set_cond_modifier(devinfo, inst, BRW_CONDITIONAL_NZ);
          brw_inst_set_flag_reg_nr(devinfo, inst, 1);
diff --git a/src/mesa/drivers/dri/i965/brw_formatquery.c b/src/mesa/drivers/dri/i965/brw_formatquery.c
new file mode 100644
index 00000000000..210109b39f7
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_formatquery.c
@@ -0,0 +1,171 @@
+/*
+ * Copyright © 2015 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "main/formatquery.h"
+#include "main/glformats.h"
+
+static size_t
+brw_query_samples_for_format(struct gl_context *ctx, GLenum target,
+                             GLenum internalFormat, int samples[16])
+{
+   struct brw_context *brw = brw_context(ctx);
+
+   (void) target;
+   (void) internalFormat;
+
+   switch (brw->gen) {
+   case 9:
+      samples[0] = 16;
+      samples[1] = 8;
+      samples[2] = 4;
+      samples[3] = 2;
+      return 4;
+
+   case 8:
+      samples[0] = 8;
+      samples[1] = 4;
+      samples[2] = 2;
+      return 3;
+
+   case 7:
+      samples[0] = 8;
+      samples[1] = 4;
+      return 2;
+
+   case 6:
+      samples[0] = 4;
+      return 1;
+
+   default:
+      assert(brw->gen < 6);
+      samples[0] = 1;
+      return 1;
+   }
+}
+
+/**
+ * Returns a generic GL type from an internal format, so that it can be used
+ * together with the base format to obtain a mesa_format by calling
+ * mesa_format_from_format_and_type().
+ */
+static GLenum
+get_generic_type_for_internal_format(GLenum internalFormat)
+{
+   if (_mesa_is_color_format(internalFormat)) {
+      if (_mesa_is_enum_format_unsigned_int(internalFormat))
+         return GL_UNSIGNED_BYTE;
+      else if (_mesa_is_enum_format_signed_int(internalFormat))
+         return GL_BYTE;
+   } else {
+      switch (internalFormat) {
+      case GL_STENCIL_INDEX:
+      case GL_STENCIL_INDEX8:
+         return GL_UNSIGNED_BYTE;
+      case GL_DEPTH_COMPONENT:
+      case GL_DEPTH_COMPONENT16:
+         return GL_UNSIGNED_SHORT;
+      case GL_DEPTH_COMPONENT24:
+      case GL_DEPTH_COMPONENT32:
+         return GL_UNSIGNED_INT;
+      case GL_DEPTH_COMPONENT32F:
+         return GL_FLOAT;
+      case GL_DEPTH_STENCIL:
+      case GL_DEPTH24_STENCIL8:
+         return GL_UNSIGNED_INT_24_8;
+      case GL_DEPTH32F_STENCIL8:
+         return GL_FLOAT_32_UNSIGNED_INT_24_8_REV;
+      default:
+         /* fall-through */
+         break;
+      }
+   }
+
+   return GL_FLOAT;
+}
+
+void
+brw_query_internal_format(struct gl_context *ctx, GLenum target,
+                          GLenum internalFormat, GLenum pname, GLint *params)
+{
+   /* The Mesa layer gives us a temporary params buffer that is guaranteed
+    * to be non-NULL, and have at least 16 elements.
+    */
+   assert(params != NULL);
+
+   switch (pname) {
+   case GL_SAMPLES:
+      brw_query_samples_for_format(ctx, target, internalFormat, params);
+      break;
+
+   case GL_NUM_SAMPLE_COUNTS: {
+      size_t num_samples;
+      GLint dummy_buffer[16];
+
+      num_samples = brw_query_samples_for_format(ctx, target, internalFormat,
+                                                 dummy_buffer);
+      params[0] = (GLint) num_samples;
+      break;
+   }
+
+   case GL_INTERNALFORMAT_PREFERRED: {
+      params[0] = GL_NONE;
+
+      /* We need to resolve an internal format that is compatible with
+       * the passed internal format, and optimal to the driver. By now,
+       * we just validate that the passed internal format is supported by
+       * the driver, and if so return the same internal format, otherwise
+       * return GL_NONE.
+       *
+       * For validating the internal format, we use the
+       * ctx->TextureFormatSupported map to check that a BRW surface format
+       * exists, that can be derived from the internal format. But this
+       * expects a mesa_format, not an internal format. So we need to "come up"
+       * with a type that is generic enough, to resolve the mesa_format first.
+       */
+      GLenum type = get_generic_type_for_internal_format(internalFormat);
+
+      /* Get a mesa_format from the internal format and type. */
+      GLint base_format = _mesa_base_tex_format(ctx, internalFormat);
+      if (base_format != -1) {
+         mesa_format mesa_format =
+            _mesa_format_from_format_and_type(base_format, type);
+
+         if (mesa_format < MESA_FORMAT_COUNT &&
+             ctx->TextureFormatSupported[mesa_format]) {
+            params[0] = internalFormat;
+         }
+      }
+      break;
+   }
+
+   default:
+      /* By default, we call the driver hook's fallback function from the frontend,
+       * which has generic implementation for all pnames.
+       */
+      _mesa_query_internal_format_default(ctx, target, internalFormat, pname,
+                                          params);
+      break;
+   }
+}
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 0f9de30f05b..b5f1a874368 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -1931,8 +1931,8 @@ fs_visitor::compact_virtual_grfs()
 void
 fs_visitor::assign_constant_locations()
 {
-   /* Only the first compile (SIMD8 mode) gets to decide on locations. */
-   if (dispatch_width != 8)
+   /* Only the first compile gets to decide on locations. */
+   if (dispatch_width != min_dispatch_width)
       return;
 
    bool is_live[uniforms];
@@ -2474,8 +2474,10 @@ fs_visitor::opt_sampler_eot()
     * we have enough space, but it will make sure the dead code eliminator kills
     * the instruction that this will replace.
     */
-   if (tex_inst->header_size != 0)
+   if (tex_inst->header_size != 0) {
+      invalidate_live_intervals();
       return true;
+   }
 
    fs_reg send_header = ibld.vgrf(BRW_REGISTER_TYPE_F,
                                   load_payload->sources + 1);
@@ -2506,6 +2508,7 @@ fs_visitor::opt_sampler_eot()
    tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
    tex_inst->src[0] = send_header;
 
+   invalidate_live_intervals();
    return true;
 }
 
@@ -5236,12 +5239,18 @@ fs_visitor::optimize()
 void
 fs_visitor::fixup_3src_null_dest()
 {
+   bool progress = false;
+
    foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
       if (inst->is_3src() && inst->dst.is_null()) {
          inst->dst = fs_reg(VGRF, alloc.allocate(dispatch_width / 8),
                             inst->dst.type);
+         progress = true;
       }
    }
+
+   if (progress)
+      invalidate_live_intervals();
 }
 
 void
@@ -5277,7 +5286,7 @@ fs_visitor::allocate_registers()
        * SIMD8.  There's probably actually some intermediate point where
        * SIMD16 with a couple of spills is still better.
        */
-      if (dispatch_width == 16) {
+      if (dispatch_width == 16 && min_dispatch_width <= 8) {
          fail("Failure to register allocate.  Reduce number of "
               "live scalar values to avoid this.");
       } else {
@@ -5519,6 +5528,13 @@ fs_visitor::run_cs()
    if (shader_time_index >= 0)
       emit_shader_time_begin();
 
+   if (devinfo->is_haswell && prog_data->total_shared > 0) {
+      /* Move SLM index from g0.0[27:24] to sr0.1[11:8] */
+      const fs_builder abld = bld.exec_all().group(1, 0);
+      abld.MOV(retype(suboffset(brw_sr0_reg(), 1), BRW_REGISTER_TYPE_UW),
+               suboffset(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW), 1));
+   }
+
    emit_nir_code();
 
    if (failed)
@@ -5782,6 +5798,7 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
       shader->info.cs.local_size[2];
 
    unsigned max_cs_threads = compiler->devinfo->max_cs_threads;
+   unsigned simd_required = DIV_ROUND_UP(local_workgroup_size, max_cs_threads);
 
    cfg_t *cfg = NULL;
    const char *fail_msg = NULL;
@@ -5791,11 +5808,13 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
    fs_visitor v8(compiler, log_data, mem_ctx, key, &prog_data->base,
                  NULL, /* Never used in core profile */
                  shader, 8, shader_time_index);
-   if (!v8.run_cs()) {
-      fail_msg = v8.fail_msg;
-   } else if (local_workgroup_size <= 8 * max_cs_threads) {
-      cfg = v8.cfg;
-      prog_data->simd_size = 8;
+   if (simd_required <= 8) {
+      if (!v8.run_cs()) {
+         fail_msg = v8.fail_msg;
+      } else {
+         cfg = v8.cfg;
+         prog_data->simd_size = 8;
+      }
    }
 
    fs_visitor v16(compiler, log_data, mem_ctx, key, &prog_data->base,
@@ -5805,7 +5824,8 @@ brw_compile_cs(const struct brw_compiler *compiler, void *log_data,
        !fail_msg && !v8.simd16_unsupported &&
        local_workgroup_size <= 16 * max_cs_threads) {
       /* Try a SIMD16 compile */
-      v16.import_uniforms(&v8);
+      if (simd_required <= 8)
+         v16.import_uniforms(&v8);
       if (!v16.run_cs()) {
          compiler->shader_perf_log(log_data,
                                    "SIMD16 shader failed to compile: %s",
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index f1a81c13ef9..2b00129b4ba 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -272,6 +272,8 @@ public:
    void emit_percomp(const brw::fs_builder &bld, const fs_inst &inst,
                      unsigned wr_mask);
 
+   bool optimize_extract_to_float(nir_alu_instr *instr,
+                                  const fs_reg &result);
    bool optimize_frontfacing_ternary(nir_alu_instr *instr,
                                      const fs_reg &result);
 
@@ -405,6 +407,7 @@ public:
    bool spilled_any_registers;
 
    const unsigned dispatch_width; /**< 8 or 16 */
+   unsigned min_dispatch_width;
 
    int shader_time_index;
 
diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
index 9dbe13df514..2616e65fc62 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp
@@ -737,8 +737,7 @@ fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
          foreach_in_list(acp_entry, entry, &acp[inst->src[i].nr % ACP_HASH_SIZE]) {
             if (try_constant_propagate(inst, entry))
                progress = true;
-
-            if (try_copy_propagate(inst, i, entry))
+            else if (try_copy_propagate(inst, i, entry))
                progress = true;
          }
       }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 8611b8dc443..29ef609fce3 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -491,6 +491,49 @@ fs_visitor::nir_emit_instr(nir_instr *instr)
    }
 }
 
+/**
+ * Recognizes a parent instruction of nir_op_extract_* and changes the type to
+ * match instr.
+ */
+bool
+fs_visitor::optimize_extract_to_float(nir_alu_instr *instr,
+                                      const fs_reg &result)
+{
+   if (!instr->src[0].src.is_ssa ||
+       !instr->src[0].src.ssa->parent_instr)
+      return false;
+
+   if (instr->src[0].src.ssa->parent_instr->type != nir_instr_type_alu)
+      return false;
+
+   nir_alu_instr *src0 =
+      nir_instr_as_alu(instr->src[0].src.ssa->parent_instr);
+
+   if (src0->op != nir_op_extract_u8 && src0->op != nir_op_extract_u16 &&
+       src0->op != nir_op_extract_i8 && src0->op != nir_op_extract_i16)
+      return false;
+
+   nir_const_value *element = nir_src_as_const_value(src0->src[1].src);
+   assert(element != NULL);
+
+   enum opcode extract_op;
+   if (src0->op == nir_op_extract_u16 || src0->op == nir_op_extract_i16) {
+      assert(element->u[0] <= 1);
+      extract_op = SHADER_OPCODE_EXTRACT_WORD;
+   } else {
+      assert(element->u[0] <= 3);
+      extract_op = SHADER_OPCODE_EXTRACT_BYTE;
+   }
+
+   fs_reg op0 = get_nir_src(src0->src[0].src);
+   op0.type = brw_type_for_nir_type(nir_op_infos[src0->op].input_types[0]);
+   op0 = offset(op0, bld, src0->src[0].swizzle[0]);
+
+   set_saturate(instr->dest.saturate,
+                bld.emit(extract_op, result, op0, brw_imm_ud(element->u[0])));
+   return true;
+}
+
 bool
 fs_visitor::optimize_frontfacing_ternary(nir_alu_instr *instr,
                                          const fs_reg &result)
@@ -662,6 +705,9 @@ fs_visitor::nir_emit_alu(const fs_builder &bld, nir_alu_instr *instr)
    switch (instr->op) {
    case nir_op_i2f:
    case nir_op_u2f:
+      if (optimize_extract_to_float(instr, result))
+         return;
+
       inst = bld.MOV(result, op[0]);
       inst->saturate = instr->dest.saturate;
       break;
@@ -2458,8 +2504,6 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
    case nir_intrinsic_atomic_counter_inc:
    case nir_intrinsic_atomic_counter_dec:
    case nir_intrinsic_atomic_counter_read: {
-      using namespace surface_access;
-
       /* Get the arguments of the atomic intrinsic. */
       const fs_reg offset = get_nir_src(instr->src[0]);
       const unsigned surface = (stage_prog_data->binding_table.abo_start +
@@ -2985,12 +3029,11 @@ fs_visitor::nir_emit_ssbo_atomic(const fs_builder &bld,
 
    /* Emit the actual atomic operation operation */
 
-   fs_reg atomic_result =
-      surface_access::emit_untyped_atomic(bld, surface, offset,
-                                          data1, data2,
-                                          1 /* dims */, 1 /* rsize */,
-                                          op,
-                                          BRW_PREDICATE_NONE);
+   fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
+                                              data1, data2,
+                                              1 /* dims */, 1 /* rsize */,
+                                              op,
+                                              BRW_PREDICATE_NONE);
    dest.type = atomic_result.type;
    bld.MOV(dest, atomic_result);
 }
@@ -3012,12 +3055,11 @@ fs_visitor::nir_emit_shared_atomic(const fs_builder &bld,
 
    /* Emit the actual atomic operation operation */
 
-   fs_reg atomic_result =
-      surface_access::emit_untyped_atomic(bld, surface, offset,
-                                          data1, data2,
-                                          1 /* dims */, 1 /* rsize */,
-                                          op,
-                                          BRW_PREDICATE_NONE);
+   fs_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
+                                              data1, data2,
+                                              1 /* dims */, 1 /* rsize */,
+                                              op,
+                                              BRW_PREDICATE_NONE);
    dest.type = atomic_result.type;
    bld.MOV(dest, atomic_result);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp
index dc2b0c8aa8d..f59fdbddfa6 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_saturate_propagation.cpp
@@ -73,7 +73,7 @@ opt_saturate_propagation_local(fs_visitor *v, bblock_t *block)
             if (scan_inst->saturate) {
                inst->saturate = false;
                progress = true;
-            } else if (src_end_ip <= ip || inst->dst.equals(inst->src[0])) {
+            } else if (src_end_ip == ip || inst->dst.equals(inst->src[0])) {
                if (scan_inst->can_do_saturate()) {
                   if (scan_inst->dst.type != inst->dst.type) {
                      scan_inst->dst.type = inst->dst.type;
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index b4b430dc140..f1da218ba63 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -1021,6 +1021,18 @@ fs_visitor::init()
       unreachable("unhandled shader stage");
    }
 
+   if (stage == MESA_SHADER_COMPUTE) {
+      const brw_cs_prog_data *cs_prog_data =
+         (const brw_cs_prog_data *) prog_data;
+      unsigned size = cs_prog_data->local_size[0] *
+                      cs_prog_data->local_size[1] *
+                      cs_prog_data->local_size[2];
+      size = DIV_ROUND_UP(size, devinfo->max_cs_threads);
+      min_dispatch_width = size > 16 ? 32 : (size > 8 ? 16 : 8);
+   } else {
+      min_dispatch_width = 8;
+   }
+
    this->prog_data = this->stage_prog_data;
 
    this->failed = false;
diff --git a/src/mesa/drivers/dri/i965/brw_ir_vec4.h b/src/mesa/drivers/dri/i965/brw_ir_vec4.h
index 660becaafa7..2b6872e6d31 100644
--- a/src/mesa/drivers/dri/i965/brw_ir_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_ir_vec4.h
@@ -76,7 +76,11 @@ offset(src_reg reg, unsigned delta)
 static inline src_reg
 swizzle(src_reg reg, unsigned swizzle)
 {
-   reg.swizzle = brw_compose_swizzle(swizzle, reg.swizzle);
+   if (reg.file == IMM)
+      reg.ud = brw_swizzle_immediate(reg.type, reg.ud, swizzle);
+   else
+      reg.swizzle = brw_compose_swizzle(swizzle, reg.swizzle);
+
    return reg;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_reg.h b/src/mesa/drivers/dri/i965/brw_reg.h
index a2a4a40f373..4f9b2526e45 100644
--- a/src/mesa/drivers/dri/i965/brw_reg.h
+++ b/src/mesa/drivers/dri/i965/brw_reg.h
@@ -81,7 +81,9 @@ struct brw_device_info;
 #define BRW_SWIZZLE_ZZZZ      BRW_SWIZZLE4(2,2,2,2)
 #define BRW_SWIZZLE_WWWW      BRW_SWIZZLE4(3,3,3,3)
 #define BRW_SWIZZLE_XYXY      BRW_SWIZZLE4(0,1,0,1)
+#define BRW_SWIZZLE_XZXZ      BRW_SWIZZLE4(0,2,0,2)
 #define BRW_SWIZZLE_YZXW      BRW_SWIZZLE4(1,2,0,3)
+#define BRW_SWIZZLE_YWYW      BRW_SWIZZLE4(1,3,1,3)
 #define BRW_SWIZZLE_ZXYW      BRW_SWIZZLE4(2,0,1,3)
 #define BRW_SWIZZLE_ZWZW      BRW_SWIZZLE4(2,3,2,3)
 #define BRW_SWIZZLE_WZYX      BRW_SWIZZLE4(3,2,1,0)
@@ -221,6 +223,7 @@ enum PACKED brw_reg_type {
 unsigned brw_reg_type_to_hw_type(const struct brw_device_info *devinfo,
                                  enum brw_reg_type type, enum brw_reg_file file);
 const char *brw_reg_type_letters(unsigned brw_reg_type);
+uint32_t brw_swizzle_immediate(enum brw_reg_type type, uint32_t x, unsigned swz);
 
 #define REG_SIZE (8*4)
 
@@ -737,6 +740,22 @@ brw_notification_reg(void)
 }
 
 static inline struct brw_reg
+brw_sr0_reg(void)
+{
+   return brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
+                  BRW_ARF_STATE,
+                  0,
+                  0,
+                  0,
+                  BRW_REGISTER_TYPE_UD,
+                  BRW_VERTICAL_STRIDE_8,
+                  BRW_WIDTH_8,
+                  BRW_HORIZONTAL_STRIDE_1,
+                  BRW_SWIZZLE_XYZW,
+                  WRITEMASK_XYZW);
+}
+
+static inline struct brw_reg
 brw_acc_reg(unsigned width)
 {
    return brw_vecn_reg(width, BRW_ARCHITECTURE_REGISTER_FILE,
@@ -871,24 +890,17 @@ get_element_d(struct brw_reg reg, unsigned elt)
    return vec1(suboffset(retype(reg, BRW_REGISTER_TYPE_D), elt));
 }
 
-
 static inline struct brw_reg
-brw_swizzle(struct brw_reg reg, unsigned x, unsigned y, unsigned z, unsigned w)
+brw_swizzle(struct brw_reg reg, unsigned swz)
 {
-   assert(reg.file != BRW_IMMEDIATE_VALUE);
+   if (reg.file == BRW_IMMEDIATE_VALUE)
+      reg.ud = brw_swizzle_immediate(reg.type, reg.ud, swz);
+   else
+      reg.swizzle = brw_compose_swizzle(swz, reg.swizzle);
 
-   reg.swizzle = brw_compose_swizzle(BRW_SWIZZLE4(x, y, z, w),
-                                              reg.swizzle);
    return reg;
 }
 
-
-static inline struct brw_reg
-brw_swizzle1(struct brw_reg reg, unsigned x)
-{
-   return brw_swizzle(reg, x, x, x, x);
-}
-
 static inline struct brw_reg
 brw_writemask(struct brw_reg reg, unsigned mask)
 {
diff --git a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
index 4f97577515a..5b54b51395c 100644
--- a/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
+++ b/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp
@@ -1057,12 +1057,7 @@ fs_instruction_scheduler::calculate_deps()
    last_accumulator_write = NULL;
    last_fixed_grf_write = NULL;
 
-   exec_node *node;
-   exec_node *prev;
-   for (node = instructions.get_tail(), prev = node->prev;
-        !node->is_head_sentinel();
-        node = prev, prev = node->prev) {
-      schedule_node *n = (schedule_node *)node;
+   foreach_in_list_reverse_safe(schedule_node, n, &instructions) {
       fs_inst *inst = (fs_inst *)n->inst;
 
       /* write-after-read deps. */
@@ -1284,12 +1279,7 @@ vec4_instruction_scheduler::calculate_deps()
    last_accumulator_write = NULL;
    last_fixed_grf_write = NULL;
 
-   exec_node *node;
-   exec_node *prev;
-   for (node = instructions.get_tail(), prev = node->prev;
-        !node->is_head_sentinel();
-        node = prev, prev = node->prev) {
-      schedule_node *n = (schedule_node *)node;
+   foreach_in_list_reverse_safe(schedule_node, n, &instructions) {
       vec4_instruction *inst = (vec4_instruction *)n->inst;
 
       /* write-after-read deps. */
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index dfe6afcf6d0..21977a23130 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -1046,13 +1046,6 @@ backend_shader::calculate_cfg()
    cfg = new(mem_ctx) cfg_t(&this->instructions);
 }
 
-void
-backend_shader::invalidate_cfg()
-{
-   ralloc_free(this->cfg);
-   this->cfg = NULL;
-}
-
 /**
  * Sets up the starting offsets for the groups of binding table entries
  * commong to all pipeline stages.
diff --git a/src/mesa/drivers/dri/i965/brw_shader.h b/src/mesa/drivers/dri/i965/brw_shader.h
index 82374a46c18..15bed78cb7c 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.h
+++ b/src/mesa/drivers/dri/i965/brw_shader.h
@@ -217,7 +217,6 @@ public:
    virtual void dump_instructions(const char *name);
 
    void calculate_cfg();
-   void invalidate_cfg();
 
    virtual void invalidate_live_intervals() = 0;
 };
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index 0032634f023..65e57ba5e62 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -321,6 +321,28 @@ src_reg::equals(const src_reg &r) const
 }
 
 bool
+vec4_visitor::vectorize_mov(bblock_t *block, vec4_instruction *inst,
+                            uint8_t imm[4], vec4_instruction *imm_inst[4],
+                            int inst_count, unsigned writemask)
+{
+   if (inst_count < 2)
+      return false;
+
+   unsigned vf;
+   memcpy(&vf, imm, sizeof(vf));
+   vec4_instruction *mov = MOV(imm_inst[0]->dst, brw_imm_vf(vf));
+   mov->dst.type = BRW_REGISTER_TYPE_F;
+   mov->dst.writemask = writemask;
+   inst->insert_before(block, mov);
+
+   for (int i = 0; i < inst_count; i++) {
+      imm_inst[i]->remove(block);
+   }
+
+   return true;
+}
+
+bool
 vec4_visitor::opt_vector_float()
 {
    bool progress = false;
@@ -328,27 +350,38 @@ vec4_visitor::opt_vector_float()
    int last_reg = -1, last_reg_offset = -1;
    enum brw_reg_file last_reg_file = BAD_FILE;
 
-   int remaining_channels = 0;
-   uint8_t imm[4];
+   uint8_t imm[4] = { 0 };
    int inst_count = 0;
    vec4_instruction *imm_inst[4];
+   unsigned writemask = 0;
 
    foreach_block_and_inst_safe(block, vec4_instruction, inst, cfg) {
       if (last_reg != inst->dst.nr ||
           last_reg_offset != inst->dst.reg_offset ||
           last_reg_file != inst->dst.file) {
+         progress |= vectorize_mov(block, inst, imm, imm_inst, inst_count,
+                                   writemask);
+         inst_count = 0;
+         writemask = 0;
          last_reg = inst->dst.nr;
          last_reg_offset = inst->dst.reg_offset;
          last_reg_file = inst->dst.file;
-         remaining_channels = WRITEMASK_XYZW;
 
-         inst_count = 0;
+         for (int i = 0; i < 4; i++) {
+            imm[i] = 0;
+         }
       }
 
       if (inst->opcode != BRW_OPCODE_MOV ||
           inst->dst.writemask == WRITEMASK_XYZW ||
-          inst->src[0].file != IMM)
+          inst->src[0].file != IMM ||
+          inst->predicate != BRW_PREDICATE_NONE) {
+         progress |= vectorize_mov(block, inst, imm, imm_inst, inst_count,
+                                   writemask);
+         inst_count = 0;
+         last_reg = -1;
          continue;
+      }
 
       int vf = brw_float_to_vf(inst->src[0].f);
       if (vf == -1)
@@ -363,23 +396,8 @@ vec4_visitor::opt_vector_float()
       if ((inst->dst.writemask & WRITEMASK_W) != 0)
          imm[3] = vf;
 
+      writemask |= inst->dst.writemask;
       imm_inst[inst_count++] = inst;
-
-      remaining_channels &= ~inst->dst.writemask;
-      if (remaining_channels == 0) {
-         unsigned vf;
-         memcpy(&vf, imm, sizeof(vf));
-         vec4_instruction *mov = MOV(inst->dst, brw_imm_vf(vf));
-         mov->dst.type = BRW_REGISTER_TYPE_F;
-         mov->dst.writemask = WRITEMASK_XYZW;
-         inst->insert_after(block, mov);
-         last_reg = -1;
-
-         for (int i = 0; i < inst_count; i++) {
-            imm_inst[i]->remove(block);
-         }
-         progress = true;
-      }
    }
 
    if (progress)
@@ -1027,6 +1045,7 @@ vec4_visitor::opt_register_coalesce()
 
          if (is_nop_mov) {
             inst->remove(block);
+            progress = true;
             continue;
          }
       }
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.h b/src/mesa/drivers/dri/i965/brw_vec4.h
index 7b86e1bc050..6143f65efa1 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.h
+++ b/src/mesa/drivers/dri/i965/brw_vec4.h
@@ -274,13 +274,6 @@ public:
    void emit_shader_time_end();
    void emit_shader_time_write(int shader_time_subindex, src_reg value);
 
-   void emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
-                            dst_reg dst, src_reg offset, src_reg src0,
-                            src_reg src1);
-
-   void emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
-                                  src_reg offset);
-
    src_reg get_scratch_offset(bblock_t *block, vec4_instruction *inst,
 			      src_reg *reladdr, int reg_offset);
    void emit_scratch_read(bblock_t *block, vec4_instruction *inst,
@@ -366,6 +359,10 @@ protected:
    virtual void gs_end_primitive();
 
 private:
+   bool vectorize_mov(bblock_t *block, vec4_instruction *inst,
+                      uint8_t imm[4], vec4_instruction *imm_inst[4],
+                      int inst_count, unsigned writemask);
+
    /**
     * If true, then register allocation should fail instead of spilling.
     */
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
index 6bd992882b8..92423e1f942 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_copy_propagation.cpp
@@ -76,22 +76,6 @@ is_channel_updated(vec4_instruction *inst, src_reg *values[4], int ch)
            inst->dst.writemask & (1 << BRW_GET_SWZ(src->swizzle, ch)));
 }
 
-static unsigned
-swizzle_vf_imm(unsigned vf4, unsigned swizzle)
-{
-   union {
-      unsigned vf4;
-      uint8_t vf[4];
-   } v = { vf4 }, ret;
-
-   ret.vf[0] = v.vf[BRW_GET_SWZ(swizzle, 0)];
-   ret.vf[1] = v.vf[BRW_GET_SWZ(swizzle, 1)];
-   ret.vf[2] = v.vf[BRW_GET_SWZ(swizzle, 2)];
-   ret.vf[3] = v.vf[BRW_GET_SWZ(swizzle, 3)];
-
-   return ret.vf4;
-}
-
 static bool
 is_logic_op(enum opcode opcode)
 {
@@ -101,21 +85,66 @@ is_logic_op(enum opcode opcode)
            opcode == BRW_OPCODE_NOT);
 }
 
+/**
+ * Get the origin of a copy as a single register if all components present in
+ * the given readmask originate from the same register and have compatible
+ * regions, otherwise return a BAD_FILE register.
+ */
+static src_reg
+get_copy_value(const copy_entry &entry, unsigned readmask)
+{
+   unsigned swz[4] = {};
+   src_reg value;
+
+   for (unsigned i = 0; i < 4; i++) {
+      if (readmask & (1 << i)) {
+         if (entry.value[i]) {
+            src_reg src = *entry.value[i];
+
+            if (src.file == IMM) {
+               swz[i] = i;
+            } else {
+               swz[i] = BRW_GET_SWZ(src.swizzle, i);
+               /* Overwrite the original swizzle so the src_reg::equals call
+                * below doesn't care about it, the correct swizzle will be
+                * calculated once the swizzles of all components are known.
+                */
+               src.swizzle = BRW_SWIZZLE_XYZW;
+            }
+
+            if (value.file == BAD_FILE) {
+               value = src;
+            } else if (!value.equals(src)) {
+               return src_reg();
+            }
+         } else {
+            return src_reg();
+         }
+      }
+   }
+
+   return swizzle(value,
+                  brw_compose_swizzle(brw_swizzle_for_mask(readmask),
+                                      BRW_SWIZZLE4(swz[0], swz[1],
+                                                   swz[2], swz[3])));
+}
+
 static bool
 try_constant_propagate(const struct brw_device_info *devinfo,
                        vec4_instruction *inst,
-                       int arg, struct copy_entry *entry)
+                       int arg, const copy_entry *entry)
 {
    /* For constant propagation, we only handle the same constant
     * across all 4 channels.  Some day, we should handle the 8-bit
     * float vector format, which would let us constant propagate
     * vectors better.
+    * We could be more aggressive here -- some channels might not get used
+    * based on the destination writemask.
     */
-   src_reg value = *entry->value[0];
-   for (int i = 1; i < 4; i++) {
-      if (!value.equals(*entry->value[i]))
-	 return false;
-   }
+   src_reg value =
+      get_copy_value(*entry,
+                     brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle,
+                                                   WRITEMASK_XYZW));
 
    if (value.file != IMM)
       return false;
@@ -144,8 +173,7 @@ try_constant_propagate(const struct brw_device_info *devinfo,
       }
    }
 
-   if (value.type == BRW_REGISTER_TYPE_VF)
-      value.ud = swizzle_vf_imm(value.ud, inst->src[arg].swizzle);
+   value = swizzle(value, inst->src[arg].swizzle);
 
    switch (inst->opcode) {
    case BRW_OPCODE_MOV:
@@ -255,38 +283,15 @@ try_constant_propagate(const struct brw_device_info *devinfo,
 static bool
 try_copy_propagate(const struct brw_device_info *devinfo,
                    vec4_instruction *inst, int arg,
-                   struct copy_entry *entry, int attributes_per_reg)
+                   const copy_entry *entry, int attributes_per_reg)
 {
    /* Build up the value we are propagating as if it were the source of a
     * single MOV
     */
-   /* For constant propagation, we only handle the same constant
-    * across all 4 channels.  Some day, we should handle the 8-bit
-    * float vector format, which would let us constant propagate
-    * vectors better.
-    */
-   src_reg value = *entry->value[0];
-   for (int i = 1; i < 4; i++) {
-      /* This is equals() except we don't care about the swizzle. */
-      if (value.file != entry->value[i]->file ||
-          value.nr != entry->value[i]->nr ||
-	  value.reg_offset != entry->value[i]->reg_offset ||
-	  value.type != entry->value[i]->type ||
-	  value.negate != entry->value[i]->negate ||
-	  value.abs != entry->value[i]->abs) {
-	 return false;
-      }
-   }
-
-   /* Compute the swizzle of the original register by swizzling the
-    * component loaded from each value according to the swizzle of
-    * operand we're going to change.
-    */
-   int s[4];
-   for (int i = 0; i < 4; i++) {
-      s[i] = BRW_GET_SWZ(entry->value[i]->swizzle, i);
-   }
-   value.swizzle = BRW_SWIZZLE4(s[0], s[1], s[2], s[3]);
+   src_reg value =
+      get_copy_value(*entry,
+                     brw_apply_inv_swizzle_to_mask(inst->src[arg].swizzle,
+                                                   WRITEMASK_XYZW));
 
    /* Check that we can propagate that value */
    if (value.file != UNIFORM &&
@@ -435,43 +440,13 @@ vec4_visitor::opt_copy_propagation(bool do_constant_prop)
          if (inst->regs_read(i) != 1)
             continue;
 
-         int reg = (alloc.offsets[inst->src[i].nr] +
-		    inst->src[i].reg_offset);
-
-	 /* Find the regs that each swizzle component came from.
-	  */
-         struct copy_entry entry;
-         memset(&entry, 0, sizeof(copy_entry));
-	 int c;
-	 for (c = 0; c < 4; c++) {
-            int channel = BRW_GET_SWZ(inst->src[i].swizzle, c);
-            entry.value[c] = entries[reg].value[channel];
-
-	    /* If there's no available copy for this channel, bail.
-	     * We could be more aggressive here -- some channels might
-	     * not get used based on the destination writemask.
-	     */
-	    if (!entry.value[c])
-	       break;
-
-            entry.saturatemask |=
-               (entries[reg].saturatemask & (1 << channel) ? 1 : 0) << c;
-
-	    /* We'll only be able to copy propagate if the sources are
-	     * all from the same file -- there's no ability to swizzle
-	     * 0 or 1 constants in with source registers like in i915.
-	     */
-	    if (c > 0 && entry.value[c - 1]->file != entry.value[c]->file)
-	       break;
-	 }
-
-	 if (c != 4)
-	    continue;
+         const unsigned reg = (alloc.offsets[inst->src[i].nr] +
+                                inst->src[i].reg_offset);
+         const copy_entry &entry = entries[reg];
 
          if (do_constant_prop && try_constant_propagate(devinfo, inst, i, &entry))
             progress = true;
-
-         if (try_copy_propagate(devinfo, inst, i, &entry, attributes_per_reg))
+         else if (try_copy_propagate(devinfo, inst, i, &entry, attributes_per_reg))
 	    progress = true;
       }
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
index 17d5f2aeff4..4686f2014c6 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_nir.cpp
@@ -717,24 +717,34 @@ vec4_visitor::nir_emit_intrinsic(nir_intrinsic_instr *instr)
          (unsigned) instr->const_index[0];
       src_reg offset = get_nir_src(instr->src[0], nir_type_int,
                                    instr->num_components);
+      const src_reg surface = brw_imm_ud(surf_index);
+      const vec4_builder bld =
+         vec4_builder(this).at_end().annotate(current_annotation, base_ir);
+      src_reg tmp;
+
       dest = get_nir_dest(instr->dest);
 
       switch (instr->intrinsic) {
-         case nir_intrinsic_atomic_counter_inc:
-            emit_untyped_atomic(BRW_AOP_INC, surf_index, dest, offset,
-                                src_reg(), src_reg());
-            break;
-         case nir_intrinsic_atomic_counter_dec:
-            emit_untyped_atomic(BRW_AOP_PREDEC, surf_index, dest, offset,
-                                src_reg(), src_reg());
-            break;
-         case nir_intrinsic_atomic_counter_read:
-            emit_untyped_surface_read(surf_index, dest, offset);
-            break;
-         default:
-            unreachable("Unreachable");
+      case nir_intrinsic_atomic_counter_inc:
+         tmp = emit_untyped_atomic(bld, surface, offset,
+                                   src_reg(), src_reg(),
+                                   1, 1,
+                                   BRW_AOP_INC);
+         break;
+      case nir_intrinsic_atomic_counter_dec:
+         tmp = emit_untyped_atomic(bld, surface, offset,
+                                   src_reg(), src_reg(),
+                                   1, 1,
+                                   BRW_AOP_PREDEC);
+         break;
+      case nir_intrinsic_atomic_counter_read:
+         tmp = emit_untyped_read(bld, surface, offset, 1, 1);
+         break;
+      default:
+         unreachable("Unreachable");
       }
 
+      bld.MOV(retype(dest, tmp.type), tmp);
       brw_mark_surface_used(stage_prog_data, surf_index);
       break;
    }
@@ -861,12 +871,11 @@ vec4_visitor::nir_emit_ssbo_atomic(int op, nir_intrinsic_instr *instr)
    const vec4_builder bld =
       vec4_builder(this).at_end().annotate(current_annotation, base_ir);
 
-   src_reg atomic_result =
-      surface_access::emit_untyped_atomic(bld, surface, offset,
-                                          data1, data2,
-                                          1 /* dims */, 1 /* rsize */,
-                                          op,
-                                          BRW_PREDICATE_NONE);
+   src_reg atomic_result = emit_untyped_atomic(bld, surface, offset,
+                                               data1, data2,
+                                               1 /* dims */, 1 /* rsize */,
+                                               op,
+                                               BRW_PREDICATE_NONE);
    dest.type = atomic_result.type;
    bld.MOV(dest, atomic_result);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_surface_builder.cpp b/src/mesa/drivers/dri/i965/brw_vec4_surface_builder.cpp
index 28002c56cdc..1db349ab8ce 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_surface_builder.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_surface_builder.cpp
@@ -221,7 +221,7 @@ namespace brw {
                           emit_insert(bld, addr, dims, has_simd4x2),
                           has_simd4x2 ? 1 : dims,
                           emit_insert(bld, src_reg(srcs), size, has_simd4x2),
-                          has_simd4x2 ? 1 : size,
+                          has_simd4x2 && size ? 1 : size,
                           surface, op, rsize, pred);
       }
 
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
index 8418a3cdc01..4cfbc143d5a 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_visitor.cpp
@@ -1115,61 +1115,6 @@ vec4_visitor::gs_end_primitive()
 }
 
 void
-vec4_visitor::emit_untyped_atomic(unsigned atomic_op, unsigned surf_index,
-                                  dst_reg dst, src_reg surf_offset,
-                                  src_reg src0, src_reg src1)
-{
-   unsigned mlen = 1 + (src0.file != BAD_FILE) + (src1.file != BAD_FILE);
-   src_reg src_payload(this, glsl_type::uint_type, mlen);
-   dst_reg payload(src_payload);
-   payload.writemask = WRITEMASK_X;
-
-   /* Set the atomic operation offset. */
-   emit(MOV(offset(payload, 0), surf_offset));
-   unsigned i = 1;
-
-   /* Set the atomic operation arguments. */
-   if (src0.file != BAD_FILE) {
-      emit(MOV(offset(payload, i), src0));
-      i++;
-   }
-
-   if (src1.file != BAD_FILE) {
-      emit(MOV(offset(payload, i), src1));
-      i++;
-   }
-
-   /* Emit the instruction.  Note that this maps to the normal SIMD8
-    * untyped atomic message on Ivy Bridge, but that's OK because
-    * unused channels will be masked out.
-    */
-   vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_ATOMIC, dst,
-                                 src_payload,
-                                 brw_imm_ud(surf_index), brw_imm_ud(atomic_op));
-   inst->mlen = mlen;
-}
-
-void
-vec4_visitor::emit_untyped_surface_read(unsigned surf_index, dst_reg dst,
-                                        src_reg surf_offset)
-{
-   dst_reg offset(this, glsl_type::uint_type);
-   offset.writemask = WRITEMASK_X;
-
-   /* Set the surface read offset. */
-   emit(MOV(offset, surf_offset));
-
-   /* Emit the instruction.  Note that this maps to the normal SIMD8
-    * untyped surface read message, but that's OK because unused
-    * channels will be masked out.
-    */
-   vec4_instruction *inst = emit(SHADER_OPCODE_UNTYPED_SURFACE_READ, dst,
-                                 src_reg(offset),
-                                 brw_imm_ud(surf_index), brw_imm_d(1));
-   inst->mlen = 1;
-}
-
-void
 vec4_visitor::emit_ndc_computation()
 {
    if (output_reg[VARYING_SLOT_POS].file == BAD_FILE)
diff --git a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
index 9935557ae70..08f9bb3330a 100644
--- a/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/gen6_gs_visitor.cpp
@@ -611,11 +611,11 @@ gen6_gs_visitor::xfb_write()
    emit(CMP(dst_null_d(), sol_temp, this->max_svbi, BRW_CONDITIONAL_LE));
    emit(IF(BRW_PREDICATE_NORMAL));
    {
-      src_reg destination_indices_uw =
-         retype(destination_indices, BRW_REGISTER_TYPE_UW);
-
-      vec4_instruction *inst = emit(MOV(dst_reg(destination_indices_uw),
-                                        brw_imm_v(0x00020100))); /* (0, 1, 2) */
+      vec4_instruction *inst = emit(MOV(dst_reg(destination_indices),
+                                        brw_imm_vf4(brw_float_to_vf(0.0),
+                                                    brw_float_to_vf(1.0),
+                                                    brw_float_to_vf(2.0),
+                                                    brw_float_to_vf(0.0))));
       inst->force_writemask_all = true;
 
       emit(ADD(dst_reg(this->destination_indices),
diff --git a/src/mesa/drivers/dri/i965/intel_extensions.c b/src/mesa/drivers/dri/i965/intel_extensions.c
index a39693b68f7..60ac124ecd0 100644
--- a/src/mesa/drivers/dri/i965/intel_extensions.c
+++ b/src/mesa/drivers/dri/i965/intel_extensions.c
@@ -196,6 +196,7 @@ intelInitExtensions(struct gl_context *ctx)
    ctx->Extensions.ARB_half_float_vertex = true;
    ctx->Extensions.ARB_instanced_arrays = true;
    ctx->Extensions.ARB_internalformat_query = true;
+   ctx->Extensions.ARB_internalformat_query2 = true;
    ctx->Extensions.ARB_map_buffer_range = true;
    ctx->Extensions.ARB_occlusion_query = true;
    ctx->Extensions.ARB_occlusion_query2 = true;
diff --git a/src/mesa/drivers/dri/i965/intel_fbo.c b/src/mesa/drivers/dri/i965/intel_fbo.c
index 3a4a53a07e6..b7b679686e5 100644
--- a/src/mesa/drivers/dri/i965/intel_fbo.c
+++ b/src/mesa/drivers/dri/i965/intel_fbo.c
@@ -289,7 +289,7 @@ intel_alloc_private_renderbuffer_storage(struct gl_context * ctx, struct gl_rend
    rb->NumSamples = intel_quantize_num_samples(screen, rb->NumSamples);
    rb->Width = width;
    rb->Height = height;
-   rb->_BaseFormat = _mesa_base_fbo_format(ctx, internalFormat);
+   rb->_BaseFormat = _mesa_get_format_base_format(rb->Format);
 
    intel_miptree_release(&irb->mt);
 
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index ee7c1d7bc2c..c6eb50aaba8 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -891,7 +891,7 @@ brw_query_renderer_string(__DRIscreen *psp, int param, const char **value)
       value[0] = brw_vendor_string;
       return 0;
    case __DRI2_RENDERER_DEVICE_ID:
-      value[0] = brw_get_renderer_string(intelScreen->deviceID);
+      value[0] = brw_get_renderer_string(intelScreen);
       return 0;
    default:
       break;
@@ -1082,6 +1082,7 @@ static bool
 intel_init_bufmgr(struct intel_screen *intelScreen)
 {
    __DRIscreen *spriv = intelScreen->driScrnPriv;
+   bool devid_override = getenv("INTEL_DEVID_OVERRIDE") != NULL;
 
    intelScreen->no_hw = getenv("INTEL_NO_HW") != NULL;
 
@@ -1099,6 +1100,25 @@ intel_init_bufmgr(struct intel_screen *intelScreen)
       return false;
    }
 
+   intelScreen->subslice_total = -1;
+   intelScreen->eu_total = -1;
+
+   /* Everything below this is for real hardware only */
+   if (intelScreen->no_hw || devid_override)
+      return true;
+
+   intel_get_param(spriv, I915_PARAM_SUBSLICE_TOTAL,
+                   &intelScreen->subslice_total);
+   intel_get_param(spriv, I915_PARAM_EU_TOTAL, &intelScreen->eu_total);
+
+   /* Without this information, we cannot get the right Braswell brandstrings,
+    * and we have to use conservative numbers for GPGPU on many platforms, but
+    * otherwise, things will just work.
+    */
+   if (intelScreen->subslice_total == -1 || intelScreen->eu_total == -1)
+      _mesa_warning(NULL,
+                    "Kernel 4.1 required to properly query GPU properties.\n");
+
    return true;
 }
 
diff --git a/src/mesa/drivers/dri/i965/intel_screen.h b/src/mesa/drivers/dri/i965/intel_screen.h
index 3a5f22c3a67..01d45d0c016 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.h
+++ b/src/mesa/drivers/dri/i965/intel_screen.h
@@ -81,7 +81,17 @@ struct intel_screen
     * I915_PARAM_CMD_PARSER_VERSION parameter
     */
    int cmd_parser_version;
- };
+
+   /**
+    * Number of subslices reported by the I915_PARAM_SUBSLICE_TOTAL parameter
+    */
+   int subslice_total;
+
+   /**
+    * Number of EUs reported by the I915_PARAM_EU_TOTAL parameter
+    */
+   int eu_total;
+};
 
 extern void intelDestroyContext(__DRIcontext * driContextPriv);
 
diff --git a/src/mesa/drivers/dri/i965/intel_tex_image.c b/src/mesa/drivers/dri/i965/intel_tex_image.c
index e21c3ac543f..1601edddef6 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_image.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_image.c
@@ -50,7 +50,7 @@ intel_miptree_create_for_teximage(struct brw_context *brw,
       width <<= 1;
       if (height != 1)
          height <<= 1;
-      if (depth != 1)
+      if (intelObj->base.Target == GL_TEXTURE_3D)
          depth <<= 1;
    }
 
@@ -111,7 +111,6 @@ intelTexImage(struct gl_context * ctx,
                                    texImage->Width, texImage->Height,
                                    texImage->Depth,
                                    format, type, pixels,
-                                   false /*allocate_storage*/,
                                    tex_busy, unpack);
    if (ok)
       return;
diff --git a/src/mesa/drivers/dri/i965/intel_tex_subimage.c b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
index 573f701acdd..4849a4151e2 100644
--- a/src/mesa/drivers/dri/i965/intel_tex_subimage.c
+++ b/src/mesa/drivers/dri/i965/intel_tex_subimage.c
@@ -214,7 +214,7 @@ intelTexSubImage(struct gl_context * ctx,
    ok = _mesa_meta_pbo_TexSubImage(ctx, dims, texImage,
                                    xoffset, yoffset, zoffset,
                                    width, height, depth, format, type,
-                                   pixels, false, tex_busy, packing);
+                                   pixels, tex_busy, packing);
    if (ok)
       return;
 
diff --git a/src/mesa/drivers/x11/xmesa.h b/src/mesa/drivers/x11/xmesa.h
index b6a2576d492..cc878e7402e 100644
--- a/src/mesa/drivers/x11/xmesa.h
+++ b/src/mesa/drivers/x11/xmesa.h
@@ -64,15 +64,15 @@ and create a window, you must do the following to use the X/Mesa interface:
 #ifndef XMESA_H
 #define XMESA_H
 
-#ifdef __cplusplus
-extern "C" {
-#endif
-
 #include <X11/Xlib.h>
 #include <X11/Xutil.h>
 #include "xmesa_x.h"
 #include "GL/gl.h"
 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
 #define XMESA_MAJOR_VERSION 6
 #define XMESA_MINOR_VERSION 3
 
diff --git a/src/mesa/main/config.h b/src/mesa/main/config.h
index 820ae072da6..bc5e56923b8 100644
--- a/src/mesa/main/config.h
+++ b/src/mesa/main/config.h
@@ -132,11 +132,6 @@
  */
 #define MAX_TEXTURE_UNITS ((MAX_TEXTURE_COORD_UNITS > MAX_TEXTURE_IMAGE_UNITS) ? MAX_TEXTURE_COORD_UNITS : MAX_TEXTURE_IMAGE_UNITS)
 
-
-/** Maximum viewport size */
-#define MAX_VIEWPORT_WIDTH 16384
-#define MAX_VIEWPORT_HEIGHT 16384
-
 /** Maximun number of viewports supported with ARB_viewport_array */
 #define MAX_VIEWPORTS 16
 
diff --git a/src/mesa/main/context.c b/src/mesa/main/context.c
index 26eee28db4e..dbba136f526 100644
--- a/src/mesa/main/context.c
+++ b/src/mesa/main/context.c
@@ -582,8 +582,8 @@ _mesa_init_constants(struct gl_constants *consts, gl_api api)
    consts->MaxLights = MAX_LIGHTS;
    consts->MaxShininess = 128.0;
    consts->MaxSpotExponent = 128.0;
-   consts->MaxViewportWidth = MAX_VIEWPORT_WIDTH;
-   consts->MaxViewportHeight = MAX_VIEWPORT_HEIGHT;
+   consts->MaxViewportWidth = 16384;
+   consts->MaxViewportHeight = 16384;
    consts->MinMapBufferAlignment = 64;
 
    /* Driver must override these values if ARB_viewport_array is supported. */
diff --git a/src/mesa/main/dd.h b/src/mesa/main/dd.h
index 3f5aa5db051..60bc8ef4411 100644
--- a/src/mesa/main/dd.h
+++ b/src/mesa/main/dd.h
@@ -184,22 +184,24 @@ struct dd_function_table {
                                       GLenum srcFormat, GLenum srcType );
 
    /**
-    * Determine sample counts support for a particular target and format
+    * Queries different driver parameters for a particular target and format.
+    * Since ARB_internalformat_query2 introduced several new query parameters
+    * over ARB_internalformat_query, having one driver hook for each parameter
+    * is no longer feasible. So this is the generic entry-point for calls
+    * to glGetInternalFormativ and glGetInternalFormati64v, after Mesa has
+    * checked errors and default values.
     *
     * \param ctx            GL context
     * \param target         GL target enum
     * \param internalFormat GL format enum
-    * \param samples        Buffer to hold the returned sample counts.
-    *                       Drivers \b must \b not return more than 16 counts.
-    *
-    * \returns
-    * The number of sample counts actually written to \c samples.  If
-    * \c internaFormat is not renderable, zero is returned.
-    */
-   size_t (*QuerySamplesForFormat)(struct gl_context *ctx,
-                                   GLenum target,
-                                   GLenum internalFormat,
-                                   int samples[16]);
+    * \param pname          GL enum that specifies the info to query.
+    * \param params         Buffer to hold the result of the query.
+    */
+   void (*QueryInternalFormat)(struct gl_context *ctx,
+                               GLenum target,
+                               GLenum internalFormat,
+                               GLenum pname,
+                               GLint *params);
 
    /**
     * Called by glTexImage[123]D() and glCopyTexImage[12]D()
diff --git a/src/mesa/main/debug_output.c b/src/mesa/main/debug_output.c
index 10ee6757cc1..c2b9f053352 100644
--- a/src/mesa/main/debug_output.c
+++ b/src/mesa/main/debug_output.c
@@ -761,15 +761,11 @@ _mesa_set_debug_state_int(struct gl_context *ctx, GLenum pname, GLint val)
 GLint
 _mesa_get_debug_state_int(struct gl_context *ctx, GLenum pname)
 {
-   struct gl_debug_state *debug;
    GLint val;
 
-   mtx_lock(&ctx->DebugMutex);
-   debug = ctx->Debug;
-   if (!debug) {
-      mtx_unlock(&ctx->DebugMutex);
+   struct gl_debug_state *debug = _mesa_lock_debug_state(ctx);
+   if (!debug)
       return 0;
-   }
 
    switch (pname) {
    case GL_DEBUG_OUTPUT:
@@ -794,7 +790,7 @@ _mesa_get_debug_state_int(struct gl_context *ctx, GLenum pname)
       break;
    }
 
-   mtx_unlock(&ctx->DebugMutex);
+   _mesa_unlock_debug_state(ctx);
 
    return val;
 }
@@ -806,15 +802,11 @@ _mesa_get_debug_state_int(struct gl_context *ctx, GLenum pname)
 void *
 _mesa_get_debug_state_ptr(struct gl_context *ctx, GLenum pname)
 {
-   struct gl_debug_state *debug;
    void *val;
+   struct gl_debug_state *debug = _mesa_lock_debug_state(ctx);
 
-   mtx_lock(&ctx->DebugMutex);
-   debug = ctx->Debug;
-   if (!debug) {
-      mtx_unlock(&ctx->DebugMutex);
+   if (!debug)
       return NULL;
-   }
 
    switch (pname) {
    case GL_DEBUG_CALLBACK_FUNCTION_ARB:
@@ -829,7 +821,7 @@ _mesa_get_debug_state_ptr(struct gl_context *ctx, GLenum pname)
       break;
    }
 
-   mtx_unlock(&ctx->DebugMutex);
+   _mesa_unlock_debug_state(ctx);
 
    return val;
 }
diff --git a/src/mesa/main/extensions_table.h b/src/mesa/main/extensions_table.h
index 50e050e9009..54a5bb057a3 100644
--- a/src/mesa/main/extensions_table.h
+++ b/src/mesa/main/extensions_table.h
@@ -73,6 +73,7 @@ EXT(ARB_half_float_vertex                   , ARB_half_float_vertex
 EXT(ARB_indirect_parameters                 , ARB_indirect_parameters                ,  x , GLC,  x ,  x , 2013)
 EXT(ARB_instanced_arrays                    , ARB_instanced_arrays                   , GLL, GLC,  x ,  x , 2008)
 EXT(ARB_internalformat_query                , ARB_internalformat_query               , GLL, GLC,  x ,  x , 2011)
+EXT(ARB_internalformat_query2               , ARB_internalformat_query2              , GLL, GLC,  x ,  x , 2013)
 EXT(ARB_invalidate_subdata                  , dummy_true                             , GLL, GLC,  x ,  x , 2012)
 EXT(ARB_map_buffer_alignment                , dummy_true                             , GLL, GLC,  x ,  x , 2011)
 EXT(ARB_map_buffer_range                    , ARB_map_buffer_range                   , GLL, GLC,  x ,  x , 2008)
@@ -95,6 +96,7 @@ EXT(ARB_sampler_objects                     , dummy_true
 EXT(ARB_seamless_cube_map                   , ARB_seamless_cube_map                  , GLL, GLC,  x ,  x , 2009)
 EXT(ARB_seamless_cubemap_per_texture        , AMD_seamless_cubemap_per_texture       , GLL, GLC,  x ,  x , 2013)
 EXT(ARB_separate_shader_objects             , dummy_true                             , GLL, GLC,  x ,  x , 2010)
+EXT(ARB_shader_atomic_counter_ops           , ARB_shader_atomic_counter_ops          , GLL, GLC,  x ,  x , 2015)
 EXT(ARB_shader_atomic_counters              , ARB_shader_atomic_counters             , GLL, GLC,  x ,  x , 2011)
 EXT(ARB_shader_bit_encoding                 , ARB_shader_bit_encoding                , GLL, GLC,  x ,  x , 2010)
 EXT(ARB_shader_clock                        , ARB_shader_clock                       , GLL, GLC,  x ,  x , 2015)
diff --git a/src/mesa/main/fbobject.c b/src/mesa/main/fbobject.c
index c9e1518ab23..d490918b816 100644
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@@ -3580,8 +3580,22 @@ _mesa_get_framebuffer_attachment_parameter(struct gl_context *ctx,
    const struct gl_renderbuffer_attachment *att;
    GLenum err;
 
-   /* The error differs in GL and GLES. */
-   err = _mesa_is_desktop_gl(ctx) ? GL_INVALID_OPERATION : GL_INVALID_ENUM;
+   /* The error code for an attachment type of GL_NONE differs between APIs.
+    *
+    * From the ES 2.0.25 specification, page 127:
+    * "If the value of FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE is NONE, then
+    *  querying any other pname will generate INVALID_ENUM."
+    *
+    * From the OpenGL 3.0 specification, page 337, or identically,
+    * the OpenGL ES 3.0.4 specification, page 240:
+    *
+    * "If the value of FRAMEBUFFER_ATTACHMENT_OBJECT_TYPE is NONE, no
+    *  framebuffer is bound to target.  In this case querying pname
+    *  FRAMEBUFFER_ATTACHMENT_OBJECT_NAME will return zero, and all other
+    *  queries will generate an INVALID_OPERATION error."
+    */
+   err = ctx->API == API_OPENGLES2 && ctx->Version < 30 ?
+      GL_INVALID_ENUM : GL_INVALID_OPERATION;
 
    if (_mesa_is_winsys_fbo(buffer)) {
       /* Page 126 (page 136 of the PDF) of the OpenGL ES 2.0.25 spec
@@ -4170,7 +4184,8 @@ _mesa_InvalidateFramebuffer(GLenum target, GLsizei numAttachments,
     */
    invalidate_framebuffer_storage(ctx, fb, numAttachments, attachments,
                                   0, 0,
-                                  MAX_VIEWPORT_WIDTH, MAX_VIEWPORT_HEIGHT,
+                                  ctx->Const.MaxViewportWidth,
+                                  ctx->Const.MaxViewportHeight,
                                   "glInvalidateFramebuffer");
 }
 
@@ -4210,7 +4225,8 @@ _mesa_InvalidateNamedFramebufferData(GLuint framebuffer,
     */
    invalidate_framebuffer_storage(ctx, fb, numAttachments, attachments,
                                   0, 0,
-                                  MAX_VIEWPORT_WIDTH, MAX_VIEWPORT_HEIGHT,
+                                  ctx->Const.MaxViewportWidth,
+                                  ctx->Const.MaxViewportHeight,
                                   "glInvalidateNamedFramebufferData");
 }
 
diff --git a/src/mesa/main/formatquery.c b/src/mesa/main/formatquery.c
index 816f12bf9e2..215c14f889f 100644
--- a/src/mesa/main/formatquery.c
+++ b/src/mesa/main/formatquery.c
@@ -28,155 +28,1495 @@
 #include "enums.h"
 #include "fbobject.h"
 #include "formatquery.h"
+#include "teximage.h"
+#include "texparam.h"
+#include "texobj.h"
+#include "get.h"
+#include "genmipmap.h"
+#include "shaderimage.h"
+#include "texcompress.h"
+#include "textureview.h"
 
-/* default implementation of QuerySamplesForFormat driverfunc, for
- * non-multisample-capable drivers. */
-size_t
-_mesa_query_samples_for_format(struct gl_context *ctx, GLenum target,
-                               GLenum internalFormat, int samples[16])
+static bool
+_is_renderable(struct gl_context *ctx, GLenum internalformat)
 {
-   (void) target;
-   (void) internalFormat;
-   (void) ctx;
+   /*  Section 4.4.4 on page 212 of the  GLES 3.0.4 spec says:
+    *
+    *     "An internal format is color-renderable if it is one of the
+    *     formats from table 3.13 noted as color-renderable or if it
+    *     is unsized format RGBA or RGB."
+    *
+    * Therefore, we must accept GL_RGB and GL_RGBA here.
+    */
+   if (internalformat != GL_RGB && internalformat != GL_RGBA &&
+       _mesa_base_fbo_format(ctx, internalformat) == 0)
+      return false;
 
-   samples[0] = 1;
-   return 1;
+   return true;
 }
 
+/* Handles the cases where either ARB_internalformat_query or
+ * ARB_internalformat_query2 have to return an error.
+ */
+static bool
+_legal_parameters(struct gl_context *ctx, GLenum target, GLenum internalformat,
+                  GLenum pname, GLsizei bufSize, GLint *params)
 
-void GLAPIENTRY
-_mesa_GetInternalformativ(GLenum target, GLenum internalformat, GLenum pname,
-                          GLsizei bufSize, GLint *params)
 {
-   GLint buffer[16];
-   GLsizei count = 0;
-   GET_CURRENT_CONTEXT(ctx);
+   bool query2 = _mesa_has_ARB_internalformat_query2(ctx);
 
-   ASSERT_OUTSIDE_BEGIN_END(ctx);
-
-   if (!ctx->Extensions.ARB_internalformat_query) {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glGetInternalformativ");
-      return;
-   }
-
-   assert(ctx->Driver.QuerySamplesForFormat != NULL);
-
-   /* The ARB_internalformat_query spec says:
+   /* The ARB_internalformat_query2 spec says:
     *
-    *     "If the <target> parameter to GetInternalformativ is not one of
-    *     TEXTURE_2D_MULTISAMPLE, TEXTURE_2D_MULTISAMPLE_ARRAY or RENDERBUFFER
-    *     then an INVALID_ENUM error is generated."
+    *    "The INVALID_ENUM error is generated if the <target> parameter to
+    *    GetInternalformati*v is not one of the targets listed in Table 6.xx.
     */
-   switch (target) {
+   switch(target){
+   case GL_TEXTURE_1D:
+   case GL_TEXTURE_1D_ARRAY:
+   case GL_TEXTURE_2D:
+   case GL_TEXTURE_2D_ARRAY:
+   case GL_TEXTURE_3D:
+   case GL_TEXTURE_CUBE_MAP:
+   case GL_TEXTURE_CUBE_MAP_ARRAY:
+   case GL_TEXTURE_RECTANGLE:
+   case GL_TEXTURE_BUFFER:
+      if (!query2) {
+         /* The ARB_internalformat_query spec says:
+          *
+          *     "If the <target> parameter to GetInternalformativ is not one of
+          *      TEXTURE_2D_MULTISAMPLE, TEXTURE_2D_MULTISAMPLE_ARRAY
+          *      or RENDERBUFFER then an INVALID_ENUM error is generated.
+          */
+         _mesa_error(ctx, GL_INVALID_ENUM,
+                     "glGetInternalformativ(target=%s)",
+                     _mesa_enum_to_string(target));
+
+         return false;
+      }
+      break;
+
    case GL_RENDERBUFFER:
       break;
 
    case GL_TEXTURE_2D_MULTISAMPLE:
    case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
-      /* These enums are only valid if ARB_texture_multisample is supported */
-      if ((_mesa_is_desktop_gl(ctx) &&
-           ctx->Extensions.ARB_texture_multisample) ||
-          _mesa_is_gles31(ctx))
-         break;
+      /* The non-existence of ARB_texture_multisample is treated in
+       * ARB_internalformat_query implementation like an error.
+       */
+      if (!query2 &&
+          !(_mesa_has_ARB_texture_multisample(ctx) || _mesa_is_gles31(ctx))) {
+         _mesa_error(ctx, GL_INVALID_ENUM,
+                     "glGetInternalformativ(target=%s)",
+                     _mesa_enum_to_string(target));
+
+         return false;
+      }
+      break;
 
    default:
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glGetInternalformativ(target=%s)",
                   _mesa_enum_to_string(target));
-      return;
+      return false;
    }
 
-   /* The ARB_internalformat_query spec says:
-    *
-    *     "If the <internalformat> parameter to GetInternalformativ is not
-    *     color-, depth- or stencil-renderable, then an INVALID_ENUM error is
-    *     generated."
-    *
-    * Page 243 of the GLES 3.0.4 spec says this for GetInternalformativ:
-    *
-    *     "internalformat must be color-renderable, depth-renderable or
-    *     stencilrenderable (as defined in section 4.4.4)."
-    *
-    * Section 4.4.4 on page 212 of the same spec says:
-    *
-    *     "An internal format is color-renderable if it is one of the
-    *     formats from table 3.13 noted as color-renderable or if it
-    *     is unsized format RGBA or RGB."
+
+   /* The ARB_internalformat_query2 spec says:
     *
-    * Therefore, we must accept GL_RGB and GL_RGBA here.
+    *     "The INVALID_ENUM error is generated if the <pname> parameter is
+    *     not one of the listed possibilities.
     */
-   if (internalformat != GL_RGB && internalformat != GL_RGBA &&
-       _mesa_base_fbo_format(ctx, internalformat) == 0) {
+   switch(pname){
+   case GL_SAMPLES:
+   case GL_NUM_SAMPLE_COUNTS:
+      break;
+
+   case GL_SRGB_DECODE_ARB:
+      /* The ARB_internalformat_query2 spec says:
+       *
+       *     "If ARB_texture_sRGB_decode or EXT_texture_sRGB_decode or
+       *     equivalent functionality is not supported, queries for the
+       *     SRGB_DECODE_ARB <pname> set the INVALID_ENUM error.
+       */
+      if (!_mesa_has_EXT_texture_sRGB_decode(ctx)) {
+         _mesa_error(ctx, GL_INVALID_ENUM,
+                     "glGetInternalformativ(pname=%s)",
+                     _mesa_enum_to_string(pname));
+         return false;
+      }
+      /* fallthrough */
+   case GL_INTERNALFORMAT_SUPPORTED:
+   case GL_INTERNALFORMAT_PREFERRED:
+   case GL_INTERNALFORMAT_RED_SIZE:
+   case GL_INTERNALFORMAT_GREEN_SIZE:
+   case GL_INTERNALFORMAT_BLUE_SIZE:
+   case GL_INTERNALFORMAT_ALPHA_SIZE:
+   case GL_INTERNALFORMAT_DEPTH_SIZE:
+   case GL_INTERNALFORMAT_STENCIL_SIZE:
+   case GL_INTERNALFORMAT_SHARED_SIZE:
+   case GL_INTERNALFORMAT_RED_TYPE:
+   case GL_INTERNALFORMAT_GREEN_TYPE:
+   case GL_INTERNALFORMAT_BLUE_TYPE:
+   case GL_INTERNALFORMAT_ALPHA_TYPE:
+   case GL_INTERNALFORMAT_DEPTH_TYPE:
+   case GL_INTERNALFORMAT_STENCIL_TYPE:
+   case GL_MAX_WIDTH:
+   case GL_MAX_HEIGHT:
+   case GL_MAX_DEPTH:
+   case GL_MAX_LAYERS:
+   case GL_MAX_COMBINED_DIMENSIONS:
+   case GL_COLOR_COMPONENTS:
+   case GL_DEPTH_COMPONENTS:
+   case GL_STENCIL_COMPONENTS:
+   case GL_COLOR_RENDERABLE:
+   case GL_DEPTH_RENDERABLE:
+   case GL_STENCIL_RENDERABLE:
+   case GL_FRAMEBUFFER_RENDERABLE:
+   case GL_FRAMEBUFFER_RENDERABLE_LAYERED:
+   case GL_FRAMEBUFFER_BLEND:
+   case GL_READ_PIXELS:
+   case GL_READ_PIXELS_FORMAT:
+   case GL_READ_PIXELS_TYPE:
+   case GL_TEXTURE_IMAGE_FORMAT:
+   case GL_TEXTURE_IMAGE_TYPE:
+   case GL_GET_TEXTURE_IMAGE_FORMAT:
+   case GL_GET_TEXTURE_IMAGE_TYPE:
+   case GL_MIPMAP:
+   case GL_MANUAL_GENERATE_MIPMAP:
+   case GL_AUTO_GENERATE_MIPMAP:
+   case GL_COLOR_ENCODING:
+   case GL_SRGB_READ:
+   case GL_SRGB_WRITE:
+   case GL_FILTER:
+   case GL_VERTEX_TEXTURE:
+   case GL_TESS_CONTROL_TEXTURE:
+   case GL_TESS_EVALUATION_TEXTURE:
+   case GL_GEOMETRY_TEXTURE:
+   case GL_FRAGMENT_TEXTURE:
+   case GL_COMPUTE_TEXTURE:
+   case GL_TEXTURE_SHADOW:
+   case GL_TEXTURE_GATHER:
+   case GL_TEXTURE_GATHER_SHADOW:
+   case GL_SHADER_IMAGE_LOAD:
+   case GL_SHADER_IMAGE_STORE:
+   case GL_SHADER_IMAGE_ATOMIC:
+   case GL_IMAGE_TEXEL_SIZE:
+   case GL_IMAGE_COMPATIBILITY_CLASS:
+   case GL_IMAGE_PIXEL_FORMAT:
+   case GL_IMAGE_PIXEL_TYPE:
+   case GL_IMAGE_FORMAT_COMPATIBILITY_TYPE:
+   case GL_SIMULTANEOUS_TEXTURE_AND_DEPTH_TEST:
+   case GL_SIMULTANEOUS_TEXTURE_AND_STENCIL_TEST:
+   case GL_SIMULTANEOUS_TEXTURE_AND_DEPTH_WRITE:
+   case GL_SIMULTANEOUS_TEXTURE_AND_STENCIL_WRITE:
+   case GL_TEXTURE_COMPRESSED:
+   case GL_TEXTURE_COMPRESSED_BLOCK_WIDTH:
+   case GL_TEXTURE_COMPRESSED_BLOCK_HEIGHT:
+   case GL_TEXTURE_COMPRESSED_BLOCK_SIZE:
+   case GL_CLEAR_BUFFER:
+   case GL_TEXTURE_VIEW:
+   case GL_VIEW_COMPATIBILITY_CLASS:
+      /* The ARB_internalformat_query spec says:
+       *
+       *     "If the <pname> parameter to GetInternalformativ is not SAMPLES
+       *     or NUM_SAMPLE_COUNTS, then an INVALID_ENUM error is generated."
+       */
+      if (!query2) {
+         _mesa_error(ctx, GL_INVALID_ENUM,
+                     "glGetInternalformativ(pname=%s)",
+                     _mesa_enum_to_string(pname));
+
+         return false;
+      }
+      break;
+
+   default:
       _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glGetInternalformativ(internalformat=%s)",
-                  _mesa_enum_to_string(internalformat));
-      return;
+                  "glGetInternalformativ(pname=%s)",
+                  _mesa_enum_to_string(pname));
+      return false;
    }
 
    /* The ARB_internalformat_query spec says:
     *
     *     "If the <bufSize> parameter to GetInternalformativ is negative, then
     *     an INVALID_VALUE error is generated."
+    *
+    * Nothing is said in ARB_internalformat_query2 but we assume the same.
     */
    if (bufSize < 0) {
       _mesa_error(ctx, GL_INVALID_VALUE,
                   "glGetInternalformativ(target=%s)",
                   _mesa_enum_to_string(target));
+      return false;
+   }
+
+   /* The ARB_internalformat_query spec says:
+    *
+    *     "If the <internalformat> parameter to GetInternalformativ is not
+    *     color-, depth- or stencil-renderable, then an INVALID_ENUM error is
+    *     generated."
+    */
+   if (!query2 && !_is_renderable(ctx, internalformat)) {
+      _mesa_error(ctx, GL_INVALID_ENUM,
+                  "glGetInternalformativ(internalformat=%s)",
+                  _mesa_enum_to_string(internalformat));
+      return false;
+   }
+
+   return true;
+}
+
+/* Sets the appropriate "unsupported" response as defined by the
+ * ARB_internalformat_query2 spec for each each <pname>.
+ */
+static void
+_set_default_response(GLenum pname, GLint buffer[16])
+{
+   /* The ARB_internalformat_query2 defines which is the reponse best
+    * representing "not supported" or "not applicable" for each <pname>.
+    *
+    *     " In general:
+    *          - size- or count-based queries will return zero,
+    *          - support-, format- or type-based queries will return NONE,
+    *          - boolean-based queries will return FALSE, and
+    *          - list-based queries return no entries."
+    */
+   switch(pname) {
+   case GL_SAMPLES:
+      break;
+
+   case GL_MAX_COMBINED_DIMENSIONS:
+      /* This value can be a 64-bit value. As the default is the 32-bit query,
+       * we pack 2 32-bit integers. So we need to clean both */
+      buffer[0] = 0;
+      buffer[1] = 0;
+      break;
+
+   case GL_NUM_SAMPLE_COUNTS:
+   case GL_INTERNALFORMAT_RED_SIZE:
+   case GL_INTERNALFORMAT_GREEN_SIZE:
+   case GL_INTERNALFORMAT_BLUE_SIZE:
+   case GL_INTERNALFORMAT_ALPHA_SIZE:
+   case GL_INTERNALFORMAT_DEPTH_SIZE:
+   case GL_INTERNALFORMAT_STENCIL_SIZE:
+   case GL_INTERNALFORMAT_SHARED_SIZE:
+   case GL_MAX_WIDTH:
+   case GL_MAX_HEIGHT:
+   case GL_MAX_DEPTH:
+   case GL_MAX_LAYERS:
+   case GL_IMAGE_TEXEL_SIZE:
+   case GL_TEXTURE_COMPRESSED_BLOCK_WIDTH:
+   case GL_TEXTURE_COMPRESSED_BLOCK_HEIGHT:
+   case GL_TEXTURE_COMPRESSED_BLOCK_SIZE:
+      buffer[0] = 0;
+      break;
+
+   case GL_INTERNALFORMAT_PREFERRED:
+   case GL_INTERNALFORMAT_RED_TYPE:
+   case GL_INTERNALFORMAT_GREEN_TYPE:
+   case GL_INTERNALFORMAT_BLUE_TYPE:
+   case GL_INTERNALFORMAT_ALPHA_TYPE:
+   case GL_INTERNALFORMAT_DEPTH_TYPE:
+   case GL_INTERNALFORMAT_STENCIL_TYPE:
+   case GL_FRAMEBUFFER_RENDERABLE:
+   case GL_FRAMEBUFFER_RENDERABLE_LAYERED:
+   case GL_FRAMEBUFFER_BLEND:
+   case GL_READ_PIXELS:
+   case GL_READ_PIXELS_FORMAT:
+   case GL_READ_PIXELS_TYPE:
+   case GL_TEXTURE_IMAGE_FORMAT:
+   case GL_TEXTURE_IMAGE_TYPE:
+   case GL_GET_TEXTURE_IMAGE_FORMAT:
+   case GL_GET_TEXTURE_IMAGE_TYPE:
+   case GL_MANUAL_GENERATE_MIPMAP:
+   case GL_AUTO_GENERATE_MIPMAP:
+   case GL_COLOR_ENCODING:
+   case GL_SRGB_READ:
+   case GL_SRGB_WRITE:
+   case GL_SRGB_DECODE_ARB:
+   case GL_FILTER:
+   case GL_VERTEX_TEXTURE:
+   case GL_TESS_CONTROL_TEXTURE:
+   case GL_TESS_EVALUATION_TEXTURE:
+   case GL_GEOMETRY_TEXTURE:
+   case GL_FRAGMENT_TEXTURE:
+   case GL_COMPUTE_TEXTURE:
+   case GL_TEXTURE_SHADOW:
+   case GL_TEXTURE_GATHER:
+   case GL_TEXTURE_GATHER_SHADOW:
+   case GL_SHADER_IMAGE_LOAD:
+   case GL_SHADER_IMAGE_STORE:
+   case GL_SHADER_IMAGE_ATOMIC:
+   case GL_IMAGE_COMPATIBILITY_CLASS:
+   case GL_IMAGE_PIXEL_FORMAT:
+   case GL_IMAGE_PIXEL_TYPE:
+   case GL_IMAGE_FORMAT_COMPATIBILITY_TYPE:
+   case GL_SIMULTANEOUS_TEXTURE_AND_DEPTH_TEST:
+   case GL_SIMULTANEOUS_TEXTURE_AND_STENCIL_TEST:
+   case GL_SIMULTANEOUS_TEXTURE_AND_DEPTH_WRITE:
+   case GL_SIMULTANEOUS_TEXTURE_AND_STENCIL_WRITE:
+   case GL_CLEAR_BUFFER:
+   case GL_TEXTURE_VIEW:
+   case GL_VIEW_COMPATIBILITY_CLASS:
+      buffer[0] = GL_NONE;
+      break;
+
+   case GL_INTERNALFORMAT_SUPPORTED:
+   case GL_COLOR_COMPONENTS:
+   case GL_DEPTH_COMPONENTS:
+   case GL_STENCIL_COMPONENTS:
+   case GL_COLOR_RENDERABLE:
+   case GL_DEPTH_RENDERABLE:
+   case GL_STENCIL_RENDERABLE:
+   case GL_MIPMAP:
+   case GL_TEXTURE_COMPRESSED:
+      buffer[0] = GL_FALSE;
+      break;
+
+   default:
+      unreachable("invalid 'pname'");
+   }
+}
+
+static bool
+_is_target_supported(struct gl_context *ctx, GLenum target)
+{
+   /* The ARB_internalformat_query2 spec says:
+    *
+    *     "if a particular type of <target> is not supported by the
+    *     implementation the "unsupported" answer should be given.
+    *     This is not an error."
+    */
+   switch(target){
+   case GL_TEXTURE_2D:
+   case GL_TEXTURE_3D:
+      break;
+
+   case GL_TEXTURE_1D:
+      if (!_mesa_is_desktop_gl(ctx))
+         return false;
+      break;
+
+   case GL_TEXTURE_1D_ARRAY:
+      if (!_mesa_has_EXT_texture_array(ctx))
+         return false;
+      break;
+
+   case GL_TEXTURE_2D_ARRAY:
+      if (!(_mesa_has_EXT_texture_array(ctx) || _mesa_is_gles3(ctx)))
+         return false;
+      break;
+
+   case GL_TEXTURE_CUBE_MAP:
+      if (!_mesa_has_ARB_texture_cube_map(ctx))
+         return false;
+      break;
+
+   case GL_TEXTURE_CUBE_MAP_ARRAY:
+      if (!_mesa_has_ARB_texture_cube_map_array(ctx))
+         return false;
+      break;
+
+   case GL_TEXTURE_RECTANGLE:
+      if (!_mesa_has_NV_texture_rectangle(ctx))
+          return false;
+      break;
+
+   case GL_TEXTURE_BUFFER:
+      if (!_mesa_has_ARB_texture_buffer_object(ctx))
+         return false;
+      break;
+
+   case GL_RENDERBUFFER:
+      if (!(_mesa_has_ARB_framebuffer_object(ctx) ||
+            _mesa_is_gles3(ctx)))
+         return false;
+      break;
+
+   case GL_TEXTURE_2D_MULTISAMPLE:
+   case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
+      if (!(_mesa_has_ARB_texture_multisample(ctx) ||
+            _mesa_is_gles31(ctx)))
+         return false;
+      break;
+
+   default:
+      unreachable("invalid target");
+   }
+
+   return true;
+}
+
+static bool
+_is_resource_supported(struct gl_context *ctx, GLenum target,
+                       GLenum internalformat, GLenum pname)
+{
+   /* From the ARB_internalformat_query2 spec:
+    *
+    * In the following descriptions, the term /resource/ is used to generically
+    * refer to an object of the appropriate type that has been created with
+    * <internalformat> and <target>.  If the particular <target> and
+    * <internalformat> combination do not make sense, ... the "unsupported"
+    * answer should be given. This is not an error.
+    */
+
+   /* In the ARB_internalformat_query2 spec wording, some <pnames> do not care
+    * about the /resource/ being supported or not, we return 'true' for those.
+    */
+   switch (pname) {
+   case GL_INTERNALFORMAT_SUPPORTED:
+   case GL_INTERNALFORMAT_PREFERRED:
+   case GL_COLOR_COMPONENTS:
+   case GL_DEPTH_COMPONENTS:
+   case GL_STENCIL_COMPONENTS:
+   case GL_COLOR_RENDERABLE:
+   case GL_DEPTH_RENDERABLE:
+   case GL_STENCIL_RENDERABLE:
+      return true;
+   default:
+      break;
+   }
+
+   switch(target){
+   case GL_TEXTURE_1D:
+   case GL_TEXTURE_1D_ARRAY:
+   case GL_TEXTURE_2D:
+   case GL_TEXTURE_2D_ARRAY:
+   case GL_TEXTURE_3D:
+   case GL_TEXTURE_CUBE_MAP:
+   case GL_TEXTURE_CUBE_MAP_ARRAY:
+   case GL_TEXTURE_RECTANGLE:
+      /* Based on what Mesa does for glTexImage1D/2D/3D and
+       * glCompressedTexImage1D/2D/3D functions.
+       */
+      if (_mesa_base_tex_format(ctx, internalformat) < 0)
+         return false;
+
+      /* additional checks for depth textures */
+      if (!_mesa_legal_texture_base_format_for_target(ctx, target, internalformat))
+         return false;
+
+      /* additional checks for compressed textures */
+      if (_mesa_is_compressed_format(ctx, internalformat) &&
+          (!_mesa_target_can_be_compressed(ctx, target, internalformat, NULL) ||
+           _mesa_format_no_online_compression(ctx, internalformat)))
+         return false;
+
+      break;
+   case GL_TEXTURE_2D_MULTISAMPLE:
+   case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
+      /* Based on what Mesa does for glTexImage2D/3DMultisample,
+       * glTexStorage2D/3DMultisample and
+       * glTextureStorage2D/3DMultisample functions.
+       */
+      if (!_mesa_is_renderable_texture_format(ctx, internalformat))
+         return false;
+
+      break;
+   case GL_TEXTURE_BUFFER:
+      /* Based on what Mesa does for the glTexBuffer function. */
+      if (_mesa_validate_texbuffer_format(ctx, internalformat) ==
+          MESA_FORMAT_NONE)
+         return false;
+
+      break;
+   case GL_RENDERBUFFER:
+      /* Based on what Mesa does for glRenderbufferStorage(Multisample) and
+       * glNamedRenderbufferStorage functions.
+       */
+      if (!_mesa_base_fbo_format(ctx, internalformat))
+         return false;
+
+      break;
+   default:
+      unreachable("bad target");
+   }
+
+   return true;
+}
+
+static bool
+_is_internalformat_supported(struct gl_context *ctx, GLenum target,
+                             GLenum internalformat)
+{
+   /* From the ARB_internalformat_query2 specification:
+    *
+    *     "- INTERNALFORMAT_SUPPORTED: If <internalformat> is an internal format
+    *     that is supported by the implementation in at least some subset of
+    *     possible operations, TRUE is written to <params>.  If <internalformat>
+    *     if not a valid token for any internal format usage, FALSE is returned.
+    *
+    *     <internalformats> that must be supported (in GL 4.2 or later) include
+    *      the following:
+    *         - "sized internal formats" from Table 3.12, 3.13, and 3.15,
+    *         - any specific "compressed internal format" from Table 3.14,
+    *         - any "image unit format" from Table 3.21.
+    *         - any generic "compressed internal format" from Table 3.14, if the
+    *         implementation accepts it for any texture specification commands, and
+    *         - unsized or base internal format, if the implementation accepts
+    *         it for texture or image specification.
+    */
+   GLint buffer[1];
+
+   /* At this point a internalformat is valid if it is valid as a texture or
+    * as a renderbuffer format. The checks are different because those methods
+    * return different values when passing non supported internalformats */
+   if (_mesa_base_tex_format(ctx, internalformat) < 0 &&
+       _mesa_base_fbo_format(ctx, internalformat) == 0)
+      return false;
+
+   /* Let the driver have the final word */
+   ctx->Driver.QueryInternalFormat(ctx, target, internalformat,
+                                   GL_INTERNALFORMAT_SUPPORTED, buffer);
+
+   return (buffer[0] == GL_TRUE);
+}
+
+static bool
+_legal_target_for_framebuffer_texture_layer(struct gl_context *ctx,
+                                            GLenum target)
+{
+   switch (target) {
+   case GL_TEXTURE_3D:
+   case GL_TEXTURE_1D_ARRAY:
+   case GL_TEXTURE_2D_ARRAY:
+   case GL_TEXTURE_CUBE_MAP_ARRAY:
+   case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
+   case GL_TEXTURE_CUBE_MAP:
+      return true;
+   default:
+      return false;
+   }
+}
+
+static GLenum
+_mesa_generic_type_for_internal_format(GLenum internalFormat)
+{
+   if (_mesa_is_enum_format_unsigned_int(internalFormat))
+      return GL_UNSIGNED_BYTE;
+   else if (_mesa_is_enum_format_signed_int(internalFormat))
+      return GL_BYTE;
+   else
+      return GL_FLOAT;
+}
+
+/* default implementation of QueryInternalFormat driverfunc, for
+ * drivers not implementing ARB_internalformat_query2.
+ */
+void
+_mesa_query_internal_format_default(struct gl_context *ctx, GLenum target,
+                                    GLenum internalFormat, GLenum pname,
+                                    GLint *params)
+{
+   (void) target;
+
+   switch (pname) {
+   case GL_SAMPLES:
+   case GL_NUM_SAMPLE_COUNTS:
+      params[0] = 1;
+      break;
+
+   case GL_INTERNALFORMAT_SUPPORTED:
+      params[0] = GL_TRUE;
+      break;
+
+   case GL_INTERNALFORMAT_PREFERRED:
+      params[0] = internalFormat;
+      break;
+
+   case GL_READ_PIXELS_FORMAT: {
+      GLenum base_format = _mesa_base_tex_format(ctx, internalFormat);
+      switch (base_format) {
+      case GL_STENCIL_INDEX:
+      case GL_DEPTH_COMPONENT:
+      case GL_DEPTH_STENCIL:
+      case GL_RED:
+      case GL_RGB:
+      case GL_BGR:
+      case GL_RGBA:
+      case GL_BGRA:
+         params[0] = base_format;
+         break;
+      default:
+         params[0] = GL_NONE;
+         break;
+      }
+      break;
+   }
+
+   case GL_READ_PIXELS_TYPE:
+   case GL_TEXTURE_IMAGE_TYPE:
+   case GL_GET_TEXTURE_IMAGE_TYPE: {
+      GLenum base_format = _mesa_base_tex_format(ctx, internalFormat);
+      if (base_format > 0)
+         params[0] = _mesa_generic_type_for_internal_format(internalFormat);
+      else
+         params[0] = GL_NONE;
+      break;
+   }
+
+   case GL_TEXTURE_IMAGE_FORMAT:
+   case GL_GET_TEXTURE_IMAGE_FORMAT: {
+      GLenum format = GL_NONE;
+      GLenum base_format = _mesa_base_tex_format(ctx, internalFormat);
+      if (base_format > 0) {
+         if (_mesa_is_enum_format_integer(internalFormat))
+           format = _mesa_base_format_to_integer_format(base_format);
+         else
+           format = base_format;
+      }
+
+      params[0] = format;
+      break;
+   }
+
+   case GL_MANUAL_GENERATE_MIPMAP:
+   case GL_AUTO_GENERATE_MIPMAP:
+   case GL_SRGB_READ:
+   case GL_SRGB_WRITE:
+   case GL_SRGB_DECODE_ARB:
+   case GL_VERTEX_TEXTURE:
+   case GL_TESS_CONTROL_TEXTURE:
+   case GL_TESS_EVALUATION_TEXTURE:
+   case GL_GEOMETRY_TEXTURE:
+   case GL_FRAGMENT_TEXTURE:
+   case GL_COMPUTE_TEXTURE:
+   case GL_SHADER_IMAGE_LOAD:
+   case GL_SHADER_IMAGE_STORE:
+   case GL_SHADER_IMAGE_ATOMIC:
+   case GL_SIMULTANEOUS_TEXTURE_AND_DEPTH_TEST:
+   case GL_SIMULTANEOUS_TEXTURE_AND_STENCIL_TEST:
+   case GL_SIMULTANEOUS_TEXTURE_AND_DEPTH_WRITE:
+   case GL_SIMULTANEOUS_TEXTURE_AND_STENCIL_WRITE:
+   case GL_CLEAR_BUFFER:
+   case GL_TEXTURE_VIEW:
+   case GL_TEXTURE_SHADOW:
+   case GL_TEXTURE_GATHER:
+   case GL_TEXTURE_GATHER_SHADOW:
+   case GL_FRAMEBUFFER_RENDERABLE:
+   case GL_FRAMEBUFFER_RENDERABLE_LAYERED:
+   case GL_FRAMEBUFFER_BLEND:
+   case GL_FILTER:
+      params[0] = GL_FULL_SUPPORT;
+      break;
+
+   default:
+      _set_default_response(pname, params);
+      break;
+   }
+}
+
+/*
+ * For MAX_WIDTH/MAX_HEIGHT/MAX_DEPTH it returns the equivalent GetInteger
+ * pname for a Getinternalformat pname/target combination. target/pname
+ * combinations that would return 0 due dimension number or unsupported status
+ * should be already filtered out
+ *
+ * Note that this means that the returned value would be independent of the
+ * internalformat. This possibility is already mentioned at the Issue 7 of the
+ * arb_internalformat_query2 spec.
+ */
+static GLenum
+equivalentSizePname(GLenum target,
+                    GLenum pname)
+{
+   switch (target) {
+   case GL_TEXTURE_1D:
+   case GL_TEXTURE_2D:
+   case GL_TEXTURE_2D_MULTISAMPLE:
+      return GL_MAX_TEXTURE_SIZE;
+   case GL_TEXTURE_3D:
+      return GL_MAX_3D_TEXTURE_SIZE;
+   case GL_TEXTURE_CUBE_MAP:
+      return GL_MAX_CUBE_MAP_TEXTURE_SIZE;
+   case GL_TEXTURE_RECTANGLE:
+      return GL_MAX_RECTANGLE_TEXTURE_SIZE;
+   case GL_RENDERBUFFER:
+      return GL_MAX_RENDERBUFFER_SIZE;
+   case GL_TEXTURE_1D_ARRAY:
+      if (pname == GL_MAX_HEIGHT)
+         return GL_MAX_ARRAY_TEXTURE_LAYERS;
+      else
+         return GL_MAX_TEXTURE_SIZE;
+   case GL_TEXTURE_2D_ARRAY:
+   case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
+      if (pname == GL_MAX_DEPTH)
+         return GL_MAX_ARRAY_TEXTURE_LAYERS;
+      else
+         return GL_MAX_TEXTURE_SIZE;
+   case GL_TEXTURE_CUBE_MAP_ARRAY:
+      if (pname == GL_MAX_DEPTH)
+         return GL_MAX_ARRAY_TEXTURE_LAYERS;
+      else
+         return GL_MAX_CUBE_MAP_TEXTURE_SIZE;
+   case GL_TEXTURE_BUFFER:
+      return GL_MAX_TEXTURE_BUFFER_SIZE;
+   default:
+      return 0;
+   }
+}
+
+/*
+ * Returns the dimensions associated to a target. GL_TEXTURE_BUFFER and
+ * GL_RENDERBUFFER have associated a dimension, but they are not textures
+ * per-se, so we can't just call _mesa_get_texture_dimension directly.
+ */
+static GLint
+get_target_dimensions(GLenum target)
+{
+   switch(target) {
+   case GL_TEXTURE_BUFFER:
+      return 1;
+   case GL_RENDERBUFFER:
+      return 2;
+   default:
+      return _mesa_get_texture_dimensions(target);
+   }
+}
+
+/*
+ * Returns the minimum amount of dimensions associated to a pname. So for
+ * example, if querying GL_MAX_HEIGHT, it is assumed that your target would
+ * have as minimum 2 dimensions.
+ *
+ * Useful to handle sentences like this from query2 spec:
+ *
+ * "MAX_HEIGHT:
+ *  <skip>
+ *  If the resource does not have at least two dimensions
+ *  <skip>."
+ */
+static GLint
+get_min_dimensions(GLenum pname)
+{
+   switch(pname) {
+   case GL_MAX_WIDTH:
+      return 1;
+   case GL_MAX_HEIGHT:
+      return 2;
+   case GL_MAX_DEPTH:
+      return 3;
+   default:
+      return 0;
+   }
+}
+
+/*
+ * Similar to teximage.c:check_multisample_target, but independent of the
+ * dimensions.
+ */
+static bool
+is_multisample_target(GLenum target)
+{
+   switch(target) {
+   case GL_TEXTURE_2D_MULTISAMPLE:
+   case GL_TEXTURE_2D_MULTISAMPLE_ARRAY:
+      return true;
+   default:
+      return false;
+   }
+
+}
+
+void GLAPIENTRY
+_mesa_GetInternalformativ(GLenum target, GLenum internalformat, GLenum pname,
+                          GLsizei bufSize, GLint *params)
+{
+   GLint buffer[16];
+   GET_CURRENT_CONTEXT(ctx);
+
+   ASSERT_OUTSIDE_BEGIN_END(ctx);
+
+   /* ARB_internalformat_query is also mandatory for ARB_internalformat_query2 */
+   if (!(_mesa_has_ARB_internalformat_query(ctx) ||
+         _mesa_is_gles3(ctx))) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "glGetInternalformativ");
       return;
    }
 
+   assert(ctx->Driver.QueryInternalFormat != NULL);
+
+   if (!_legal_parameters(ctx, target, internalformat, pname, bufSize, params))
+      return;
+
+   /* initialize the contents of the temporary buffer */
+   memcpy(buffer, params, MIN2(bufSize, 16) * sizeof(GLint));
+
+   /* Use the 'unsupported' response defined by the spec for every pname
+    * as the default answer.
+    */
+   _set_default_response(pname, buffer);
+
+   if (!_is_target_supported(ctx, target) ||
+       !_is_internalformat_supported(ctx, target, internalformat) ||
+       !_is_resource_supported(ctx, target, internalformat, pname))
+      goto end;
+
    switch (pname) {
    case GL_SAMPLES:
-      count = ctx->Driver.QuerySamplesForFormat(ctx, target,
-            internalformat, buffer);
-      break;
-   case GL_NUM_SAMPLE_COUNTS: {
-      if ((ctx->API == API_OPENGLES2 && ctx->Version == 30) &&
-          _mesa_is_enum_format_integer(internalformat)) {
-         /* From GL ES 3.0 specification, section 6.1.15 page 236: "Since
-          * multisampling is not supported for signed and unsigned integer
-          * internal formats, the value of NUM_SAMPLE_COUNTS will be zero
-          * for such formats.
-          *
-          * Such a restriction no longer exists in GL ES 3.1.
-          */
-         buffer[0] = 0;
-         count = 1;
+      /* fall-through */
+   case GL_NUM_SAMPLE_COUNTS:
+      /* The ARB_internalformat_query2 sets the response as 'unsupported' for
+       * SAMPLES and NUM_SAMPLE_COUNTS:
+       *
+       *     "If <internalformat> is not color-renderable, depth-renderable, or
+       *     stencil-renderable (as defined in section 4.4.4), or if <target>
+       *     does not support multiple samples (ie other than
+       *     TEXTURE_2D_MULTISAMPLE,  TEXTURE_2D_MULTISAMPLE_ARRAY,
+       *     or RENDERBUFFER)."
+       */
+      if ((target != GL_RENDERBUFFER &&
+           target != GL_TEXTURE_2D_MULTISAMPLE &&
+           target != GL_TEXTURE_2D_MULTISAMPLE_ARRAY) ||
+          !_is_renderable(ctx, internalformat))
+         goto end;
+
+      /* The GL ES 3.0 specification, section 6.1.15 page 236 says:
+       *
+       *     "Since multisampling is not supported for signed and unsigned
+       *     integer internal formats, the value of NUM_SAMPLE_COUNTS will be
+       *     zero for such formats.
+       */
+      if (pname == GL_NUM_SAMPLE_COUNTS && ctx->API == API_OPENGLES2 &&
+          ctx->Version == 30 && _mesa_is_enum_format_integer(internalformat)) {
+         goto end;
+      }
+
+      ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname,
+                                      buffer);
+      break;
+
+   case GL_INTERNALFORMAT_SUPPORTED:
+      /* Having a supported <internalformat> is implemented as a prerequisite
+       * for all the <pnames>. Thus,  if we reach this point, the internalformat is
+       * supported.
+       */
+      buffer[0] = GL_TRUE;
+      break;
+
+   case GL_INTERNALFORMAT_PREFERRED:
+      /* The ARB_internalformat_query2 spec says:
+       *
+       *     "- INTERNALFORMAT_PREFERRED: The implementation-preferred internal
+       *     format for representing resources of the specified <internalformat> is
+       *     returned in <params>.
+       *
+       * Therefore, we let the driver answer.
+       */
+      ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname,
+                                      buffer);
+      break;
+
+   case GL_INTERNALFORMAT_RED_SIZE:
+   case GL_INTERNALFORMAT_GREEN_SIZE:
+   case GL_INTERNALFORMAT_BLUE_SIZE:
+   case GL_INTERNALFORMAT_ALPHA_SIZE:
+   case GL_INTERNALFORMAT_DEPTH_SIZE:
+   case GL_INTERNALFORMAT_STENCIL_SIZE:
+   case GL_INTERNALFORMAT_SHARED_SIZE:
+   case GL_INTERNALFORMAT_RED_TYPE:
+   case GL_INTERNALFORMAT_GREEN_TYPE:
+   case GL_INTERNALFORMAT_BLUE_TYPE:
+   case GL_INTERNALFORMAT_ALPHA_TYPE:
+   case GL_INTERNALFORMAT_DEPTH_TYPE:
+   case GL_INTERNALFORMAT_STENCIL_TYPE: {
+      GLint baseformat;
+      mesa_format texformat;
+
+      if (target != GL_RENDERBUFFER) {
+         if (!_mesa_legal_get_tex_level_parameter_target(ctx, target, true))
+            goto end;
+
+         baseformat = _mesa_base_tex_format(ctx, internalformat);
       } else {
-         size_t num_samples;
+         baseformat = _mesa_base_fbo_format(ctx, internalformat);
+      }
 
-         /* The driver can return 0, and we should pass that along to the
-          * application.  The ARB decided that ARB_internalformat_query should
-          * behave as ARB_internalformat_query2 in this situation.
-          *
-          * The ARB_internalformat_query2 spec says:
-          *
-          *     "- NUM_SAMPLE_COUNTS: The number of sample counts that would be
-          *        returned by querying SAMPLES is returned in <params>.
-          *        * If <internalformat> is not color-renderable,
-          *          depth-renderable, or stencil-renderable (as defined in
-          *          section 4.4.4), or if <target> does not support multiple
-          *          samples (ie other than TEXTURE_2D_MULTISAMPLE,
-          *          TEXTURE_2D_MULTISAMPLE_ARRAY, or RENDERBUFFER), 0 is
-          *          returned."
-          */
-         num_samples =  ctx->Driver.QuerySamplesForFormat(ctx, target, internalformat, buffer);
+      /* Let the driver choose the texture format.
+       *
+       * Disclaimer: I am considering that drivers use for renderbuffers the
+       * same format-choice logic as for textures.
+       */
+      texformat = ctx->Driver.ChooseTextureFormat(ctx, target, internalformat,
+                                                  GL_NONE /*format */, GL_NONE /* type */);
+
+      if (texformat == MESA_FORMAT_NONE || baseformat <= 0)
+         goto end;
+
+      /* Implementation based on what Mesa does for glGetTexLevelParameteriv
+       * and glGetRenderbufferParameteriv functions.
+       */
+      if (pname == GL_INTERNALFORMAT_SHARED_SIZE) {
+         if (_mesa_has_EXT_texture_shared_exponent(ctx) &&
+             target != GL_TEXTURE_BUFFER &&
+             target != GL_RENDERBUFFER &&
+             texformat == MESA_FORMAT_R9G9B9E5_FLOAT) {
+            buffer[0] = 5;
+         }
+         goto end;
+      }
+
+      if (!_mesa_base_format_has_channel(baseformat, pname))
+         goto end;
 
-         /* QuerySamplesForFormat writes some stuff to buffer, so we have to
-          * separately over-write it with the requested value.
+      switch (pname) {
+      case GL_INTERNALFORMAT_DEPTH_SIZE:
+         if (!_mesa_has_ARB_depth_texture(ctx) &&
+             target != GL_RENDERBUFFER &&
+             target != GL_TEXTURE_BUFFER)
+            goto end;
+         /* fallthrough */
+      case GL_INTERNALFORMAT_RED_SIZE:
+      case GL_INTERNALFORMAT_GREEN_SIZE:
+      case GL_INTERNALFORMAT_BLUE_SIZE:
+      case GL_INTERNALFORMAT_ALPHA_SIZE:
+      case GL_INTERNALFORMAT_STENCIL_SIZE:
+         buffer[0] = _mesa_get_format_bits(texformat, pname);
+         break;
+
+      case GL_INTERNALFORMAT_DEPTH_TYPE:
+         if (!_mesa_has_ARB_texture_float(ctx))
+            goto end;
+         /* fallthrough */
+      case GL_INTERNALFORMAT_RED_TYPE:
+      case GL_INTERNALFORMAT_GREEN_TYPE:
+      case GL_INTERNALFORMAT_BLUE_TYPE:
+      case GL_INTERNALFORMAT_ALPHA_TYPE:
+      case GL_INTERNALFORMAT_STENCIL_TYPE:
+         buffer[0]  = _mesa_get_format_datatype(texformat);
+         break;
+
+      default:
+         break;
+
+      }
+      break;
+   }
+
+      /* For WIDTH/HEIGHT/DEPTH/LAYERS there is no reason to think that the
+       * returned values should be different to the values returned by
+       * GetInteger with MAX_TEXTURE_SIZE, MAX_3D_TEXTURE_SIZE, etc.*/
+   case GL_MAX_WIDTH:
+   case GL_MAX_HEIGHT:
+   case GL_MAX_DEPTH: {
+      GLenum get_pname;
+      GLint dimensions;
+      GLint min_dimensions;
+
+      /* From query2:MAX_HEIGHT spec (as example):
+       *
+       * "If the resource does not have at least two dimensions, or if the
+       * resource is unsupported, zero is returned."
+       */
+      dimensions = get_target_dimensions(target);
+      min_dimensions = get_min_dimensions(pname);
+      if (dimensions < min_dimensions)
+         goto end;
+
+      get_pname = equivalentSizePname(target, pname);
+      if (get_pname == 0)
+         goto end;
+
+      _mesa_GetIntegerv(get_pname, buffer);
+      break;
+   }
+
+   case GL_MAX_LAYERS:
+      if (!_mesa_has_EXT_texture_array(ctx))
+         goto end;
+
+      if (!_mesa_is_array_texture(target))
+         goto end;
+
+      _mesa_GetIntegerv(GL_MAX_ARRAY_TEXTURE_LAYERS, buffer);
+      break;
+
+   case GL_MAX_COMBINED_DIMENSIONS:{
+      GLint64 combined_value = 1;
+      GLenum max_dimensions_pnames[] = {
+         GL_MAX_WIDTH,
+         GL_MAX_HEIGHT,
+         GL_MAX_DEPTH,
+         GL_SAMPLES
+      };
+      unsigned i;
+      GLint current_value;
+
+      /* Combining the dimensions. Note that for array targets, this would
+       * automatically include the value of MAX_LAYERS, as that value is
+       * returned as MAX_HEIGHT or MAX_DEPTH */
+      for (i = 0; i < 4; i++) {
+         if (max_dimensions_pnames[i] == GL_SAMPLES &&
+             !is_multisample_target(target))
+            continue;
+
+         _mesa_GetInternalformativ(target, internalformat,
+                                   max_dimensions_pnames[i],
+                                   1, &current_value);
+
+         if (current_value != 0)
+            combined_value *= current_value;
+      }
+
+      if (_mesa_is_cube_map_texture(target))
+         combined_value *= 6;
+
+      /* We pack the 64-bit value on two 32-bit values. Calling the 32-bit
+       * query, this would work as far as the value can be hold on a 32-bit
+       * signed integer. For the 64-bit query, the wrapper around the 32-bit
+       * query will unpack the value */
+      memcpy(buffer, &combined_value, sizeof(GLint64));
+      break;
+   }
+
+   case GL_COLOR_COMPONENTS:
+      /* The ARB_internalformat_query2 spec says:
+       *
+       *     "- COLOR_COMPONENTS: If the internal format contains any color
+       *     components (R, G, B, or A), TRUE is returned in <params>.
+       *     If the internal format is unsupported or contains no color
+       *     components, FALSE is returned."
+       */
+      if (_mesa_is_color_format(internalformat))
+         buffer[0] = GL_TRUE;
+      break;
+
+   case GL_DEPTH_COMPONENTS:
+      /* The ARB_internalformat_query2 spec says:
+       *
+       *     "- DEPTH_COMPONENTS: If the internal format contains a depth
+       *     component (D), TRUE is returned in <params>. If the internal format
+       *     is unsupported or contains no depth component, FALSE is returned."
+       */
+      if (_mesa_is_depth_format(internalformat) ||
+          _mesa_is_depthstencil_format(internalformat))
+         buffer[0] = GL_TRUE;
+      break;
+
+   case GL_STENCIL_COMPONENTS:
+      /* The ARB_internalformat_query2 spec says:
+       *
+       *     "- STENCIL_COMPONENTS: If the internal format contains a stencil
+       *     component (S), TRUE is returned in <params>. If the internal format
+       *     is unsupported or contains no stencil component, FALSE is returned.
+       */
+      if (_mesa_is_stencil_format(internalformat) ||
+          _mesa_is_depthstencil_format(internalformat))
+         buffer[0] = GL_TRUE;
+      break;
+
+   case GL_COLOR_RENDERABLE:
+   case GL_DEPTH_RENDERABLE:
+   case GL_STENCIL_RENDERABLE:
+      if (!_is_renderable(ctx, internalformat))
+         goto end;
+
+      if (pname == GL_COLOR_RENDERABLE) {
+         if (!_mesa_is_color_format(internalformat))
+            goto end;
+      } else {
+         GLenum baseFormat = _mesa_base_fbo_format(ctx, internalformat);
+         if (baseFormat != GL_DEPTH_STENCIL &&
+             ((pname == GL_DEPTH_RENDERABLE && baseFormat != GL_DEPTH_COMPONENT) ||
+              (pname == GL_STENCIL_RENDERABLE && baseFormat != GL_STENCIL_INDEX)))
+            goto end;
+      }
+
+      buffer[0] = GL_TRUE;
+      break;
+
+   case GL_FRAMEBUFFER_RENDERABLE_LAYERED:
+      if (!_mesa_has_EXT_texture_array(ctx) ||
+          _legal_target_for_framebuffer_texture_layer(ctx, target))
+         goto end;
+      /* fallthrough */
+   case GL_FRAMEBUFFER_RENDERABLE:
+   case GL_FRAMEBUFFER_BLEND:
+      if (!_mesa_has_ARB_framebuffer_object(ctx))
+         goto end;
+
+      if (target == GL_TEXTURE_BUFFER ||
+          !_is_renderable(ctx, internalformat))
+         goto end;
+
+      ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname,
+                                      buffer);
+      break;
+
+   case GL_READ_PIXELS:
+   case GL_READ_PIXELS_FORMAT:
+   case GL_READ_PIXELS_TYPE:
+      ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname,
+                                      buffer);
+      break;
+
+   case GL_TEXTURE_IMAGE_FORMAT:
+   case GL_GET_TEXTURE_IMAGE_FORMAT:
+   case GL_TEXTURE_IMAGE_TYPE:
+   case GL_GET_TEXTURE_IMAGE_TYPE:
+      ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname,
+                                      buffer);
+      break;
+
+   case GL_MIPMAP:
+   case GL_MANUAL_GENERATE_MIPMAP:
+   case GL_AUTO_GENERATE_MIPMAP:
+      if (!_mesa_is_valid_generate_texture_mipmap_target(ctx, target) ||
+          !_mesa_is_valid_generate_texture_mipmap_internalformat(ctx,
+                                                              internalformat)) {
+         goto end;
+      }
+
+      if (pname == GL_MIPMAP) {
+         buffer[0] = GL_TRUE;
+         goto end;
+      }
+      else if (pname == GL_MANUAL_GENERATE_MIPMAP) {
+         if (!_mesa_has_ARB_framebuffer_object(ctx))
+            goto end;
+      }
+      else {
+         /* From ARB_internalformat_query2:
+          *    "Dependencies on OpenGL 3.2 (Core Profile)
+          *     In core profiles for OpenGL 3.2 and later versions, queries
+          *     for the AUTO_GENERATE_MIPMAP <pname> return the appropriate
+          *     unsupported response."
           */
-         buffer[0] = (GLint) num_samples;
-         count = 1;
+         if (_mesa_is_desktop_gl(ctx) && ctx->Version >= 32)
+            goto end;
       }
+
+      ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname,
+                                      buffer);
+      break;
+
+   case GL_COLOR_ENCODING:
+      if (!_mesa_is_color_format(internalformat))
+         goto end;
+
+      if (_mesa_is_srgb_format(internalformat))
+         buffer[0] = GL_SRGB;
+      else
+         buffer[0] = GL_LINEAR;
+      break;
+
+   case GL_SRGB_READ:
+      if (!_mesa_has_EXT_texture_sRGB(ctx) ||
+          !_mesa_is_srgb_format(internalformat)) {
+         goto end;
+      }
+
+      ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname,
+                                      buffer);
+      break;
+
+   case GL_SRGB_WRITE:
+      if (!_mesa_has_EXT_framebuffer_sRGB(ctx) ||
+          !_mesa_is_color_format(internalformat)) {
+         goto end;
+      }
+
+      ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname,
+                                      buffer);
+      break;
+
+   case GL_SRGB_DECODE_ARB:
+      /* Presence of EXT_texture_sRGB_decode was already verified */
+      if (!_mesa_has_EXT_texture_sRGB(ctx) ||
+          target == GL_RENDERBUFFER ||
+          !_mesa_is_srgb_format(internalformat)) {
+         goto end;
+      }
+
+      ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname,
+                                      buffer);
+      break;
+
+   case GL_FILTER:
+      /* If it doesn't allow to set sampler parameters then it would not allow
+       * to set a filter different to GL_NEAREST. In practice, this method
+       * only filters out MULTISAMPLE/MULTISAMPLE_ARRAY */
+      if (!_mesa_target_allows_setting_sampler_parameters(target))
+         goto end;
+
+      if (_mesa_is_enum_format_integer(internalformat))
+         goto end;
+
+      if (target == GL_TEXTURE_BUFFER)
+         goto end;
+
+      /* At this point we know that multi-texel filtering is supported. We
+       * need to call the driver to know if it is CAVEAT_SUPPORT or
+       * FULL_SUPPORT.
+       */
+      ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname,
+                                      buffer);
+      break;
+
+   case GL_VERTEX_TEXTURE:
+   case GL_TESS_CONTROL_TEXTURE:
+   case GL_TESS_EVALUATION_TEXTURE:
+   case GL_GEOMETRY_TEXTURE:
+   case GL_FRAGMENT_TEXTURE:
+   case GL_COMPUTE_TEXTURE:
+      if (target == GL_RENDERBUFFER)
+         goto end;
+
+      if ((pname == GL_TESS_CONTROL_TEXTURE ||
+           pname == GL_TESS_EVALUATION_TEXTURE) &&
+          !_mesa_has_tessellation(ctx))
+         goto end;
+
+      if (pname == GL_GEOMETRY_TEXTURE && !_mesa_has_geometry_shaders(ctx))
+         goto end;
+
+      if (pname == GL_COMPUTE_TEXTURE && !_mesa_has_compute_shaders(ctx))
+         goto end;
+
+      ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname,
+                                      buffer);
+      break;
+
+   case GL_TEXTURE_GATHER:
+   case GL_TEXTURE_GATHER_SHADOW:
+      if (!_mesa_has_ARB_texture_gather(ctx))
+         goto end;
+
+      /* fallthrough */
+   case GL_TEXTURE_SHADOW:
+      /* Only depth or depth-stencil image formats make sense in shadow
+         samplers */
+      if (pname != GL_TEXTURE_GATHER &&
+          !_mesa_is_depth_format(internalformat) &&
+          !_mesa_is_depthstencil_format(internalformat))
+         goto end;
+
+      /* Validate the target for shadow and gather operations */
+      switch (target) {
+      case GL_TEXTURE_2D:
+      case GL_TEXTURE_2D_ARRAY:
+      case GL_TEXTURE_CUBE_MAP:
+      case GL_TEXTURE_CUBE_MAP_ARRAY:
+      case GL_TEXTURE_RECTANGLE:
+         break;
+
+      case GL_TEXTURE_1D:
+      case GL_TEXTURE_1D_ARRAY:
+         /* 1D and 1DArray textures are not admitted in gather operations */
+         if (pname != GL_TEXTURE_SHADOW)
+            goto end;
+         break;
+
+      default:
+         goto end;
+      }
+
+      ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname,
+                                      buffer);
+      break;
+
+   case GL_SHADER_IMAGE_LOAD:
+   case GL_SHADER_IMAGE_STORE:
+      if (!_mesa_has_ARB_shader_image_load_store(ctx))
+         goto end;
+
+      /* We call to _mesa_is_shader_image_format_supported
+       * using "internalformat" as parameter, because the
+       * the ARB_internalformat_query2 spec says:
+       * "In this case the <internalformat> is the value of the <format>
+       * parameter that is passed to BindImageTexture."
+       */
+      if (target == GL_RENDERBUFFER ||
+          !_mesa_is_shader_image_format_supported(ctx, internalformat))
+         goto end;
+
+      ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname,
+                                      buffer);
+      break;
+
+   case GL_SHADER_IMAGE_ATOMIC:
+      if (!_mesa_has_ARB_shader_image_load_store(ctx))
+         goto end;
+
+      ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname,
+                                      buffer);
+      break;
+
+   case GL_IMAGE_TEXEL_SIZE: {
+      mesa_format image_format;
+
+      if (!_mesa_has_ARB_shader_image_load_store(ctx) ||
+          target == GL_RENDERBUFFER)
+         goto end;
+
+      image_format = _mesa_get_shader_image_format(internalformat);
+      if (image_format == MESA_FORMAT_NONE)
+         goto end;
+
+      /* We return bits */
+      buffer[0] = (_mesa_get_format_bytes(image_format) * 8);
       break;
    }
+
+   case GL_IMAGE_COMPATIBILITY_CLASS:
+      if (!_mesa_has_ARB_shader_image_load_store(ctx) ||
+          target == GL_RENDERBUFFER)
+         goto end;
+
+      buffer[0] = _mesa_get_image_format_class(internalformat);
+      break;
+
+   case GL_IMAGE_PIXEL_FORMAT: {
+      GLint base_format;
+
+      if (!_mesa_has_ARB_shader_image_load_store(ctx) ||
+          target == GL_RENDERBUFFER ||
+          !_mesa_is_shader_image_format_supported(ctx, internalformat))
+         goto end;
+
+      base_format = _mesa_base_tex_format(ctx, internalformat);
+      if (base_format == -1)
+         goto end;
+
+      if (_mesa_is_enum_format_integer(internalformat))
+         buffer[0] = _mesa_base_format_to_integer_format(base_format);
+      else
+         buffer[0] = base_format;
+      break;
+   }
+
+   case GL_IMAGE_PIXEL_TYPE: {
+      mesa_format image_format;
+      GLenum datatype;
+      GLuint comps;
+
+      if (!_mesa_has_ARB_shader_image_load_store(ctx) ||
+          target == GL_RENDERBUFFER)
+         goto end;
+
+      image_format = _mesa_get_shader_image_format(internalformat);
+      if (image_format == MESA_FORMAT_NONE)
+         goto end;
+
+      _mesa_uncompressed_format_to_type_and_comps(image_format, &datatype,
+                                                  &comps);
+      if (!datatype)
+         goto end;
+
+      buffer[0] = datatype;
+      break;
+   }
+
+   case GL_IMAGE_FORMAT_COMPATIBILITY_TYPE: {
+      if (!_mesa_has_ARB_shader_image_load_store(ctx))
+         goto end;
+
+      if (!_mesa_legal_get_tex_level_parameter_target(ctx, target, true))
+         goto end;
+
+      /* From spec: "Equivalent to calling GetTexParameter with <value> set
+       * to IMAGE_FORMAT_COMPATIBILITY_TYPE."
+       *
+       * GetTexParameter just returns
+       * tex_obj->ImageFormatCompatibilityType. We create a fake tex_obj
+       * just with the purpose of getting the value.
+       */
+      struct gl_texture_object *tex_obj = _mesa_new_texture_object(ctx, 0, target);
+      buffer[0] = tex_obj->ImageFormatCompatibilityType;
+      _mesa_delete_texture_object(ctx, tex_obj);
+
+      break;
+   }
+
+   case GL_SIMULTANEOUS_TEXTURE_AND_DEPTH_TEST:
+   case GL_SIMULTANEOUS_TEXTURE_AND_STENCIL_TEST:
+   case GL_SIMULTANEOUS_TEXTURE_AND_DEPTH_WRITE:
+   case GL_SIMULTANEOUS_TEXTURE_AND_STENCIL_WRITE:
+      if (target == GL_RENDERBUFFER)
+         goto end;
+
+      if (!_mesa_is_depthstencil_format(internalformat)) {
+         if (((pname == GL_SIMULTANEOUS_TEXTURE_AND_DEPTH_TEST ||
+               pname == GL_SIMULTANEOUS_TEXTURE_AND_DEPTH_WRITE) &&
+              !_mesa_is_depth_format(internalformat)) ||
+             ((pname == GL_SIMULTANEOUS_TEXTURE_AND_STENCIL_TEST ||
+               pname == GL_SIMULTANEOUS_TEXTURE_AND_STENCIL_WRITE) &&
+              !_mesa_is_stencil_format(internalformat)))
+            goto end;
+      }
+
+      ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname,
+                                      buffer);
+      break;
+
+   case GL_TEXTURE_COMPRESSED:
+      buffer[0] = _mesa_is_compressed_format(ctx, internalformat);
+      break;
+
+   case GL_TEXTURE_COMPRESSED_BLOCK_WIDTH:
+   case GL_TEXTURE_COMPRESSED_BLOCK_HEIGHT:
+   case GL_TEXTURE_COMPRESSED_BLOCK_SIZE: {
+      mesa_format mesaformat;
+      GLint block_size;
+
+      mesaformat = _mesa_glenum_to_compressed_format(internalformat);
+      if (mesaformat == MESA_FORMAT_NONE)
+         goto end;
+
+      block_size = _mesa_get_format_bytes(mesaformat);
+      assert(block_size > 0);
+
+      if (pname == GL_TEXTURE_COMPRESSED_BLOCK_SIZE) {
+         buffer[0] = block_size;
+      } else {
+         GLuint bwidth, bheight;
+
+         /* Returns the width and height in pixels. We return bytes */
+         _mesa_get_format_block_size(mesaformat, &bwidth, &bheight);
+         assert(bwidth > 0 && bheight > 0);
+
+         if (pname == GL_TEXTURE_COMPRESSED_BLOCK_WIDTH)
+            buffer[0] = block_size / bheight;
+         else
+            buffer[0] = block_size / bwidth;
+      }
+      break;
+   }
+
+   case GL_CLEAR_BUFFER:
+      if (target != GL_TEXTURE_BUFFER)
+         goto end;
+
+      ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname,
+                                      buffer);
+      break;
+
+   case GL_TEXTURE_VIEW:
+   case GL_VIEW_COMPATIBILITY_CLASS:
+      if (!_mesa_has_ARB_texture_view(ctx) ||
+          target == GL_TEXTURE_BUFFER ||
+          target == GL_RENDERBUFFER)
+         goto end;
+
+      if (pname == GL_TEXTURE_VIEW) {
+         ctx->Driver.QueryInternalFormat(ctx, target, internalformat, pname,
+                                         buffer);
+      } else {
+         GLenum view_class = _mesa_texture_view_lookup_view_class(ctx,
+                                                                  internalformat);
+         if (view_class == GL_FALSE)
+            goto end;
+
+         buffer[0] = view_class;
+      }
+      break;
+
    default:
-      _mesa_error(ctx, GL_INVALID_ENUM,
-                  "glGetInternalformativ(pname=%s)",
-                  _mesa_enum_to_string(pname));
-      return;
+      unreachable("bad param");
    }
 
+ end:
    if (bufSize != 0 && params == NULL) {
       /* Emit a warning to aid application debugging, but go ahead and do the
        * memcpy (and probably crash) anyway.
@@ -190,7 +1530,55 @@ _mesa_GetInternalformativ(GLenum target, GLenum internalformat, GLenum pname,
     * application.  Clamp the size of the copy to the size supplied by the
     * application.
     */
-   memcpy(params, buffer, MIN2(count, bufSize) * sizeof(GLint));
+   memcpy(params, buffer, MIN2(bufSize, 16) * sizeof(GLint));
 
    return;
 }
+
+void GLAPIENTRY
+_mesa_GetInternalformati64v(GLenum target, GLenum internalformat,
+                            GLenum pname, GLsizei bufSize, GLint64 *params)
+{
+   GLint params32[16];
+   unsigned i;
+   GLsizei realSize = MIN2(bufSize, 16);
+   GLsizei callSize;
+
+   GET_CURRENT_CONTEXT(ctx);
+
+   ASSERT_OUTSIDE_BEGIN_END(ctx);
+
+   if (!_mesa_has_ARB_internalformat_query2(ctx)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "glGetInternalformati64v");
+      return;
+   }
+
+   /* For SAMPLES there are cases where params needs to remain unmodified. As
+    * no pname can return a negative value, we fill params32 with negative
+    * values as reference values, that can be used to know what copy-back to
+    * params */
+   memset(params32, -1, 16);
+
+   /* For GL_MAX_COMBINED_DIMENSIONS we need to get back 2 32-bit integers,
+    * and at the same time we only need 2. So for that pname, we call the
+    * 32-bit query with bufSize 2, except on the case of bufSize 0, that is
+    * basically like asking to not get the value, but that is a caller
+    * problem. */
+   if (pname == GL_MAX_COMBINED_DIMENSIONS && bufSize > 0)
+      callSize = 2;
+   else
+      callSize = bufSize;
+
+   _mesa_GetInternalformativ(target, internalformat, pname, callSize, params32);
+
+   if (pname == GL_MAX_COMBINED_DIMENSIONS) {
+      memcpy(params, params32, sizeof(GLint64));
+   } else {
+      for (i = 0; i < realSize; i++) {
+         /* We only copy back the values that changed */
+         if (params32[i] < 0)
+            break;
+         params[i] = (GLint64) params32[i];
+      }
+   }
+}
diff --git a/src/mesa/main/formatquery.h b/src/mesa/main/formatquery.h
index 603400059e5..1061fd26753 100644
--- a/src/mesa/main/formatquery.h
+++ b/src/mesa/main/formatquery.h
@@ -32,8 +32,17 @@ size_t
 _mesa_query_samples_for_format(struct gl_context *ctx, GLenum target,
                                GLenum internalFormat, int samples[16]);
 
+void
+_mesa_query_internal_format_default(struct gl_context *ctx, GLenum target,
+                                    GLenum internalFormat, GLenum pname,
+                                    GLint *params);
+
 extern void GLAPIENTRY
 _mesa_GetInternalformativ(GLenum target, GLenum internalformat,
                           GLenum pname, GLsizei bufSize, GLint *params);
 
+extern void GLAPIENTRY
+_mesa_GetInternalformati64v(GLenum target, GLenum internalformat,
+                            GLenum pname, GLsizei bufSize, GLint64 *params);
+
 #endif /* FORMATQUERY_H */
diff --git a/src/mesa/main/formats.c b/src/mesa/main/formats.c
index 9f920075026..41d40a522fa 100644
--- a/src/mesa/main/formats.c
+++ b/src/mesa/main/formats.c
@@ -132,21 +132,25 @@ _mesa_get_format_bits(mesa_format format, GLenum pname)
    case GL_TEXTURE_RED_SIZE:
    case GL_RENDERBUFFER_RED_SIZE_EXT:
    case GL_FRAMEBUFFER_ATTACHMENT_RED_SIZE:
+   case GL_INTERNALFORMAT_RED_SIZE:
       return info->RedBits;
    case GL_GREEN_BITS:
    case GL_TEXTURE_GREEN_SIZE:
    case GL_RENDERBUFFER_GREEN_SIZE_EXT:
    case GL_FRAMEBUFFER_ATTACHMENT_GREEN_SIZE:
+   case GL_INTERNALFORMAT_GREEN_SIZE:
       return info->GreenBits;
    case GL_BLUE_BITS:
    case GL_TEXTURE_BLUE_SIZE:
    case GL_RENDERBUFFER_BLUE_SIZE_EXT:
    case GL_FRAMEBUFFER_ATTACHMENT_BLUE_SIZE:
+   case GL_INTERNALFORMAT_BLUE_SIZE:
       return info->BlueBits;
    case GL_ALPHA_BITS:
    case GL_TEXTURE_ALPHA_SIZE:
    case GL_RENDERBUFFER_ALPHA_SIZE_EXT:
    case GL_FRAMEBUFFER_ATTACHMENT_ALPHA_SIZE:
+   case GL_INTERNALFORMAT_ALPHA_SIZE:
       return info->AlphaBits;
    case GL_TEXTURE_INTENSITY_SIZE:
       return info->IntensityBits;
@@ -158,11 +162,13 @@ _mesa_get_format_bits(mesa_format format, GLenum pname)
    case GL_TEXTURE_DEPTH_SIZE_ARB:
    case GL_RENDERBUFFER_DEPTH_SIZE_EXT:
    case GL_FRAMEBUFFER_ATTACHMENT_DEPTH_SIZE:
+   case GL_INTERNALFORMAT_DEPTH_SIZE:
       return info->DepthBits;
    case GL_STENCIL_BITS:
    case GL_TEXTURE_STENCIL_SIZE_EXT:
    case GL_RENDERBUFFER_STENCIL_SIZE_EXT:
    case GL_FRAMEBUFFER_ATTACHMENT_STENCIL_SIZE:
+   case GL_INTERNALFORMAT_STENCIL_SIZE:
       return info->StencilBits;
    default:
       _mesa_problem(NULL, "bad pname in _mesa_get_format_bits()");
diff --git a/src/mesa/main/formats.csv b/src/mesa/main/formats.csv
index 529de31212c..a663c1e867f 100644
--- a/src/mesa/main/formats.csv
+++ b/src/mesa/main/formats.csv
@@ -26,7 +26,7 @@
 #
 ###########################################################################
 
-# This CSV file has the input data for gen_format.h and gen_format.c
+# This CSV file has the input data for format_parser.py
 #
 # Each format entry contains:
 # - name, per enum mesa_format
diff --git a/src/mesa/main/genmipmap.c b/src/mesa/main/genmipmap.c
index 6c2d31dbcf3..6eacd424df7 100644
--- a/src/mesa/main/genmipmap.c
+++ b/src/mesa/main/genmipmap.c
@@ -38,27 +38,18 @@
 #include "texobj.h"
 #include "hash.h"
 
-/**
- * Implements glGenerateMipmap and glGenerateTextureMipmap.
- * Generates all the mipmap levels below the base level.
- */
-void
-_mesa_generate_texture_mipmap(struct gl_context *ctx,
-                              struct gl_texture_object *texObj, GLenum target,
-                              bool dsa)
+bool
+_mesa_is_valid_generate_texture_mipmap_target(struct gl_context *ctx,
+                                              GLenum target)
 {
-   struct gl_texture_image *srcImage;
-   GLboolean error;
-   const char *suffix = dsa ? "Texture" : "";
-
-   FLUSH_VERTICES(ctx, 0);
+   bool error;
 
    switch (target) {
    case GL_TEXTURE_1D:
       error = _mesa_is_gles(ctx);
       break;
    case GL_TEXTURE_2D:
-      error = GL_FALSE;
+      error = false;
       break;
    case GL_TEXTURE_3D:
       error = ctx->API == API_OPENGLES;
@@ -78,14 +69,35 @@ _mesa_generate_texture_mipmap(struct gl_context *ctx,
               !ctx->Extensions.ARB_texture_cube_map_array;
       break;
    default:
-      error = GL_TRUE;
+      error = true;
    }
 
-   if (error) {
-      _mesa_error(ctx, GL_INVALID_ENUM, "glGenerate%sMipmap(target=%s)",
-                  suffix, _mesa_enum_to_string(target));
-      return;
-   }
+   return !error;
+}
+
+bool
+_mesa_is_valid_generate_texture_mipmap_internalformat(struct gl_context *ctx,
+                                                      GLenum internalformat)
+{
+   return (!_mesa_is_enum_format_integer(internalformat) &&
+           !_mesa_is_depthstencil_format(internalformat) &&
+           !_mesa_is_astc_format(internalformat) &&
+           !_mesa_is_stencil_format(internalformat));
+}
+
+/**
+ * Implements glGenerateMipmap and glGenerateTextureMipmap.
+ * Generates all the mipmap levels below the base level.
+ */
+void
+_mesa_generate_texture_mipmap(struct gl_context *ctx,
+                              struct gl_texture_object *texObj, GLenum target,
+                              bool dsa)
+{
+   struct gl_texture_image *srcImage;
+   const char *suffix = dsa ? "Texture" : "";
+
+   FLUSH_VERTICES(ctx, 0);
 
    if (texObj->BaseLevel >= texObj->MaxLevel) {
       /* nothing to do */
@@ -109,10 +121,8 @@ _mesa_generate_texture_mipmap(struct gl_context *ctx,
       return;
    }
 
-   if (_mesa_is_enum_format_integer(srcImage->InternalFormat) ||
-       _mesa_is_depthstencil_format(srcImage->InternalFormat) ||
-       _mesa_is_astc_format(srcImage->InternalFormat) ||
-       _mesa_is_stencil_format(srcImage->InternalFormat)) {
+   if (!_mesa_is_valid_generate_texture_mipmap_internalformat(ctx,
+                                                              srcImage->InternalFormat)) {
       _mesa_unlock_texture(ctx, texObj);
       _mesa_error(ctx, GL_INVALID_OPERATION,
                   "glGenerate%sMipmap(invalid internal format)", suffix);
@@ -143,6 +153,12 @@ _mesa_GenerateMipmap(GLenum target)
    struct gl_texture_object *texObj;
    GET_CURRENT_CONTEXT(ctx);
 
+   if (!_mesa_is_valid_generate_texture_mipmap_target(ctx, target)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "glGenerateMipmap(target=%s)",
+                  _mesa_enum_to_string(target));
+      return;
+   }
+
    texObj = _mesa_get_current_tex_object(ctx, target);
    if (!texObj)
       return;
@@ -163,5 +179,11 @@ _mesa_GenerateTextureMipmap(GLuint texture)
    if (!texObj)
       return;
 
+   if (!_mesa_is_valid_generate_texture_mipmap_target(ctx, texObj->Target)) {
+      _mesa_error(ctx, GL_INVALID_ENUM, "glGenerateTextureMipmap(target=%s)",
+                  _mesa_enum_to_string(texObj->Target));
+      return;
+   }
+
    _mesa_generate_texture_mipmap(ctx, texObj, texObj->Target, true);
 }
diff --git a/src/mesa/main/genmipmap.h b/src/mesa/main/genmipmap.h
index f4ef859511e..40b7f3636aa 100644
--- a/src/mesa/main/genmipmap.h
+++ b/src/mesa/main/genmipmap.h
@@ -32,6 +32,12 @@ extern void
 _mesa_generate_texture_mipmap(struct gl_context *ctx,
                               struct gl_texture_object *texObj, GLenum target,
                               bool dsa);
+bool
+_mesa_is_valid_generate_texture_mipmap_target(struct gl_context *ctx,
+                                              GLenum target);
+bool
+_mesa_is_valid_generate_texture_mipmap_internalformat(struct gl_context *ctx,
+                                                      GLenum internalformat);
 
 extern void GLAPIENTRY
 _mesa_GenerateMipmap(GLenum target);
diff --git a/src/mesa/main/get.c b/src/mesa/main/get.c
index f40c5705813..b0fadc93aef 100644
--- a/src/mesa/main/get.c
+++ b/src/mesa/main/get.c
@@ -384,6 +384,13 @@ static const int extra_ARB_shader_storage_buffer_object_and_geometry_shader[] =
    EXTRA_END
 };
 
+static const int extra_ARB_shader_image_load_store_shader_storage_buffer_object_es31[] = {
+   EXT(ARB_shader_image_load_store),
+   EXT(ARB_shader_storage_buffer_object),
+   EXTRA_API_ES31,
+   EXTRA_END
+};
+
 static const int extra_ARB_framebuffer_no_attachments_and_geometry_shader[] = {
    EXTRA_EXT_FB_NO_ATTACH_GS,
    EXTRA_END
@@ -1055,6 +1062,8 @@ find_custom_value(struct gl_context *ctx, const struct value_desc *d, union valu
       }
       break;
    /* GL_KHR_DEBUG */
+   case GL_DEBUG_OUTPUT:
+   case GL_DEBUG_OUTPUT_SYNCHRONOUS:
    case GL_DEBUG_LOGGED_MESSAGES:
    case GL_DEBUG_NEXT_LOGGED_MESSAGE_LENGTH:
    case GL_DEBUG_GROUP_STACK_DEPTH:
@@ -1715,19 +1724,19 @@ _mesa_GetInteger64v(GLenum pname, GLint64 *params)
       break;
 
    case TYPE_FLOATN_4:
-      params[3] = FLOAT_TO_INT64(((GLfloat *) p)[3]);
+      params[3] = FLOAT_TO_INT(((GLfloat *) p)[3]);
    case TYPE_FLOATN_3:
-      params[2] = FLOAT_TO_INT64(((GLfloat *) p)[2]);
+      params[2] = FLOAT_TO_INT(((GLfloat *) p)[2]);
    case TYPE_FLOATN_2:
-      params[1] = FLOAT_TO_INT64(((GLfloat *) p)[1]);
+      params[1] = FLOAT_TO_INT(((GLfloat *) p)[1]);
    case TYPE_FLOATN:
-      params[0] = FLOAT_TO_INT64(((GLfloat *) p)[0]);
+      params[0] = FLOAT_TO_INT(((GLfloat *) p)[0]);
       break;
 
    case TYPE_DOUBLEN_2:
-      params[1] = FLOAT_TO_INT64(((GLdouble *) p)[1]);
+      params[1] = FLOAT_TO_INT(((GLdouble *) p)[1]);
    case TYPE_DOUBLEN:
-      params[0] = FLOAT_TO_INT64(((GLdouble *) p)[0]);
+      params[0] = FLOAT_TO_INT(((GLdouble *) p)[0]);
       break;
 
    case TYPE_INT_4:
diff --git a/src/mesa/main/get_hash_params.py b/src/mesa/main/get_hash_params.py
index 07d2d20df7a..12c21899cb1 100644
--- a/src/mesa/main/get_hash_params.py
+++ b/src/mesa/main/get_hash_params.py
@@ -126,6 +126,8 @@ descriptor=[
   [ "MAX_TEXTURE_MAX_ANISOTROPY_EXT", "CONTEXT_FLOAT(Const.MaxTextureMaxAnisotropy), extra_EXT_texture_filter_anisotropic" ],
 
 # GL_KHR_debug (GL 4.3)/ GL_ARB_debug_output
+  [ "DEBUG_OUTPUT", "LOC_CUSTOM, TYPE_BOOLEAN, 0, NO_EXTRA" ],
+  [ "DEBUG_OUTPUT_SYNCHRONOUS", "LOC_CUSTOM, TYPE_BOOLEAN, 0, NO_EXTRA" ],
   [ "DEBUG_LOGGED_MESSAGES", "LOC_CUSTOM, TYPE_INT, 0, NO_EXTRA" ],
   [ "DEBUG_NEXT_LOGGED_MESSAGE_LENGTH", "LOC_CUSTOM, TYPE_INT, 0, NO_EXTRA" ],
   [ "MAX_DEBUG_LOGGED_MESSAGES", "CONST(MAX_DEBUG_LOGGED_MESSAGES), NO_EXTRA" ],
@@ -493,9 +495,12 @@ descriptor=[
   [ "MAX_COMBINED_SHADER_STORAGE_BLOCKS", "CONTEXT_INT(Const.MaxCombinedShaderStorageBlocks), extra_ARB_shader_storage_buffer_object_es31" ],
   [ "MAX_SHADER_STORAGE_BLOCK_SIZE", "CONTEXT_INT(Const.MaxShaderStorageBlockSize), extra_ARB_shader_storage_buffer_object_es31" ],
   [ "MAX_SHADER_STORAGE_BUFFER_BINDINGS", "CONTEXT_INT(Const.MaxShaderStorageBufferBindings), extra_ARB_shader_storage_buffer_object_es31" ],
-  [ "MAX_COMBINED_SHADER_OUTPUT_RESOURCES", "CONTEXT_INT(Const.MaxCombinedShaderOutputResources), extra_ARB_shader_storage_buffer_object_es31" ],
   [ "SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT", "CONTEXT_INT(Const.ShaderStorageBufferOffsetAlignment), extra_ARB_shader_storage_buffer_object_es31" ],
   [ "SHADER_STORAGE_BUFFER_BINDING", "LOC_CUSTOM, TYPE_INT, 0, extra_ARB_shader_storage_buffer_object_es31" ],
+
+  # GL_ARB_shader_image_load_store / GL_ARB_shader_storage_buffer_object / GLES 3.1
+  # (MAX_COMBINED_IMAGE_UNITS_AND_FRAGMENT_OUTPUTS in GL_ARB_shader_image_load_store)
+  [ "MAX_COMBINED_SHADER_OUTPUT_RESOURCES", "CONTEXT_INT(Const.MaxCombinedShaderOutputResources), extra_ARB_shader_image_load_store_shader_storage_buffer_object_es31" ],
 ]},
 
 # Enums in OpenGL Core profile and ES 3.1
@@ -773,6 +778,7 @@ descriptor=[
   [ "DEPTH_CLAMP", "CONTEXT_BOOL(Transform.DepthClamp), extra_ARB_depth_clamp" ],
 
 # GL_ATI_fragment_shader
+  [ "FRAGMENT_SHADER_ATI", "CONTEXT_BOOL(ATIFragmentShader.Enabled), extra_ATI_fragment_shader" ],
   [ "NUM_FRAGMENT_REGISTERS_ATI", "CONST(6), extra_ATI_fragment_shader" ],
   [ "NUM_FRAGMENT_CONSTANTS_ATI", "CONST(8), extra_ATI_fragment_shader" ],
   [ "NUM_PASSES_ATI", "CONST(2), extra_ATI_fragment_shader" ],
@@ -838,7 +844,6 @@ descriptor=[
   [ "MAX_PROGRAM_TEXTURE_GATHER_COMPONENTS_ARB", "CONTEXT_INT(Const.MaxProgramTextureGatherComponents), extra_ARB_texture_gather"],
 
 # GL_ARB_shader_image_load_store
-  [ "MAX_COMBINED_IMAGE_UNITS_AND_FRAGMENT_OUTPUTS", "CONTEXT_INT(Const.MaxCombinedShaderOutputResources), extra_ARB_shader_image_load_store" ],
   [ "MAX_IMAGE_SAMPLES", "CONTEXT_INT(Const.MaxImageSamples), extra_ARB_shader_image_load_store" ],
 
 # GL_EXT_polygon_offset_clamp
diff --git a/src/mesa/main/glformats.c b/src/mesa/main/glformats.c
index 987cd0db45c..cf6495885b6 100644
--- a/src/mesa/main/glformats.c
+++ b/src/mesa/main/glformats.c
@@ -1339,6 +1339,51 @@ _mesa_is_compressed_format(const struct gl_context *ctx, GLenum format)
 }
 
 /**
+ * Test if the given format represents an sRGB format.
+ * \param format the GL format (can be an internal format)
+ * \return GL_TRUE if format is sRGB, GL_FALSE otherwise
+ */
+GLboolean
+_mesa_is_srgb_format(GLenum format)
+{
+   switch (format) {
+   case GL_SRGB:
+   case GL_SRGB8:
+   case GL_SRGB_ALPHA:
+   case GL_SRGB8_ALPHA8:
+   case GL_COMPRESSED_SRGB:
+   case GL_COMPRESSED_SRGB_ALPHA:
+   case GL_COMPRESSED_SRGB_S3TC_DXT1_EXT:
+   case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT1_EXT:
+   case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT3_EXT:
+   case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT:
+   case GL_COMPRESSED_SRGB8_ETC2:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ETC2_EAC:
+   case GL_COMPRESSED_SRGB8_PUNCHTHROUGH_ALPHA1_ETC2:
+   case GL_COMPRESSED_SRGB_ALPHA_BPTC_UNORM:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_4x4_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x4_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_5x5_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x5_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_6x6_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x5_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x6_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_8x8_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x5_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x6_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x8_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_10x10_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x10_KHR:
+   case GL_COMPRESSED_SRGB8_ALPHA8_ASTC_12x12_KHR:
+      return GL_TRUE;
+   default:
+      break;
+   }
+
+   return GL_FALSE;
+}
+
+/**
  * Convert various unpack formats to the corresponding base format.
  */
 GLenum
@@ -1430,6 +1475,8 @@ _mesa_base_format_has_channel(GLenum base_format, GLenum pname)
    case GL_TEXTURE_RED_TYPE:
    case GL_RENDERBUFFER_RED_SIZE_EXT:
    case GL_FRAMEBUFFER_ATTACHMENT_RED_SIZE:
+   case GL_INTERNALFORMAT_RED_SIZE:
+   case GL_INTERNALFORMAT_RED_TYPE:
       if (base_format == GL_RED ||
 	  base_format == GL_RG ||
 	  base_format == GL_RGB ||
@@ -1441,6 +1488,8 @@ _mesa_base_format_has_channel(GLenum base_format, GLenum pname)
    case GL_TEXTURE_GREEN_TYPE:
    case GL_RENDERBUFFER_GREEN_SIZE_EXT:
    case GL_FRAMEBUFFER_ATTACHMENT_GREEN_SIZE:
+   case GL_INTERNALFORMAT_GREEN_SIZE:
+   case GL_INTERNALFORMAT_GREEN_TYPE:
       if (base_format == GL_RG ||
 	  base_format == GL_RGB ||
 	  base_format == GL_RGBA) {
@@ -1451,6 +1500,8 @@ _mesa_base_format_has_channel(GLenum base_format, GLenum pname)
    case GL_TEXTURE_BLUE_TYPE:
    case GL_RENDERBUFFER_BLUE_SIZE_EXT:
    case GL_FRAMEBUFFER_ATTACHMENT_BLUE_SIZE:
+   case GL_INTERNALFORMAT_BLUE_SIZE:
+   case GL_INTERNALFORMAT_BLUE_TYPE:
       if (base_format == GL_RGB ||
 	  base_format == GL_RGBA) {
 	 return GL_TRUE;
@@ -1460,6 +1511,8 @@ _mesa_base_format_has_channel(GLenum base_format, GLenum pname)
    case GL_TEXTURE_ALPHA_TYPE:
    case GL_RENDERBUFFER_ALPHA_SIZE_EXT:
    case GL_FRAMEBUFFER_ATTACHMENT_ALPHA_SIZE:
+   case GL_INTERNALFORMAT_ALPHA_SIZE:
+   case GL_INTERNALFORMAT_ALPHA_TYPE:
       if (base_format == GL_RGBA ||
 	  base_format == GL_ALPHA ||
 	  base_format == GL_LUMINANCE_ALPHA) {
@@ -1483,6 +1536,8 @@ _mesa_base_format_has_channel(GLenum base_format, GLenum pname)
    case GL_TEXTURE_DEPTH_TYPE:
    case GL_RENDERBUFFER_DEPTH_SIZE_EXT:
    case GL_FRAMEBUFFER_ATTACHMENT_DEPTH_SIZE:
+   case GL_INTERNALFORMAT_DEPTH_SIZE:
+   case GL_INTERNALFORMAT_DEPTH_TYPE:
       if (base_format == GL_DEPTH_STENCIL ||
 	  base_format == GL_DEPTH_COMPONENT) {
 	 return GL_TRUE;
@@ -1490,6 +1545,8 @@ _mesa_base_format_has_channel(GLenum base_format, GLenum pname)
       return GL_FALSE;
    case GL_RENDERBUFFER_STENCIL_SIZE_EXT:
    case GL_FRAMEBUFFER_ATTACHMENT_STENCIL_SIZE:
+   case GL_INTERNALFORMAT_STENCIL_SIZE:
+   case GL_INTERNALFORMAT_STENCIL_TYPE:
       if (base_format == GL_DEPTH_STENCIL ||
 	  base_format == GL_STENCIL_INDEX) {
 	 return GL_TRUE;
@@ -2546,6 +2603,10 @@ _mesa_es3_effective_internal_format_for_format_and_type(GLenum format,
          return GL_RGBA8;
       case GL_RGB:
          return GL_RGB8;
+      case GL_RG:
+         return GL_RG8;
+      case GL_RED:
+         return GL_R8;
       /* Although LUMINANCE_ALPHA, LUMINANCE and ALPHA appear in table 3.12,
        * (section 3.8 Texturing, page 128 of the OpenGL-ES 3.0.4) as effective
        * internal formats, they do not correspond to GL constants, so the base
@@ -3464,6 +3525,27 @@ _mesa_format_from_format_and_type(GLenum format, GLenum type)
    case GL_UNSIGNED_INT_10F_11F_11F_REV:
       if (format == GL_RGB)
          return MESA_FORMAT_R11G11B10_FLOAT;
+      break;
+   case GL_FLOAT:
+      if (format == GL_DEPTH_COMPONENT)
+         return MESA_FORMAT_Z_FLOAT32;
+      break;
+   case GL_UNSIGNED_INT:
+      if (format == GL_DEPTH_COMPONENT)
+         return MESA_FORMAT_Z_UNORM32;
+      break;
+   case GL_UNSIGNED_SHORT:
+      if (format == GL_DEPTH_COMPONENT)
+         return MESA_FORMAT_Z_UNORM16;
+      break;
+   case GL_UNSIGNED_INT_24_8:
+      if (format == GL_DEPTH_STENCIL)
+         return MESA_FORMAT_Z24_UNORM_S8_UINT;
+      break;
+   case GL_FLOAT_32_UNSIGNED_INT_24_8_REV:
+      if (format == GL_DEPTH_STENCIL)
+         return MESA_FORMAT_Z32_FLOAT_S8X24_UINT;
+      break;
    default:
       break;
    }
diff --git a/src/mesa/main/glformats.h b/src/mesa/main/glformats.h
index b3668556da2..00d2767085d 100644
--- a/src/mesa/main/glformats.h
+++ b/src/mesa/main/glformats.h
@@ -101,6 +101,9 @@ _mesa_is_depth_or_stencil_format(GLenum format);
 extern GLboolean
 _mesa_is_compressed_format(const struct gl_context *ctx, GLenum format);
 
+extern GLboolean
+_mesa_is_srgb_format(GLenum format);
+
 extern GLenum
 _mesa_base_format_to_integer_format(GLenum format);
 
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index a2e3c26c321..2e43996f23a 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -3811,6 +3811,7 @@ struct gl_extensions
    GLboolean ARB_indirect_parameters;
    GLboolean ARB_instanced_arrays;
    GLboolean ARB_internalformat_query;
+   GLboolean ARB_internalformat_query2;
    GLboolean ARB_map_buffer_range;
    GLboolean ARB_occlusion_query;
    GLboolean ARB_occlusion_query2;
@@ -3819,6 +3820,7 @@ struct gl_extensions
    GLboolean ARB_query_buffer_object;
    GLboolean ARB_sample_shading;
    GLboolean ARB_seamless_cube_map;
+   GLboolean ARB_shader_atomic_counter_ops;
    GLboolean ARB_shader_atomic_counters;
    GLboolean ARB_shader_bit_encoding;
    GLboolean ARB_shader_clock;
diff --git a/src/mesa/main/multisample.c b/src/mesa/main/multisample.c
index e7783ea5374..77773a20883 100644
--- a/src/mesa/main/multisample.c
+++ b/src/mesa/main/multisample.c
@@ -174,10 +174,15 @@ _mesa_check_sample_count(struct gl_context *ctx, GLenum target,
     * for <internalformat> then the error INVALID_OPERATION is generated."
     */
    if (ctx->Extensions.ARB_internalformat_query) {
-      GLint buffer[16];
-      int count = ctx->Driver.QuerySamplesForFormat(ctx, target,
-                                                    internalFormat, buffer);
-      int limit = count ? buffer[0] : -1;
+      GLint buffer[16] = {-1};
+      GLint limit;
+
+      ctx->Driver.QueryInternalFormat(ctx, target, internalFormat,
+                                      GL_SAMPLES, buffer);
+      /* since the query returns samples sorted in descending order,
+       * the first element is the greatest supported sample value.
+       */
+      limit = buffer[0];
 
       return samples > limit ? GL_INVALID_OPERATION : GL_NO_ERROR;
    }
diff --git a/src/mesa/main/samplerobj.h b/src/mesa/main/samplerobj.h
index abc6e019046..8e9539d8d8f 100644
--- a/src/mesa/main/samplerobj.h
+++ b/src/mesa/main/samplerobj.h
@@ -27,14 +27,12 @@
 #ifndef SAMPLEROBJ_H
 #define SAMPLEROBJ_H
 
+#include "mtypes.h"
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-
-#include "mtypes.h"
-
-
 struct dd_function_table;
 
 static inline struct gl_sampler_object *
diff --git a/src/mesa/main/shaderimage.c b/src/mesa/main/shaderimage.c
index 040e9fd6e3c..fd5934f939f 100644
--- a/src/mesa/main/shaderimage.c
+++ b/src/mesa/main/shaderimage.c
@@ -331,12 +331,54 @@ get_image_format_class(mesa_format format)
    }
 }
 
-/**
- * Return whether an image format should be supported based on the current API
- * version of the context.
- */
-static bool
-is_image_format_supported(const struct gl_context *ctx, GLenum format)
+static GLenum
+_image_format_class_to_glenum(enum image_format_class class)
+{
+   switch (class) {
+   case IMAGE_FORMAT_CLASS_NONE:
+      return GL_NONE;
+   case IMAGE_FORMAT_CLASS_1X8:
+      return GL_IMAGE_CLASS_1_X_8;
+   case IMAGE_FORMAT_CLASS_1X16:
+      return GL_IMAGE_CLASS_1_X_16;
+   case IMAGE_FORMAT_CLASS_1X32:
+      return GL_IMAGE_CLASS_1_X_32;
+   case IMAGE_FORMAT_CLASS_2X8:
+      return GL_IMAGE_CLASS_2_X_8;
+   case IMAGE_FORMAT_CLASS_2X16:
+      return GL_IMAGE_CLASS_2_X_16;
+   case IMAGE_FORMAT_CLASS_2X32:
+      return GL_IMAGE_CLASS_2_X_32;
+   case IMAGE_FORMAT_CLASS_10_11_11:
+      return GL_IMAGE_CLASS_11_11_10;
+   case IMAGE_FORMAT_CLASS_4X8:
+      return GL_IMAGE_CLASS_4_X_8;
+   case IMAGE_FORMAT_CLASS_4X16:
+      return GL_IMAGE_CLASS_4_X_16;
+   case IMAGE_FORMAT_CLASS_4X32:
+      return GL_IMAGE_CLASS_4_X_32;
+   case IMAGE_FORMAT_CLASS_2_10_10_10:
+      return GL_IMAGE_CLASS_10_10_10_2;
+   default:
+      assert(!"Invalid image_format_class");
+      return GL_NONE;
+   }
+}
+
+GLenum
+_mesa_get_image_format_class(GLenum format)
+{
+   mesa_format tex_format = _mesa_get_shader_image_format(format);
+   if (tex_format == MESA_FORMAT_NONE)
+      return GL_NONE;
+
+   enum image_format_class class = get_image_format_class(tex_format);
+   return _image_format_class_to_glenum(class);
+}
+
+bool
+_mesa_is_shader_image_format_supported(const struct gl_context *ctx,
+                                       GLenum format)
 {
    switch (format) {
    /* Formats supported on both desktop and ES GL, c.f. table 8.27 of the
@@ -503,7 +545,7 @@ validate_bind_image_texture(struct gl_context *ctx, GLuint unit,
       return GL_FALSE;
    }
 
-   if (!is_image_format_supported(ctx, format)) {
+   if (!_mesa_is_shader_image_format_supported(ctx, format)) {
       _mesa_error(ctx, GL_INVALID_VALUE, "glBindImageTexture(format)");
       return GL_FALSE;
    }
@@ -668,7 +710,7 @@ _mesa_BindImageTextures(GLuint first, GLsizei count, const GLuint *textures)
             tex_format = image->InternalFormat;
          }
 
-         if (!is_image_format_supported(ctx, tex_format)) {
+         if (!_mesa_is_shader_image_format_supported(ctx, tex_format)) {
             /* The ARB_multi_bind spec says:
              *
              *   "An INVALID_OPERATION error is generated if the internal
diff --git a/src/mesa/main/shaderimage.h b/src/mesa/main/shaderimage.h
index 94ee814a716..85193e104f6 100644
--- a/src/mesa/main/shaderimage.h
+++ b/src/mesa/main/shaderimage.h
@@ -43,6 +43,20 @@ mesa_format
 _mesa_get_shader_image_format(GLenum format);
 
 /**
+ * Get the GL image format class for a shader image format GL enum
+ */
+GLenum
+_mesa_get_image_format_class(GLenum format);
+
+/**
+ * Return whether an image format should be supported based on the current API
+ * version of the context.
+ */
+bool
+_mesa_is_shader_image_format_supported(const struct gl_context *ctx,
+                                       GLenum format);
+
+/**
  * Get a single image unit struct with the default state.
  */
 struct gl_image_unit
diff --git a/src/mesa/main/tests/dispatch_sanity.cpp b/src/mesa/main/tests/dispatch_sanity.cpp
index 24e3d189091..09b97c33074 100644
--- a/src/mesa/main/tests/dispatch_sanity.cpp
+++ b/src/mesa/main/tests/dispatch_sanity.cpp
@@ -857,6 +857,9 @@ const struct function common_desktop_functions_possible[] = {
    /* GL_ARB_internalformat_query */
    { "glGetInternalformativ", 30, -1 },
 
+   /* GL_ARB_internalformat_query */
+   { "glGetInternalformati64v", 30, -1 },
+
    /* GL_ARB_multi_bind */
    { "glBindBuffersBase", 44, -1 },
    { "glBindBuffersRange", 44, -1 },
@@ -2355,6 +2358,7 @@ const struct function gles3_functions_possible[] = {
    { "glGetInteger64v", 30, -1 },
    { "glGetIntegeri_v", 30, -1 },
    { "glGetInternalformativ", 30, -1 },
+   { "glGetInternalformati64v", 30, -1 },
    // glGetProgramBinary aliases glGetProgramBinaryOES in GLES 2
    { "glGetQueryiv", 30, -1 },
    { "glGetQueryObjectuiv", 30, -1 },
diff --git a/src/mesa/main/texcompress.c b/src/mesa/main/texcompress.c
index a8ac19e40d7..3ae64521513 100644
--- a/src/mesa/main/texcompress.c
+++ b/src/mesa/main/texcompress.c
@@ -443,7 +443,7 @@ _mesa_get_compressed_formats(struct gl_context *ctx, GLint *formats)
 
 
 /**
- * Convert a compressed MESA_FORMAT_x to a GLenum.
+ * Convert GLenum to a compressed MESA_FORMAT_x.
  */
 mesa_format
 _mesa_glenum_to_compressed_format(GLenum format)
diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 8a4c6286cbe..616a92953e7 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -242,6 +242,24 @@ _mesa_is_array_texture(GLenum target)
    };
 }
 
+/**
+ * Test if a target is a cube map.
+ *
+ * \param target texture target.
+ *
+ * \return true if the target is a cube map, false otherwise.
+ */
+bool
+_mesa_is_cube_map_texture(GLenum target)
+{
+   switch(target) {
+   case GL_TEXTURE_CUBE_MAP:
+   case GL_TEXTURE_CUBE_MAP_ARRAY:
+      return true;
+   default:
+      return false;
+   }
+}
 
 /**
  * Return the proxy target which corresponds to the given texture target
@@ -1266,7 +1284,7 @@ compressedteximage_only_format(const struct gl_context *ctx, GLenum format)
 /**
  * Return true if the format doesn't support online compression.
  */
-static bool
+bool
 _mesa_format_no_online_compression(const struct gl_context *ctx, GLenum format)
 {
    return _mesa_is_astc_format(format) ||
@@ -1552,19 +1570,12 @@ compressed_tex_size(GLsizei width, GLsizei height, GLsizei depth,
  * \param ctx             GL context
  * \param target          Texture target
  * \param internalFormat  Internal format of the texture image
- * \param dimensions      Dimensionality at the caller.  This is \b not used
- *                        in the validation.  It is only used when logging
- *                        error messages.
- * \param caller          Base name of the calling function (e.g.,
- *                        "glTexImage" or "glTexStorage").
  *
  * \returns true if the combination is legal, false otherwise.
  */
 bool
 _mesa_legal_texture_base_format_for_target(struct gl_context *ctx,
-                                           GLenum target, GLenum internalFormat,
-                                           unsigned dimensions,
-                                           const char *caller)
+                                           GLenum target, GLenum internalFormat)
 {
    if (_mesa_base_tex_format(ctx, internalFormat) == GL_DEPTH_COMPONENT
        || _mesa_base_tex_format(ctx, internalFormat) == GL_DEPTH_STENCIL
@@ -1603,9 +1614,6 @@ _mesa_legal_texture_base_format_for_target(struct gl_context *ctx,
           !((target == GL_TEXTURE_CUBE_MAP_ARRAY ||
              target == GL_PROXY_TEXTURE_CUBE_MAP_ARRAY) &&
             ctx->Extensions.ARB_texture_cube_map_array)) {
-         _mesa_error(ctx, GL_INVALID_OPERATION,
-                     "%s%dD(bad target for depth texture)",
-                     caller, dimensions);
          return false;
       }
    }
@@ -1849,9 +1857,11 @@ texture_error_check( struct gl_context *ctx,
    }
 
    /* additional checks for depth textures */
-   if (!_mesa_legal_texture_base_format_for_target(ctx, target, internalFormat,
-                                                   dimensions, "glTexImage"))
+   if (!_mesa_legal_texture_base_format_for_target(ctx, target, internalFormat)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION,
+                  "glTexImage%dD(bad target for texture)", dimensions);
       return GL_TRUE;
+   }
 
    /* additional checks for compressed textures */
    if (_mesa_is_compressed_format(ctx, internalFormat)) {
@@ -5148,8 +5158,8 @@ _mesa_TextureBufferRange(GLuint texture, GLenum internalFormat, GLuint buffer,
                               bufObj, offset, size, "glTextureBufferRange");
 }
 
-static GLboolean
-is_renderable_texture_format(struct gl_context *ctx, GLenum internalformat)
+GLboolean
+_mesa_is_renderable_texture_format(struct gl_context *ctx, GLenum internalformat)
 {
    /* Everything that is allowed for renderbuffers,
     * except for a base format of GL_STENCIL_INDEX, unless supported.
@@ -5229,7 +5239,7 @@ texture_image_multisample(struct gl_context *ctx, GLuint dims,
       return;
    }
 
-   if (!is_renderable_texture_format(ctx, internalformat)) {
+   if (!_mesa_is_renderable_texture_format(ctx, internalformat)) {
       /* Page 172 of OpenGL ES 3.1 spec says:
        *   "An INVALID_ENUM error is generated if sizedinternalformat is not
        *   color-renderable, depth-renderable, or stencil-renderable (as
diff --git a/src/mesa/main/teximage.h b/src/mesa/main/teximage.h
index 17f2c908ecc..8b687062a67 100644
--- a/src/mesa/main/teximage.h
+++ b/src/mesa/main/teximage.h
@@ -210,9 +210,13 @@ _mesa_validate_texbuffer_format(const struct gl_context *ctx,
 bool
 _mesa_legal_texture_base_format_for_target(struct gl_context *ctx,
                                            GLenum target,
-                                           GLenum internalFormat,
-                                           unsigned dimensions,
-                                           const char *caller);
+                                           GLenum internalFormat);
+
+bool
+_mesa_format_no_online_compression(const struct gl_context *ctx, GLenum format);
+
+GLboolean
+_mesa_is_renderable_texture_format(struct gl_context *ctx, GLenum internalformat);
 
 extern void
 _mesa_texture_sub_image(struct gl_context *ctx, GLuint dims,
@@ -252,6 +256,10 @@ _mesa_texture_buffer_range(struct gl_context *ctx,
                            struct gl_buffer_object *bufObj,
                            GLintptr offset, GLsizeiptr size,
                            const char *caller);
+
+bool
+_mesa_is_cube_map_texture(GLenum target);
+
 /*@}*/
 
 
diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c
index 3b769f436b7..9350ca5c035 100644
--- a/src/mesa/main/texparam.c
+++ b/src/mesa/main/texparam.c
@@ -252,8 +252,8 @@ incomplete(struct gl_context *ctx, struct gl_texture_object *texObj)
 }
 
 
-static GLboolean
-target_allows_setting_sampler_parameters(GLenum target)
+GLboolean
+_mesa_target_allows_setting_sampler_parameters(GLenum target)
 {
    switch (target) {
    case GL_TEXTURE_2D_MULTISAMPLE:
@@ -279,7 +279,7 @@ set_tex_parameteri(struct gl_context *ctx,
 
    switch (pname) {
    case GL_TEXTURE_MIN_FILTER:
-      if (!target_allows_setting_sampler_parameters(texObj->Target))
+      if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target))
          goto invalid_enum;
 
       if (texObj->Sampler.MinFilter == params[0])
@@ -307,7 +307,7 @@ set_tex_parameteri(struct gl_context *ctx,
       return GL_FALSE;
 
    case GL_TEXTURE_MAG_FILTER:
-      if (!target_allows_setting_sampler_parameters(texObj->Target))
+      if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target))
          goto invalid_enum;
 
       if (texObj->Sampler.MagFilter == params[0])
@@ -324,7 +324,7 @@ set_tex_parameteri(struct gl_context *ctx,
       return GL_FALSE;
 
    case GL_TEXTURE_WRAP_S:
-      if (!target_allows_setting_sampler_parameters(texObj->Target))
+      if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target))
          goto invalid_enum;
 
       if (texObj->Sampler.WrapS == params[0])
@@ -337,7 +337,7 @@ set_tex_parameteri(struct gl_context *ctx,
       return GL_FALSE;
 
    case GL_TEXTURE_WRAP_T:
-      if (!target_allows_setting_sampler_parameters(texObj->Target))
+      if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target))
          goto invalid_enum;
 
       if (texObj->Sampler.WrapT == params[0])
@@ -350,7 +350,7 @@ set_tex_parameteri(struct gl_context *ctx,
       return GL_FALSE;
 
    case GL_TEXTURE_WRAP_R:
-      if (!target_allows_setting_sampler_parameters(texObj->Target))
+      if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target))
          goto invalid_enum;
 
       if (texObj->Sampler.WrapR == params[0])
@@ -438,7 +438,7 @@ set_tex_parameteri(struct gl_context *ctx,
       if ((_mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_shadow)
           || _mesa_is_gles3(ctx)) {
 
-         if (!target_allows_setting_sampler_parameters(texObj->Target))
+         if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target))
             goto invalid_enum;
 
          if (texObj->Sampler.CompareMode == params[0])
@@ -457,7 +457,7 @@ set_tex_parameteri(struct gl_context *ctx,
       if ((_mesa_is_desktop_gl(ctx) && ctx->Extensions.ARB_shadow)
           || _mesa_is_gles3(ctx)) {
 
-         if (!target_allows_setting_sampler_parameters(texObj->Target))
+         if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target))
             goto invalid_enum;
 
          if (texObj->Sampler.CompareFunc == params[0])
@@ -571,7 +571,7 @@ set_tex_parameteri(struct gl_context *ctx,
       if (ctx->Extensions.EXT_texture_sRGB_decode) {
          GLenum decode = params[0];
 
-         if (!target_allows_setting_sampler_parameters(texObj->Target))
+         if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target))
             goto invalid_enum;
 
 	 if (decode == GL_DECODE_EXT || decode == GL_SKIP_DECODE_EXT) {
@@ -589,7 +589,7 @@ set_tex_parameteri(struct gl_context *ctx,
           && ctx->Extensions.AMD_seamless_cubemap_per_texture) {
          GLenum param = params[0];
 
-         if (!target_allows_setting_sampler_parameters(texObj->Target))
+         if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target))
             goto invalid_enum;
 
          if (param != GL_TRUE && param != GL_FALSE) {
@@ -645,7 +645,7 @@ set_tex_parameterf(struct gl_context *ctx,
       if (!_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx))
          goto invalid_pname;
 
-      if (!target_allows_setting_sampler_parameters(texObj->Target))
+      if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target))
          goto invalid_enum;
 
       if (texObj->Sampler.MinLod == params[0])
@@ -658,7 +658,7 @@ set_tex_parameterf(struct gl_context *ctx,
       if (!_mesa_is_desktop_gl(ctx) && !_mesa_is_gles3(ctx))
          goto invalid_pname;
 
-      if (!target_allows_setting_sampler_parameters(texObj->Target))
+      if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target))
          goto invalid_enum;
 
       if (texObj->Sampler.MaxLod == params[0])
@@ -677,7 +677,7 @@ set_tex_parameterf(struct gl_context *ctx,
 
    case GL_TEXTURE_MAX_ANISOTROPY_EXT:
       if (ctx->Extensions.EXT_texture_filter_anisotropic) {
-         if (!target_allows_setting_sampler_parameters(texObj->Target))
+         if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target))
             goto invalid_enum;
 
          if (texObj->Sampler.MaxAnisotropy == params[0])
@@ -705,7 +705,7 @@ set_tex_parameterf(struct gl_context *ctx,
       if (_mesa_is_gles(ctx))
          goto invalid_pname;
 
-      if (!target_allows_setting_sampler_parameters(texObj->Target))
+      if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target))
          goto invalid_enum;
 
       if (texObj->Sampler.LodBias != params[0]) {
@@ -720,7 +720,7 @@ set_tex_parameterf(struct gl_context *ctx,
           !ctx->Extensions.ARB_texture_border_clamp)
          goto invalid_pname;
 
-      if (!target_allows_setting_sampler_parameters(texObj->Target))
+      if (!_mesa_target_allows_setting_sampler_parameters(texObj->Target))
          goto invalid_enum;
 
       flush(ctx);
@@ -1202,9 +1202,9 @@ _mesa_TextureParameterIuiv(GLuint texture, GLenum pname, const GLuint *params)
    _mesa_texture_parameterIuiv(ctx, texObj, pname, params, true);
 }
 
-static GLboolean
-legal_get_tex_level_parameter_target(struct gl_context *ctx, GLenum target,
-                                     bool dsa)
+GLboolean
+_mesa_legal_get_tex_level_parameter_target(struct gl_context *ctx, GLenum target,
+                                           bool dsa)
 {
    /* Common targets for desktop GL and GLES 3.1. */
    switch (target) {
@@ -1578,7 +1578,7 @@ valid_tex_level_parameteriv_target(struct gl_context *ctx, GLenum target,
                                    bool dsa)
 {
    const char *suffix = dsa ? "ture" : "";
-   if (!legal_get_tex_level_parameter_target(ctx, target, dsa)) {
+   if (!_mesa_legal_get_tex_level_parameter_target(ctx, target, dsa)) {
       _mesa_error(ctx, GL_INVALID_ENUM,
                   "glGetTex%sLevelParameter[if]v(target=%s)", suffix,
                   _mesa_enum_to_string(target));
diff --git a/src/mesa/main/texparam.h b/src/mesa/main/texparam.h
index 96defbec213..b2f6a847337 100644
--- a/src/mesa/main/texparam.h
+++ b/src/mesa/main/texparam.h
@@ -65,6 +65,13 @@ _mesa_texture_parameterIuiv(struct gl_context *ctx,
                             struct gl_texture_object *texObj,
                             GLenum pname, const GLuint *params, bool dsa);
 
+GLboolean
+_mesa_legal_get_tex_level_parameter_target(struct gl_context *ctx, GLenum target,
+                                           bool dsa);
+
+GLboolean
+_mesa_target_allows_setting_sampler_parameters(GLenum target);
+
 /*@}*/
 
 /**
diff --git a/src/mesa/main/texstorage.c b/src/mesa/main/texstorage.c
index 9fd969fbc53..f4a076028fb 100644
--- a/src/mesa/main/texstorage.c
+++ b/src/mesa/main/texstorage.c
@@ -358,11 +358,11 @@ tex_storage_error_check(struct gl_context *ctx,
    }
 
    /* additional checks for depth textures */
-   if (!_mesa_legal_texture_base_format_for_target(ctx, target, internalformat,
-                                                   dims, dsa ?
-                                                   "glTextureStorage" :
-                                                   "glTexStorage"))
+   if (!_mesa_legal_texture_base_format_for_target(ctx, target, internalformat)) {
+      _mesa_error(ctx, GL_INVALID_OPERATION, "glTex%sStorage%uD(bad target for texture)",
+                  suffix, dims);
       return GL_TRUE;
+   }
 
    return GL_FALSE;
 }
diff --git a/src/mesa/main/textureview.c b/src/mesa/main/textureview.c
index 316d8280338..419fbebf2f0 100644
--- a/src/mesa/main/textureview.c
+++ b/src/mesa/main/textureview.c
@@ -162,12 +162,8 @@ static const struct internal_format_class_info s3tc_compatible_internal_formats[
    {GL_VIEW_CLASS_S3TC_DXT5_RGBA, GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT},
 };
 
-/**
- * Lookup format view class based on internalformat
- * \return VIEW_CLASS if internalformat found in table, false otherwise.
- */
-static GLenum
-lookup_view_class(const struct gl_context *ctx, GLenum internalformat)
+GLenum
+_mesa_texture_view_lookup_view_class(const struct gl_context *ctx, GLenum internalformat)
 {
    GLuint i;
 
@@ -336,8 +332,8 @@ _mesa_texture_view_compatible_format(const struct gl_context *ctx,
    if (origInternalFormat == newInternalFormat)
       return true;
 
-   origViewClass = lookup_view_class(ctx, origInternalFormat);
-   newViewClass = lookup_view_class(ctx, newInternalFormat);
+   origViewClass = _mesa_texture_view_lookup_view_class(ctx, origInternalFormat);
+   newViewClass = _mesa_texture_view_lookup_view_class(ctx, newInternalFormat);
    if ((origViewClass == newViewClass) && origViewClass != false)
       return true;
 
diff --git a/src/mesa/main/textureview.h b/src/mesa/main/textureview.h
index 59e24b68dd0..39b415d8793 100644
--- a/src/mesa/main/textureview.h
+++ b/src/mesa/main/textureview.h
@@ -34,6 +34,14 @@ _mesa_texture_view_compatible_format(const struct gl_context *ctx,
                                      GLenum origInternalFormat,
                                      GLenum newInternalFormat);
 
+/**
+ * Lookup format view class based on internalformat
+ * \return VIEW_CLASS if internalformat found in table, false otherwise.
+ */
+GLenum
+_mesa_texture_view_lookup_view_class(const struct gl_context *ctx,
+                                     GLenum internalformat);
+
 extern void GLAPIENTRY
 _mesa_TextureView(GLuint texture, GLenum target, GLuint origtexture,
                   GLenum internalformat,
diff --git a/src/mesa/main/transformfeedback.c b/src/mesa/main/transformfeedback.c
index 976b2686a60..f73a89f6c0f 100644
--- a/src/mesa/main/transformfeedback.c
+++ b/src/mesa/main/transformfeedback.c
@@ -1136,6 +1136,11 @@ _mesa_DeleteTransformFeedbacks(GLsizei n, const GLuint *names)
             }
             _mesa_HashRemove(ctx->TransformFeedback.Objects, names[i]);
             /* unref, but object may not be deleted until later */
+            if (obj == ctx->TransformFeedback.CurrentObject) {
+               reference_transform_feedback_object(
+                     &ctx->TransformFeedback.CurrentObject,
+                     ctx->TransformFeedback.DefaultObject);
+            }
             reference_transform_feedback_object(&obj, NULL);
          }
       }
diff --git a/src/mesa/program/prog_execute.c b/src/mesa/program/prog_execute.c
index 8f167be60cd..aadf82116e5 100644
--- a/src/mesa/program/prog_execute.c
+++ b/src/mesa/program/prog_execute.c
@@ -650,11 +650,9 @@ _mesa_execute_program(struct gl_context * ctx,
                    program->Instructions[inst->BranchTarget].Opcode
                    == OPCODE_ENDIF);
             /* eval condition */
-            if (inst->SrcReg[0].File != PROGRAM_UNDEFINED) {
-               GLfloat a[4];
-               fetch_vector1(&inst->SrcReg[0], machine, a);
-               cond = (a[0] != 0.0F);
-            }
+            GLfloat a[4];
+            fetch_vector1(&inst->SrcReg[0], machine, a);
+            cond = (a[0] != 0.0F);
             if (DEBUG_PROG) {
                printf("IF: %d\n", cond);
             }
diff --git a/src/mesa/program/prog_optimize.c b/src/mesa/program/prog_optimize.c
index c6d22644419..401f26ec0d0 100644
--- a/src/mesa/program/prog_optimize.c
+++ b/src/mesa/program/prog_optimize.c
@@ -289,8 +289,7 @@ _mesa_remove_dead_code_global(struct gl_program *prog)
 
       /* check dst reg */
       if (inst->DstReg.File == PROGRAM_TEMPORARY) {
-         const GLuint index = inst->DstReg.Index;
-         assert(index < REG_ALLOCATE_MAX_PROGRAM_TEMPS);
+         assert(inst->DstReg.Index < REG_ALLOCATE_MAX_PROGRAM_TEMPS);
 
          if (inst->DstReg.RelAddr) {
             if (dbg)
diff --git a/src/mesa/program/program.c b/src/mesa/program/program.c
index 27867c48d52..67672164f59 100644
--- a/src/mesa/program/program.c
+++ b/src/mesa/program/program.c
@@ -219,7 +219,6 @@ _mesa_new_program(struct gl_context *ctx, GLenum target, GLuint id)
       struct gl_vertex_program *prog = CALLOC_STRUCT(gl_vertex_program);
       return _mesa_init_gl_program(&prog->Base, target, id);
    }
-   case GL_FRAGMENT_PROGRAM_NV:
    case GL_FRAGMENT_PROGRAM_ARB: {
       struct gl_fragment_program *prog = CALLOC_STRUCT(gl_fragment_program);
       return _mesa_init_gl_program(&prog->Base, target, id);
diff --git a/src/mesa/program/program_parse_extra.c b/src/mesa/program/program_parse_extra.c
index 1c5f5794dd5..c82c9c1b751 100644
--- a/src/mesa/program/program_parse_extra.c
+++ b/src/mesa/program/program_parse_extra.c
@@ -40,7 +40,6 @@ _mesa_parse_instruction_suffix(const struct asm_parser_state *state,
 {
    inst->Saturate = GL_FALSE;
 
-
    /* The only possible suffix element is the saturation selector from
     * ARB_fragment_program.
     */
@@ -51,7 +50,6 @@ _mesa_parse_instruction_suffix(const struct asm_parser_state *state,
       }
    }
 
-
    /* It is an error for all of the suffix string not to be consumed.
     */
    return suffix[0] == '\0';
@@ -85,7 +83,6 @@ _mesa_ARBfp_parse_option(struct asm_parser_state *state, const char *option)
        */
       option += 4;
 
-
       if (strncmp(option, "fog_", 4) == 0) {
 	 option += 4;
 
@@ -136,10 +133,12 @@ _mesa_ARBfp_parse_option(struct asm_parser_state *state, const char *option)
           * program options will fail to load.
           */
 
-         if (strcmp(option, "nicest") == 0 && state->option.PrecisionHint != OPTION_FASTEST) {
+         if (strcmp(option, "nicest") == 0 &&
+             state->option.PrecisionHint != OPTION_FASTEST) {
             state->option.PrecisionHint = OPTION_NICEST;
             return 1;
-         } else if (strcmp(option, "fastest") == 0 && state->option.PrecisionHint != OPTION_NICEST) {
+         } else if (strcmp(option, "fastest") == 0 &&
+                    state->option.PrecisionHint != OPTION_NICEST) {
             state->option.PrecisionHint = OPTION_FASTEST;
             return 1;
          }
diff --git a/src/mesa/state_tracker/st_atom.c b/src/mesa/state_tracker/st_atom.c
index 622621bdcbb..fc80adf6f8d 100644
--- a/src/mesa/state_tracker/st_atom.c
+++ b/src/mesa/state_tracker/st_atom.c
@@ -62,7 +62,12 @@ static const struct st_tracked_state *render_atoms[] =
    &st_update_tessctrl_texture,
    &st_update_tesseval_texture,
    &st_update_sampler, /* depends on update_*_texture for swizzle */
-   &st_update_framebuffer,
+   &st_bind_vs_images,
+   &st_bind_tcs_images,
+   &st_bind_tes_images,
+   &st_bind_gs_images,
+   &st_bind_fs_images,
+   &st_update_framebuffer, /* depends on update_*_texture and bind_*_images */
    &st_update_msaa,
    &st_update_sample_shading,
    &st_update_vs_constants,
@@ -85,11 +90,6 @@ static const struct st_tracked_state *render_atoms[] =
    &st_bind_tes_ssbos,
    &st_bind_fs_ssbos,
    &st_bind_gs_ssbos,
-   &st_bind_vs_images,
-   &st_bind_tcs_images,
-   &st_bind_tes_images,
-   &st_bind_gs_images,
-   &st_bind_fs_images,
    &st_update_pixel_transfer,
    &st_update_tess,
 
diff --git a/src/mesa/state_tracker/st_atom_image.c b/src/mesa/state_tracker/st_atom_image.c
index 4b48bc30b69..e96d10a196c 100644
--- a/src/mesa/state_tracker/st_atom_image.c
+++ b/src/mesa/state_tracker/st_atom_image.c
@@ -25,6 +25,7 @@
  **************************************************************************/
 
 #include "main/imports.h"
+#include "main/shaderimage.h"
 #include "program/prog_parameter.h"
 #include "program/prog_print.h"
 #include "compiler/glsl/ir_uniform.h"
@@ -48,17 +49,19 @@ st_bind_images(struct st_context *st, struct gl_shader *shader,
 {
    unsigned i;
    struct pipe_image_view images[MAX_IMAGE_UNIFORMS];
-   struct gl_program_constants *c = &st->ctx->Const.Program[shader->Stage];
+   struct gl_program_constants *c;
 
    if (!shader || !st->pipe->set_shader_images)
       return;
 
+   c = &st->ctx->Const.Program[shader->Stage];
+
    for (i = 0; i < shader->NumImages; i++) {
       struct gl_image_unit *u = &st->ctx->ImageUnits[shader->ImageUnits[i]];
       struct st_texture_object *stObj = st_texture_object(u->TexObj);
       struct pipe_image_view *img = &images[i];
 
-      if (!stObj ||
+      if (!_mesa_is_image_unit_valid(st->ctx, u) ||
           !st_finalize_texture(st->ctx, st->pipe, u->TexObj) ||
           !stObj->pt) {
          memset(img, 0, sizeof(*img));
@@ -67,6 +70,21 @@ st_bind_images(struct st_context *st, struct gl_shader *shader,
 
       img->resource = stObj->pt;
       img->format = st_mesa_format_to_pipe_format(st, u->_ActualFormat);
+
+      switch (u->Access) {
+      case GL_READ_ONLY:
+         img->access = PIPE_IMAGE_ACCESS_READ;
+         break;
+      case GL_WRITE_ONLY:
+         img->access = PIPE_IMAGE_ACCESS_WRITE;
+         break;
+      case GL_READ_WRITE:
+         img->access = PIPE_IMAGE_ACCESS_READ_WRITE;
+         break;
+      default:
+         unreachable("bad gl_image_unit::Access");
+      }
+
       if (stObj->pt->target == PIPE_BUFFER) {
          unsigned base, size;
          unsigned f, n;
diff --git a/src/mesa/state_tracker/st_atom_shader.c b/src/mesa/state_tracker/st_atom_shader.c
index a88f0352746..ff90bd61d5b 100644
--- a/src/mesa/state_tracker/st_atom_shader.c
+++ b/src/mesa/state_tracker/st_atom_shader.c
@@ -37,6 +37,7 @@
 
 #include "main/imports.h"
 #include "main/mtypes.h"
+#include "main/framebuffer.h"
 #include "program/program.h"
 
 #include "pipe/p_context.h"
@@ -70,16 +71,13 @@ update_fp( struct st_context *st )
    key.clamp_color = st->clamp_frag_color_in_shader &&
                      st->ctx->Color._ClampFragmentColor;
 
-   /* Don't set it if the driver can force the interpolation by itself.
-    * If SAMPLE_ID or SAMPLE_POS are used, the interpolation is set
-    * automatically.
-    * Ignore sample qualifier while computing this flag.
-    */
+   /* _NEW_MULTISAMPLE | _NEW_BUFFERS */
    key.persample_shading =
       st->force_persample_in_shader &&
-      !(stfp->Base.Base.SystemValuesRead & (SYSTEM_BIT_SAMPLE_ID |
-                                            SYSTEM_BIT_SAMPLE_POS)) &&
-      _mesa_get_min_invocations_per_fragment(st->ctx, &stfp->Base, true) > 1;
+      st->ctx->Multisample._Enabled &&
+      st->ctx->Multisample.SampleShading &&
+      st->ctx->Multisample.MinSampleShadingValue *
+      _mesa_geometric_samples(st->ctx->DrawBuffer) > 1;
 
    st->fp_variant = st_get_fp_variant(st, stfp, &key);
 
diff --git a/src/mesa/state_tracker/st_cb_compute.c b/src/mesa/state_tracker/st_cb_compute.c
index 364159d62d8..bfc6d96cd57 100644
--- a/src/mesa/state_tracker/st_cb_compute.c
+++ b/src/mesa/state_tracker/st_cb_compute.c
@@ -47,7 +47,7 @@ static void st_dispatch_compute_common(struct gl_context *ctx,
    if (ctx->NewState)
       _mesa_update_state(ctx);
 
-   if (st->dirty_cp.st || ctx->NewDriverState)
+   if (st->dirty_cp.st || st->dirty_cp.mesa || ctx->NewDriverState)
       st_validate_state(st, ST_PIPELINE_COMPUTE);
 
    for (unsigned i = 0; i < 3; i++) {
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index cfec627f10c..bffa4d026cb 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -3104,7 +3104,7 @@ void
 st_init_texture_functions(struct dd_function_table *functions)
 {
    functions->ChooseTextureFormat = st_ChooseTextureFormat;
-   functions->QuerySamplesForFormat = st_QuerySamplesForFormat;
+   functions->QueryInternalFormat = st_QueryInternalFormat;
    functions->TexImage = st_TexImage;
    functions->TexSubImage = st_TexSubImage;
    functions->CompressedTexSubImage = st_CompressedTexSubImage;
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index e3ddee660f7..f5a6f8598ca 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -141,9 +141,7 @@ void st_invalidate_state(struct gl_context * ctx, GLbitfield new_state)
 
    /* Invalidate render and compute pipelines. */
    st->dirty.mesa |= new_state;
-   st->dirty.st |= ST_NEW_MESA;
    st->dirty_cp.mesa |= new_state;
-   st->dirty_cp.st |= ST_NEW_MESA;
 
    /* This is the only core Mesa module we depend upon.
     * No longer use swrast, swsetup, tnl.
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index f960c64cbe8..ba51a9c6248 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -50,7 +50,7 @@ struct st_perf_monitor_group;
 struct u_upload_mgr;
 
 
-#define ST_NEW_MESA                    (1 << 0) /* Mesa state has changed */
+/* gap  */
 #define ST_NEW_FRAGMENT_PROGRAM        (1 << 1)
 #define ST_NEW_VERTEX_PROGRAM          (1 << 2)
 #define ST_NEW_FRAMEBUFFER             (1 << 3)
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index 2de6620602d..fdd59a383a9 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -201,7 +201,7 @@ st_draw_vbo(struct gl_context *ctx,
    st_flush_bitmap_cache(st);
 
    /* Validate state. */
-   if (st->dirty.st || ctx->NewDriverState) {
+   if (st->dirty.st || st->dirty.mesa || ctx->NewDriverState) {
       st_validate_state(st, ST_PIPELINE_RENDER);
 
 #if 0
@@ -314,7 +314,7 @@ st_indirect_draw_vbo(struct gl_context *ctx,
    assert(stride);
 
    /* Validate state. */
-   if (st->dirty.st || ctx->NewDriverState) {
+   if (st->dirty.st || st->dirty.mesa || ctx->NewDriverState) {
       st_validate_state(st, ST_PIPELINE_RENDER);
    }
 
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 24c64447f44..3666ece8ee7 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -109,23 +109,20 @@ void st_init_limits(struct pipe_screen *screen,
       _clamp(screen->get_param(screen, PIPE_CAP_MAX_RENDER_TARGETS),
              1, MAX_DRAW_BUFFERS);
 
-   c->MaxDualSourceDrawBuffers
-      = _clamp(screen->get_param(screen, PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS),
-              0, MAX_DRAW_BUFFERS);
-
-   c->MaxLineWidth
-      = _maxf(1.0f, screen->get_paramf(screen,
-                                       PIPE_CAPF_MAX_LINE_WIDTH));
-   c->MaxLineWidthAA
-      = _maxf(1.0f, screen->get_paramf(screen,
-                                       PIPE_CAPF_MAX_LINE_WIDTH_AA));
-
-   c->MaxPointSize
-      = _maxf(1.0f, screen->get_paramf(screen,
-                                       PIPE_CAPF_MAX_POINT_WIDTH));
-   c->MaxPointSizeAA
-      = _maxf(1.0f, screen->get_paramf(screen,
-                                       PIPE_CAPF_MAX_POINT_WIDTH_AA));
+   c->MaxDualSourceDrawBuffers =
+      _clamp(screen->get_param(screen,
+                               PIPE_CAP_MAX_DUAL_SOURCE_RENDER_TARGETS),
+             0, MAX_DRAW_BUFFERS);
+
+   c->MaxLineWidth =
+      _maxf(1.0f, screen->get_paramf(screen, PIPE_CAPF_MAX_LINE_WIDTH));
+   c->MaxLineWidthAA =
+      _maxf(1.0f, screen->get_paramf(screen, PIPE_CAPF_MAX_LINE_WIDTH_AA));
+
+   c->MaxPointSize =
+      _maxf(1.0f, screen->get_paramf(screen, PIPE_CAPF_MAX_POINT_WIDTH));
+   c->MaxPointSizeAA =
+      _maxf(1.0f, screen->get_paramf(screen, PIPE_CAPF_MAX_POINT_WIDTH_AA));
 
    /* these are not queryable. Note that GL basically mandates a 1.0 minimum
     * for non-aa sizes, but we can go down to 0.0 for aa points.
@@ -133,15 +130,16 @@ void st_init_limits(struct pipe_screen *screen,
    c->MinPointSize = 1.0f;
    c->MinPointSizeAA = 0.0f;
 
-   c->MaxTextureMaxAnisotropy
-      = _maxf(2.0f, screen->get_paramf(screen,
-                                 PIPE_CAPF_MAX_TEXTURE_ANISOTROPY));
+   c->MaxTextureMaxAnisotropy =
+      _maxf(2.0f,
+            screen->get_paramf(screen, PIPE_CAPF_MAX_TEXTURE_ANISOTROPY));
 
-   c->MaxTextureLodBias
-      = screen->get_paramf(screen, PIPE_CAPF_MAX_TEXTURE_LOD_BIAS);
+   c->MaxTextureLodBias =
+      screen->get_paramf(screen, PIPE_CAPF_MAX_TEXTURE_LOD_BIAS);
 
-   c->QuadsFollowProvokingVertexConvention = screen->get_param(
-      screen, PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION);
+   c->QuadsFollowProvokingVertexConvention =
+      screen->get_param(screen,
+                        PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION);
 
    c->MaxUniformBlockSize =
       screen->get_shader_param(screen, PIPE_SHADER_FRAGMENT,
@@ -195,21 +193,31 @@ void st_init_limits(struct pipe_screen *screen,
                                        PIPE_SHADER_CAP_MAX_TEXTURE_SAMPLERS),
               MAX_TEXTURE_IMAGE_UNITS);
 
-      pc->MaxInstructions    = pc->MaxNativeInstructions    =
+      pc->MaxInstructions =
+      pc->MaxNativeInstructions =
          screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_INSTRUCTIONS);
-      pc->MaxAluInstructions = pc->MaxNativeAluInstructions =
-         screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS);
-      pc->MaxTexInstructions = pc->MaxNativeTexInstructions =
-         screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS);
-      pc->MaxTexIndirections = pc->MaxNativeTexIndirections =
-         screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS);
-      pc->MaxAttribs         = pc->MaxNativeAttribs         =
+      pc->MaxAluInstructions =
+      pc->MaxNativeAluInstructions =
+         screen->get_shader_param(screen, sh,
+                                  PIPE_SHADER_CAP_MAX_ALU_INSTRUCTIONS);
+      pc->MaxTexInstructions =
+      pc->MaxNativeTexInstructions =
+         screen->get_shader_param(screen, sh,
+                                  PIPE_SHADER_CAP_MAX_TEX_INSTRUCTIONS);
+      pc->MaxTexIndirections =
+      pc->MaxNativeTexIndirections =
+         screen->get_shader_param(screen, sh,
+                                  PIPE_SHADER_CAP_MAX_TEX_INDIRECTIONS);
+      pc->MaxAttribs =
+      pc->MaxNativeAttribs =
          screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_INPUTS);
-      pc->MaxTemps           = pc->MaxNativeTemps           =
+      pc->MaxTemps =
+      pc->MaxNativeTemps =
          screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_TEMPS);
-      pc->MaxAddressRegs     = pc->MaxNativeAddressRegs     =
-         sh == PIPE_SHADER_VERTEX ? 1 : 0;
-      pc->MaxParameters      = pc->MaxNativeParameters      =
+      pc->MaxAddressRegs =
+      pc->MaxNativeAddressRegs = sh == PIPE_SHADER_VERTEX ? 1 : 0;
+      pc->MaxParameters =
+      pc->MaxNativeParameters =
          screen->get_shader_param(screen, sh,
                    PIPE_SHADER_CAP_MAX_CONST_BUFFER_SIZE) / sizeof(float[4]);
       pc->MaxInputComponents =
@@ -217,10 +225,12 @@ void st_init_limits(struct pipe_screen *screen,
       pc->MaxOutputComponents =
          screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_OUTPUTS) * 4;
 
-      pc->MaxUniformComponents = 4 * MIN2(pc->MaxNativeParameters, MAX_UNIFORMS);
+      pc->MaxUniformComponents =
+         4 * MIN2(pc->MaxNativeParameters, MAX_UNIFORMS);
 
       pc->MaxUniformBlocks =
-         screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_CONST_BUFFERS);
+         screen->get_shader_param(screen, sh,
+                                  PIPE_SHADER_CAP_MAX_CONST_BUFFERS);
       if (pc->MaxUniformBlocks)
          pc->MaxUniformBlocks -= 1; /* The first one is for ordinary uniforms. */
       pc->MaxUniformBlocks = _min(pc->MaxUniformBlocks, MAX_UNIFORM_BUFFERS);
@@ -246,21 +256,33 @@ void st_init_limits(struct pipe_screen *screen,
       options->EmitNoNoise = TRUE;
 
       /* TODO: make these more fine-grained if anyone needs it */
-      options->MaxIfDepth = screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH);
-      options->EmitNoLoops = !screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH);
-      options->EmitNoFunctions = !screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_SUBROUTINES);
-      options->EmitNoMainReturn = !screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_SUBROUTINES);
-
-      options->EmitNoCont = !screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED);
-
-      options->EmitNoIndirectInput = !screen->get_shader_param(screen, sh,
-                                        PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR);
-      options->EmitNoIndirectOutput = !screen->get_shader_param(screen, sh,
-                                        PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR);
-      options->EmitNoIndirectTemp = !screen->get_shader_param(screen, sh,
-                                        PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR);
-      options->EmitNoIndirectUniform = !screen->get_shader_param(screen, sh,
-                                        PIPE_SHADER_CAP_INDIRECT_CONST_ADDR);
+      options->MaxIfDepth =
+         screen->get_shader_param(screen, sh,
+                                  PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH);
+      options->EmitNoLoops =
+         !screen->get_shader_param(screen, sh,
+                                   PIPE_SHADER_CAP_MAX_CONTROL_FLOW_DEPTH);
+      options->EmitNoFunctions =
+         !screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_SUBROUTINES);
+      options->EmitNoMainReturn =
+         !screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_SUBROUTINES);
+
+      options->EmitNoCont =
+         !screen->get_shader_param(screen, sh,
+                                   PIPE_SHADER_CAP_TGSI_CONT_SUPPORTED);
+
+      options->EmitNoIndirectInput =
+         !screen->get_shader_param(screen, sh,
+                                   PIPE_SHADER_CAP_INDIRECT_INPUT_ADDR);
+      options->EmitNoIndirectOutput =
+         !screen->get_shader_param(screen, sh,
+                                   PIPE_SHADER_CAP_INDIRECT_OUTPUT_ADDR);
+      options->EmitNoIndirectTemp =
+         !screen->get_shader_param(screen, sh,
+                                   PIPE_SHADER_CAP_INDIRECT_TEMP_ADDR);
+      options->EmitNoIndirectUniform =
+         !screen->get_shader_param(screen, sh,
+                                   PIPE_SHADER_CAP_INDIRECT_CONST_ADDR);
 
       if (pc->MaxNativeInstructions &&
           (options->EmitNoIndirectUniform || pc->MaxUniformBlocks < 12)) {
@@ -268,10 +290,14 @@ void st_init_limits(struct pipe_screen *screen,
       }
 
       if (options->EmitNoLoops)
-         options->MaxUnrollIterations = MIN2(screen->get_shader_param(screen, sh, PIPE_SHADER_CAP_MAX_INSTRUCTIONS), 65536);
+         options->MaxUnrollIterations =
+            MIN2(screen->get_shader_param(screen, sh,
+                                          PIPE_SHADER_CAP_MAX_INSTRUCTIONS),
+                 65536);
       else
-         options->MaxUnrollIterations = screen->get_shader_param(screen, sh,
-                                      PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT);
+         options->MaxUnrollIterations =
+            screen->get_shader_param(screen, sh,
+                                  PIPE_SHADER_CAP_MAX_UNROLL_ITERATIONS_HINT);
 
       options->LowerClipDistance = true;
       options->LowerBufferInterfaceBlocks = true;
@@ -293,37 +319,50 @@ void st_init_limits(struct pipe_screen *screen,
 
    /* This depends on program constants. */
    c->MaxTextureCoordUnits
-      = _min(c->Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits, MAX_TEXTURE_COORD_UNITS);
+      = _min(c->Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits,
+             MAX_TEXTURE_COORD_UNITS);
 
-   c->MaxTextureUnits = _min(c->Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits, c->MaxTextureCoordUnits);
+   c->MaxTextureUnits =
+      _min(c->Program[MESA_SHADER_FRAGMENT].MaxTextureImageUnits,
+           c->MaxTextureCoordUnits);
 
-   c->Program[MESA_SHADER_VERTEX].MaxAttribs = MIN2(c->Program[MESA_SHADER_VERTEX].MaxAttribs, 16);
+   c->Program[MESA_SHADER_VERTEX].MaxAttribs =
+      MIN2(c->Program[MESA_SHADER_VERTEX].MaxAttribs, 16);
 
    /* PIPE_SHADER_CAP_MAX_INPUTS for the FS specifies the maximum number
     * of inputs. It's always 2 colors + N generic inputs. */
    c->MaxVarying = screen->get_shader_param(screen, PIPE_SHADER_FRAGMENT,
                                             PIPE_SHADER_CAP_MAX_INPUTS);
    c->MaxVarying = MIN2(c->MaxVarying, MAX_VARYING);
-   c->MaxGeometryOutputVertices = screen->get_param(screen, PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES);
-   c->MaxGeometryTotalOutputComponents = screen->get_param(screen, PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS);
+   c->MaxGeometryOutputVertices =
+      screen->get_param(screen, PIPE_CAP_MAX_GEOMETRY_OUTPUT_VERTICES);
+   c->MaxGeometryTotalOutputComponents =
+      screen->get_param(screen, PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS);
    c->MaxTessPatchComponents =
       MAX2(screen->get_param(screen, PIPE_CAP_MAX_SHADER_PATCH_VARYINGS),
            MAX_VARYING) * 4;
 
-   c->MinProgramTexelOffset = screen->get_param(screen, PIPE_CAP_MIN_TEXEL_OFFSET);
-   c->MaxProgramTexelOffset = screen->get_param(screen, PIPE_CAP_MAX_TEXEL_OFFSET);
+   c->MinProgramTexelOffset =
+      screen->get_param(screen, PIPE_CAP_MIN_TEXEL_OFFSET);
+   c->MaxProgramTexelOffset =
+      screen->get_param(screen, PIPE_CAP_MAX_TEXEL_OFFSET);
 
-   c->MaxProgramTextureGatherComponents = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS);
-   c->MinProgramTextureGatherOffset = screen->get_param(screen, PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET);
-   c->MaxProgramTextureGatherOffset = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET);
+   c->MaxProgramTextureGatherComponents =
+      screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_GATHER_COMPONENTS);
+   c->MinProgramTextureGatherOffset =
+      screen->get_param(screen, PIPE_CAP_MIN_TEXTURE_GATHER_OFFSET);
+   c->MaxProgramTextureGatherOffset =
+      screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_GATHER_OFFSET);
 
    c->MaxTransformFeedbackBuffers =
       screen->get_param(screen, PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS);
-   c->MaxTransformFeedbackBuffers = MIN2(c->MaxTransformFeedbackBuffers, MAX_FEEDBACK_BUFFERS);
+   c->MaxTransformFeedbackBuffers = MIN2(c->MaxTransformFeedbackBuffers,
+                                         MAX_FEEDBACK_BUFFERS);
    c->MaxTransformFeedbackSeparateComponents =
       screen->get_param(screen, PIPE_CAP_MAX_STREAM_OUTPUT_SEPARATE_COMPONENTS);
    c->MaxTransformFeedbackInterleavedComponents =
-      screen->get_param(screen, PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS);
+      screen->get_param(screen,
+                        PIPE_CAP_MAX_STREAM_OUTPUT_INTERLEAVED_COMPONENTS);
    c->MaxVertexStreams =
       MAX2(1, screen->get_param(screen, PIPE_CAP_MAX_VERTEX_STREAMS));
 
@@ -368,8 +407,10 @@ void st_init_limits(struct pipe_screen *screen,
          c->Program[MESA_SHADER_FRAGMENT].MaxAtomicBuffers;
    assert(c->MaxCombinedAtomicBuffers <= MAX_COMBINED_ATOMIC_BUFFERS);
 
-   if (c->MaxCombinedAtomicBuffers > 0)
+   if (c->MaxCombinedAtomicBuffers > 0) {
       extensions->ARB_shader_atomic_counters = GL_TRUE;
+      extensions->ARB_shader_atomic_counter_ops = GL_TRUE;
+   }
 
    c->MaxCombinedShaderOutputResources = c->MaxDrawBuffers;
    c->ShaderStorageBufferOffsetAlignment =
@@ -1028,12 +1069,14 @@ void st_init_extensions(struct pipe_screen *screen,
       extensions->ARB_ES3_compatibility = GL_TRUE;
    }
 
+#ifdef HAVE_ST_VDPAU
    if (screen->get_video_param &&
        screen->get_video_param(screen, PIPE_VIDEO_PROFILE_UNKNOWN,
                                PIPE_VIDEO_ENTRYPOINT_BITSTREAM,
                                PIPE_VIDEO_CAP_SUPPORTS_INTERLACED)) {
       extensions->NV_vdpau_interop = GL_TRUE;
    }
+#endif
 
    if (screen->get_shader_param(screen, PIPE_SHADER_VERTEX,
                                 PIPE_SHADER_CAP_DOUBLES) &&
diff --git a/src/mesa/state_tracker/st_format.c b/src/mesa/state_tracker/st_format.c
index 82bf3a185ad..5392c23ec00 100644
--- a/src/mesa/state_tracker/st_format.c
+++ b/src/mesa/state_tracker/st_format.c
@@ -42,6 +42,7 @@
 #include "main/texstore.h"
 #include "main/image.h"
 #include "main/macros.h"
+#include "main/formatquery.h"
 
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
@@ -2308,9 +2309,9 @@ st_ChooseTextureFormat(struct gl_context *ctx, GLenum target,
 
 
 /**
- * Called via ctx->Driver.ChooseTextureFormat().
+ * Called via ctx->Driver.QueryInternalFormat().
  */
-size_t
+static size_t
 st_QuerySamplesForFormat(struct gl_context *ctx, GLenum target,
                          GLenum internalFormat, int samples[16])
 {
@@ -2349,6 +2350,39 @@ st_QuerySamplesForFormat(struct gl_context *ctx, GLenum target,
    return num_sample_counts;
 }
 
+/**
+ * ARB_internalformat_query2 driver hook.
+ */
+void
+st_QueryInternalFormat(struct gl_context *ctx, GLenum target,
+                       GLenum internalFormat, GLenum pname, GLint *params)
+{
+   /* The API entry-point gives us a temporary params buffer that is non-NULL
+    * and guaranteed to have at least 16 elements.
+    */
+   assert(params != NULL);
+
+   switch (pname) {
+   case GL_SAMPLES:
+      st_QuerySamplesForFormat(ctx, target, internalFormat, params);
+      break;
+
+   case GL_NUM_SAMPLE_COUNTS: {
+      size_t num_samples;
+      num_samples = st_QuerySamplesForFormat(ctx, target, internalFormat,
+                                             params);
+      params[0] = (GLint) num_samples;
+      break;
+   }
+
+   default:
+      /* For the rest of the pnames, we call back the Mesa's default
+       * function for drivers that don't implement ARB_internalformat_query2.
+       */
+      _mesa_query_internal_format_default(ctx, target, internalFormat, pname,
+                                          params);
+   }
+}
 
 /**
  * This is used for translating texture border color and the clear
diff --git a/src/mesa/state_tracker/st_format.h b/src/mesa/state_tracker/st_format.h
index 3e10aa64bc6..6ba61df7e4e 100644
--- a/src/mesa/state_tracker/st_format.h
+++ b/src/mesa/state_tracker/st_format.h
@@ -70,11 +70,9 @@ st_ChooseTextureFormat(struct gl_context * ctx, GLenum target,
                        GLint internalFormat,
                        GLenum format, GLenum type);
 
-size_t
-st_QuerySamplesForFormat(struct gl_context *ctx, GLenum target,
-                         GLenum internalFormat, int samples[16]);
-
-
+void
+st_QueryInternalFormat(struct gl_context *ctx, GLenum target,
+                       GLenum internalFormat, GLenum pname, GLint *params);
 
 extern void
 st_translate_color(const union gl_color_union *colorIn,
diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
index 26e463e0437..18414055549 100644
--- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
+++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
@@ -3158,8 +3158,8 @@ void
 glsl_to_tgsi_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
 {
    const char *callee = ir->callee->function_name();
-   ir_dereference *deref = static_cast<ir_dereference *>(
-      ir->actual_parameters.get_head());
+   exec_node *param = ir->actual_parameters.get_head();
+   ir_dereference *deref = static_cast<ir_dereference *>(param);
    ir_variable *location = deref->variable_referenced();
 
    st_src_reg buffer(
@@ -3188,17 +3188,56 @@ glsl_to_tgsi_visitor::visit_atomic_counter_intrinsic(ir_call *ir)
 
    if (!strcmp("__intrinsic_atomic_read", callee)) {
       inst = emit_asm(ir, TGSI_OPCODE_LOAD, dst, offset);
-      inst->buffer = buffer;
    } else if (!strcmp("__intrinsic_atomic_increment", callee)) {
       inst = emit_asm(ir, TGSI_OPCODE_ATOMUADD, dst, offset,
                       st_src_reg_for_int(1));
-      inst->buffer = buffer;
    } else if (!strcmp("__intrinsic_atomic_predecrement", callee)) {
       inst = emit_asm(ir, TGSI_OPCODE_ATOMUADD, dst, offset,
                       st_src_reg_for_int(-1));
-      inst->buffer = buffer;
       emit_asm(ir, TGSI_OPCODE_ADD, dst, this->result, st_src_reg_for_int(-1));
+   } else {
+      param = param->get_next();
+      ir_rvalue *val = ((ir_instruction *)param)->as_rvalue();
+      val->accept(this);
+
+      st_src_reg data = this->result, data2 = undef_src;
+      unsigned opcode;
+      if (!strcmp("__intrinsic_atomic_add", callee))
+         opcode = TGSI_OPCODE_ATOMUADD;
+      else if (!strcmp("__intrinsic_atomic_min", callee))
+         opcode = TGSI_OPCODE_ATOMIMIN;
+      else if (!strcmp("__intrinsic_atomic_max", callee))
+         opcode = TGSI_OPCODE_ATOMIMAX;
+      else if (!strcmp("__intrinsic_atomic_and", callee))
+         opcode = TGSI_OPCODE_ATOMAND;
+      else if (!strcmp("__intrinsic_atomic_or", callee))
+         opcode = TGSI_OPCODE_ATOMOR;
+      else if (!strcmp("__intrinsic_atomic_xor", callee))
+         opcode = TGSI_OPCODE_ATOMXOR;
+      else if (!strcmp("__intrinsic_atomic_exchange", callee))
+         opcode = TGSI_OPCODE_ATOMXCHG;
+      else if (!strcmp("__intrinsic_atomic_comp_swap", callee)) {
+         opcode = TGSI_OPCODE_ATOMCAS;
+         param = param->get_next();
+         val = ((ir_instruction *)param)->as_rvalue();
+         val->accept(this);
+         data2 = this->result;
+      } else if (!strcmp("__intrinsic_atomic_sub", callee)) {
+         opcode = TGSI_OPCODE_ATOMUADD;
+         st_src_reg res = get_temp(glsl_type::uvec4_type);
+         st_dst_reg dstres = st_dst_reg(res);
+         dstres.writemask = dst.writemask;
+         emit_asm(ir, TGSI_OPCODE_INEG, dstres, data);
+         data = res;
+      } else {
+         assert(!"Unexpected intrinsic");
+         return;
+      }
+
+      inst = emit_asm(ir, opcode, dst, offset, data, data2);
    }
+
+   inst->buffer = buffer;
 }
 
 void
@@ -3577,6 +3616,13 @@ glsl_to_tgsi_visitor::visit_image_intrinsic(ir_call *ir)
 
    inst->image_format = st_mesa_format_to_pipe_format(st_context(ctx),
          _mesa_get_shader_image_format(imgvar->data.image_format));
+
+   if (imgvar->data.image_coherent)
+      inst->buffer_access |= TGSI_MEMORY_COHERENT;
+   if (imgvar->data.image_restrict)
+      inst->buffer_access |= TGSI_MEMORY_RESTRICT;
+   if (imgvar->data.image_volatile)
+      inst->buffer_access |= TGSI_MEMORY_VOLATILE;
 }
 
 void
@@ -3591,7 +3637,16 @@ glsl_to_tgsi_visitor::visit(ir_call *ir)
    /* Filter out intrinsics */
    if (!strcmp("__intrinsic_atomic_read", callee) ||
        !strcmp("__intrinsic_atomic_increment", callee) ||
-       !strcmp("__intrinsic_atomic_predecrement", callee)) {
+       !strcmp("__intrinsic_atomic_predecrement", callee) ||
+       !strcmp("__intrinsic_atomic_add", callee) ||
+       !strcmp("__intrinsic_atomic_sub", callee) ||
+       !strcmp("__intrinsic_atomic_min", callee) ||
+       !strcmp("__intrinsic_atomic_max", callee) ||
+       !strcmp("__intrinsic_atomic_and", callee) ||
+       !strcmp("__intrinsic_atomic_or", callee) ||
+       !strcmp("__intrinsic_atomic_xor", callee) ||
+       !strcmp("__intrinsic_atomic_exchange", callee) ||
+       !strcmp("__intrinsic_atomic_comp_swap", callee)) {
       visit_atomic_counter_intrinsic(ir);
       return;
    }
@@ -5524,7 +5579,7 @@ compile_tgsi_instruction(struct st_translate *t,
 
    int num_dst;
    int num_src;
-   unsigned tex_target;
+   unsigned tex_target = 0;
 
    num_dst = num_inst_dst_regs(inst);
    num_src = num_inst_src_regs(inst);
@@ -5599,32 +5654,38 @@ compile_tgsi_instruction(struct st_translate *t,
       for (i = num_src - 1; i >= 0; i--)
          src[i + 1] = src[i];
       num_src++;
-      if (inst->buffer.file == PROGRAM_MEMORY)
+      if (inst->buffer.file == PROGRAM_MEMORY) {
          src[0] = t->shared_memory;
-      else if (inst->buffer.file == PROGRAM_BUFFER)
+      } else if (inst->buffer.file == PROGRAM_BUFFER) {
          src[0] = t->buffers[inst->buffer.index];
-      else
+      } else {
          src[0] = t->images[inst->buffer.index];
+         tex_target = st_translate_texture_target(inst->tex_target, inst->tex_shadow);
+      }
       if (inst->buffer.reladdr)
          src[0] = ureg_src_indirect(src[0], ureg_src(t->address[2]));
       assert(src[0].File != TGSI_FILE_NULL);
       ureg_memory_insn(ureg, inst->op, dst, num_dst, src, num_src,
-                       inst->buffer_access);
+                       inst->buffer_access,
+                       tex_target, inst->image_format);
       break;
 
    case TGSI_OPCODE_STORE:
-      if (inst->buffer.file == PROGRAM_MEMORY)
+      if (inst->buffer.file == PROGRAM_MEMORY) {
          dst[0] = ureg_dst(t->shared_memory);
-      else if (inst->buffer.file == PROGRAM_BUFFER)
+      } else if (inst->buffer.file == PROGRAM_BUFFER) {
          dst[0] = ureg_dst(t->buffers[inst->buffer.index]);
-      else
+      } else {
          dst[0] = ureg_dst(t->images[inst->buffer.index]);
+         tex_target = st_translate_texture_target(inst->tex_target, inst->tex_shadow);
+      }
       dst[0] = ureg_writemask(dst[0], inst->dst[0].writemask);
       if (inst->buffer.reladdr)
          dst[0] = ureg_dst_indirect(dst[0], ureg_src(t->address[2]));
       assert(dst[0].File != TGSI_FILE_NULL);
       ureg_memory_insn(ureg, inst->op, dst, num_dst, src, num_src,
-                       inst->buffer_access);
+                       inst->buffer_access,
+                       tex_target, inst->image_format);
       break;
 
    case TGSI_OPCODE_SCS:
@@ -6060,6 +6121,9 @@ st_translate_program(
    }
 
    if (procType == TGSI_PROCESSOR_FRAGMENT) {
+      if (program->shader->EarlyFragmentTests)
+         ureg_property(ureg, TGSI_PROPERTY_FS_EARLY_DEPTH_STENCIL, 1);
+
       if (proginfo->InputsRead & VARYING_BIT_POS) {
           /* Must do this after setting up t->inputs. */
           emit_wpos(st_context(ctx), t, proginfo, ureg,
diff --git a/src/mesa/state_tracker/st_program.c b/src/mesa/state_tracker/st_program.c
index 2e21d02b8b5..c9f390aa9a2 100644
--- a/src/mesa/state_tracker/st_program.c
+++ b/src/mesa/state_tracker/st_program.c
@@ -573,10 +573,6 @@ st_translate_fragment_program(struct st_context *st,
          else
             interpLocation[slot] = TGSI_INTERPOLATE_LOC_CENTER;
 
-         if (stfp->Base.Base.SystemValuesRead & (SYSTEM_BIT_SAMPLE_ID |
-                                                 SYSTEM_BIT_SAMPLE_POS))
-            interpLocation[slot] = TGSI_INTERPOLATE_LOC_SAMPLE;
-
          switch (attr) {
          case VARYING_SLOT_POS:
             input_semantic_name[slot] = TGSI_SEMANTIC_POSITION;
diff --git a/src/mesa/state_tracker/st_vdpau.c b/src/mesa/state_tracker/st_vdpau.c
index 63af1196af1..71dd15bc4fe 100644
--- a/src/mesa/state_tracker/st_vdpau.c
+++ b/src/mesa/state_tracker/st_vdpau.c
@@ -49,6 +49,8 @@
 #include "st_format.h"
 #include "st_cb_flush.h"
 
+#ifdef HAVE_ST_VDPAU
+
 static void
 st_vdpau_map_surface(struct gl_context *ctx, GLenum target, GLenum access,
                      GLboolean output, struct gl_texture_object *texObj,
@@ -180,9 +182,13 @@ st_vdpau_unmap_surface(struct gl_context *ctx, GLenum target, GLenum access,
    st_flush(st, NULL, 0);
 }
 
+#endif
+
 void
 st_init_vdpau_functions(struct dd_function_table *functions)
 {
+#ifdef HAVE_ST_VDPAU
    functions->VDPAUMapSurface = st_vdpau_map_surface;
    functions->VDPAUUnmapSurface = st_vdpau_unmap_surface;
+#endif
 }
diff --git a/src/mesa/swrast/s_context.c b/src/mesa/swrast/s_context.c
index af24207e567..0a5fc7e9329 100644
--- a/src/mesa/swrast/s_context.c
+++ b/src/mesa/swrast/s_context.c
@@ -900,11 +900,16 @@ void
 _swrast_render_finish( struct gl_context *ctx )
 {
    SWcontext *swrast = SWRAST_CONTEXT(ctx);
+   struct gl_query_object *query = ctx->Query.CurrentOcclusionObject;
 
    _swrast_flush(ctx);
 
    if (swrast->Driver.SpanRenderFinish)
       swrast->Driver.SpanRenderFinish( ctx );
+
+   if (query && (query->Target == GL_ANY_SAMPLES_PASSED ||
+                 query->Target == GL_ANY_SAMPLES_PASSED_CONSERVATIVE))
+      query->Result = !!query->Result;
 }
 
 
diff --git a/src/util/mesa-sha1.h b/src/util/mesa-sha1.h
index 1599405cd5a..0be5485f313 100644
--- a/src/util/mesa-sha1.h
+++ b/src/util/mesa-sha1.h
@@ -23,12 +23,12 @@
 #ifndef SHA1_H
 #define SHA1_H
 
+#include <stdlib.h>
+
 #ifdef __cplusplus
 extern "C" {
 #endif
 
-#include <stdlib.h>
-
 struct mesa_sha1;
 
 struct mesa_sha1 *
author	Jason Ekstrand <[email protected]>	2016-03-15 14:09:50 -0700
committer	Jason Ekstrand <[email protected]>	2016-03-15 14:09:50 -0700
commit	7f6a0cb29c89a03441be744680a2145445be3a3c (patch)
tree	516824ab49962521563b95fa79430cf948baaccc
parent	b83785d86d2c7f07323920615c72a9f09695a9a7 (diff)
parent	e103b52aec773537d2821d8acc42ac9caa2a4b17 (diff)