49 files changed, 2581 insertions, 1790 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/common/containers.hpp b/src/gallium/drivers/swr/rasterizer/common/containers.hpp
index bc96c5f62fd..f3c05979144 100644
--- a/src/gallium/drivers/swr/rasterizer/common/containers.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/containers.hpp
@@ -33,137 +33,137 @@ namespace SWRL
 template <typename T, int NUM_ELEMENTS>
 struct UncheckedFixedVector
 {
-	UncheckedFixedVector() : mSize(0)
-	{
-	}
-
-	UncheckedFixedVector(std::size_t size, T const& exemplar)
-	{
-		this->mSize = 0;
-		for (std::size_t i = 0; i < size; ++i)
-			this->push_back(exemplar);
-	}
-
-	template <typename Iter>
-	UncheckedFixedVector(Iter fst, Iter lst)
-	{
-		this->mSize = 0;
-		for ( ; fst != lst; ++fst)
-			this->push_back(*fst);
-	}
-
-	UncheckedFixedVector(UncheckedFixedVector const& UFV)
-	{
-		this->mSize = 0;
-		for (std::size_t i = 0, N = UFV.size(); i < N; ++i)
-			(*this)[i] = UFV[i];
-		this->mSize = UFV.size();
-	}
-
-	UncheckedFixedVector& operator=(UncheckedFixedVector const& UFV)
-	{
-		for (std::size_t i = 0, N = UFV.size(); i < N; ++i)
-			(*this)[i] = UFV[i];
-		this->mSize = UFV.size();
-		return *this;
-	}
-
-	T* begin()	{ return &this->mElements[0]; }
-	T* end()	{ return &this->mElements[0] + this->mSize; }
-	T const* begin() const	{ return &this->mElements[0]; }
-	T const* end() const	{ return &this->mElements[0] + this->mSize; }
-
-	friend bool operator==(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
-	{
-		if (L.size() != R.size()) return false;
-		for (std::size_t i = 0, N = L.size(); i < N; ++i)
-		{
-			if (L[i] != R[i]) return false;
-		}
-		return true;
-	}
-
-	friend bool operator!=(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
-	{
-		if (L.size() != R.size()) return true;
-		for (std::size_t i = 0, N = L.size(); i < N; ++i)
-		{
-			if (L[i] != R[i]) return true;
-		}
-		return false;
-	}
-
-	T& operator[](std::size_t idx)
-	{
-		return this->mElements[idx];
-	}
-	T const& operator[](std::size_t idx) const
-	{
-		return this->mElements[idx];
-	}
-	void push_back(T const& t)
-	{
-		this->mElements[this->mSize]	= t;
-		++this->mSize;
-	}
-	void pop_back()
-	{
-		SWR_ASSERT(this->mSize > 0);
-		--this->mSize;
-	}
-	T& back()
-	{
-		return this->mElements[this->mSize-1];
-	}
-	T const& back() const
-	{
-		return this->mElements[this->mSize-1];
-	}
-	bool empty() const
-	{
-		return this->mSize == 0;
-	}
-	std::size_t size() const
-	{
-		return this->mSize;
-	}
-	void resize(std::size_t sz)
-	{
-		this->mSize = sz;
-	}
-	void clear()
-	{
-		this->resize(0);
-	}
+    UncheckedFixedVector() : mSize(0)
+    {
+    }
+
+    UncheckedFixedVector(std::size_t size, T const& exemplar)
+    {
+        this->mSize = 0;
+        for (std::size_t i = 0; i < size; ++i)
+            this->push_back(exemplar);
+    }
+
+    template <typename Iter>
+    UncheckedFixedVector(Iter fst, Iter lst)
+    {
+        this->mSize = 0;
+        for ( ; fst != lst; ++fst)
+            this->push_back(*fst);
+    }
+
+    UncheckedFixedVector(UncheckedFixedVector const& UFV)
+    {
+        this->mSize = 0;
+        for (std::size_t i = 0, N = UFV.size(); i < N; ++i)
+            (*this)[i] = UFV[i];
+        this->mSize = UFV.size();
+    }
+
+    UncheckedFixedVector& operator=(UncheckedFixedVector const& UFV)
+    {
+        for (std::size_t i = 0, N = UFV.size(); i < N; ++i)
+            (*this)[i] = UFV[i];
+        this->mSize = UFV.size();
+        return *this;
+    }
+
+    T* begin()  { return &this->mElements[0]; }
+    T* end()    { return &this->mElements[0] + this->mSize; }
+    T const* begin() const  { return &this->mElements[0]; }
+    T const* end() const    { return &this->mElements[0] + this->mSize; }
+
+    friend bool operator==(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
+    {
+        if (L.size() != R.size()) return false;
+        for (std::size_t i = 0, N = L.size(); i < N; ++i)
+        {
+            if (L[i] != R[i]) return false;
+        }
+        return true;
+    }
+
+    friend bool operator!=(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
+    {
+        if (L.size() != R.size()) return true;
+        for (std::size_t i = 0, N = L.size(); i < N; ++i)
+        {
+            if (L[i] != R[i]) return true;
+        }
+        return false;
+    }
+
+    T& operator[](std::size_t idx)
+    {
+        return this->mElements[idx];
+    }
+    T const& operator[](std::size_t idx) const
+    {
+        return this->mElements[idx];
+    }
+    void push_back(T const& t)
+    {
+        this->mElements[this->mSize]    = t;
+        ++this->mSize;
+    }
+    void pop_back()
+    {
+        SWR_ASSERT(this->mSize > 0);
+        --this->mSize;
+    }
+    T& back()
+    {
+        return this->mElements[this->mSize-1];
+    }
+    T const& back() const
+    {
+        return this->mElements[this->mSize-1];
+    }
+    bool empty() const
+    {
+        return this->mSize == 0;
+    }
+    std::size_t size() const
+    {
+        return this->mSize;
+    }
+    void resize(std::size_t sz)
+    {
+        this->mSize = sz;
+    }
+    void clear()
+    {
+        this->resize(0);
+    }
 private:
-	std::size_t	mSize;
-	T			mElements[NUM_ELEMENTS];
+    std::size_t    mSize{ 0 };
+    T mElements[NUM_ELEMENTS];
 };
 
 template <typename T, int NUM_ELEMENTS>
 struct FixedStack : UncheckedFixedVector<T, NUM_ELEMENTS>
 {
-	FixedStack() {}
-
-	void push(T const& t)
-	{
-		this->push_back(t);
-	}
-
-	void pop()
-	{
-		this->pop_back();
-	}
-
-	T& top()
-	{
-		return this->back();
-	}
-
-	T const& top() const
-	{
-		return this->back();
-	}
+    FixedStack() {}
+
+    void push(T const& t)
+    {
+        this->push_back(t);
+    }
+
+    void pop()
+    {
+        this->pop_back();
+    }
+
+    T& top()
+    {
+        return this->back();
+    }
+
+    T const& top() const
+    {
+        return this->back();
+    }
 };
 
 template <typename T>
@@ -190,16 +190,16 @@ namespace std
 template <typename T, int N>
 struct hash<SWRL::UncheckedFixedVector<T, N>>
 {
-	size_t operator() (SWRL::UncheckedFixedVector<T, N> const& v) const
-	{
-		if (v.size() == 0) return 0;
-		std::hash<T> H;
-		size_t x = H(v[0]);
-		if (v.size() == 1) return x;
-		for (size_t i = 1; i < v.size(); ++i)
-			x ^= H(v[i]) + 0x9e3779b9 + (x<<6) + (x>>2);
-		return x;
-	}
+    size_t operator() (SWRL::UncheckedFixedVector<T, N> const& v) const
+    {
+        if (v.size() == 0) return 0;
+        std::hash<T> H;
+        size_t x = H(v[0]);
+        if (v.size() == 1) return x;
+        for (size_t i = 1; i < v.size(); ++i)
+            x ^= H(v[i]) + 0x9e3779b9 + (x<<6) + (x>>2);
+        return x;
+    }
 };
 
 
diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h
index 522ae0dd65f..5794f3f625a 100644
--- a/src/gallium/drivers/swr/rasterizer/common/os.h
+++ b/src/gallium/drivers/swr/rasterizer/common/os.h
@@ -47,16 +47,18 @@
 #define DEBUGBREAK __debugbreak()
 
 #define PRAGMA_WARNING_PUSH_DISABLE(...) \
-	__pragma(warning(push));\
-	__pragma(warning(disable:__VA_ARGS__));
+    __pragma(warning(push));\
+    __pragma(warning(disable:__VA_ARGS__));
 
 #define PRAGMA_WARNING_POP() __pragma(warning(pop))
 
 #if defined(_WIN32)
 #if defined(_WIN64)
+#define BitScanReverseSizeT BitScanReverse64
 #define BitScanForwardSizeT BitScanForward64
 #define _mm_popcount_sizeT _mm_popcnt_u64
 #else
+#define BitScanReverseSizeT BitScanReverse
 #define BitScanForwardSizeT BitScanForward
 #define _mm_popcount_sizeT _mm_popcnt_u32
 #endif
@@ -68,29 +70,20 @@
 
 #include <stdlib.h>
 #include <string.h>
-#include <X11/Xmd.h>
 #include <x86intrin.h>
 #include <stdint.h>
 #include <sys/types.h>
 #include <unistd.h>
 #include <sys/stat.h>
+#include <stdio.h>
 
-typedef void			VOID;
+typedef void            VOID;
 typedef void*           LPVOID;
-typedef CARD8			BOOL;
-typedef wchar_t			WCHAR;
-typedef uint16_t		UINT16;
-typedef int				INT;
-typedef unsigned int	UINT;
-typedef uint32_t		UINT32;
-typedef uint64_t		UINT64;
-typedef int64_t		    INT64;
-typedef void*			HANDLE;
-typedef float			FLOAT;
-typedef int			    LONG;
-typedef CARD8		    BYTE;
-typedef unsigned char   UCHAR;
-typedef unsigned int	DWORD;
+typedef int             INT;
+typedef unsigned int    UINT;
+typedef void*           HANDLE;
+typedef int             LONG;
+typedef unsigned int    DWORD;
 
 #undef FALSE
 #define FALSE 0
@@ -104,8 +97,11 @@ typedef unsigned int	DWORD;
 #define INLINE __inline
 #endif
 #define DEBUGBREAK asm ("int $3")
+#if !defined(__CYGWIN__)
 #define __cdecl
+#define __stdcall
 #define __declspec(X)
+#endif
 
 #define GCC_VERSION (__GNUC__ * 10000 \
                      + __GNUC_MINOR__ * 100 \
@@ -180,21 +176,13 @@ unsigned char _bittest(const LONG *a, LONG b)
 
 #define CreateDirectory(name, pSecurity) mkdir(name, 0777)
 
-#if defined(_WIN32)
-static inline
-unsigned int _mm_popcnt_u32(unsigned int v)
-{
-    return __builtin_popcount(v);
-}
-#endif
-
 #define _aligned_free free
 #define InterlockedCompareExchange(Dest, Exchange, Comparand) __sync_val_compare_and_swap(Dest, Comparand, Exchange)
 #define InterlockedExchangeAdd(Addend, Value) __sync_fetch_and_add(Addend, Value)
 #define InterlockedDecrement(Append) __sync_sub_and_fetch(Append, 1)
+#define InterlockedDecrement64(Append) __sync_sub_and_fetch(Append, 1)
 #define InterlockedIncrement(Append) __sync_add_and_fetch(Append, 1)
 #define _ReadWriteBarrier() asm volatile("" ::: "memory")
-#define __stdcall
 
 #define PRAGMA_WARNING_PUSH_DISABLE(...)
 #define PRAGMA_WARNING_POP()
@@ -206,7 +194,7 @@ unsigned int _mm_popcnt_u32(unsigned int v)
 #endif
 
 // Universal types
-typedef BYTE        KILOBYTE[1024];
+typedef uint8_t     KILOBYTE[1024];
 typedef KILOBYTE    MEGABYTE[1024];
 typedef MEGABYTE    GIGABYTE[1024];
 
diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
index 454641b2751..c6768b4c566 100644
--- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
@@ -64,12 +64,14 @@ void BucketManager::RegisterThread(const std::string& name)
 
 UINT BucketManager::RegisterBucket(const BUCKET_DESC& desc)
 {
+    mThreadMutex.lock();
     size_t id = mBuckets.size();
     mBuckets.push_back(desc);
+    mThreadMutex.unlock();
     return (UINT)id;
 }
 
-void BucketManager::PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 parentCycles, const BUCKET& bucket)
+void BucketManager::PrintBucket(FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket)
 {
     const char *arrows[] = {
         "",
@@ -88,7 +90,7 @@ void BucketManager::PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64
     float percentParent = (float)((double)bucket.elapsed / (double)parentCycles * 100.0);
 
     // compute average cycle count per invocation
-    UINT64 CPE = bucket.elapsed / bucket.count;
+    uint64_t CPE = bucket.elapsed / bucket.count;
 
     BUCKET_DESC &desc = mBuckets[bucket.id];
 
@@ -127,7 +129,7 @@ void BucketManager::PrintThread(FILE* f, const BUCKET_THREAD& thread)
 
     // compute thread level total cycle counts across all buckets from root
     const BUCKET& root = thread.root;
-    UINT64 totalCycles = 0;
+    uint64_t totalCycles = 0;
     for (const BUCKET& child : root.children)
     {
         totalCycles += child.elapsed;
@@ -186,3 +188,13 @@ void BucketManager::PrintReport(const std::string& filename)
         fclose(f);
     }
 }
+
+void BucketManager_StartBucket(BucketManager* pBucketMgr, uint32_t id)
+{
+    pBucketMgr->StartBucket(id);
+}
+
+void BucketManager_StopBucket(BucketManager* pBucketMgr, uint32_t id)
+{
+    pBucketMgr->StopBucket(id);
+}
diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
index 99cb10ec6e8..9dfa7f694d0 100644
--- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
@@ -70,7 +70,9 @@ public:
     // removes all registered buckets
     void ClearBuckets()
     {
+        mThreadMutex.lock();
         mBuckets.clear();
+        mThreadMutex.unlock();
     }
 
     /// Registers a new thread with the manager.
@@ -209,7 +211,7 @@ public:
     }
 
 private:
-    void PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 parentCycles, const BUCKET& bucket);
+    void PrintBucket(FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket);
     void PrintThread(FILE* f, const BUCKET_THREAD& thread);
 
     // list of active threads that have registered with this manager
@@ -227,3 +229,8 @@ private:
     bool mThreadViz{ false };
     std::string mThreadVizDir;
 };
+
+
+// C helpers for jitter
+void BucketManager_StartBucket(BucketManager* pBucketMgr, uint32_t id);
+void BucketManager_StopBucket(BucketManager* pBucketMgr, uint32_t id);
diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
index 41c6d5dec79..34c322e5a85 100644
--- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
@@ -64,13 +64,13 @@ struct BUCKET_THREAD
     std::string name;
 
     // id for this thread, assigned by the thread manager
-    uint32_t id;
+    uint32_t id{ 0 };
 
     // root of the bucket hierarchy for this thread
     BUCKET root;
 
     // currently executing bucket somewhere in the hierarchy
-    BUCKET* pCurrent;
+    BUCKET* pCurrent{ nullptr };
 
     // currently executing hierarchy level
     uint32_t level{ 0 };
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
index 8fa6d9ef408..fa792b42e1a 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
@@ -43,14 +43,14 @@ typedef uint8_t simdmask;
 // simd vector
 OSALIGNSIMD(union) simdvector
 {
-	simdscalar	v[4];
-	struct
-	{
-		simdscalar x, y, z, w;
-	};
-
-	simdscalar& operator[] (const int i) { return v[i]; }
-	const simdscalar& operator[] (const int i) const { return v[i]; }
+    simdscalar  v[4];
+    struct
+    {
+        simdscalar x, y, z, w;
+    };
+
+    simdscalar& operator[] (const int i) { return v[i]; }
+    const simdscalar& operator[] (const int i) const { return v[i]; }
 };
 
 #if KNOB_SIMD_WIDTH == 8
@@ -59,8 +59,8 @@ OSALIGNSIMD(union) simdvector
 #define _simd_load1_ps _mm256_broadcast_ss
 #define _simd_loadu_ps _mm256_loadu_ps
 #define _simd_setzero_ps _mm256_setzero_ps
-#define _simd_set1_ps	_mm256_set1_ps
-#define _simd_blend_ps	_mm256_blend_ps
+#define _simd_set1_ps   _mm256_set1_ps
+#define _simd_blend_ps  _mm256_blend_ps
 #define _simd_blendv_ps _mm256_blendv_ps
 #define _simd_store_ps _mm256_store_ps
 #define _simd_mul_ps _mm256_mul_ps
@@ -100,21 +100,156 @@ OSALIGNSIMD(union) simdvector
 INLINE \
 __m256i func(__m256i a, __m256i b)\
 {\
-	__m128i aHi = _mm256_extractf128_si256(a, 1);\
-	__m128i bHi = _mm256_extractf128_si256(b, 1);\
-	__m128i aLo = _mm256_castsi256_si128(a);\
-	__m128i bLo = _mm256_castsi256_si128(b);\
+    __m128i aHi = _mm256_extractf128_si256(a, 1);\
+    __m128i bHi = _mm256_extractf128_si256(b, 1);\
+    __m128i aLo = _mm256_castsi256_si128(a);\
+    __m128i bLo = _mm256_castsi256_si128(b);\
 \
-	__m128i subLo = intrin(aLo, bLo);\
-	__m128i subHi = intrin(aHi, bHi);\
+    __m128i subLo = intrin(aLo, bLo);\
+    __m128i subHi = intrin(aHi, bHi);\
 \
-	__m256i result = _mm256_castsi128_si256(subLo);\
-	        result = _mm256_insertf128_si256(result, subHi, 1);\
+    __m256i result = _mm256_castsi128_si256(subLo);\
+            result = _mm256_insertf128_si256(result, subHi, 1);\
 \
-	return result;\
+    return result;\
 }
 
 #if (KNOB_ARCH == KNOB_ARCH_AVX)
+INLINE
+__m256 _simdemu_permute_ps(__m256 a, __m256i b)
+{
+    __m128 aHi = _mm256_extractf128_ps(a, 1);
+    __m128i bHi = _mm256_extractf128_si256(b, 1);
+    __m128 aLo = _mm256_castps256_ps128(a);
+    __m128i bLo = _mm256_castsi256_si128(b);
+
+    __m128i indexHi = _mm_cmpgt_epi32(bLo, _mm_set1_epi32(3));
+    __m128 resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bLo, _mm_set1_epi32(0x3)));
+    __m128 resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bLo, _mm_set1_epi32(0x3)));
+    __m128 blendLowRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi));
+
+    indexHi = _mm_cmpgt_epi32(bHi, _mm_set1_epi32(3));
+    resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bHi, _mm_set1_epi32(0x3)));
+    resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bHi, _mm_set1_epi32(0x3)));
+    __m128 blendHiRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi));
+
+    __m256 result = _mm256_castps128_ps256(blendLowRes);
+    result = _mm256_insertf128_ps(result, blendHiRes, 1);
+
+    return result;
+}
+
+INLINE
+__m256i _simdemu_srlv_epi32(__m256i vA, __m256i vCount)
+{
+    int32_t aHi, aLow, countHi, countLow;
+    __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
+    __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
+    __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
+    __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
+
+    aHi = _mm_extract_epi32(vAHi, 0);
+    countHi = _mm_extract_epi32(vCountHi, 0);
+    aHi >>= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 0);
+
+    aLow = _mm_extract_epi32(vALow, 0);
+    countLow = _mm_extract_epi32(vCountLow, 0);
+    aLow >>= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 0);
+
+    aHi = _mm_extract_epi32(vAHi, 1);
+    countHi = _mm_extract_epi32(vCountHi, 1);
+    aHi >>= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 1);
+
+    aLow = _mm_extract_epi32(vALow, 1);
+    countLow = _mm_extract_epi32(vCountLow, 1);
+    aLow >>= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 1);
+
+    aHi = _mm_extract_epi32(vAHi, 2);
+    countHi = _mm_extract_epi32(vCountHi, 2);
+    aHi >>= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 2);
+
+    aLow = _mm_extract_epi32(vALow, 2);
+    countLow = _mm_extract_epi32(vCountLow, 2);
+    aLow >>= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 2);
+
+    aHi = _mm_extract_epi32(vAHi, 3);
+    countHi = _mm_extract_epi32(vCountHi, 3);
+    aHi >>= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 3);
+
+    aLow = _mm_extract_epi32(vALow, 3);
+    countLow = _mm_extract_epi32(vCountLow, 3);
+    aLow >>= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 3);
+
+    __m256i ret = _mm256_set1_epi32(0);
+    ret = _mm256_insertf128_si256(ret, vAHi, 1);
+    ret = _mm256_insertf128_si256(ret, vALow, 0);
+    return ret;
+}
+
+
+INLINE
+__m256i _simdemu_sllv_epi32(__m256i vA, __m256i vCount)
+{
+    int32_t aHi, aLow, countHi, countLow;
+    __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
+    __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
+    __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
+    __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
+
+    aHi = _mm_extract_epi32(vAHi, 0);
+    countHi = _mm_extract_epi32(vCountHi, 0);
+    aHi <<= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 0);
+
+    aLow = _mm_extract_epi32(vALow, 0);
+    countLow = _mm_extract_epi32(vCountLow, 0);
+    aLow <<= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 0);
+
+    aHi = _mm_extract_epi32(vAHi, 1);
+    countHi = _mm_extract_epi32(vCountHi, 1);
+    aHi <<= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 1);
+
+    aLow = _mm_extract_epi32(vALow, 1);
+    countLow = _mm_extract_epi32(vCountLow, 1);
+    aLow <<= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 1);
+
+    aHi = _mm_extract_epi32(vAHi, 2);
+    countHi = _mm_extract_epi32(vCountHi, 2);
+    aHi <<= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 2);
+
+    aLow = _mm_extract_epi32(vALow, 2);
+    countLow = _mm_extract_epi32(vCountLow, 2);
+    aLow <<= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 2);
+
+    aHi = _mm_extract_epi32(vAHi, 3);
+    countHi = _mm_extract_epi32(vCountHi, 3);
+    aHi <<= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 3);
+
+    aLow = _mm_extract_epi32(vALow, 3);
+    countLow = _mm_extract_epi32(vCountLow, 3);
+    aLow <<= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 3);
+
+    __m256i ret = _mm256_set1_epi32(0);
+    ret = _mm256_insertf128_si256(ret, vAHi, 1);
+    ret = _mm256_insertf128_si256(ret, vALow, 0);
+    return ret;
+}
+
 #define _simd_mul_epi32 _simdemu_mul_epi32
 #define _simd_mullo_epi32 _simdemu_mullo_epi32
 #define _simd_sub_epi32 _simdemu_sub_epi32
@@ -136,7 +271,14 @@ __m256i func(__m256i a, __m256i b)\
 #define _simd_add_epi8 _simdemu_add_epi8
 #define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64
 #define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64
+#define _simd_cmpgt_epi8 _simdemu_cmpgt_epi8
+#define _simd_cmpeq_epi8 _simdemu_cmpeq_epi8
+#define _simd_cmpgt_epi16 _simdemu_cmpgt_epi16
+#define _simd_cmpeq_epi16 _simdemu_cmpeq_epi16
 #define _simd_movemask_epi8 _simdemu_movemask_epi8
+#define _simd_permute_ps _simdemu_permute_ps
+#define _simd_srlv_epi32 _simdemu_srlv_epi32
+#define _simd_sllv_epi32 _simdemu_sllv_epi32
 
 SIMD_EMU_EPI(_simdemu_mul_epi32, _mm_mul_epi32)
 SIMD_EMU_EPI(_simdemu_mullo_epi32, _mm_mullo_epi32)
@@ -158,6 +300,10 @@ SIMD_EMU_EPI(_simdemu_subs_epu8, _mm_subs_epu8)
 SIMD_EMU_EPI(_simdemu_add_epi8, _mm_add_epi8)
 SIMD_EMU_EPI(_simdemu_cmpeq_epi64, _mm_cmpeq_epi64)
 SIMD_EMU_EPI(_simdemu_cmpgt_epi64, _mm_cmpgt_epi64)
+SIMD_EMU_EPI(_simdemu_cmpgt_epi8, _mm_cmpgt_epi8)
+SIMD_EMU_EPI(_simdemu_cmpeq_epi8, _mm_cmpeq_epi8)
+SIMD_EMU_EPI(_simdemu_cmpgt_epi16, _mm_cmpgt_epi16)
+SIMD_EMU_EPI(_simdemu_cmpeq_epi16, _mm_cmpeq_epi16)
 
 #define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
 #define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
@@ -176,25 +322,25 @@ SIMD_EMU_EPI(_simdemu_shuffle_epi8, _mm_shuffle_epi8)
 INLINE
 __m128 _mm_fmaddemu_ps(__m128 a, __m128 b, __m128 c)
 {
-	__m128 res = _mm_mul_ps(a, b);
-	res = _mm_add_ps(res, c);
-	return res;
+    __m128 res = _mm_mul_ps(a, b);
+    res = _mm_add_ps(res, c);
+    return res;
 }
 
 INLINE
 __m256 _mm_fmaddemu256_ps(__m256 a, __m256 b, __m256 c)
 {
-	__m256 res = _mm256_mul_ps(a, b);
-	res = _mm256_add_ps(res, c);
-	return res;
+    __m256 res = _mm256_mul_ps(a, b);
+    res = _mm256_add_ps(res, c);
+    return res;
 }
 
 INLINE
 __m256 _mm_fmsubemu256_ps(__m256 a, __m256 b, __m256 c)
 {
-	__m256 res = _mm256_mul_ps(a, b);
-	res = _mm256_sub_ps(res, c);
-	return res;
+    __m256 res = _mm256_mul_ps(a, b);
+    res = _mm256_sub_ps(res, c);
+    return res;
 }
 
 INLINE
@@ -295,7 +441,14 @@ int _simdemu_movemask_epi8(__m256i a)
 
 #define _simd_cmpeq_epi64 _mm256_cmpeq_epi64
 #define _simd_cmpgt_epi64 _mm256_cmpgt_epi64
+#define _simd_cmpgt_epi8  _mm256_cmpgt_epi8
+#define _simd_cmpeq_epi8  _mm256_cmpeq_epi8
+#define _simd_cmpgt_epi16  _mm256_cmpgt_epi16
+#define _simd_cmpeq_epi16  _mm256_cmpeq_epi16
 #define _simd_movemask_epi8 _mm256_movemask_epi8
+#define _simd_permute_ps _mm256_permutevar8x32_ps
+#define _simd_srlv_epi32 _mm256_srlv_epi32
+#define _simd_sllv_epi32 _mm256_sllv_epi32
 #endif
 
 #define _simd_shuffleps_epi32(vA, vB, imm) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(vA), _mm256_castsi256_ps(vB), imm))
@@ -343,30 +496,30 @@ void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int sl
 
 INLINE __m256i _simdemu_slli_epi32(__m256i a, uint32_t i)
 {
-	__m128i aHi = _mm256_extractf128_si256(a, 1);
-	__m128i aLo = _mm256_castsi256_si128(a);
+    __m128i aHi = _mm256_extractf128_si256(a, 1);
+    __m128i aLo = _mm256_castsi256_si128(a);
 
-	__m128i resHi = _mm_slli_epi32(aHi, i);
-	__m128i resLo = _mm_slli_epi32(aLo, i);
+    __m128i resHi = _mm_slli_epi32(aHi, i);
+    __m128i resLo = _mm_slli_epi32(aLo, i);
 
-	__m256i result = _mm256_castsi128_si256(resLo);
-		    result = _mm256_insertf128_si256(result, resHi, 1);
+    __m256i result = _mm256_castsi128_si256(resLo);
+            result = _mm256_insertf128_si256(result, resHi, 1);
 
-	return result;
+    return result;
 }
 
 INLINE __m256i _simdemu_srai_epi32(__m256i a, uint32_t i)
 {
-	__m128i aHi = _mm256_extractf128_si256(a, 1);
-	__m128i aLo = _mm256_castsi256_si128(a);
+    __m128i aHi = _mm256_extractf128_si256(a, 1);
+    __m128i aLo = _mm256_castsi256_si128(a);
 
-	__m128i resHi = _mm_srai_epi32(aHi, i);
-	__m128i resLo = _mm_srai_epi32(aLo, i);
+    __m128i resHi = _mm_srai_epi32(aHi, i);
+    __m128i resLo = _mm_srai_epi32(aLo, i);
 
-	__m256i result = _mm256_castsi128_si256(resLo);
-		    result = _mm256_insertf128_si256(result, resHi, 1);
+    __m256i result = _mm256_castsi128_si256(resLo);
+            result = _mm256_insertf128_si256(result, resHi, 1);
 
-	return result;
+    return result;
 }
 
 INLINE __m256i _simdemu_srli_epi32(__m256i a, uint32_t i)
@@ -386,7 +539,7 @@ INLINE __m256i _simdemu_srli_epi32(__m256i a, uint32_t i)
 INLINE
 void _simdvec_transpose(simdvector &v)
 {
-	SWR_ASSERT(false, "Need to implement 8 wide version");
+    SWR_ASSERT(false, "Need to implement 8 wide version");
 }
 
 #else
@@ -397,132 +550,132 @@ void _simdvec_transpose(simdvector &v)
 INLINE
 void _simdvec_load_ps(simdvector& r, const float *p)
 {
-	r[0] = _simd_set1_ps(p[0]);
-	r[1] = _simd_set1_ps(p[1]);
-	r[2] = _simd_set1_ps(p[2]);
-	r[3] = _simd_set1_ps(p[3]);
+    r[0] = _simd_set1_ps(p[0]);
+    r[1] = _simd_set1_ps(p[1]);
+    r[2] = _simd_set1_ps(p[2]);
+    r[3] = _simd_set1_ps(p[3]);
 }
 
 INLINE
 void _simdvec_mov(simdvector& r, const simdscalar& s)
 {
-	r[0] = s;
-	r[1] = s;
-	r[2] = s;
-	r[3] = s;
+    r[0] = s;
+    r[1] = s;
+    r[2] = s;
+    r[3] = s;
 }
 
 INLINE
 void _simdvec_mov(simdvector& r, const simdvector& v)
 {
-	r[0] = v[0];
-	r[1] = v[1];
-	r[2] = v[2];
-	r[3] = v[3];
+    r[0] = v[0];
+    r[1] = v[1];
+    r[2] = v[2];
+    r[3] = v[3];
 }
 
 // just move a lane from the source simdvector to dest simdvector
 INLINE
 void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane)
 {
-	_simd_mov(r[0], rlane, s[0], slane);
-	_simd_mov(r[1], rlane, s[1], slane);
-	_simd_mov(r[2], rlane, s[2], slane);
-	_simd_mov(r[3], rlane, s[3], slane);
+    _simd_mov(r[0], rlane, s[0], slane);
+    _simd_mov(r[1], rlane, s[1], slane);
+    _simd_mov(r[2], rlane, s[2], slane);
+    _simd_mov(r[3], rlane, s[3], slane);
 }
 
 INLINE
 void _simdvec_dp3_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
 {
-	simdscalar tmp;
-	r	= _simd_mul_ps(v0[0], v1[0]);	// (v0.x*v1.x)
+    simdscalar tmp;
+    r   = _simd_mul_ps(v0[0], v1[0]);   // (v0.x*v1.x)
 
-	tmp	= _simd_mul_ps(v0[1], v1[1]);		// (v0.y*v1.y)
-	r	= _simd_add_ps(r, tmp);			// (v0.x*v1.x) + (v0.y*v1.y)
+    tmp = _simd_mul_ps(v0[1], v1[1]);       // (v0.y*v1.y)
+    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y)
 
-	tmp	= _simd_mul_ps(v0[2], v1[2]);	// (v0.z*v1.z)
-	r	= _simd_add_ps(r, tmp);			// (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+    tmp = _simd_mul_ps(v0[2], v1[2]);   // (v0.z*v1.z)
+    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
 }
 
 INLINE
 void _simdvec_dp4_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
 {
-	simdscalar tmp;
-	r	= _simd_mul_ps(v0[0], v1[0]);	// (v0.x*v1.x)
+    simdscalar tmp;
+    r   = _simd_mul_ps(v0[0], v1[0]);   // (v0.x*v1.x)
 
-	tmp	= _simd_mul_ps(v0[1], v1[1]);		// (v0.y*v1.y)
-	r	= _simd_add_ps(r, tmp);			// (v0.x*v1.x) + (v0.y*v1.y)
+    tmp = _simd_mul_ps(v0[1], v1[1]);       // (v0.y*v1.y)
+    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y)
 
-	tmp	= _simd_mul_ps(v0[2], v1[2]);	// (v0.z*v1.z)
-	r	= _simd_add_ps(r, tmp);			// (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+    tmp = _simd_mul_ps(v0[2], v1[2]);   // (v0.z*v1.z)
+    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
 
-	tmp	= _simd_mul_ps(v0[3], v1[3]);	// (v0.w*v1.w)
-	r	= _simd_add_ps(r, tmp);			// (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+    tmp = _simd_mul_ps(v0[3], v1[3]);   // (v0.w*v1.w)
+    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
 }
 
 INLINE
 simdscalar _simdvec_rcp_length_ps(const simdvector& v)
 {
-	simdscalar length;
-	_simdvec_dp4_ps(length, v, v);
-	return _simd_rsqrt_ps(length);
+    simdscalar length;
+    _simdvec_dp4_ps(length, v, v);
+    return _simd_rsqrt_ps(length);
 }
 
 INLINE
 void _simdvec_normalize_ps(simdvector& r, const simdvector& v)
 {
-	simdscalar vecLength;
-	vecLength = _simdvec_rcp_length_ps(v);
+    simdscalar vecLength;
+    vecLength = _simdvec_rcp_length_ps(v);
 
-	r[0] = _simd_mul_ps(v[0], vecLength);
-	r[1] = _simd_mul_ps(v[1], vecLength);
-	r[2] = _simd_mul_ps(v[2], vecLength);
-	r[3] = _simd_mul_ps(v[3], vecLength);
+    r[0] = _simd_mul_ps(v[0], vecLength);
+    r[1] = _simd_mul_ps(v[1], vecLength);
+    r[2] = _simd_mul_ps(v[2], vecLength);
+    r[3] = _simd_mul_ps(v[3], vecLength);
 }
 
 INLINE
 void _simdvec_mul_ps(simdvector& r, const simdvector& v, const simdscalar& s)
 {
-	r[0] = _simd_mul_ps(v[0], s);
-	r[1] = _simd_mul_ps(v[1], s);
-	r[2] = _simd_mul_ps(v[2], s);
-	r[3] = _simd_mul_ps(v[3], s);
+    r[0] = _simd_mul_ps(v[0], s);
+    r[1] = _simd_mul_ps(v[1], s);
+    r[2] = _simd_mul_ps(v[2], s);
+    r[3] = _simd_mul_ps(v[3], s);
 }
 
 INLINE
 void _simdvec_mul_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
 {
-	r[0] = _simd_mul_ps(v0[0], v1[0]);
-	r[1] = _simd_mul_ps(v0[1], v1[1]);
-	r[2] = _simd_mul_ps(v0[2], v1[2]);
-	r[3] = _simd_mul_ps(v0[3], v1[3]);
+    r[0] = _simd_mul_ps(v0[0], v1[0]);
+    r[1] = _simd_mul_ps(v0[1], v1[1]);
+    r[2] = _simd_mul_ps(v0[2], v1[2]);
+    r[3] = _simd_mul_ps(v0[3], v1[3]);
 }
 
 INLINE
 void _simdvec_add_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
 {
-	r[0] = _simd_add_ps(v0[0], v1[0]);
-	r[1] = _simd_add_ps(v0[1], v1[1]);
-	r[2] = _simd_add_ps(v0[2], v1[2]);
-	r[3] = _simd_add_ps(v0[3], v1[3]);
+    r[0] = _simd_add_ps(v0[0], v1[0]);
+    r[1] = _simd_add_ps(v0[1], v1[1]);
+    r[2] = _simd_add_ps(v0[2], v1[2]);
+    r[3] = _simd_add_ps(v0[3], v1[3]);
 }
 
 INLINE
 void _simdvec_min_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
 {
-	r[0] = _simd_min_ps(v0[0], s);
-	r[1] = _simd_min_ps(v0[1], s);
-	r[2] = _simd_min_ps(v0[2], s);
-	r[3] = _simd_min_ps(v0[3], s);
+    r[0] = _simd_min_ps(v0[0], s);
+    r[1] = _simd_min_ps(v0[1], s);
+    r[2] = _simd_min_ps(v0[2], s);
+    r[3] = _simd_min_ps(v0[3], s);
 }
 
 INLINE
 void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
 {
-	r[0] = _simd_max_ps(v0[0], s);
-	r[1] = _simd_max_ps(v0[1], s);
-	r[2] = _simd_max_ps(v0[2], s);
-	r[3] = _simd_max_ps(v0[3], s);
+    r[0] = _simd_max_ps(v0[0], s);
+    r[1] = _simd_max_ps(v0[1], s);
+    r[2] = _simd_max_ps(v0[2], s);
+    r[3] = _simd_max_ps(v0[3], s);
 }
 
 // Matrix4x4 * Vector4
@@ -532,65 +685,65 @@ void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
 //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
 INLINE
 void _simd_mat4x4_vec4_multiply(
-	simdvector& result,
-	const float *pMatrix,
-	const simdvector& v)
-{
-	simdscalar m;
-	simdscalar r0;
-	simdscalar r1;
-
-	m	= _simd_load1_ps(pMatrix + 0*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 3);	// m[row][3]
-	r1	= _simd_mul_ps(m, v[3]);				// (m3 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
-	result[0] = r0;
-
-	m	= _simd_load1_ps(pMatrix + 1*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 3);	// m[row][3]
-	r1	= _simd_mul_ps(m, v[3]);				// (m3 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
-	result[1] = r0;
-
-	m	= _simd_load1_ps(pMatrix + 2*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 3);	// m[row][3]
-	r1	= _simd_mul_ps(m, v[3]);				// (m3 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
-	result[2] = r0;
-
-	m	= _simd_load1_ps(pMatrix + 3*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 3*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 3*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 3*4 + 3);	// m[row][3]
-	r1	= _simd_mul_ps(m, v[3]);				// (m3 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
-	result[3] = r0;
+    simdvector& result,
+    const float *pMatrix,
+    const simdvector& v)
+{
+    simdscalar m;
+    simdscalar r0;
+    simdscalar r1;
+
+    m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 3);    // m[row][3]
+    r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+    result[0] = r0;
+
+    m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 3);    // m[row][3]
+    r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+    result[1] = r0;
+
+    m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 3);    // m[row][3]
+    r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+    result[2] = r0;
+
+    m   = _simd_load1_ps(pMatrix + 3*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 3*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 3*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 3*4 + 3);    // m[row][3]
+    r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+    result[3] = r0;
 }
 
 // Matrix4x4 * Vector3 - Direction Vector where w = 0.
@@ -600,45 +753,45 @@ void _simd_mat4x4_vec4_multiply(
 //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
 INLINE
 void _simd_mat3x3_vec3_w0_multiply(
-	simdvector& result,
-	const float *pMatrix,
-	const simdvector& v)
-{
-	simdscalar m;
-	simdscalar r0;
-	simdscalar r1;
-
-	m	= _simd_load1_ps(pMatrix + 0*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	result[0] = r0;
-
-	m	= _simd_load1_ps(pMatrix + 1*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	result[1] = r0;
-
-	m	= _simd_load1_ps(pMatrix + 2*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	result[2] = r0;
-
-	result[3] = _simd_setzero_ps();
+    simdvector& result,
+    const float *pMatrix,
+    const simdvector& v)
+{
+    simdscalar m;
+    simdscalar r0;
+    simdscalar r1;
+
+    m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    result[0] = r0;
+
+    m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    result[1] = r0;
+
+    m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    result[2] = r0;
+
+    result[3] = _simd_setzero_ps();
 }
 
 // Matrix4x4 * Vector3 - Position vector where w = 1.
@@ -648,108 +801,108 @@ void _simd_mat3x3_vec3_w0_multiply(
 //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
 INLINE
 void _simd_mat4x4_vec3_w1_multiply(
-	simdvector& result,
-	const float *pMatrix,
-	const simdvector& v)
-{
-	simdscalar m;
-	simdscalar r0;
-	simdscalar r1;
-
-	m	= _simd_load1_ps(pMatrix + 0*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 3);	// m[row][3]
-	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-	result[0] = r0;
-
-	m	= _simd_load1_ps(pMatrix + 1*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 3);	// m[row][3]
-	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-	result[1] = r0;
-
-	m	= _simd_load1_ps(pMatrix + 2*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 3);	// m[row][3]
-	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-	result[2] = r0;
-
-	m	= _simd_load1_ps(pMatrix + 3*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 3*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 3*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 3*4 + 3);	// m[row][3]
-	result[3]	= _simd_add_ps(r0, m);			// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+    simdvector& result,
+    const float *pMatrix,
+    const simdvector& v)
+{
+    simdscalar m;
+    simdscalar r0;
+    simdscalar r1;
+
+    m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 3);    // m[row][3]
+    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+    result[0] = r0;
+
+    m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 3);    // m[row][3]
+    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+    result[1] = r0;
+
+    m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 3);    // m[row][3]
+    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+    result[2] = r0;
+
+    m   = _simd_load1_ps(pMatrix + 3*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 3*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 3*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 3*4 + 3);    // m[row][3]
+    result[3]   = _simd_add_ps(r0, m);          // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
 }
 
 INLINE
 void _simd_mat4x3_vec3_w1_multiply(
-	simdvector& result,
-	const float *pMatrix,
-	const simdvector& v)
-{
-	simdscalar m;
-	simdscalar r0;
-	simdscalar r1;
-
-	m	= _simd_load1_ps(pMatrix + 0*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 0*4 + 3);	// m[row][3]
-	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-	result[0] = r0;
-
-	m	= _simd_load1_ps(pMatrix + 1*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 1*4 + 3);	// m[row][3]
-	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-	result[1] = r0;
-
-	m	= _simd_load1_ps(pMatrix + 2*4 + 0);	// m[row][0]
-	r0	= _simd_mul_ps(m, v[0]);				// (m00 * v.x)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 1);	// m[row][1]
-	r1	= _simd_mul_ps(m, v[1]);				// (m1 * v.y)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 2);	// m[row][2]
-	r1	= _simd_mul_ps(m, v[2]);				// (m2 * v.z)
-	r0	= _simd_add_ps(r0, r1);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-	m	= _simd_load1_ps(pMatrix + 2*4 + 3);	// m[row][3]
-	r0	= _simd_add_ps(r0, m);					// (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-	result[2] = r0;
-	result[3] = _simd_set1_ps(1.0f);
+    simdvector& result,
+    const float *pMatrix,
+    const simdvector& v)
+{
+    simdscalar m;
+    simdscalar r0;
+    simdscalar r1;
+
+    m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 0*4 + 3);    // m[row][3]
+    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+    result[0] = r0;
+
+    m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 1*4 + 3);    // m[row][3]
+    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+    result[1] = r0;
+
+    m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
+    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
+    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
+    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
+    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+    m   = _simd_load1_ps(pMatrix + 2*4 + 3);    // m[row][3]
+    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+    result[2] = r0;
+    result[3] = _simd_set1_ps(1.0f);
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -783,5 +936,61 @@ static INLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, cons
     return vplaneps(vA, vB, vC, vI, vJ);
 }
 
+INLINE
+UINT pdep_u32(UINT a, UINT mask)
+{
+#if KNOB_ARCH==KNOB_ARCH_AVX2
+    return _pdep_u32(a, mask);
+#else
+    UINT result = 0;
+
+    // copied from http://wm.ite.pl/articles/pdep-soft-emu.html 
+    // using bsf instead of funky loop
+    DWORD maskIndex;
+    while (_BitScanForward(&maskIndex, mask))
+    {
+        // 1. isolate lowest set bit of mask
+        const UINT lowest = 1 << maskIndex;
+
+        // 2. populate LSB from src
+        const UINT LSB = (UINT)((int)(a << 31) >> 31);
+
+        // 3. copy bit from mask
+        result |= LSB & lowest;
+
+        // 4. clear lowest bit
+        mask &= ~lowest;
+
+        // 5. prepare for next iteration
+        a >>= 1;
+    }
+
+    return result;
+#endif
+}
+
+INLINE
+UINT pext_u32(UINT a, UINT mask)
+{
+#if KNOB_ARCH==KNOB_ARCH_AVX2
+    return _pext_u32(a, mask);
+#else
+    UINT result = 0;
+    DWORD maskIndex;
+    uint32_t currentBit = 0;
+    while (_BitScanForward(&maskIndex, mask))
+    {
+        // 1. isolate lowest set bit of mask
+        const UINT lowest = 1 << maskIndex;
+
+        // 2. copy bit from mask
+        result |= ((a & lowest) > 0) << currentBit++;
+
+        // 3. clear lowest bit
+        mask &= ~lowest;
+    }
+    return result;
+#endif
+}
 
 #endif//__SWR_SIMDINTRIN_H__
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index fccccab503c..f0f7956b590 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -49,7 +49,7 @@ void SetupDefaultState(SWR_CONTEXT *pContext);
 /// @brief Create SWR Context.
 /// @param pCreateInfo - pointer to creation info.
 HANDLE SwrCreateContext(
-    const SWR_CREATECONTEXT_INFO* pCreateInfo)
+    SWR_CREATECONTEXT_INFO* pCreateInfo)
 {
     RDTSC_RESET();
     RDTSC_INIT(0);
@@ -61,27 +61,16 @@ HANDLE SwrCreateContext(
     pContext->driverType = pCreateInfo->driver;
     pContext->privateStateSize = pCreateInfo->privateStateSize;
 
-    pContext->dcRing = (DRAW_CONTEXT*)_aligned_malloc(sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT, 64);
-    memset(pContext->dcRing, 0, sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT);
-
-    pContext->dsRing = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT, 64);
-    memset(pContext->dsRing, 0, sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT);
-
-    pContext->numSubContexts = pCreateInfo->maxSubContexts;
-    if (pContext->numSubContexts > 1)
-    {
-        pContext->subCtxSave = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE) * pContext->numSubContexts, 64);
-        memset(pContext->subCtxSave, 0, sizeof(DRAW_STATE) * pContext->numSubContexts);
-    }
+    pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
+    pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
 
     for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
     {
-        pContext->dcRing[dc].pArena = new Arena();
-        pContext->dcRing[dc].inUse = false;
+        pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
         pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena));
         pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.
 
-        pContext->dsRing[dc].pArena = new Arena();
+        pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
     }
 
     if (!KNOB_SINGLE_THREADED)
@@ -108,9 +97,6 @@ HANDLE SwrCreateContext(
         pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4);
     }
 
-    pContext->nextDrawId = 1;
-    pContext->DrawEnqueued = 1;
-
     // State setup AFTER context is fully initialized
     SetupDefaultState(pContext);
 
@@ -125,6 +111,13 @@ HANDLE SwrCreateContext(
     pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
     pContext->pfnClearTile = pCreateInfo->pfnClearTile;
 
+    // pass pointer to bucket manager back to caller
+#ifdef KNOB_ENABLE_RDTSC
+    pCreateInfo->pBucketMgr = &gBucketMgr;
+#endif
+
+    pCreateInfo->contextSaveSize = sizeof(API_STATE);
+
     return (HANDLE)pContext;
 }
 
@@ -148,10 +141,6 @@ void SwrDestroyContext(HANDLE hContext)
         _aligned_free(pContext->pScratch[i]);
     }
 
-    _aligned_free(pContext->dcRing);
-    _aligned_free(pContext->dsRing);
-    _aligned_free(pContext->subCtxSave);
-
     delete(pContext->pHotTileMgr);
 
     pContext->~SWR_CONTEXT();
@@ -168,49 +157,20 @@ void WakeAllThreads(SWR_CONTEXT *pContext)
     pContext->FifosNotEmpty.notify_all();
 }
 
-bool StillDrawing(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC)
-{
-    // For single thread nothing should still be drawing.
-    if (KNOB_SINGLE_THREADED) { return false; }
-
-    if (pDC->isCompute)
-    {
-        if (pDC->doneCompute)
-        {
-            pDC->inUse = false;
-            return false;
-        }
-    }
-
-    // Check if backend work is done. First make sure all triangles have been binned.
-    if (pDC->doneFE == true)
-    {
-        // ensure workers have all moved passed this draw
-        if (pDC->threadsDoneFE != pContext->NumWorkerThreads)
-        {
-            return true;
-        }
-
-        if (pDC->threadsDoneBE != pContext->NumWorkerThreads)
-        {
-            return true;
-        }
-
-        pDC->inUse = false;    // all work is done.
-    }
-
-    return pDC->inUse;
-}
-
-void QueueDraw(SWR_CONTEXT *pContext)
+template<bool IsDraw>
+void QueueWork(SWR_CONTEXT *pContext)
 {
-    SWR_ASSERT(pContext->pCurDrawContext->inUse == false);
-    pContext->pCurDrawContext->inUse = true;
+    // Each worker thread looks at a DC for both FE and BE work at different times and so we
+    // multiply threadDone by 2.  When the threadDone counter has reached 0 then all workers
+    // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
+    // then moved on if all work is done.)
+    pContext->pCurDrawContext->threadsDone =
+        pContext->NumWorkerThreads ? pContext->NumWorkerThreads * 2 : 2;
 
     _ReadWriteBarrier();
     {
         std::unique_lock<std::mutex> lock(pContext->WaitLock);
-        pContext->DrawEnqueued++;
+        pContext->dcRing.Enqueue();
     }
 
     if (KNOB_SINGLE_THREADED)
@@ -219,10 +179,21 @@ void QueueDraw(SWR_CONTEXT *pContext)
         uint32_t mxcsr = _mm_getcsr();
         _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
 
-        std::unordered_set<uint32_t> lockedTiles;
-        uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
-        WorkOnFifoFE(pContext, 0, curDraw[0], 0);
-        WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles);
+        if (IsDraw)
+        {
+            static TileSet lockedTiles;
+            uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
+            WorkOnFifoFE(pContext, 0, curDraw[0], 0);
+            WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles, 0, 0);
+        }
+        else
+        {
+            uint64_t curDispatch = pContext->pCurDrawContext->drawId;
+            WorkOnCompute(pContext, 0, curDispatch);
+        }
+
+        // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
+        while (CompleteDrawContext(pContext, pContext->pCurDrawContext) > 0) {}
 
         // restore csr
         _mm_setcsr(mxcsr);
@@ -239,40 +210,14 @@ void QueueDraw(SWR_CONTEXT *pContext)
     pContext->pCurDrawContext = nullptr;
 }
 
-///@todo Combine this with QueueDraw
-void QueueDispatch(SWR_CONTEXT *pContext)
+INLINE void QueueDraw(SWR_CONTEXT* pContext)
 {
-    SWR_ASSERT(pContext->pCurDrawContext->inUse == false);
-    pContext->pCurDrawContext->inUse = true;
-
-    _ReadWriteBarrier();
-    {
-        std::unique_lock<std::mutex> lock(pContext->WaitLock);
-        pContext->DrawEnqueued++;
-    }
-
-    if (KNOB_SINGLE_THREADED)
-    {
-        // flush denormals to 0
-        uint32_t mxcsr = _mm_getcsr();
-        _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
-
-        uint64_t curDispatch = pContext->pCurDrawContext->drawId;
-        WorkOnCompute(pContext, 0, curDispatch);
-
-        // restore csr
-        _mm_setcsr(mxcsr);
-    }
-    else
-    {
-        RDTSC_START(APIDrawWakeAllThreads);
-        WakeAllThreads(pContext);
-        RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
-    }
+    QueueWork<true>(pContext);
+}
 
-    // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
-    pContext->pPrevDrawContext = pContext->pCurDrawContext;
-    pContext->pCurDrawContext = nullptr;
+INLINE void QueueDispatch(SWR_CONTEXT* pContext)
+{
+    QueueWork<false>(pContext);
 }
 
 DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
@@ -281,23 +226,21 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
     // If current draw context is null then need to obtain a new draw context to use from ring.
     if (pContext->pCurDrawContext == nullptr)
     {
-        uint32_t dcIndex = pContext->nextDrawId % KNOB_MAX_DRAWS_IN_FLIGHT;
-
-        DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
-        pContext->pCurDrawContext = pCurDrawContext;
-
-        // Need to wait until this draw context is available to use.
-        while (StillDrawing(pContext, pCurDrawContext))
+        // Need to wait for a free entry.
+        while (pContext->dcRing.IsFull())
         {
             _mm_pause();
         }
 
+        uint32_t dcIndex = pContext->dcRing.GetHead() % KNOB_MAX_DRAWS_IN_FLIGHT;
+
+        DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
+        pContext->pCurDrawContext = pCurDrawContext;
+
         // Assign next available entry in DS ring to this DC.
         uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
         pCurDrawContext->pState = &pContext->dsRing[dsIndex];
 
-        Arena& stateArena = *(pCurDrawContext->pState->pArena);
-
         // Copy previous state to current state.
         if (pContext->pPrevDrawContext)
         {
@@ -310,7 +253,9 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
             {
                 CopyState(*pCurDrawContext->pState, *pPrevDrawContext->pState);
 
-                stateArena.Reset(true);    // Reset memory.
+                // Should have been cleaned up previously
+                SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
+
                 pCurDrawContext->pState->pPrivateState = nullptr;
 
                 pContext->curStateId++;  // Progress state ring index forward.
@@ -320,30 +265,31 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
                 // If its a split draw then just copy the state pointer over
                 // since its the same draw.
                 pCurDrawContext->pState = pPrevDrawContext->pState;
+                SWR_ASSERT(pPrevDrawContext->cleanupState == false);
             }
         }
         else
         {
-            stateArena.Reset();    // Reset memory.
+            SWR_ASSERT(pCurDrawContext->pState->pArena->IsEmpty() == true);
             pContext->curStateId++;  // Progress state ring index forward.
         }
 
+        SWR_ASSERT(pCurDrawContext->pArena->IsEmpty() == true);
+
         pCurDrawContext->dependency = 0;
-        pCurDrawContext->pArena->Reset();
         pCurDrawContext->pContext = pContext;
         pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
-        pCurDrawContext->inUse = false;
 
-        pCurDrawContext->doneCompute = false;
         pCurDrawContext->doneFE = false;
         pCurDrawContext->FeLock = 0;
-        pCurDrawContext->threadsDoneFE = 0;
-        pCurDrawContext->threadsDoneBE = 0;
+        pCurDrawContext->threadsDone = 0;
 
         pCurDrawContext->pTileMgr->initialize();
 
         // Assign unique drawId for this DC
-        pCurDrawContext->drawId = pContext->nextDrawId++;
+        pCurDrawContext->drawId = pContext->dcRing.GetHead();
+
+        pCurDrawContext->cleanupState = true;
     }
     else
     {
@@ -354,38 +300,36 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
     return pContext->pCurDrawContext;
 }
 
-void SWR_API SwrSetActiveSubContext(
-    HANDLE hContext,
-    uint32_t subContextIndex)
+API_STATE* GetDrawState(SWR_CONTEXT *pContext)
 {
-    SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
-    if (subContextIndex >= pContext->numSubContexts)
-    {
-        return;
-    }
+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+    SWR_ASSERT(pDC->pState != nullptr);
 
-    if (subContextIndex != pContext->curSubCtxId)
-    {
-        // Save and restore draw state
-        DRAW_CONTEXT* pDC = GetDrawContext(pContext);
-        CopyState(
-            pContext->subCtxSave[pContext->curSubCtxId],
-            *(pDC->pState));
+    return &pDC->pState->state;
+}
 
-        CopyState(
-            *(pDC->pState),
-            pContext->subCtxSave[subContextIndex]);
+void SWR_API SwrSaveState(
+    HANDLE hContext,
+    void* pOutputStateBlock,
+    size_t memSize)
+{
+    SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
+    auto pSrc = GetDrawState(pContext);
+    SWR_ASSERT(pOutputStateBlock && memSize >= sizeof(*pSrc));
 
-        pContext->curSubCtxId = subContextIndex;
-    }
+    memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc));
 }
 
-API_STATE* GetDrawState(SWR_CONTEXT *pContext)
+void SWR_API SwrRestoreState(
+    HANDLE hContext,
+    const void* pStateBlock,
+    size_t memSize)
 {
-    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
-    SWR_ASSERT(pDC->pState != nullptr);
+    SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
+    auto pDst = GetDrawState(pContext);
+    SWR_ASSERT(pStateBlock && memSize >= sizeof(*pDst));
 
-    return &pDC->pState->state;
+    memcpy(pDst, pStateBlock, sizeof(*pDst));
 }
 
 void SetupDefaultState(SWR_CONTEXT *pContext)
@@ -431,16 +375,12 @@ void SwrWaitForIdle(HANDLE hContext)
     SWR_CONTEXT *pContext = GetContext(hContext);
 
     RDTSC_START(APIWaitForIdle);
-    // Wait for all work to complete.
-    for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
-    {
-        DRAW_CONTEXT *pDC = &pContext->dcRing[dc];
 
-        while (StillDrawing(pContext, pDC))
-        {
-            _mm_pause();
-        }
+    while (!pContext->dcRing.IsEmpty())
+    {
+        _mm_pause();
     }
+
     RDTSC_STOP(APIWaitForIdle, 1, 0);
 }
 
@@ -770,16 +710,25 @@ void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
         pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1;
     }
 }
-
+// templated backend function tables
+extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
+extern PFN_BACKEND_FUNC gBackendSingleSample[2][2];
+extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2];
+extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2];
+extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS + 1][SWR_MULTISAMPLE_TYPE_MAX];
+extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2];
+extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2];
+extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2];
 void SetupPipeline(DRAW_CONTEXT *pDC)
 {
     DRAW_STATE* pState = pDC->pState;
     const SWR_RASTSTATE &rastState = pState->state.rastState;
+    const SWR_PS_STATE &psState = pState->state.psState;
     BACKEND_FUNCS& backendFuncs = pState->backendFuncs;
     const uint32_t forcedSampleCount = (rastState.bForcedSampleCount) ? 1 : 0;
 
     // setup backend
-    if (pState->state.psState.pfnPixelShader == nullptr)
+    if (psState.pfnPixelShader == nullptr)
     {
         backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount];
         // always need to generate I & J per sample for Z interpolation
@@ -788,41 +737,40 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
     else
     {
         const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.bForcedSampleCount) ? 1 : 0;
-        const uint32_t centroid = ((pState->state.psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
+        const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
 
         // currently only support 'normal' input coverage
-        SWR_ASSERT(pState->state.psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL ||
-                   pState->state.psState.inputCoverage == SWR_INPUT_COVERAGE_NONE);
+        SWR_ASSERT(psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL ||
+                   psState.inputCoverage == SWR_INPUT_COVERAGE_NONE);
      
-        SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)pState->state.psState.barycentricsMask;
+        SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
         
         // select backend function
-        switch(pState->state.psState.shadingRate)
+        switch(psState.shadingRate)
         {
         case SWR_SHADING_RATE_PIXEL:
             if(bMultisampleEnable)
             {
                 // always need to generate I & J per sample for Z interpolation
                 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
-                backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][pState->state.psState.inputCoverage][centroid][forcedSampleCount];
-                backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][pState->state.blendState.sampleCount];
+                backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount];
+                backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
             }
             else
             {
                 // always need to generate I & J per pixel for Z interpolation
                 barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
-                backendFuncs.pfnBackend = gBackendSingleSample[pState->state.psState.inputCoverage][centroid];
-                backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][SWR_MULTISAMPLE_1X];
+                backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid];
+                backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][SWR_MULTISAMPLE_1X];
             }
             break;
         case SWR_SHADING_RATE_SAMPLE:
             SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN);
             // always need to generate I & J per sample for Z interpolation
             barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
-            backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][pState->state.psState.inputCoverage][centroid];
-            backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][pState->state.blendState.sampleCount];
+            backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid];
+            backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
             break;
-        case SWR_SHADING_RATE_COARSE:
         default:
             SWR_ASSERT(0 && "Invalid shading rate");
             break;
@@ -913,7 +861,7 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
 
     uint32_t numRTs = pState->state.psState.numRenderTargets;
     pState->state.colorHottileEnable = 0;
-    if(pState->state.psState.pfnPixelShader != nullptr)
+    if (psState.pfnPixelShader != nullptr)
     {
         for (uint32_t rt = 0; rt < numRTs; ++rt)
         {
@@ -1005,6 +953,11 @@ uint32_t MaxVertsPerDraw(
         }
         break;
 
+    // The Primitive Assembly code can only handle 1 RECT at a time.
+    case TOP_RECT_LIST:
+        vertsPerDraw = 3;
+        break;
+
     default:
         // We are not splitting up draws for other topologies.
         break;
@@ -1116,6 +1069,8 @@ void DrawInstanced(
         pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
         pDC->FeWork.desc.draw.startVertexID = draw * maxVertsPerDraw;
 
+        pDC->cleanupState = (remainingVerts == numVertsForDraw);
+
         //enqueue DC
         QueueDraw(pContext);
 
@@ -1250,6 +1205,8 @@ void DrawIndexedInstance(
         pDC->FeWork.desc.draw.baseVertex = baseVertex;
         pDC->FeWork.desc.draw.startPrimID = draw * primsPerDraw;
 
+        pDC->cleanupState = (remainingIndices == numIndicesForDraw);
+
         //enqueue DC
         QueueDraw(pContext);
 
@@ -1305,7 +1262,10 @@ void SwrDrawIndexedInstanced(
     DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance);
 }
 
-// Attach surfaces to pipeline
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrInvalidateTiles
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
 void SwrInvalidateTiles(
     HANDLE hContext,
     uint32_t attachmentMask)
@@ -1313,10 +1273,39 @@ void SwrInvalidateTiles(
     SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
 
+    pDC->FeWork.type = DISCARDINVALIDATETILES;
+    pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
+    pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
+    memset(&pDC->FeWork.desc.discardInvalidateTiles.rect, 0, sizeof(SWR_RECT));
+    pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID;
+    pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false;
+    pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false;
+
+    //enqueue
+    QueueDraw(pContext);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrDiscardRect
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
+/// @param rect - if rect is all zeros, the entire attachment surface will be discarded
+void SwrDiscardRect(
+    HANDLE hContext,
+    uint32_t attachmentMask,
+    SWR_RECT rect)
+{
+    SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
+    DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
     // Queue a load to the hottile
-    pDC->FeWork.type = INVALIDATETILES;
-    pDC->FeWork.pfnWork = ProcessInvalidateTiles;
-    pDC->FeWork.desc.invalidateTiles.attachmentMask = attachmentMask;
+    pDC->FeWork.type = DISCARDINVALIDATETILES;
+    pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
+    pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
+    pDC->FeWork.desc.discardInvalidateTiles.rect = rect;
+    pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED;
+    pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true;
+    pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true;
 
     //enqueue
     QueueDraw(pContext);
@@ -1391,7 +1380,7 @@ void SwrClearRenderTarget(
     uint32_t clearMask,
     const float clearColor[4],
     float z,
-    BYTE stencil)
+    uint8_t stencil)
 {
     RDTSC_START(APIClearRenderTarget);
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h
index 72fae8b2c21..90c2f038c46 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@@ -53,7 +53,7 @@ typedef void(SWR_API *PFN_CALLBACK_FUNC)(uint64_t data, uint64_t data2, uint64_t
 /// @param pDstHotTile - pointer to the hot tile surface
 typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, SWR_FORMAT dstFormat,
     SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, BYTE *pDstHotTile);
+    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, uint8_t *pDstHotTile);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Function signature for store hot tiles
@@ -65,7 +65,7 @@ typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, SWR_FORMAT dstForma
 /// @param pSrcHotTile - pointer to the hot tile surface
 typedef void(SWR_API *PFN_STORE_TILE)(HANDLE hPrivateContext, SWR_FORMAT srcFormat,
     SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
-    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, BYTE *pSrcHotTile);
+    uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, uint8_t *pSrcHotTile);
 
 /// @brief Function signature for clearing from the hot tiles clear value
 /// @param hPrivateContext - handle to private data
@@ -77,6 +77,8 @@ typedef void(SWR_API *PFN_CLEAR_TILE)(HANDLE hPrivateContext,
     SWR_RENDERTARGET_ATTACHMENT rtIndex,
     uint32_t x, uint32_t y, const float* pClearColor);
 
+class BucketManager;
+
 //////////////////////////////////////////////////////////////////////////
 /// SWR_CREATECONTEXT_INFO
 /////////////////////////////////////////////////////////////////////////
@@ -88,13 +90,17 @@ struct SWR_CREATECONTEXT_INFO
     // Use SwrGetPrivateContextState() to access private state.
     uint32_t privateStateSize;
 
-    // Each SWR context can have multiple sets of active state
-    uint32_t maxSubContexts;
-
-    // tile manipulation functions
+    // Tile manipulation functions
     PFN_LOAD_TILE pfnLoadTile;
     PFN_STORE_TILE pfnStoreTile;
     PFN_CLEAR_TILE pfnClearTile;
+
+    // Pointer to rdtsc buckets mgr returned to the caller.
+    // Only populated when KNOB_ENABLE_RDTSC is set
+    BucketManager* pBucketMgr;
+
+    // Output: size required memory passed to for SwrSaveState / SwrRestoreState
+    size_t  contextSaveSize;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -112,7 +118,7 @@ struct SWR_RECT
 /// @brief Create SWR Context.
 /// @param pCreateInfo - pointer to creation info.
 HANDLE SWR_API SwrCreateContext(
-    const SWR_CREATECONTEXT_INFO* pCreateInfo);
+    SWR_CREATECONTEXT_INFO* pCreateInfo);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Destroys SWR Context.
@@ -121,12 +127,24 @@ void SWR_API SwrDestroyContext(
     HANDLE hContext);
 
 //////////////////////////////////////////////////////////////////////////
-/// @brief Set currently active state context
-/// @param subContextIndex - value from 0 to
-///     SWR_CREATECONTEXT_INFO.maxSubContexts.  Defaults to 0.
-void SWR_API SwrSetActiveSubContext(
+/// @brief Saves API state associated with hContext
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pOutputStateBlock - Memory block to receive API state data
+/// @param memSize - Size of memory pointed to by pOutputStateBlock
+void SWR_API SwrSaveState(
     HANDLE hContext,
-    uint32_t subContextIndex);
+    void* pOutputStateBlock,
+    size_t memSize);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Restores API state to hContext previously saved with SwrSaveState
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pStateBlock - Memory block to read API state data from
+/// @param memSize - Size of memory pointed to by pStateBlock
+void SWR_API SwrRestoreState(
+    HANDLE hContext,
+    const void* pStateBlock,
+    size_t memSize);
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Sync cmd. Executes the callback func when all rendering up to this sync
@@ -391,6 +409,16 @@ void SWR_API SwrInvalidateTiles(
     uint32_t attachmentMask);
 
 //////////////////////////////////////////////////////////////////////////
+/// @brief SwrDiscardRect
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
+/// @param rect - if rect is all zeros, the entire attachment surface will be discarded
+void SWR_API SwrDiscardRect(
+    HANDLE hContext,
+    uint32_t attachmentMask,
+    SWR_RECT rect);
+
+//////////////////////////////////////////////////////////////////////////
 /// @brief SwrDispatch
 /// @param hContext - Handle passed back from SwrCreateContext
 /// @param threadGroupCountX - Number of thread groups dispatched in X direction
@@ -419,9 +447,9 @@ void SWR_API SwrStoreTiles(
 void SWR_API SwrClearRenderTarget(
     HANDLE hContext,
     uint32_t clearMask,
-    const FLOAT clearColor[4],
+    const float clearColor[4],
     float z,
-    BYTE stencil);
+    uint8_t stencil);
 
 void SWR_API SwrSetRastState(
     HANDLE hContext,
diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.cpp b/src/gallium/drivers/swr/rasterizer/core/arena.cpp
deleted file mode 100644
index 8184c8d3f4c..00000000000
--- a/src/gallium/drivers/swr/rasterizer/core/arena.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file arena.cpp
-*
-* @brief Arena memory manager
-*        The arena is convenient and fast for managing allocations for any of
-*        our allocations that are associated with operations and can all be freed
-*        once when their operation has completed. Allocations are cheap since
-*        most of the time its simply an increment of an offset. Also, no need to
-*        free individual allocations. All of the arena memory can be freed at once.
-*
-******************************************************************************/
-
-#include "context.h"
-#include "arena.h"
-
-#include <cmath>
-
-Arena::Arena()
-    : m_pCurBlock(nullptr), m_size(0)
-{
-    m_pMutex = new std::mutex();
-}
-
-Arena::~Arena()
-{
-    Reset();        // Reset just in case to avoid leaking memory.
-
-    if (m_pCurBlock)
-    {
-        _aligned_free(m_pCurBlock->pMem);
-        delete m_pCurBlock;
-    }
-
-    delete m_pMutex;
-}
-
-///@todo Remove this when all users have stopped using this.
-void Arena::Init()
-{
-    m_size = 0;
-    m_pCurBlock = nullptr;
-
-    m_pMutex = new std::mutex();
-}
-
-void* Arena::AllocAligned(size_t size, size_t align)
-{
-    if (m_pCurBlock)
-    {
-        ArenaBlock* pCurBlock = m_pCurBlock;
-        pCurBlock->offset = AlignUp(pCurBlock->offset, align);
-
-        if ((pCurBlock->offset + size) <= pCurBlock->blockSize)
-        {
-            void* pMem = PtrAdd(pCurBlock->pMem, pCurBlock->offset);
-            pCurBlock->offset += size;
-            m_size += size;
-            return pMem;
-        }
-
-        // Not enough memory in this block, fall through to allocate
-        // a new block
-    }
-
-    static const size_t ArenaBlockSize = 1024*1024;
-    size_t blockSize = std::max(m_size + ArenaBlockSize, std::max(size, ArenaBlockSize));
-    blockSize = AlignUp(blockSize, KNOB_SIMD_WIDTH*4);
-
-    void *pMem = _aligned_malloc(blockSize, KNOB_SIMD_WIDTH*4);    // Arena blocks are always simd byte aligned.
-    SWR_ASSERT(pMem != nullptr);
-
-    ArenaBlock* pNewBlock = new (std::nothrow) ArenaBlock();
-    SWR_ASSERT(pNewBlock != nullptr);
-
-    if (pNewBlock != nullptr)
-    {
-        pNewBlock->pNext        = m_pCurBlock;
-
-        m_pCurBlock             = pNewBlock;
-        m_pCurBlock->pMem       = pMem;
-        m_pCurBlock->blockSize  = blockSize;
-
-    }
-
-    return AllocAligned(size, align);
-}
-
-void* Arena::Alloc(size_t size)
-{
-    return AllocAligned(size, 1);
-}
-
-void* Arena::AllocAlignedSync(size_t size, size_t align)
-{
-    void* pAlloc = nullptr;
-
-    SWR_ASSERT(m_pMutex != nullptr);
-
-    m_pMutex->lock();
-    pAlloc = AllocAligned(size, align);
-    m_pMutex->unlock();
-
-    return pAlloc;
-}
-
-void* Arena::AllocSync(size_t size)
-{
-    void* pAlloc = nullptr;
-
-    SWR_ASSERT(m_pMutex != nullptr);
-
-    m_pMutex->lock();
-    pAlloc = Alloc(size);
-    m_pMutex->unlock();
-
-    return pAlloc;
-}
-
-void Arena::Reset(bool removeAll)
-{
-    if (m_pCurBlock)
-    {
-        m_pCurBlock->offset = 0;
-
-        ArenaBlock *pUsedBlocks = m_pCurBlock->pNext;
-        m_pCurBlock->pNext = nullptr;
-        while(pUsedBlocks)
-        {
-            ArenaBlock* pBlock = pUsedBlocks;
-            pUsedBlocks = pBlock->pNext;
-
-            _aligned_free(pBlock->pMem);
-            delete pBlock;
-        }
-
-        if (removeAll)
-        {
-            _aligned_free(m_pCurBlock->pMem);
-            delete m_pCurBlock;
-            m_pCurBlock = nullptr;
-        }
-    }
-
-    m_size = 0;
-}
diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h
index 76eee11fb08..67d81a44347 100644
--- a/src/gallium/drivers/swr/rasterizer/core/arena.h
+++ b/src/gallium/drivers/swr/rasterizer/core/arena.h
@@ -33,37 +33,308 @@
 #pragma once
 
 #include <mutex>
+#include <algorithm>
+#include <atomic>
+#include "core/utils.h"
 
-class Arena
+class DefaultAllocator
 {
 public:
-    Arena();
-   ~Arena();
+    void* AllocateAligned(size_t size, size_t align)
+    {
+        void* p = _aligned_malloc(size, align);
+        return p;
+    }
+    void  Free(void* pMem)
+    {
+        _aligned_free(pMem);
+    }
+};
 
-    void        Init();
+static const size_t ARENA_BLOCK_ALIGN = 64;
 
-    void*       AllocAligned(size_t size, size_t  align);
-    void*       Alloc(size_t  size);
+struct ArenaBlock
+{
+    size_t      blockSize = 0;
+    ArenaBlock* pNext = nullptr;
+};
+static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN,
+              "Increase BLOCK_ALIGN size");
 
-    void*       AllocAlignedSync(size_t size, size_t align);
-    void*       AllocSync(size_t size);
+// Caching Allocator for Arena
+template<uint32_t NumBucketsT = 4, uint32_t StartBucketBitT = 16>
+struct CachingAllocatorT : DefaultAllocator
+{
+    static uint32_t GetBucketId(size_t blockSize)
+    {
+        uint32_t bucketId = 0;
 
-    void        Reset(bool removeAll = false);
-    size_t      Size() { return m_size; }
+#if defined(BitScanReverseSizeT)
+        BitScanReverseSizeT((unsigned long*)&bucketId, blockSize >> CACHE_START_BUCKET_BIT);
+        bucketId = std::min<uint32_t>(bucketId, CACHE_NUM_BUCKETS - 1);
+#endif
 
-private:
+        return bucketId;
+    }
+
+    void* AllocateAligned(size_t size, size_t align)
+    {
+        SWR_ASSERT(size >= sizeof(ArenaBlock));
+        SWR_ASSERT(size <= uint32_t(-1));
+
+        size_t blockSize = size - ARENA_BLOCK_ALIGN;
+
+        {
+            // search cached blocks
+            std::lock_guard<std::mutex> l(m_mutex);
+            ArenaBlock* pPrevBlock = &m_cachedBlocks[GetBucketId(blockSize)];
+            ArenaBlock* pBlock = pPrevBlock->pNext;
+            ArenaBlock* pPotentialBlock = nullptr;
+            ArenaBlock* pPotentialPrev = nullptr;
+
+            while (pBlock)
+            {
+                if (pBlock->blockSize >= blockSize)
+                {
+                    if (pBlock == AlignUp(pBlock, align))
+                    {
+                        if (pBlock->blockSize == blockSize)
+                        {
+                            // Won't find a better match
+                            break;
+                        }
+
+                        // We could use this as it is larger than we wanted, but
+                        // continue to search for a better match
+                        pPotentialBlock = pBlock;
+                        pPotentialPrev = pPrevBlock;
+                    }
+                }
+                else
+                {
+                    // Blocks are sorted by size (biggest first)
+                    // So, if we get here, there are no blocks 
+                    // large enough, fall through to allocation.
+                    pBlock = nullptr;
+                    break;
+                }
+
+                pPrevBlock = pBlock;
+                pBlock = pBlock->pNext;
+            }
+
+            if (!pBlock)
+            {
+                // Couldn't find an exact match, use next biggest size
+                pBlock = pPotentialBlock;
+                pPrevBlock = pPotentialPrev;
+            }
+
+            if (pBlock)
+            {
+                SWR_ASSERT(pPrevBlock && pPrevBlock->pNext == pBlock);
+                pPrevBlock->pNext = pBlock->pNext;
+                pBlock->pNext = nullptr;
+
+                return pBlock;
+            }
+
+            m_totalAllocated += size;
+
+#if 0
+            {
+                static uint32_t count = 0;
+                char buf[128];
+                sprintf_s(buf, "Arena Alloc %d 0x%llx bytes - 0x%llx total\n", ++count, uint64_t(size), uint64_t(m_totalAllocated));
+                OutputDebugStringA(buf);
+            }
+#endif
+        }
+
+        return this->DefaultAllocator::AllocateAligned(size, align);
+    }
+
+    void  Free(void* pMem)
+    {
+        if (pMem)
+        {
+            ArenaBlock* pNewBlock = reinterpret_cast<ArenaBlock*>(pMem);
+            SWR_ASSERT(pNewBlock->blockSize >= 0);
+
+            std::unique_lock<std::mutex> l(m_mutex);
+            ArenaBlock* pPrevBlock = &m_cachedBlocks[GetBucketId(pNewBlock->blockSize)];
+            ArenaBlock* pBlock = pPrevBlock->pNext;
+
+            while (pBlock)
+            {
+                if (pNewBlock->blockSize >= pBlock->blockSize)
+                {
+                    // Insert here
+                    break;
+                }
+                pPrevBlock = pBlock;
+                pBlock = pBlock->pNext;
+            }
+
+            // Insert into list
+            SWR_ASSERT(pPrevBlock);
+            pPrevBlock->pNext = pNewBlock;
+            pNewBlock->pNext = pBlock;
+        }
+    }
+
+    ~CachingAllocatorT()
+    {
+        // Free all cached blocks
+        for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i)
+        {
+            ArenaBlock* pBlock = m_cachedBlocks[i].pNext;
+            while (pBlock)
+            {
+                ArenaBlock* pNext = pBlock->pNext;
+                this->DefaultAllocator::Free(pBlock);
+                pBlock = pNext;
+            }
+        }
+    }
+
+    // buckets, for block sizes < (1 << (start+1)), < (1 << (start+2)), ...
+    static const uint32_t   CACHE_NUM_BUCKETS       = NumBucketsT;
+    static const uint32_t   CACHE_START_BUCKET_BIT  = StartBucketBitT;
+
+    ArenaBlock              m_cachedBlocks[CACHE_NUM_BUCKETS];
+    std::mutex              m_mutex;
+
+    size_t                  m_totalAllocated = 0;
+};
+typedef CachingAllocatorT<> CachingAllocator;
+
+template<typename T = DefaultAllocator, size_t BlockSizeT = (128 * 1024)>
+class TArena
+{
+public:
+    TArena(T& in_allocator)  : m_allocator(in_allocator) {}
+    TArena()                 : m_allocator(m_defAllocator) {}
+    ~TArena()
+    {
+        Reset(true);
+    }
+
+    void* AllocAligned(size_t size, size_t  align)
+    {
+        if (0 == size)
+        {
+            return nullptr;
+        }
+
+        SWR_ASSERT(align <= ARENA_BLOCK_ALIGN);
+
+        if (m_pCurBlock)
+        {
+            ArenaBlock* pCurBlock = m_pCurBlock;
+            size_t offset = AlignUp(m_offset, align);
+
+            if ((offset + size) <= pCurBlock->blockSize)
+            {
+                void* pMem = PtrAdd(pCurBlock, offset + ARENA_BLOCK_ALIGN);
+                m_offset = offset + size;
+                return pMem;
+            }
+
+            // Not enough memory in this block, fall through to allocate
+            // a new block
+        }
+
+        static const size_t ArenaBlockSize = BlockSizeT - ARENA_BLOCK_ALIGN;
+        size_t blockSize = std::max(size, ArenaBlockSize);
+
+        // Add in one BLOCK_ALIGN unit to store ArenaBlock in.
+        blockSize = AlignUp(blockSize, ARENA_BLOCK_ALIGN);
+
+        void *pMem = m_allocator.AllocateAligned(blockSize + ARENA_BLOCK_ALIGN, ARENA_BLOCK_ALIGN);    // Arena blocks are always simd byte aligned.
+        SWR_ASSERT(pMem != nullptr);
+
+        ArenaBlock* pNewBlock = new (pMem) ArenaBlock();
+
+        if (pNewBlock != nullptr)
+        {
+            m_offset = 0;
+            pNewBlock->pNext = m_pCurBlock;
+
+            m_pCurBlock = pNewBlock;
+            m_pCurBlock->blockSize = blockSize;
+        }
+
+        return AllocAligned(size, align);
+    }
+
+    void* Alloc(size_t  size)
+    {
+        return AllocAligned(size, 1);
+    }
 
-    struct ArenaBlock
+    void* AllocAlignedSync(size_t size, size_t align)
     {
-        void*       pMem        = nullptr;
-        size_t      blockSize   = 0;
-        size_t      offset      = 0;
-        ArenaBlock* pNext       = nullptr;
-    };
+        void* pAlloc = nullptr;
 
-    ArenaBlock*     m_pCurBlock = nullptr;
-    size_t          m_size      = 0;
+        m_mutex.lock();
+        pAlloc = AllocAligned(size, align);
+        m_mutex.unlock();
+
+        return pAlloc;
+    }
+
+    void* AllocSync(size_t size)
+    {
+        void* pAlloc = nullptr;
+
+        m_mutex.lock();
+        pAlloc = Alloc(size);
+        m_mutex.unlock();
+
+        return pAlloc;
+    }
+
+    void Reset(bool removeAll = false)
+    {
+        m_offset = 0;
+
+        if (m_pCurBlock)
+        {
+            ArenaBlock *pUsedBlocks = m_pCurBlock->pNext;
+            m_pCurBlock->pNext = nullptr;
+            while (pUsedBlocks)
+            {
+                ArenaBlock* pBlock = pUsedBlocks;
+                pUsedBlocks = pBlock->pNext;
+
+                m_allocator.Free(pBlock);
+            }
+
+            if (removeAll)
+            {
+                m_allocator.Free(m_pCurBlock);
+                m_pCurBlock = nullptr;
+            }
+        }
+    }
+
+    bool IsEmpty()
+    {
+        return (m_pCurBlock == nullptr) || (m_offset == 0 && m_pCurBlock->pNext == nullptr);
+    }
+
+private:
+
+    ArenaBlock*         m_pCurBlock = nullptr;
+    size_t              m_offset    = 0;
 
     /// @note Mutex is only used by sync allocation functions.
-    std::mutex*     m_pMutex;
+    std::mutex          m_mutex;
+
+    DefaultAllocator    m_defAllocator;
+    T&                  m_allocator;
 };
+
+using StdArena      = TArena<DefaultAllocator>;
+using CachingArena  = TArena<CachingAllocator>;
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
index 4a472bc9e5c..7fb83edf169 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -156,7 +156,7 @@ void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTil
 }
 
 template<SWR_FORMAT format>
-void ClearRasterTile(BYTE *pTileBuffer, simdvector &value)
+void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
 {
     auto lambda = [&](int comp)
     {
@@ -299,10 +299,10 @@ void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, vo
             /// @todo clear data should come in as RGBA32_FLOAT
             DWORD clearData[4];
             float clearFloat[4];
-            clearFloat[0] = ((BYTE*)(&pClear->clearRTColor))[0] / 255.0f;
-            clearFloat[1] = ((BYTE*)(&pClear->clearRTColor))[1] / 255.0f;
-            clearFloat[2] = ((BYTE*)(&pClear->clearRTColor))[2] / 255.0f;
-            clearFloat[3] = ((BYTE*)(&pClear->clearRTColor))[3] / 255.0f;
+            clearFloat[0] = ((uint8_t*)(&pClear->clearRTColor))[0] / 255.0f;
+            clearFloat[1] = ((uint8_t*)(&pClear->clearRTColor))[1] / 255.0f;
+            clearFloat[2] = ((uint8_t*)(&pClear->clearRTColor))[2] / 255.0f;
+            clearFloat[3] = ((uint8_t*)(&pClear->clearRTColor))[3] / 255.0f;
             clearData[0] = *(DWORD*)&clearFloat[0];
             clearData[1] = *(DWORD*)&clearFloat[1];
             clearData[2] = *(DWORD*)&clearFloat[2];
@@ -399,30 +399,32 @@ void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile
 }
 
 
-void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
+void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
 {
-    INVALIDATE_TILES_DESC *pDesc = (INVALIDATE_TILES_DESC*)pData;
+    DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData;
     SWR_CONTEXT *pContext = pDC->pContext;
 
+    const int numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
+
     for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
     {
         if (pDesc->attachmentMask & (1 << i))
         {
-            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, false);
+            HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(
+                pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples);
             if (pHotTile)
             {
-                pHotTile->state = HOTTILE_INVALID;
+                pHotTile->state = (HOTTILE_STATE)pDesc->newTileState;
             }
         }
     }
 }
 
 #if KNOB_SIMD_WIDTH == 8
-const __m256 vQuadCenterOffsetsX = { 0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5 };
-const __m256 vQuadCenterOffsetsY = { 0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5 };
-const __m256 vQuadULOffsetsX ={0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
-const __m256 vQuadULOffsetsY ={0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
-#define MASK 0xff
+const __m256 vCenterOffsetsX = {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
+const __m256 vCenterOffsetsY = {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
+const __m256 vULOffsetsX = {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
+const __m256 vULOffsetsY = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
 #else
 #error Unsupported vector width
 #endif
@@ -457,155 +459,6 @@ simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscala
     return _simd_movemask_ps(vClipMask);
 }
 
-template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
-INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
-{
-
-    // will need to update for avx512
-    assert(KNOB_SIMD_WIDTH == 8);
-
-    __m256i mask[2];
-    __m256i sampleCoverage[2];
-    if(bIsStandardPattern)
-    {
-        __m256i src = _mm256_set1_epi32(0);
-        __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
-
-        if(MultisampleTraits<sampleCountT>::numSamples == 1)
-        {
-            mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 2)
-        {
-            mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 4)
-        {
-            mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 8)
-        {
-            mask[0] = _mm256_set1_epi32(-1);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 16)
-        {
-            mask[0] = _mm256_set1_epi32(-1);
-            mask[1] = _mm256_set1_epi32(-1);
-            index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
-        }
-
-        // gather coverage for samples 0-7
-        sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
-        if(MultisampleTraits<sampleCountT>::numSamples > 8)
-        {
-            // gather coverage for samples 8-15
-            sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
-        }
-    }
-    else
-    {
-        // center coverage is the same for all samples; just broadcast to the sample slots
-        uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
-        if(MultisampleTraits<sampleCountT>::numSamples == 1)
-        {
-            sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 2)
-        {
-            sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 4)
-        {
-            sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 8)
-        {
-            sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
-        }
-        else if(MultisampleTraits<sampleCountT>::numSamples == 16)
-        {
-            sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
-            sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
-        }
-    }
-
-    mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
-                              -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
-    // pull out the the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
-    __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
-
-    __m256i packedCoverage1;
-    if(MultisampleTraits<sampleCountT>::numSamples > 8)
-    {
-        // pull out the the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
-        packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
-    }
-
-#if (KNOB_ARCH == KNOB_ARCH_AVX)
-    // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
-    __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
-    __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
-    packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
-
-    __m256i packedSampleCoverage;
-    if(MultisampleTraits<sampleCountT>::numSamples > 8)
-    {
-        // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
-        hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
-        shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
-        shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
-        packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
-        packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
-    }
-    else
-    {
-        packedSampleCoverage = packedCoverage0;
-    }
-#else
-    __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
-    // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
-    packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
-
-    __m256i packedSampleCoverage;
-    if(MultisampleTraits<sampleCountT>::numSamples > 8)
-    {
-        permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
-        // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
-        packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
-
-        // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
-        packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
-    }
-    else
-    {
-        packedSampleCoverage = packedCoverage0;
-    }
-#endif
-
-    for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
-    {
-        // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
-        inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
-
-        if(!bForcedSampleCount)
-        {
-            // input coverage has to be anded with sample mask if MSAA isn't forced on
-            inputMask[i] &= sampleMask;
-        }
-
-        // shift to the next pixel in the 4x2
-        packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
-    }
-}
-
-template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
-INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
-{
-    uint32_t inputMask[KNOB_SIMD_WIDTH]; 
-    generateInputCoverage<sampleCountT, bIsStandardPattern, bForcedSampleCount>(coverageMask, inputMask, sampleMask);
-    inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
-}
-
 template<bool perspMask>
 INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
 {
@@ -766,6 +619,8 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND
     // type safety guaranteed from template instantiation in BEChooser<>::GetFunc
     static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
     uint32_t rasterTileColorOffset = MultisampleTraits<sampleCount>::RasterTileColorOffset(sample);
+    simdvector blendOut;
+
     for(uint32_t rt = 0; rt < NumRT; ++rt)
     {
         uint8_t *pColorSample;
@@ -779,6 +634,9 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND
         }
 
         const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
+        // pfnBlendFunc may not update all channels.  Initialize with PS output.
+        /// TODO: move this into the blend JIT.
+        blendOut = psContext.shaded[rt];
 
         // Blend outputs and update coverage mask for alpha test
         if(pfnBlendFunc[rt] != nullptr)
@@ -789,7 +647,7 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND
                 psContext.shaded[1],
                 sample,
                 pColorSample,
-                psContext.shaded[rt],
+                blendOut,
                 &psContext.oMask,
                 (simdscalari*)&coverageMask);
         }
@@ -805,19 +663,19 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND
         // store with color mask
         if(!pRTBlend->writeDisableRed)
         {
-            _simd_maskstore_ps((float*)pColorSample, outputMask, psContext.shaded[rt].x);
+            _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
         }
         if(!pRTBlend->writeDisableGreen)
         {
-            _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, psContext.shaded[rt].y);
+            _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
         }
         if(!pRTBlend->writeDisableBlue)
         {
-            _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, psContext.shaded[rt].z);
+            _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
         }
         if(!pRTBlend->writeDisableAlpha)
         {
-            _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, psContext.shaded[rt].w);
+            _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
         }
     }
 }
@@ -884,9 +742,9 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
     for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
         // UL pixel corner
-        psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+        psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
         // pixel center
-        psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
+        psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
 
         for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
         {
@@ -898,9 +756,9 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
             if(coverageMask & MASK)
             {
                 RDTSC_START(BEBarycentric);
-                psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+                psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
                 // pixel center
-                psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
+                psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
 
                 backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
 
@@ -1077,15 +935,15 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
         // UL pixel corner
-        psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+        psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
         // pixel center
-        psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
+        psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
         
         for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
         {
-            psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+            psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
             // pixel center
-            psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
+            psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
 
             RDTSC_START(BEBarycentric);
             backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
@@ -1313,14 +1171,14 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
     
     for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
-        psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
-        psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
+        psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
+        psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
         for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
         {
-            simdscalar vZ[MultisampleTraits<sampleCount>::numSamples];
-            psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+            simdscalar vZ[MultisampleTraits<sampleCount>::numSamples]{ 0 };
+            psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
             // set pixel center positions
-            psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
+            psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
 
             if (bInputCoverage)
             {
@@ -1353,7 +1211,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
             }
             else
             {
-				psContext.activeMask = _simd_set1_epi32(-1);
+                psContext.activeMask = _simd_set1_epi32(-1);
             }
 
             // need to declare enough space for all samples
@@ -1552,9 +1410,11 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
     RDTSC_START(BESetup);
 
     static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
+
     SWR_CONTEXT *pContext = pDC->pContext;
     const API_STATE& state = GetApiState(pDC);
     const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
+    const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
 
     // broadcast scalars
     BarycentricCoeffs coeffs;
@@ -1572,7 +1432,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
 
     coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
 
-    BYTE *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
+    uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
 
     RDTSC_STOP(BESetup, 0, 0);
 
@@ -1580,12 +1440,12 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
     for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
     {
         // UL pixel corner
-        simdscalar vYSamplePosUL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+        simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
 
         for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
         {
             // UL pixel corners
-            simdscalar vXSamplePosUL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+            simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
 
             // iterate over active samples
             unsigned long sample = 0;
@@ -1593,7 +1453,8 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
             while (_BitScanForward(&sample, sampleMask))
             {
                 sampleMask &= ~(1 << sample);
-                if (work.coverageMask[sample] & MASK)
+                simdmask coverageMask = work.coverageMask[sample] & MASK;
+                if (coverageMask)
                 {
                     RDTSC_START(BEBarycentric);
                     // calculate per sample positions
@@ -1607,7 +1468,14 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
 
                     RDTSC_STOP(BEBarycentric, 0, 0);
 
-                    simdscalar vCoverageMask = vMask(work.coverageMask[sample] & MASK);
+                    // interpolate user clip distance if available
+                    if (rastState.clipDistanceMask)
+                    {
+                        coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
+                            psContext.vI.sample, psContext.vJ.sample);
+                    }
+
+                    simdscalar vCoverageMask = vMask(coverageMask);
                     simdscalar stencilPassMask = vCoverageMask;
 
                     // offset depth/stencil buffers current sample
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h
index 53089e5047b..2fa18953cad 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.h
@@ -29,16 +29,20 @@
 #pragma once
 
 #include "common/os.h"
-#include "core/context.h" 
+#include "core/context.h"
+#include "core/multisample.h"
 
 void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId);
 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
 void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
 void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
-void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
+void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
 void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers);
 void InitClearTilesTable();
+simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ);
+void InitBackendFuncTables();
+void InitCPSFuncTables();
 
 enum SWR_BACKEND_FUNCS
 {
@@ -47,13 +51,160 @@ enum SWR_BACKEND_FUNCS
     SWR_BACKEND_MSAA_SAMPLE_RATE,
     SWR_BACKEND_FUNCS_MAX,
 };
-void InitBackendFuncTables();
 
-extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
-extern PFN_BACKEND_FUNC gBackendSingleSample[2][2];
-extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2];
-extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2];
-extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS+1][SWR_MULTISAMPLE_TYPE_MAX];
-extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2];
-extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2];
-extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2];
+#if KNOB_SIMD_WIDTH == 8
+extern const __m256 vCenterOffsetsX;
+extern const __m256 vCenterOffsetsY;
+extern const __m256 vULOffsetsX;
+extern const __m256 vULOffsetsY;
+#define MASK 0xff
+#endif
+
+template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
+INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
+{
+
+    // will need to update for avx512
+    assert(KNOB_SIMD_WIDTH == 8);
+
+    __m256i mask[2];
+    __m256i sampleCoverage[2];
+    if(bIsStandardPattern)
+    {
+        __m256i src = _mm256_set1_epi32(0);
+        __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
+
+        if(MultisampleTraits<sampleCountT>::numSamples == 1)
+        {
+            mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 2)
+        {
+            mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 4)
+        {
+            mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 8)
+        {
+            mask[0] = _mm256_set1_epi32(-1);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 16)
+        {
+            mask[0] = _mm256_set1_epi32(-1);
+            mask[1] = _mm256_set1_epi32(-1);
+            index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
+        }
+
+        // gather coverage for samples 0-7
+        sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
+        if(MultisampleTraits<sampleCountT>::numSamples > 8)
+        {
+            // gather coverage for samples 8-15
+            sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
+        }
+    }
+    else
+    {
+        // center coverage is the same for all samples; just broadcast to the sample slots
+        uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
+        if(MultisampleTraits<sampleCountT>::numSamples == 1)
+        {
+            sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 2)
+        {
+            sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 4)
+        {
+            sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 8)
+        {
+            sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
+        }
+        else if(MultisampleTraits<sampleCountT>::numSamples == 16)
+        {
+            sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
+            sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
+        }
+    }
+
+    mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
+                              -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
+    // pull out the the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
+    __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
+
+    __m256i packedCoverage1;
+    if(MultisampleTraits<sampleCountT>::numSamples > 8)
+    {
+        // pull out the the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
+        packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
+    }
+
+#if (KNOB_ARCH == KNOB_ARCH_AVX)
+    // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
+    __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
+    __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
+    packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
+
+    __m256i packedSampleCoverage;
+    if(MultisampleTraits<sampleCountT>::numSamples > 8)
+    {
+        // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
+        hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
+        shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
+        shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
+        packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
+        packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
+    }
+    else
+    {
+        packedSampleCoverage = packedCoverage0;
+    }
+#else
+    __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
+    // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
+    packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
+
+    __m256i packedSampleCoverage;
+    if(MultisampleTraits<sampleCountT>::numSamples > 8)
+    {
+        permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
+        // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
+        packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
+
+        // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
+        packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
+    }
+    else
+    {
+        packedSampleCoverage = packedCoverage0;
+    }
+#endif
+
+    for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
+    {
+        // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
+        inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
+
+        if(!bForcedSampleCount)
+        {
+            // input coverage has to be anded with sample mask if MSAA isn't forced on
+            inputMask[i] &= sampleMask;
+        }
+
+        // shift to the next pixel in the 4x2
+        packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
+    }
+}
+
+template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
+INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
+{
+    uint32_t inputMask[KNOB_SIMD_WIDTH]; 
+    generateInputCoverage<sampleCountT, bIsStandardPattern, bForcedSampleCount>(coverageMask, inputMask, sampleMask);
+    inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
index ce27bf71d3c..3a2a8b35be8 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
@@ -31,6 +31,9 @@
 #include "common/os.h"
 #include "core/clip.h"
 
+// Temp storage used by the clipper
+THREAD simdvertex tlsTempVertices[7];
+
 float ComputeInterpFactor(float boundaryCoord0, float boundaryCoord1)
 {
     return (boundaryCoord0 / (boundaryCoord0 - boundaryCoord1));
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h
index 49494a4e374..ba5870a92bb 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -32,6 +32,9 @@
 #include "core/pa.h"
 #include "rdtsc_core.h"
 
+// Temp storage used by the clipper
+extern THREAD simdvertex tlsTempVertices[7];
+
 enum SWR_CLIPCODES
 {
     // Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
@@ -354,6 +357,25 @@ public:
             }
         }
 
+        // assemble user clip distances if enabled
+        if (this->state.rastState.clipDistanceMask & 0xf)
+        {
+            pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector);
+            for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+            {
+                vertices[i].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT] = tmpVector[i];
+            }
+        }
+
+        if (this->state.rastState.clipDistanceMask & 0xf0)
+        {
+            pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector);
+            for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+            {
+                vertices[i].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT] = tmpVector[i];
+            }
+        }
+
         uint32_t numAttribs = maxSlot + 1;
 
         simdscalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
@@ -436,6 +458,27 @@ public:
                 }
             }
 
+            // transpose user clip distances if enabled
+            if (this->state.rastState.clipDistanceMask & 0xf)
+            {
+                pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * inputPrim;
+                for (uint32_t c = 0; c < 4; ++c)
+                {
+                    transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
+                    pBase += sizeof(simdscalar);
+                }
+            }
+
+            if (this->state.rastState.clipDistanceMask & 0xf0)
+            {
+                pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * inputPrim;
+                for (uint32_t c = 0; c < 4; ++c)
+                {
+                    transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
+                    pBase += sizeof(simdscalar);
+                }
+            }
+
             PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology);
 
             while (clipPa.GetNextStreamOutput())
@@ -630,6 +673,31 @@ private:
                 ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
             }
         }
+
+        // interpolate clip distance if enabled
+        if (this->state.rastState.clipDistanceMask & 0xf)
+        {
+            uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
+            for (uint32_t c = 0; c < 4; ++c)
+            {
+                simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
+                simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
+                simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+                ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
+            }
+        }
+
+        if (this->state.rastState.clipDistanceMask & 0xf0)
+        {
+            uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
+            for (uint32_t c = 0; c < 4; ++c)
+            {
+                simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
+                simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
+                simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+                ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
+            }
+        }
     }
 
     template<SWR_CLIPCODES ClippingPlane>
@@ -700,6 +768,27 @@ private:
                     }
                 }
 
+                // store clip distance if enabled
+                if (this->state.rastState.clipDistanceMask & 0xf)
+                {
+                    uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
+                    for (uint32_t c = 0; c < 4; ++c)
+                    {
+                        simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+                        ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
+                    }
+                }
+
+                if (this->state.rastState.clipDistanceMask & 0xf0)
+                {
+                    uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
+                    for (uint32_t c = 0; c < 4; ++c)
+                    {
+                        simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+                        ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
+                    }
+                }
+
                 // increment outIndex
                 vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
             }
@@ -818,8 +907,7 @@ private:
     simdscalari ClipPrims(float* pVertices, const simdscalar& vPrimMask, const simdscalar& vClipMask, int numAttribs)
     {
         // temp storage
-        simdvertex tempVertices[7];
-        float* pTempVerts = (float*)&tempVertices[0];
+        float* pTempVerts = (float*)&tlsTempVertices[0];
 
         // zero out num input verts for non-active lanes
         simdscalari vNumInPts = _simd_set1_epi32(NumVertsPerPrim);
@@ -854,9 +942,9 @@ private:
         return vNumOutPts;
     }
 
-    const uint32_t workerId;
-    const DRIVER_TYPE driverType;
-    DRAW_CONTEXT* pDC;
+    const uint32_t workerId{ 0 };
+    const DRIVER_TYPE driverType{ DX };
+    DRAW_CONTEXT* pDC{ nullptr };
     const API_STATE& state;
     simdscalar clipCodes[NumVertsPerPrim];
 };
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index 4a214aff1c8..39f23372a18 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -41,6 +41,7 @@
 #include "core/knobs.h"
 #include "common/simdintrin.h"
 #include "core/threads.h"
+#include "ringbuffer.h"
 
 // x.8 fixed point precision values
 #define FIXED_POINT_SHIFT 8
@@ -82,6 +83,7 @@ struct SWR_TRIANGLE_DESC
     float *pUserClipBuffer;
 
     uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES];
+    uint64_t anyCoveredSamples;
 
     TRI_FLAGS triFlags;
 };
@@ -109,12 +111,16 @@ struct CLEAR_DESC
     CLEAR_FLAGS flags;
     float clearRTColor[4];  // RGBA_32F
     float clearDepth;   // [0..1]
-    BYTE clearStencil;
+    uint8_t clearStencil;
 };
 
-struct INVALIDATE_TILES_DESC
+struct DISCARD_INVALIDATE_TILES_DESC
 {
     uint32_t attachmentMask;
+    SWR_RECT rect;
+    SWR_TILE_STATE newTileState;
+    bool createNewTiles;
+    bool fullTilesOnly;
 };
 
 struct SYNC_DESC
@@ -150,7 +156,7 @@ enum WORK_TYPE
     SYNC,
     DRAW,
     CLEAR,
-    INVALIDATETILES,
+    DISCARDINVALIDATETILES,
     STORETILES,
     QUERYSTATS,
 };
@@ -164,7 +170,7 @@ struct BE_WORK
         SYNC_DESC sync;
         TRIANGLE_WORK_DESC tri;
         CLEAR_DESC clear;
-        INVALIDATE_TILES_DESC invalidateTiles;
+        DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
         STORE_TILES_DESC storeTiles;
         QUERY_DESC queryStats;
     } desc;
@@ -201,7 +207,7 @@ struct FE_WORK
         SYNC_DESC sync;
         DRAW_WORK draw;
         CLEAR_DESC clear;
-        INVALIDATE_TILES_DESC invalidateTiles;
+        DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
         STORE_TILES_DESC storeTiles;
         QUERY_DESC queryStats;
     } desc;
@@ -354,6 +360,7 @@ struct BACKEND_FUNCS
     PFN_OUTPUT_MERGER pfnOutputMerger;
 };
 
+
 // Draw State
 struct DRAW_STATE
 {
@@ -365,7 +372,7 @@ struct DRAW_STATE
     BACKEND_FUNCS backendFuncs;
     PFN_PROCESS_PRIMS pfnProcessPrims;
 
-    Arena*    pArena;     // This should only be used by API thread.
+    CachingArena* pArena;     // This should only be used by API thread.
 };
 
 // Draw Context
@@ -381,25 +388,22 @@ struct DRAW_CONTEXT
 
     FE_WORK FeWork;
     volatile OSALIGNLINE(uint32_t) FeLock;
-    volatile OSALIGNLINE(bool) inUse;
     volatile OSALIGNLINE(bool) doneFE;    // Is FE work done for this draw?
-
-    // Have all worker threads moved past draw in DC ring?
-    volatile OSALIGNLINE(uint32_t) threadsDoneFE;
-    volatile OSALIGNLINE(uint32_t) threadsDoneBE;
+    volatile OSALIGNLINE(int64_t) threadsDone;
 
     uint64_t dependency;
 
     MacroTileMgr* pTileMgr;
 
     // The following fields are valid if isCompute is true.
-    volatile OSALIGNLINE(bool) doneCompute; // Is this dispatch done?   (isCompute)
     DispatchQueue* pDispatch;               // Queue for thread groups. (isCompute)
 
     DRAW_STATE* pState;
-    Arena*    pArena;
+    CachingArena* pArena;
 
     uint8_t* pSpillFill[KNOB_MAX_NUM_THREADS];  // Scratch space used for spill fills.
+
+    bool  cleanupState; // True if this is the last draw using an entry in the state ring.
 };
 
 INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
@@ -438,7 +442,7 @@ struct SWR_CONTEXT
     //  3. State - When an applications sets state after draw
     //     a. Same as step 1.
     //     b. State is copied from prev draw context to current.
-    DRAW_CONTEXT* dcRing;
+    RingBuffer<DRAW_CONTEXT> dcRing;
 
     DRAW_CONTEXT *pCurDrawContext;    // This points to DC entry in ring for an unsubmitted draw.
     DRAW_CONTEXT *pPrevDrawContext;   // This points to DC entry for the previous context submitted that we can copy state from.
@@ -448,14 +452,10 @@ struct SWR_CONTEXT
     //  These split draws all have identical state. So instead of storing the state directly
     //  in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs
     //  to reference a single entry in the DS ring.
-    DRAW_STATE*   dsRing;
+    RingBuffer<DRAW_STATE> dsRing;
 
     uint32_t curStateId;               // Current index to the next available entry in the DS ring.
 
-    DRAW_STATE*   subCtxSave;          // Save area for inactive contexts.
-    uint32_t      curSubCtxId;         // Current index for active state subcontext.
-    uint32_t      numSubContexts;      // Number of available subcontexts
-
     uint32_t NumWorkerThreads;
 
     THREAD_POOL threadPool; // Thread pool associated with this context
@@ -463,13 +463,6 @@ struct SWR_CONTEXT
     std::condition_variable FifosNotEmpty;
     std::mutex WaitLock;
 
-    // Draw Contexts will get a unique drawId generated from this
-    uint64_t nextDrawId;
-
-    // most recent draw id enqueued by the API thread
-    // written by api thread, read by multiple workers
-    OSALIGNLINE(volatile uint64_t) DrawEnqueued;
-
     DRIVER_TYPE driverType;
 
     uint32_t privateStateSize;
@@ -486,6 +479,8 @@ struct SWR_CONTEXT
 
     // Scratch space for workers.
     uint8_t* pScratch[KNOB_MAX_NUM_THREADS];
+
+    CachingAllocator cachingArenaAllocator;
 };
 
 void WaitForDependencies(SWR_CONTEXT *pContext, uint64_t drawId);
diff --git a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
index 4f245c8c53e..2cc9d4054ac 100644
--- a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
+++ b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
@@ -82,7 +82,7 @@ void StencilOp(SWR_STENCILOP op, simdscalar mask, simdscalar stencilRefps, simds
 
 INLINE
 simdscalar DepthStencilTest(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState,
-                 bool frontFacing, simdscalar interpZ, BYTE* pDepthBase, simdscalar coverageMask, BYTE *pStencilBase,
+                 bool frontFacing, simdscalar interpZ, uint8_t* pDepthBase, simdscalar coverageMask, uint8_t *pStencilBase,
                  simdscalar* pStencilMask)
 {
     static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
@@ -177,8 +177,8 @@ simdscalar DepthStencilTest(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENC
 
 INLINE
 void DepthStencilWrite(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState,
-        bool frontFacing, simdscalar interpZ, BYTE* pDepthBase, const simdscalar& depthMask, const simdscalar& coverageMask, 
-        BYTE *pStencilBase, const simdscalar& stencilMask)
+        bool frontFacing, simdscalar interpZ, uint8_t* pDepthBase, const simdscalar& depthMask, const simdscalar& coverageMask, 
+        uint8_t *pStencilBase, const simdscalar& stencilMask)
 {
     if (pDSState->depthWriteEnable)
     {
diff --git a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
index 7e556012e6b..ccf0b70544f 100644
--- a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
+++ b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
@@ -49,7 +49,8 @@ struct QUEUE
     static const uint32_t mBlockSizeShift = 6;
     static const uint32_t mBlockSize = 1 << mBlockSizeShift;
 
-    void clear(Arena& arena)
+    template <typename ArenaT>
+    void clear(ArenaT& arena)
     {
         mHead = 0;
         mTail = 0;
@@ -102,7 +103,8 @@ struct QUEUE
         mNumEntries --;
     }
 
-    bool enqueue_try_nosync(Arena& arena, const T* entry)
+    template <typename ArenaT>
+    bool enqueue_try_nosync(ArenaT& arena, const T* entry)
     {
         memcpy(&mCurBlock[mTail], entry, sizeof(T));
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
index 83d85fc86d8..344758eefe5 100644
--- a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
@@ -34,7 +34,7 @@
 /// @param pSrc - source data in SOA form
 /// @param dst - output data in SOA form
 template<SWR_FORMAT SrcFormat>
-INLINE void LoadSOA(const BYTE *pSrc, simdvector &dst)
+INLINE void LoadSOA(const uint8_t *pSrc, simdvector &dst)
 {
     // fast path for float32
     if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<SrcFormat>::GetBPC(0) == 32))
@@ -141,7 +141,7 @@ INLINE simdscalar Normalize(simdscalar vComp, uint32_t Component)
 /// @param src - source data in SOA form
 /// @param dst - output data in SOA form
 template<SWR_FORMAT DstFormat>
-INLINE void StoreSOA(const simdvector &src, BYTE *pDst)
+INLINE void StoreSOA(const simdvector &src, uint8_t *pDst)
 {
     // fast path for float32
     if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<DstFormat>::GetBPC(0) == 32))
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h b/src/gallium/drivers/swr/rasterizer/core/format_types.h
index aa350259a15..9acf846a7f0 100644
--- a/src/gallium/drivers/swr/rasterizer/core/format_types.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h
@@ -34,8 +34,8 @@ template <uint32_t NumBits, bool Signed = false>
 struct PackTraits
 {
     static const uint32_t MyNumBits = NumBits;
-    static simdscalar loadSOA(const BYTE *pSrc) = delete;
-    static void storeSOA(BYTE *pDst, simdscalar src) = delete;
+    static simdscalar loadSOA(const uint8_t *pSrc) = delete;
+    static void storeSOA(uint8_t *pDst, simdscalar src) = delete;
     static simdscalar unpack(simdscalar &in) = delete;
     static simdscalar pack(simdscalar &in) = delete;
 };
@@ -48,8 +48,8 @@ struct PackTraits<0, false>
 {
     static const uint32_t MyNumBits = 0;
 
-    static simdscalar loadSOA(const BYTE *pSrc) { return _simd_setzero_ps(); }
-    static void storeSOA(BYTE *pDst, simdscalar src) { return; }
+    static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_setzero_ps(); }
+    static void storeSOA(uint8_t *pDst, simdscalar src) { return; }
     static simdscalar unpack(simdscalar &in) { return _simd_setzero_ps(); }
     static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); }
 };
@@ -63,7 +63,7 @@ struct PackTraits<8, false>
 {
     static const uint32_t MyNumBits = 8;
 
-    static simdscalar loadSOA(const BYTE *pSrc)
+    static simdscalar loadSOA(const uint8_t *pSrc)
     {
 #if KNOB_SIMD_WIDTH == 8
         __m256 result = _mm256_setzero_ps();
@@ -74,7 +74,7 @@ struct PackTraits<8, false>
 #endif
     }
 
-    static void storeSOA(BYTE *pDst, simdscalar src)
+    static void storeSOA(uint8_t *pDst, simdscalar src)
     {
         // store simd bytes
 #if KNOB_SIMD_WIDTH == 8
@@ -125,7 +125,7 @@ struct PackTraits<8, true>
 {
     static const uint32_t MyNumBits = 8;
 
-    static simdscalar loadSOA(const BYTE *pSrc)
+    static simdscalar loadSOA(const uint8_t *pSrc)
     {
 #if KNOB_SIMD_WIDTH == 8
         __m256 result = _mm256_setzero_ps();
@@ -136,7 +136,7 @@ struct PackTraits<8, true>
 #endif
     }
 
-    static void storeSOA(BYTE *pDst, simdscalar src)
+    static void storeSOA(uint8_t *pDst, simdscalar src)
     {
         // store simd bytes
 #if KNOB_SIMD_WIDTH == 8
@@ -188,7 +188,7 @@ struct PackTraits<16, false>
 {
     static const uint32_t MyNumBits = 16;
 
-    static simdscalar loadSOA(const BYTE *pSrc)
+    static simdscalar loadSOA(const uint8_t *pSrc)
     {
 #if KNOB_SIMD_WIDTH == 8
         __m256 result = _mm256_setzero_ps();
@@ -199,7 +199,7 @@ struct PackTraits<16, false>
 #endif
     }
 
-    static void storeSOA(BYTE *pDst, simdscalar src)
+    static void storeSOA(uint8_t *pDst, simdscalar src)
     {
 #if KNOB_SIMD_WIDTH == 8
         // store 16B (2B * 8)
@@ -249,7 +249,7 @@ struct PackTraits<16, true>
 {
     static const uint32_t MyNumBits = 16;
 
-    static simdscalar loadSOA(const BYTE *pSrc)
+    static simdscalar loadSOA(const uint8_t *pSrc)
     {
 #if KNOB_SIMD_WIDTH == 8
         __m256 result = _mm256_setzero_ps();
@@ -260,7 +260,7 @@ struct PackTraits<16, true>
 #endif
     }
 
-    static void storeSOA(BYTE *pDst, simdscalar src)
+    static void storeSOA(uint8_t *pDst, simdscalar src)
     {
 #if KNOB_SIMD_WIDTH == 8
         // store 16B (2B * 8)
@@ -311,8 +311,8 @@ struct PackTraits<32, false>
 {
     static const uint32_t MyNumBits = 32;
 
-    static simdscalar loadSOA(const BYTE *pSrc) { return _simd_load_ps((const float*)pSrc); }
-    static void storeSOA(BYTE *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); }
+    static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_load_ps((const float*)pSrc); }
+    static void storeSOA(uint8_t *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); }
     static simdscalar unpack(simdscalar &in) { return in; }
     static simdscalar pack(simdscalar &in) { return in; }
 };
@@ -984,7 +984,7 @@ struct ComponentTraits
         return TypeTraits<X, NumBitsX>::fromFloat();
     }
 
-    INLINE static simdscalar loadSOA(uint32_t comp, const BYTE* pSrc)
+    INLINE static simdscalar loadSOA(uint32_t comp, const uint8_t* pSrc)
     {
         switch (comp)
         {
@@ -1001,7 +1001,7 @@ struct ComponentTraits
         return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
     }
 
-    INLINE static void storeSOA(uint32_t comp, BYTE *pDst, simdscalar src)
+    INLINE static void storeSOA(uint32_t comp, uint8_t *pDst, simdscalar src)
     {
         switch (comp)
         {
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index f43a672bd82..36721e00beb 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -193,35 +193,71 @@ void ProcessStoreTiles(
 /// @param workerId - thread's worker id. Even thread has a unique id.
 /// @param pUserData - Pointer to user data passed back to callback.
 /// @todo This should go away when we switch this to use compute threading.
-void ProcessInvalidateTiles(
+void ProcessDiscardInvalidateTiles(
     SWR_CONTEXT *pContext,
     DRAW_CONTEXT *pDC,
     uint32_t workerId,
     void *pUserData)
 {
     RDTSC_START(FEProcessInvalidateTiles);
-    INVALIDATE_TILES_DESC *pInv = (INVALIDATE_TILES_DESC*)pUserData;
+    DISCARD_INVALIDATE_TILES_DESC *pInv = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
     MacroTileMgr *pTileMgr = pDC->pTileMgr;
 
-    const API_STATE& state = GetApiState(pDC);
+    SWR_RECT rect;
+
+    if (pInv->rect.top | pInv->rect.bottom | pInv->rect.right | pInv->rect.left)
+    {
+        // Valid rect
+        rect = pInv->rect;
+    }
+    else
+    {
+        // Use viewport dimensions
+        const API_STATE& state = GetApiState(pDC);
+
+        rect.left   = (uint32_t)state.vp[0].x;
+        rect.right  = (uint32_t)(state.vp[0].x + state.vp[0].width);
+        rect.top    = (uint32_t)state.vp[0].y;
+        rect.bottom = (uint32_t)(state.vp[0].y + state.vp[0].height);
+    }
 
     // queue a store to each macro tile
     // compute macro tile bounds for the current render target
     uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
     uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
 
-    uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth;
-    uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight;
+    // Setup region assuming full tiles
+    uint32_t macroTileStartX = (rect.left + (macroWidth - 1)) / macroWidth;
+    uint32_t macroTileStartY = (rect.top + (macroHeight - 1)) / macroHeight;
+
+    uint32_t macroTileEndX = rect.right / macroWidth;
+    uint32_t macroTileEndY = rect.bottom / macroHeight;
+
+    if (pInv->fullTilesOnly == false)
+    {
+        // include partial tiles
+        macroTileStartX = rect.left / macroWidth;
+        macroTileStartY = rect.top / macroHeight;
+
+        macroTileEndX = (rect.right + macroWidth - 1) / macroWidth;
+        macroTileEndY = (rect.bottom + macroHeight - 1) / macroHeight;
+    }
+
+    SWR_ASSERT(macroTileEndX <= KNOB_NUM_HOT_TILES_X);
+    SWR_ASSERT(macroTileEndY <= KNOB_NUM_HOT_TILES_Y);
+
+    macroTileEndX = std::min<uint32_t>(macroTileEndX, KNOB_NUM_HOT_TILES_X);
+    macroTileEndY = std::min<uint32_t>(macroTileEndY, KNOB_NUM_HOT_TILES_Y);
 
     // load tiles
     BE_WORK work;
-    work.type = INVALIDATETILES;
-    work.pfnWork = ProcessInvalidateTilesBE;
-    work.desc.invalidateTiles = *pInv;
+    work.type = DISCARDINVALIDATETILES;
+    work.pfnWork = ProcessDiscardInvalidateTilesBE;
+    work.desc.discardInvalidateTiles = *pInv;
 
-    for (uint32_t x = 0; x < numMacroTilesX; ++x)
+    for (uint32_t x = macroTileStartX; x < macroTileEndX; ++x)
     {
-        for (uint32_t y = 0; y < numMacroTilesY; ++y)
+        for (uint32_t y = macroTileStartY; y < macroTileEndY; ++y)
         {
             pTileMgr->enqueue(x, y, &work);
         }
@@ -630,6 +666,8 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t num
     }
 }
 
+THREAD SWR_GS_CONTEXT tlsGsContext;
+
 //////////////////////////////////////////////////////////////////////////
 /// @brief Implements GS stage.
 /// @param pDC - pointer to draw context.
@@ -651,7 +689,6 @@ static void GeometryShaderStage(
 {
     RDTSC_START(FEGeometryShader);
 
-    SWR_GS_CONTEXT gsContext;
     SWR_CONTEXT* pContext = pDC->pContext;
 
     const API_STATE& state = GetApiState(pDC);
@@ -660,9 +697,9 @@ static void GeometryShaderStage(
     SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
     SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
 
-    gsContext.pStream = (uint8_t*)pGsOut;
-    gsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
-    gsContext.PrimitiveID = primID;
+    tlsGsContext.pStream = (uint8_t*)pGsOut;
+    tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
+    tlsGsContext.PrimitiveID = primID;
 
     uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
     simdvector attrib[MAX_ATTRIBUTES];
@@ -675,7 +712,7 @@ static void GeometryShaderStage(
 
         for (uint32_t i = 0; i < numVertsPerPrim; ++i)
         {
-            gsContext.vert[i].attrib[attribSlot] = attrib[i];
+            tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
         }
     }
     
@@ -683,7 +720,7 @@ static void GeometryShaderStage(
     pa.Assemble(VERTEX_POSITION_SLOT, attrib);
     for (uint32_t i = 0; i < numVertsPerPrim; ++i)
     {
-        gsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
+        tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
     }
 
     const uint32_t vertexStride = sizeof(simdvertex);
@@ -710,14 +747,14 @@ static void GeometryShaderStage(
 
     for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
     {
-        gsContext.InstanceID = instance;
-        gsContext.mask = GenerateMask(numInputPrims);
+        tlsGsContext.InstanceID = instance;
+        tlsGsContext.mask = GenerateMask(numInputPrims);
 
         // execute the geometry shader
-        state.pfnGsFunc(GetPrivateState(pDC), &gsContext);
+        state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
 
-        gsContext.pStream += instanceStride;
-        gsContext.pCutOrStreamIdBuffer += cutInstanceStride;
+        tlsGsContext.pStream += instanceStride;
+        tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
     }
 
     // set up new binner and state for the GS output topology
@@ -736,7 +773,7 @@ static void GeometryShaderStage(
     // foreach input prim:
     // - setup a new PA based on the emitted verts for that prim
     // - loop over the new verts, calling PA to assemble each prim
-    uint32_t* pVertexCount = (uint32_t*)&gsContext.vertexCount;
+    uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
     uint32_t* pPrimitiveId = (uint32_t*)&primID;
 
     uint32_t totalPrimsGenerated = 0;
@@ -844,7 +881,7 @@ static void GeometryShaderStage(
 static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
     void **ppStreamCutBuffer)
 {
-    Arena* pArena = pDC->pArena;
+    auto pArena = pDC->pArena;
     SWR_ASSERT(pArena != nullptr);
     SWR_ASSERT(state.gsState.gsEnable);
     // allocate arena space to hold GS output verts
@@ -1186,7 +1223,7 @@ void ProcessDraw(
 
         // if the entire index buffer isn't being consumed, set the last index
         // so that fetches < a SIMD wide will be masked off
-        fetchInfo.pLastIndex = (const int32_t*)(((BYTE*)state.indexBuffer.pIndices) + state.indexBuffer.size);
+        fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
         if (pLastRequestedIndex < fetchInfo.pLastIndex)
         {
             fetchInfo.pLastIndex = pLastRequestedIndex;
@@ -1362,7 +1399,7 @@ void ProcessDraw(
             i += KNOB_SIMD_WIDTH;
             if (IsIndexedT)
             {
-                fetchInfo.pIndices = (int*)((BYTE*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
+                fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
             }
             else
             {
@@ -1776,7 +1813,7 @@ void BinTriangles(
             work.pfnWork = gRasterizerTable[rastState.scissorEnable][SWR_MULTISAMPLE_1X];
         }
 
-        Arena* pArena = pDC->pArena;
+        auto pArena = pDC->pArena;
         SWR_ASSERT(pArena != nullptr);
 
         // store active attribs
@@ -1948,7 +1985,7 @@ void BinPoints(
 
             work.pfnWork = RasterizeSimplePoint;
 
-            Arena* pArena = pDC->pArena;
+            auto pArena = pDC->pArena;
             SWR_ASSERT(pArena != nullptr);
 
             // store attributes
@@ -2082,7 +2119,7 @@ void BinPoints(
 
             work.pfnWork = RasterizeTriPoint;
 
-            Arena* pArena = pDC->pArena;
+            auto pArena = pDC->pArena;
             SWR_ASSERT(pArena != nullptr);
 
             // store active attribs
@@ -2299,7 +2336,7 @@ void BinLines(
 
         work.pfnWork = RasterizeLine;
 
-        Arena* pArena = pDC->pArena;
+        auto pArena = pDC->pArena;
         SWR_ASSERT(pArena != nullptr);
 
         // store active attribs
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h
index acb935fc251..f92f88c3226 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h
@@ -146,14 +146,13 @@ float calcDeterminantInt(const __m128i vA, const __m128i vB)
     //vMul = [A1*B2 - B1*A2]
     vMul = _mm_sub_epi64(vMul, vMul2);
 
-	// According to emmintrin.h __mm_store1_pd(), address must be 16-byte aligned
-    OSALIGN(int64_t, 16) result;
-    _mm_store1_pd((double*)&result, _mm_castsi128_pd(vMul));
+    int64_t result;
+    _mm_store_sd((double*)&result, _mm_castsi128_pd(vMul));
 
-    double fResult = (double)result;
-    fResult = fResult * (1.0 / FIXED_POINT16_SCALE);
+    double dResult = (double)result;
+    dResult = dResult * (1.0 / FIXED_POINT16_SCALE);
 
-    return (float)fResult;
+    return (float)dResult;
 }
 
 INLINE
@@ -316,7 +315,7 @@ void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, vo
 
 void ProcessClear(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
 void ProcessStoreTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-void ProcessInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+void ProcessDiscardInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
 void ProcessSync(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
 void ProcessQueryStats(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
index 3f19555557f..adf738c1bed 100644
--- a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
+++ b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
@@ -80,6 +80,11 @@ static inline void ConvertEnvToKnob(const char* pOverride, float& knobValue)
     }
 }
 
+static inline void ConvertEnvToKnob(const char* pOverride, std::string& knobValue)
+{
+    knobValue = pOverride;
+}
+
 template <typename T>
 static inline void InitKnob(T& knob)
 {
diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h
index 2028d9fbcfe..f8f1a33b7e3 100644
--- a/src/gallium/drivers/swr/rasterizer/core/pa.h
+++ b/src/gallium/drivers/swr/rasterizer/core/pa.h
@@ -34,12 +34,12 @@
 
 struct PA_STATE
 {
-    DRAW_CONTEXT *pDC;              // draw context
-    uint8_t* pStreamBase;           // vertex stream
-    uint32_t streamSizeInVerts;     // total size of the input stream in verts
+    DRAW_CONTEXT *pDC{ nullptr };              // draw context
+    uint8_t* pStreamBase{ nullptr };           // vertex stream
+    uint32_t streamSizeInVerts{ 0 };     // total size of the input stream in verts
 
     // The topology the binner will use. In some cases the FE changes the topology from the api state.
-    PRIMITIVE_TOPOLOGY binTopology;
+    PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
 
     PA_STATE() {}
     PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) :
@@ -76,37 +76,37 @@ struct PA_STATE
 // cuts
 struct PA_STATE_OPT : public PA_STATE
 {
-    simdvertex leadingVertex;           // For tri-fan
-    uint32_t numPrims;              // Total number of primitives for draw.
-    uint32_t numPrimsComplete;      // Total number of complete primitives.
+    simdvertex leadingVertex;            // For tri-fan
+    uint32_t numPrims{ 0 };              // Total number of primitives for draw.
+    uint32_t numPrimsComplete{ 0 };      // Total number of complete primitives.
 
-    uint32_t numSimdPrims;          // Number of prims in current simd.
+    uint32_t numSimdPrims{ 0 };          // Number of prims in current simd.
 
-    uint32_t cur;                   // index to current VS output.
-    uint32_t prev;                  // index to prev VS output. Not really needed in the state.
-    uint32_t first;                 // index to first VS output. Used for trifan.
+    uint32_t cur{ 0 };                   // index to current VS output.
+    uint32_t prev{ 0 };                  // index to prev VS output. Not really needed in the state.
+    uint32_t first{ 0 };                 // index to first VS output. Used for trifan.
 
-    uint32_t counter;               // state counter
-    bool reset;                     // reset state
+    uint32_t counter{ 0 };               // state counter
+    bool reset{ false };                 // reset state
 
-    uint32_t primIDIncr;            // how much to increment for each vector (typically vector / {1, 2})
+    uint32_t primIDIncr{ 0 };            // how much to increment for each vector (typically vector / {1, 2})
     simdscalari primID;
 
     typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]);
     typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
 
-    PFN_PA_FUNC        pfnPaFunc;        // PA state machine function for assembling 4 triangles.
-    PFN_PA_SINGLE_FUNC pfnPaSingleFunc;  // PA state machine function for assembling single triangle.
-    PFN_PA_FUNC        pfnPaFuncReset;   // initial state to set on reset
+    PFN_PA_FUNC        pfnPaFunc{ nullptr };        // PA state machine function for assembling 4 triangles.
+    PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr };  // PA state machine function for assembling single triangle.
+    PFN_PA_FUNC        pfnPaFuncReset{ nullptr };   // initial state to set on reset
 
     // state used to advance the PA when Next is called
-    PFN_PA_FUNC        pfnPaNextFunc;
-    uint32_t           nextNumSimdPrims;
-    uint32_t           nextNumPrimsIncrement;
-    bool               nextReset;
-    bool               isStreaming;
+    PFN_PA_FUNC        pfnPaNextFunc{ nullptr };
+    uint32_t           nextNumSimdPrims{ 0 };
+    uint32_t           nextNumPrimsIncrement{ 0 };
+    bool               nextReset{ false };
+    bool               isStreaming{ false };
 
-    simdmask tmpIndices;             // temporary index store for unused virtual function
+    simdmask tmpIndices{ 0 };            // temporary index store for unused virtual function
     
     PA_STATE_OPT() {}
     PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
@@ -333,33 +333,33 @@ INLINE __m128 swizzleLaneN(const simdvector &a, int lane)
 // Cut-aware primitive assembler.
 struct PA_STATE_CUT : public PA_STATE
 {
-    simdmask* pCutIndices;          // cut indices buffer, 1 bit per vertex
-    uint32_t numVerts;              // number of vertices available in buffer store
-    uint32_t numAttribs;            // number of attributes
-    int32_t numRemainingVerts;      // number of verts remaining to be assembled
-    uint32_t numVertsToAssemble;    // total number of verts to assemble for the draw
+    simdmask* pCutIndices{ nullptr };    // cut indices buffer, 1 bit per vertex
+    uint32_t numVerts{ 0 };              // number of vertices available in buffer store
+    uint32_t numAttribs{ 0 };            // number of attributes
+    int32_t numRemainingVerts{ 0 };      // number of verts remaining to be assembled
+    uint32_t numVertsToAssemble{ 0 };    // total number of verts to assemble for the draw
     OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][KNOB_SIMD_WIDTH];    // current index buffer for gather
     simdscalari vOffsets[MAX_NUM_VERTS_PER_PRIM];           // byte offsets for currently assembling simd
-    uint32_t numPrimsAssembled;     // number of primitives that are fully assembled
-    uint32_t headVertex;            // current unused vertex slot in vertex buffer store
-    uint32_t tailVertex;            // beginning vertex currently assembling
-    uint32_t curVertex;             // current unprocessed vertex
-    uint32_t startPrimId;           // starting prim id
-    simdscalari vPrimId;            // vector of prim ID
-    bool needOffsets;               // need to compute gather offsets for current SIMD
-    uint32_t vertsPerPrim;
-    simdvertex tmpVertex;               // temporary simdvertex for unimplemented API
-    bool processCutVerts;           // vertex indices with cuts should be processed as normal, otherwise they
-                                    // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
-                                    // while the GS sends valid verts for every index 
+    uint32_t numPrimsAssembled{ 0 };     // number of primitives that are fully assembled
+    uint32_t headVertex{ 0 };            // current unused vertex slot in vertex buffer store
+    uint32_t tailVertex{ 0 };            // beginning vertex currently assembling
+    uint32_t curVertex{ 0 };             // current unprocessed vertex
+    uint32_t startPrimId{ 0 };           // starting prim id
+    simdscalari vPrimId;                 // vector of prim ID
+    bool needOffsets{ false };           // need to compute gather offsets for current SIMD
+    uint32_t vertsPerPrim{ 0 };
+    simdvertex tmpVertex;                // temporary simdvertex for unimplemented API
+    bool processCutVerts{ false };       // vertex indices with cuts should be processed as normal, otherwise they
+                                         // are ignored.  Fetch shader sends invalid verts on cuts that should be ignored
+                                         // while the GS sends valid verts for every index 
     // Topology state tracking
     uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
-    uint32_t curIndex;
-    bool reverseWinding;            // indicates reverse winding for strips
-    int32_t adjExtraVert;           // extra vert uses for tristrip w/ adj
+    uint32_t curIndex{ 0 };
+    bool reverseWinding{ false };        // indicates reverse winding for strips
+    int32_t adjExtraVert{ 0 };           // extra vert uses for tristrip w/ adj
 
     typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
-    PFN_PA_FUNC pfnPa;              // per-topology function that processes a single vert
+    PFN_PA_FUNC pfnPa{ nullptr };        // per-topology function that processes a single vert
 
     PA_STATE_CUT() {}
     PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, simdmask* in_pIndices, uint32_t in_numVerts, 
@@ -1199,9 +1199,9 @@ struct PA_FACTORY
 
     PA_STATE_OPT paOpt;
     PA_STATE_CUT paCut;
-    bool cutPA;
+    bool cutPA{ false };
 
-    PRIMITIVE_TOPOLOGY topo;
+    PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
 
     simdvertex vertexStore[MAX_NUM_VERTS_PER_PRIM];
     simdmask indexStore[MAX_NUM_VERTS_PER_PRIM];
diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
index 587e336d87d..52fb7c88cdd 100644
--- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
@@ -690,9 +690,10 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
 
     // Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
     // used to for testing if entire raster tile is inside a triangle
-    vEdgeFix16[0] = _mm256_add_pd(vEdgeFix16[0], rastEdges[0].vRasterTileOffsets);
-    vEdgeFix16[1] = _mm256_add_pd(vEdgeFix16[1], rastEdges[1].vRasterTileOffsets);
-    vEdgeFix16[2] = _mm256_add_pd(vEdgeFix16[2], rastEdges[2].vRasterTileOffsets);
+    for (uint32_t e = 0; e < numEdges; ++e)
+    {
+        vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets);
+    }
 
     // at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
     // step sample positions to the raster tile bbox of multisample points
@@ -700,7 +701,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
     //                             |      |
     //                             |      |
     // min(xSamples),max(ySamples)  ------  max(xSamples),max(ySamples)
-    __m256d vEdge0TileBbox, vEdge1TileBbox, vEdge2TileBbox;
+    __m256d vEdgeTileBbox[3];
     if (sampleCount > SWR_MULTISAMPLE_1X)
     {
         __m128i vTileSampleBBoxXh = MultisampleTraits<sampleCount>::TileSampleOffsetsX();
@@ -711,17 +712,12 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
 
         // step edge equation tests from Tile
         // used to for testing if entire raster tile is inside a triangle
-        __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].a), vTileSampleBBoxXFix8);
-        __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].b), vTileSampleBBoxYFix8);
-        vEdge0TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-
-        vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].a), vTileSampleBBoxXFix8);
-        vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].b), vTileSampleBBoxYFix8);
-        vEdge1TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-
-        vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].a), vTileSampleBBoxXFix8);
-        vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].b), vTileSampleBBoxYFix8);
-        vEdge2TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+        for (uint32_t e = 0; e < 3; ++e)
+        {
+            __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
+            __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
+            vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+        }
     }
 
     RDTSC_STOP(BEStepSetup, 0, pDC->drawId);
@@ -756,7 +752,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
 
         for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
         {
-            uint64_t anyCoveredSamples = 0;
+            triDesc.anyCoveredSamples = 0;
 
             // is the corner of the edge outside of the raster tile? (vEdge < 0)
             int mask0, mask1, mask2;
@@ -770,9 +766,9 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
             {
                 __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
                 // evaluate edge equations at the tile multisample bounding box
-                vSampleBboxTest0 = _mm256_add_pd(vEdge0TileBbox, vEdgeFix16[0]);
-                vSampleBboxTest1 = _mm256_add_pd(vEdge1TileBbox, vEdgeFix16[1]);
-                vSampleBboxTest2 = _mm256_add_pd(vEdge2TileBbox, vEdgeFix16[2]);
+                vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]);
+                vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]);
+                vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]);
                 mask0 = _mm256_movemask_pd(vSampleBboxTest0);
                 mask1 = _mm256_movemask_pd(vSampleBboxTest1);
                 mask2 = _mm256_movemask_pd(vSampleBboxTest2);
@@ -789,20 +785,21 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
                     triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
                     if ((mask0 & mask1 & mask2) == 0xf)
                     {
-                        anyCoveredSamples = triDesc.coverageMask[sampleNum];
+                        triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
                         // trivial accept, all 4 corners of all 3 edges are negative 
                         // i.e. raster tile completely inside triangle
                         RDTSC_EVENT(BETrivialAccept, 1, 0);
                     }
                     else
                     {
-                        __m256d vEdge0AtSample, vEdge1AtSample, vEdge2AtSample; 
+                        __m256d vEdgeAtSample[numEdges];
                         if(sampleCount == SWR_MULTISAMPLE_1X)
                         {
                             // should get optimized out for single sample case (global value numbering or copy propagation)
-                            vEdge0AtSample = vEdgeFix16[0];
-                            vEdge1AtSample = vEdgeFix16[1];
-                            vEdge2AtSample = vEdgeFix16[2];
+                            for (uint32_t e = 0; e < numEdges; ++e)
+                            {
+                                vEdgeAtSample[e] = vEdgeFix16[e];
+                            }
                         }
                         else
                         {
@@ -815,31 +812,20 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
                             // for each edge and broadcasts it before offsetting to individual pixel quads
 
                             // step edge equation tests from UL tile corner to pixel sample position
-                            __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].a), vSampleOffsetX);
-                            __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].b), vSampleOffsetY);
-                            vEdge0AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-                            vEdge0AtSample = _mm256_add_pd(vEdgeFix16[0], vEdge0AtSample);
-
-                            vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].a), vSampleOffsetX);
-                            vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].b), vSampleOffsetY);
-                            vEdge1AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-                            vEdge1AtSample = _mm256_add_pd(vEdgeFix16[1], vEdge1AtSample);
-
-                            vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].a), vSampleOffsetX);
-                            vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].b), vSampleOffsetY);
-                            vEdge2AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-                            vEdge2AtSample = _mm256_add_pd(vEdgeFix16[2], vEdge2AtSample);
+                            for (uint32_t e = 0; e < numEdges; ++e)
+                            {
+                                __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
+                                __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
+                                vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+                                vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]);
+                            }
                         }
 
                         double startQuadEdges[numEdges];
                         const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
-                        _mm256_maskstore_pd(&startQuadEdges[0], vLane0Mask, vEdge0AtSample);
-                        _mm256_maskstore_pd(&startQuadEdges[1], vLane0Mask, vEdge1AtSample);
-                        _mm256_maskstore_pd(&startQuadEdges[2], vLane0Mask, vEdge2AtSample);
-
-                        for (uint32_t e = 3; e < numEdges; ++e)
+                        for (uint32_t e = 0; e < numEdges; ++e)
                         {
-                            _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeFix16[e]);
+                            _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]);
                         }
 
                         // not trivial accept or reject, must rasterize full tile
@@ -854,7 +840,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
                         }
                         RDTSC_STOP(BERasterizePartial, 0, 0);
 
-                        anyCoveredSamples |= triDesc.coverageMask[sampleNum]; 
+                        triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum]; 
                     }
                 }
                 else
@@ -875,7 +861,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
             }
             else
 #endif
-            if(anyCoveredSamples)
+            if(triDesc.anyCoveredSamples)
             {
                 RDTSC_START(BEPixelBackend);
                 backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
diff --git a/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h
new file mode 100644
index 00000000000..7ff109d4fe8
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h
@@ -0,0 +1,102 @@
+/****************************************************************************
+* Copyright (C) 2016 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file arena.h
+*
+* @brief RingBuffer
+*        The RingBuffer class manages all aspects of the ring buffer including
+*        the head/tail indices, etc.
+*
+******************************************************************************/
+#pragma once
+
+template<typename T>
+class RingBuffer
+{
+public:
+    RingBuffer()
+        : mpRingBuffer(nullptr), mNumEntries(0), mRingHead(0), mRingTail(0)
+    {
+    }
+
+    ~RingBuffer()
+    {
+        Destroy();
+    }
+
+    void Init(uint32_t numEntries)
+    {
+        SWR_ASSERT(numEntries > 0);
+        mNumEntries = numEntries;
+        mpRingBuffer = (T*)_aligned_malloc(sizeof(T)*numEntries, 64);
+        SWR_ASSERT(mpRingBuffer != nullptr);
+        memset(mpRingBuffer, 0, sizeof(T)*numEntries);
+    }
+
+    void Destroy()
+    {
+        _aligned_free(mpRingBuffer);
+        mpRingBuffer = nullptr;
+    }
+
+    T& operator[](const uint32_t index)
+    {
+        SWR_ASSERT(index < mNumEntries);
+        return mpRingBuffer[index];
+    }
+
+    INLINE void Enqueue()
+    {
+        mRingHead++; // There's only one producer.
+    }
+
+    INLINE void Dequeue()
+    {
+        InterlockedIncrement(&mRingTail); // There are multiple consumers.
+    }
+
+    INLINE bool IsEmpty()
+    {
+        return (GetHead() == GetTail());
+    }
+
+    INLINE bool IsFull()
+    {
+        ///@note We don't handle wrap case due to using 64-bit indices.
+        ///      It would take 11 million years to wrap at 50,000 DCs per sec.
+        ///      If we used 32-bit indices then its about 23 hours to wrap.
+        uint64_t numEnqueued = GetHead() - GetTail();
+        SWR_ASSERT(numEnqueued <= mNumEntries);
+
+        return (numEnqueued == mNumEntries);
+    }
+
+    INLINE volatile uint64_t GetTail() { return mRingTail; }
+    INLINE volatile uint64_t GetHead() { return mRingHead; }
+
+protected:
+    T* mpRingBuffer;
+    uint32_t mNumEntries;
+
+    OSALIGNLINE(volatile uint64_t) mRingHead;  // Consumer Counter
+    OSALIGNLINE(volatile uint64_t) mRingTail;  // Producer Counter
+};
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h
index 2758555fd4b..5752094ca10 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -307,6 +307,8 @@ struct PixelPositions
     simdscalar centroid;
 };
 
+#define SWR_MAX_NUM_MULTISAMPLES 16
+
 //////////////////////////////////////////////////////////////////////////
 /// SWR_PS_CONTEXT
 /// @brief Input to pixel shader.
@@ -338,6 +340,7 @@ struct SWR_PS_CONTEXT
     uint32_t frontFace;         // IN: front- 1, back- 0
     uint32_t primID;            // IN: primitive ID
     uint32_t sampleIndex;       // IN: sampleIndex
+
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -748,7 +751,6 @@ struct SWR_RENDER_TARGET_BLEND_STATE
 };
 static_assert(sizeof(SWR_RENDER_TARGET_BLEND_STATE) == 1, "Invalid SWR_RENDER_TARGET_BLEND_STATE size");
 
-#define SWR_MAX_NUM_MULTISAMPLES 16
 enum SWR_MULTISAMPLE_COUNT
 {
     SWR_MULTISAMPLE_1X = 0,
@@ -786,7 +788,8 @@ typedef void(__cdecl *PFN_GS_FUNC)(HANDLE hPrivateData, SWR_GS_CONTEXT* pGsConte
 typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, SWR_CS_CONTEXT* pCsContext);
 typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT& soContext);
 typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext);
-typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, BYTE*, simdvector&, simdscalari*, simdscalari*);
+typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext);
+typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, uint8_t*, simdvector&, simdscalari*, simdscalari*);
 
 //////////////////////////////////////////////////////////////////////////
 /// FRONTEND_STATE
@@ -941,6 +944,7 @@ struct SWR_BACKEND_STATE
     uint8_t numComponents[KNOB_NUM_ATTRIBUTES];
 };
 
+
 union SWR_DEPTH_STENCIL_STATE
 {
     struct
@@ -980,7 +984,6 @@ enum SWR_SHADING_RATE
 {
     SWR_SHADING_RATE_PIXEL,
     SWR_SHADING_RATE_SAMPLE,
-    SWR_SHADING_RATE_COARSE,
     SWR_SHADING_RATE_MAX,
 };
 
@@ -1024,4 +1027,5 @@ struct SWR_PS_STATE
     uint32_t barycentricsMask   : 3;    // which type(s) of barycentric coords does the PS interpolate attributes with
     uint32_t usesUAV            : 1;    // pixel shader accesses UAV 
     uint32_t forceEarlyZ        : 1;    // force execution of early depth/stencil test
+
 };
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index 24c5588bfec..07bc94a1a54 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -24,7 +24,6 @@
 #include <stdio.h>
 #include <thread>
 #include <algorithm>
-#include <unordered_set>
 #include <float.h>
 #include <vector>
 #include <utility>
@@ -44,7 +43,6 @@
 #include "rasterizer.h"
 #include "rdtsc_core.h"
 #include "tilemgr.h"
-#include "core/multisample.h"
 
 
 
@@ -265,9 +263,7 @@ void bindThread(uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=
 INLINE
 uint64_t GetEnqueuedDraw(SWR_CONTEXT *pContext)
 {
-    //uint64_t result = _InterlockedCompareExchange64((volatile __int64*)&pContext->DrawEnqueued, 0, 0);
-    //return result;
-    return pContext->DrawEnqueued;
+    return pContext->dcRing.GetHead();
 }
 
 INLINE
@@ -283,170 +279,27 @@ bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint64_t lastReti
     return (pDC->dependency > lastRetiredDraw);
 }
 
-void ClearColorHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
+INLINE int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
 {
-    // Load clear color into SIMD register...
-    float *pClearData = (float*)(pHotTile->clearData);
-    simdscalar valR = _simd_broadcast_ss(&pClearData[0]);
-    simdscalar valG = _simd_broadcast_ss(&pClearData[1]);
-    simdscalar valB = _simd_broadcast_ss(&pClearData[2]);
-    simdscalar valA = _simd_broadcast_ss(&pClearData[3]);
+    int64_t result = InterlockedDecrement64(&pDC->threadsDone);
+    SWR_ASSERT(result >= 0);
 
-    float *pfBuf = (float*)pHotTile->pBuffer;
-    uint32_t numSamples = pHotTile->numSamples;
-
-    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+    if (result == 0)
     {
-        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+        // Cleanup memory allocations
+        pDC->pArena->Reset(true);
+        pDC->pTileMgr->initialize();
+        if (pDC->cleanupState)
         {
-            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++)
-            {
-                _simd_store_ps(pfBuf, valR);
-                pfBuf += KNOB_SIMD_WIDTH;
-                _simd_store_ps(pfBuf, valG);
-                pfBuf += KNOB_SIMD_WIDTH;
-                _simd_store_ps(pfBuf, valB);
-                pfBuf += KNOB_SIMD_WIDTH;
-                _simd_store_ps(pfBuf, valA);
-                pfBuf += KNOB_SIMD_WIDTH;
-            }
+            pDC->pState->pArena->Reset(true);
         }
-    }
-}
-
-void ClearDepthHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
-{
-    // Load clear color into SIMD register...
-    float *pClearData = (float*)(pHotTile->clearData);
-    simdscalar valZ = _simd_broadcast_ss(&pClearData[0]);
 
-    float *pfBuf = (float*)pHotTile->pBuffer;
-    uint32_t numSamples = pHotTile->numSamples;
-
-    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
-    {
-        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
-        {
-            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM)
-            {
-                _simd_store_ps(pfBuf, valZ);
-                pfBuf += KNOB_SIMD_WIDTH;
-            }
-        }
-    }
-}
-
-void ClearStencilHotTile(const HOTTILE* pHotTile)
-{
-    // convert from F32 to U8.
-    uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]);
-    //broadcast 32x into __m256i...
-    simdscalari valS = _simd_set1_epi8(clearVal);
-
-    simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer;
-    uint32_t numSamples = pHotTile->numSamples;
-
-    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
-    {
-        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
-        {
-            // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
-            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4)
-            {
-                _simd_store_si(pBuf, valS);
-                pBuf += 1;
-            }
-        }
-    }
-}
-
-// for draw calls, we initialize the active hot tiles and perform deferred
-// load on them if tile is in invalid state. we do this in the outer thread loop instead of inside
-// the draw routine itself mainly for performance, to avoid unnecessary setup
-// every triangle
-// @todo support deferred clear
-INLINE
-void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, const TRIANGLE_WORK_DESC* pWork)
-{
-    const API_STATE& state = GetApiState(pDC);
-    HotTileMgr *pHotTileMgr = pContext->pHotTileMgr;
-
-    uint32_t x, y;
-    MacroTileMgr::getTileIndices(macroID, x, y);
-    x *= KNOB_MACROTILE_X_DIM;
-    y *= KNOB_MACROTILE_Y_DIM;
-
-    uint32_t numSamples = GetNumSamples(state.rastState.sampleCount);
-
-    // check RT if enabled
-    unsigned long rtSlot = 0;
-    uint32_t colorHottileEnableMask = state.colorHottileEnable;
-    while(_BitScanForward(&rtSlot, colorHottileEnableMask))
-    {
-        HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples);
-
-        if (pHotTile->state == HOTTILE_INVALID)
-        {
-            RDTSC_START(BELoadTiles);
-            // invalid hottile before draw requires a load from surface before we can draw to it
-            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
-            pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_STOP(BELoadTiles, 0, 0);
-        }
-        else if (pHotTile->state == HOTTILE_CLEAR)
-        {
-            RDTSC_START(BELoadTiles);
-            // Clear the tile.
-            ClearColorHotTile(pHotTile);
-            pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_STOP(BELoadTiles, 0, 0);
-        }
-        colorHottileEnableMask &= ~(1 << rtSlot);
-    }
+        _ReadWriteBarrier();
 
-    // check depth if enabled
-    if (state.depthHottileEnable)
-    {
-        HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
-        if (pHotTile->state == HOTTILE_INVALID)
-        {
-            RDTSC_START(BELoadTiles);
-            // invalid hottile before draw requires a load from surface before we can draw to it
-            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
-            pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_STOP(BELoadTiles, 0, 0);
-        }
-        else if (pHotTile->state == HOTTILE_CLEAR)
-        {
-            RDTSC_START(BELoadTiles);
-            // Clear the tile.
-            ClearDepthHotTile(pHotTile);
-            pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_STOP(BELoadTiles, 0, 0);
-        }
+        pContext->dcRing.Dequeue();  // Remove from tail
     }
 
-    // check stencil if enabled
-    if (state.stencilHottileEnable)
-    {
-        HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
-        if (pHotTile->state == HOTTILE_INVALID)
-        {
-            RDTSC_START(BELoadTiles);
-            // invalid hottile before draw requires a load from surface before we can draw to it
-            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
-            pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_STOP(BELoadTiles, 0, 0);
-        }
-        else if (pHotTile->state == HOTTILE_CLEAR)
-        {
-            RDTSC_START(BELoadTiles);
-            // Clear the tile.
-            ClearStencilHotTile(pHotTile);
-            pHotTile->state = HOTTILE_DIRTY;
-            RDTSC_STOP(BELoadTiles, 0, 0);
-        }
-    }
+    return result;
 }
 
 INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
@@ -466,7 +319,7 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
         if (isWorkComplete)
         {
             curDrawBE++;
-            InterlockedIncrement(&pDC->threadsDoneBE);
+            CompleteDrawContext(pContext, pDC);
         }
         else
         {
@@ -496,7 +349,9 @@ void WorkOnFifoBE(
     SWR_CONTEXT *pContext,
     uint32_t workerId,
     uint64_t &curDrawBE,
-    std::unordered_set<uint32_t>& lockedTiles)
+    TileSet& lockedTiles,
+    uint32_t numaNode,
+    uint32_t numaMask)
 {
     // Find the first incomplete draw that has pending work. If no such draw is found then
     // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
@@ -537,68 +392,78 @@ void WorkOnFifoBE(
 
         for (uint32_t tileID : macroTiles)
         {
+            // Only work on tiles for for this numa node
+            uint32_t x, y;
+            pDC->pTileMgr->getTileIndices(tileID, x, y);
+            if (((x ^ y) & numaMask) != numaNode)
+            {
+                continue;
+            }
+
             MacroTileQueue &tile = pDC->pTileMgr->getMacroTileQueue(tileID);
             
+            if (!tile.getNumQueued())
+            {
+                continue;
+            }
+
             // can only work on this draw if it's not in use by other threads
-            if (lockedTiles.find(tileID) == lockedTiles.end())
+            if (lockedTiles.find(tileID) != lockedTiles.end())
             {
-                if (tile.getNumQueued())
+                continue;
+            }
+
+            if (tile.tryLock())
+            {
+                BE_WORK *pWork;
+
+                RDTSC_START(WorkerFoundWork);
+
+                uint32_t numWorkItems = tile.getNumQueued();
+                SWR_ASSERT(numWorkItems);
+
+                pWork = tile.peek();
+                SWR_ASSERT(pWork);
+                if (pWork->type == DRAW)
                 {
-                    if (tile.tryLock())
-                    {
-                        BE_WORK *pWork;
-
-                        RDTSC_START(WorkerFoundWork);
-
-                        uint32_t numWorkItems = tile.getNumQueued();
-
-                        if (numWorkItems != 0)
-                        {
-                            pWork = tile.peek();
-                            SWR_ASSERT(pWork);
-                            if (pWork->type == DRAW)
-                            {
-                                InitializeHotTiles(pContext, pDC, tileID, (const TRIANGLE_WORK_DESC*)&pWork->desc);
-                            }
-                        }
-
-                        while ((pWork = tile.peek()) != nullptr)
-                        {
-                            pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
-                            tile.dequeue();
-                        }
-                        RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId);
-
-                        _ReadWriteBarrier();
-
-                        pDC->pTileMgr->markTileComplete(tileID);
-
-                        // Optimization: If the draw is complete and we're the last one to have worked on it then
-                        // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
-                        if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete())
-                        {
-                            // We can increment the current BE and safely move to next draw since we know this draw is complete.
-                            curDrawBE++;
-                            InterlockedIncrement(&pDC->threadsDoneBE);
-
-                            lastRetiredDraw++;
-
-                            lockedTiles.clear();
-                            break;
-                        }
-                    }
-                    else
-                    {
-                        // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
-                        lockedTiles.insert(tileID);
-                    }
+                    pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, tileID);
+                }
+
+                while ((pWork = tile.peek()) != nullptr)
+                {
+                    pWork->pfnWork(pDC, workerId, tileID, &pWork->desc);
+                    tile.dequeue();
                 }
+                RDTSC_STOP(WorkerFoundWork, numWorkItems, pDC->drawId);
+
+                _ReadWriteBarrier();
+
+                pDC->pTileMgr->markTileComplete(tileID);
+
+                // Optimization: If the draw is complete and we're the last one to have worked on it then
+                // we can reset the locked list as we know that all previous draws before the next are guaranteed to be complete.
+                if ((curDrawBE == i) && pDC->pTileMgr->isWorkComplete())
+                {
+                    // We can increment the current BE and safely move to next draw since we know this draw is complete.
+                    curDrawBE++;
+                    CompleteDrawContext(pContext, pDC);
+
+                    lastRetiredDraw++;
+
+                    lockedTiles.clear();
+                    break;
+                }
+            }
+            else
+            {
+                // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
+                lockedTiles.insert(tileID);
             }
         }
     }
 }
 
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, UCHAR numaNode)
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode)
 {
     // Try to grab the next DC from the ring
     uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
@@ -608,8 +473,8 @@ void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE,
         DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
         if (pDC->isCompute || pDC->doneFE || pDC->FeLock)
         {
+            CompleteDrawContext(pContext, pDC);
             curDrawFE++;
-            InterlockedIncrement(&pDC->threadsDoneFE);
         }
         else
         {
@@ -673,22 +538,12 @@ void WorkOnCompute(
     // Is there any work remaining?
     if (queue.getNumQueued() > 0)
     {
-        bool lastToComplete = false;
-
         uint32_t threadGroupId = 0;
         while (queue.getWork(threadGroupId))
         {
             ProcessComputeBE(pDC, workerId, threadGroupId);
 
-            lastToComplete = queue.finishedWork();
-        }
-
-        _ReadWriteBarrier();
-
-        if (lastToComplete)
-        {
-            SWR_ASSERT(queue.isWorkComplete() == true);
-            pDC->doneCompute = true;
+            queue.finishedWork();
         }
     }
 }
@@ -704,14 +559,15 @@ DWORD workerThreadMain(LPVOID pData)
 
     RDTSC_INIT(threadId);
 
-    int numaNode = (int)pThreadData->numaId;
+    uint32_t numaNode = pThreadData->numaId;
+    uint32_t numaMask = pContext->threadPool.numaMask;
 
     // flush denormals to 0
     _mm_setcsr(_mm_getcsr() | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
 
     // Track tiles locked by other threads. If we try to lock a macrotile and find its already
     // locked then we'll add it to this list so that we don't try and lock it again.
-    std::unordered_set<uint32_t> lockedTiles;
+    TileSet lockedTiles;
 
     // each worker has the ability to work on any of the queued draws as long as certain
     // conditions are met. the data associated
@@ -732,10 +588,10 @@ DWORD workerThreadMain(LPVOID pData)
     //    the worker can safely increment its oldestDraw counter and move on to the next draw.
     std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);
 
-    auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->DrawEnqueued; };
+    auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->dcRing.GetHead(); };
 
-    uint64_t curDrawBE = 1;
-    uint64_t curDrawFE = 1;
+    uint64_t curDrawBE = 0;
+    uint64_t curDrawFE = 0;
 
     while (pContext->threadPool.inThreadShutdown == false)
     {
@@ -776,7 +632,7 @@ DWORD workerThreadMain(LPVOID pData)
         }
 
         RDTSC_START(WorkerWorkOnFifoBE);
-        WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles);
+        WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
         RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);
 
         WorkOnCompute(pContext, workerId, curDrawBE);
@@ -853,9 +709,12 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
             numThreads, KNOB_MAX_NUM_THREADS);
     }
 
+    uint32_t numAPIReservedThreads = 1;
+
+
     if (numThreads == 1)
     {
-        // If only 1 worker thread, try to move it to an available
+        // If only 1 worker threads, try to move it to an available
         // HW thread.  If that fails, use the API thread.
         if (numCoresPerNode < numHWCoresPerNode)
         {
@@ -878,8 +737,15 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
     }
     else
     {
-        // Save a HW thread for the API thread.
-        numThreads--;
+        // Save HW threads for the API if we can
+        if (numThreads > numAPIReservedThreads)
+        {
+            numThreads -= numAPIReservedThreads;
+        }
+        else
+        {
+            numAPIReservedThreads = 0;
+        }
     }
 
     pPool->numThreads = numThreads;
@@ -887,6 +753,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
 
     pPool->inThreadShutdown = false;
     pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA));
+    pPool->numaMask = 0;
 
     if (KNOB_MAX_WORKER_THREADS)
     {
@@ -907,6 +774,8 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
     }
     else
     {
+        pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.)
+
         uint32_t workerId = 0;
         for (uint32_t n = 0; n < numNodes; ++n)
         {
@@ -918,9 +787,9 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
                 auto& core = node.cores[c];
                 for (uint32_t t = 0; t < numHyperThreads; ++t)
                 {
-                    if (c == 0 && n == 0 && t == 0)
+                    if (numAPIReservedThreads)
                     {
-                        // Skip core 0, thread0  on node 0 to reserve for API thread
+                        --numAPIReservedThreads;
                         continue;
                     }
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h
index 0fa7196f5ac..821d7dcb16e 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@@ -34,6 +34,7 @@
 typedef std::thread* THREAD_PTR;
 
 struct SWR_CONTEXT;
+struct DRAW_CONTEXT;
 
 struct THREAD_DATA
 {
@@ -50,14 +51,18 @@ struct THREAD_POOL
 {
     THREAD_PTR threads[KNOB_MAX_NUM_THREADS];
     uint32_t numThreads;
+    uint32_t numaMask;
     volatile bool inThreadShutdown;
     THREAD_DATA *pThreadData;
 };
 
+typedef std::unordered_set<uint32_t> TileSet;
+
 void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
 void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
 
 // Expose FE and BE worker functions to the API thread if single threaded
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, UCHAR numaNode);
-void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, std::unordered_set<uint32_t> &usedTiles);
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode);
+void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask);
 void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE);
+int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC);
+\ No newline at end of file
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
index 860393661e2..794577270cf 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@@ -29,7 +29,9 @@
 #include <unordered_map>
 
 #include "fifo.hpp"
-#include "tilemgr.h"
+#include "core/tilemgr.h"
+#include "core/multisample.h"
+#include "rdtsc_core.h"
 
 #define TILE_ID(x,y) ((x << 16 | y))
 
@@ -54,24 +56,21 @@ void DispatchQueue::operator delete(void *p)
     _aligned_free(p);
 }
 
-MacroTileMgr::MacroTileMgr(Arena& arena) : mArena(arena)
+MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena)
 {
 }
 
-void MacroTileMgr::initialize()
-{
-    mWorkItemsProduced = 0;
-    mWorkItemsConsumed = 0;
-
-    mDirtyTiles.clear();
-}
-
 void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork)
 {
     // Should not enqueue more then what we have backing for in the hot tile manager.
     SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
     SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
 
+    if ((x & ~(KNOB_NUM_HOT_TILES_X-1)) | (y & ~(KNOB_NUM_HOT_TILES_Y-1)))
+    {
+        return;
+    }
+
     uint32_t id = TILE_ID(x, y);
 
     MacroTileQueue &tile = mTiles[id];
@@ -103,3 +102,284 @@ void MacroTileMgr::markTileComplete(uint32_t id)
     tile.mWorkItemsFE = 0;
     tile.mWorkItemsBE = 0;
 }
+
+HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples,
+    uint32_t renderTargetArrayIndex)
+{
+    uint32_t x, y;
+    MacroTileMgr::getTileIndices(macroID, x, y);
+
+    SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
+    SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
+
+    HotTileSet &tile = mHotTiles[x][y];
+    HOTTILE& hotTile = tile.Attachment[attachment];
+    if (hotTile.pBuffer == NULL)
+    {
+        if (create)
+        {
+            uint32_t size = numSamples * mHotTileSize[attachment];
+            uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
+            hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, KNOB_SIMD_WIDTH * 4, numaNode);
+            hotTile.state = HOTTILE_INVALID;
+            hotTile.numSamples = numSamples;
+            hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
+        }
+        else
+        {
+            return NULL;
+        }
+    }
+    else
+    {
+        // free the old tile and create a new one with enough space to hold all samples
+        if (numSamples > hotTile.numSamples)
+        {
+            // tile should be either uninitialized or resolved if we're deleting and switching to a 
+            // new sample count
+            SWR_ASSERT((hotTile.state == HOTTILE_INVALID) ||
+                (hotTile.state == HOTTILE_RESOLVED) ||
+                (hotTile.state == HOTTILE_CLEAR));
+            FreeHotTileMem(hotTile.pBuffer);
+
+            uint32_t size = numSamples * mHotTileSize[attachment];
+            uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
+            hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, KNOB_SIMD_WIDTH * 4, numaNode);
+            hotTile.state = HOTTILE_INVALID;
+            hotTile.numSamples = numSamples;
+        }
+
+        // if requested render target array index isn't currently loaded, need to store out the current hottile 
+        // and load the requested array slice
+        if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex)
+        {
+            SWR_FORMAT format;
+            switch (attachment)
+            {
+            case SWR_ATTACHMENT_COLOR0:
+            case SWR_ATTACHMENT_COLOR1:
+            case SWR_ATTACHMENT_COLOR2:
+            case SWR_ATTACHMENT_COLOR3:
+            case SWR_ATTACHMENT_COLOR4:
+            case SWR_ATTACHMENT_COLOR5:
+            case SWR_ATTACHMENT_COLOR6:
+            case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break;
+            case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break;
+            case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break;
+            default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break;
+            }
+
+            if (hotTile.state == HOTTILE_DIRTY)
+            {
+                pContext->pfnStoreTile(GetPrivateState(pDC), format, attachment,
+                    x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer);
+            }
+
+            pContext->pfnLoadTile(GetPrivateState(pDC), format, attachment,
+                x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer);
+
+            hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
+            hotTile.state = HOTTILE_DIRTY;
+        }
+    }
+    return &tile.Attachment[attachment];
+}
+
+HOTTILE* HotTileMgr::GetHotTileNoLoad(
+    SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID,
+    SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples)
+{
+    uint32_t x, y;
+    MacroTileMgr::getTileIndices(macroID, x, y);
+
+    SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
+    SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
+
+    HotTileSet &tile = mHotTiles[x][y];
+    HOTTILE& hotTile = tile.Attachment[attachment];
+    if (hotTile.pBuffer == NULL)
+    {
+        if (create)
+        {
+            uint32_t size = numSamples * mHotTileSize[attachment];
+            hotTile.pBuffer = (uint8_t*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
+            hotTile.state = HOTTILE_INVALID;
+            hotTile.numSamples = numSamples;
+            hotTile.renderTargetArrayIndex = 0;
+        }
+        else
+        {
+            return NULL;
+        }
+    }
+
+    return &hotTile;
+}
+
+void HotTileMgr::ClearColorHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
+{
+    // Load clear color into SIMD register...
+    float *pClearData = (float*)(pHotTile->clearData);
+    simdscalar valR = _simd_broadcast_ss(&pClearData[0]);
+    simdscalar valG = _simd_broadcast_ss(&pClearData[1]);
+    simdscalar valB = _simd_broadcast_ss(&pClearData[2]);
+    simdscalar valA = _simd_broadcast_ss(&pClearData[3]);
+
+    float *pfBuf = (float*)pHotTile->pBuffer;
+    uint32_t numSamples = pHotTile->numSamples;
+
+    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+    {
+        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+        {
+            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++)
+            {
+                _simd_store_ps(pfBuf, valR);
+                pfBuf += KNOB_SIMD_WIDTH;
+                _simd_store_ps(pfBuf, valG);
+                pfBuf += KNOB_SIMD_WIDTH;
+                _simd_store_ps(pfBuf, valB);
+                pfBuf += KNOB_SIMD_WIDTH;
+                _simd_store_ps(pfBuf, valA);
+                pfBuf += KNOB_SIMD_WIDTH;
+            }
+        }
+    }
+}
+
+void HotTileMgr::ClearDepthHotTile(const HOTTILE* pHotTile)  // clear a macro tile from float4 clear data.
+{
+    // Load clear color into SIMD register...
+    float *pClearData = (float*)(pHotTile->clearData);
+    simdscalar valZ = _simd_broadcast_ss(&pClearData[0]);
+
+    float *pfBuf = (float*)pHotTile->pBuffer;
+    uint32_t numSamples = pHotTile->numSamples;
+
+    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+    {
+        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+        {
+            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM)
+            {
+                _simd_store_ps(pfBuf, valZ);
+                pfBuf += KNOB_SIMD_WIDTH;
+            }
+        }
+    }
+}
+
+void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile)
+{
+    // convert from F32 to U8.
+    uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]);
+    //broadcast 32x into __m256i...
+    simdscalari valS = _simd_set1_epi8(clearVal);
+
+    simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer;
+    uint32_t numSamples = pHotTile->numSamples;
+
+    for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+    {
+        for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+        {
+            // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
+            for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4)
+            {
+                _simd_store_si(pBuf, valS);
+                pBuf += 1;
+            }
+        }
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief InitializeHotTiles
+/// for draw calls, we initialize the active hot tiles and perform deferred
+/// load on them if tile is in invalid state. we do this in the outer thread
+/// loop instead of inside the draw routine itself mainly for performance,
+/// to avoid unnecessary setup every triangle
+/// @todo support deferred clear
+/// @param pCreateInfo - pointer to creation info.
+void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID)
+{
+    const API_STATE& state = GetApiState(pDC);
+    HotTileMgr *pHotTileMgr = pContext->pHotTileMgr;
+
+    uint32_t x, y;
+    MacroTileMgr::getTileIndices(macroID, x, y);
+    x *= KNOB_MACROTILE_X_DIM;
+    y *= KNOB_MACROTILE_Y_DIM;
+
+    uint32_t numSamples = GetNumSamples(state.rastState.sampleCount);
+
+    // check RT if enabled
+    unsigned long rtSlot = 0;
+    uint32_t colorHottileEnableMask = state.colorHottileEnable;
+    while (_BitScanForward(&rtSlot, colorHottileEnableMask))
+    {
+        HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples);
+
+        if (pHotTile->state == HOTTILE_INVALID)
+        {
+            RDTSC_START(BELoadTiles);
+            // invalid hottile before draw requires a load from surface before we can draw to it
+            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+            pHotTile->state = HOTTILE_DIRTY;
+            RDTSC_STOP(BELoadTiles, 0, 0);
+        }
+        else if (pHotTile->state == HOTTILE_CLEAR)
+        {
+            RDTSC_START(BELoadTiles);
+            // Clear the tile.
+            ClearColorHotTile(pHotTile);
+            pHotTile->state = HOTTILE_DIRTY;
+            RDTSC_STOP(BELoadTiles, 0, 0);
+        }
+        colorHottileEnableMask &= ~(1 << rtSlot);
+    }
+
+    // check depth if enabled
+    if (state.depthHottileEnable)
+    {
+        HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
+        if (pHotTile->state == HOTTILE_INVALID)
+        {
+            RDTSC_START(BELoadTiles);
+            // invalid hottile before draw requires a load from surface before we can draw to it
+            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+            pHotTile->state = HOTTILE_DIRTY;
+            RDTSC_STOP(BELoadTiles, 0, 0);
+        }
+        else if (pHotTile->state == HOTTILE_CLEAR)
+        {
+            RDTSC_START(BELoadTiles);
+            // Clear the tile.
+            ClearDepthHotTile(pHotTile);
+            pHotTile->state = HOTTILE_DIRTY;
+            RDTSC_STOP(BELoadTiles, 0, 0);
+        }
+    }
+
+    // check stencil if enabled
+    if (state.stencilHottileEnable)
+    {
+        HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
+        if (pHotTile->state == HOTTILE_INVALID)
+        {
+            RDTSC_START(BELoadTiles);
+            // invalid hottile before draw requires a load from surface before we can draw to it
+            pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+            pHotTile->state = HOTTILE_DIRTY;
+            RDTSC_STOP(BELoadTiles, 0, 0);
+        }
+        else if (pHotTile->state == HOTTILE_CLEAR)
+        {
+            RDTSC_START(BELoadTiles);
+            // Clear the tile.
+            ClearStencilHotTile(pHotTile);
+            pHotTile->state = HOTTILE_DIRTY;
+            RDTSC_STOP(BELoadTiles, 0, 0);
+        }
+    }
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
index 9137941bad4..aa561badc1c 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
@@ -59,7 +59,8 @@ struct MacroTileQueue
 
     //////////////////////////////////////////////////////////////////////////
     /// @brief Clear fifo and unlock it.
-    void clear(Arena& arena)
+    template <typename ArenaT>
+    void clear(ArenaT& arena)
     {
         mFifo.clear(arena);
     }
@@ -71,7 +72,8 @@ struct MacroTileQueue
         return mFifo.peek();
     }
 
-    bool enqueue_try_nosync(Arena& arena, const BE_WORK* entry)
+    template <typename ArenaT>
+    bool enqueue_try_nosync(ArenaT& arena, const BE_WORK* entry)
     {
         return mFifo.enqueue_try_nosync(arena, entry);
     }
@@ -104,7 +106,7 @@ private:
 class MacroTileMgr
 {
 public:
-    MacroTileMgr(Arena& arena);
+    MacroTileMgr(CachingArena& arena);
     ~MacroTileMgr()
     {
         for (auto &tile : mTiles)
@@ -113,7 +115,14 @@ public:
         }
     }
 
-    void initialize();
+    INLINE void initialize()
+    {
+        mWorkItemsProduced = 0;
+        mWorkItemsConsumed = 0;
+
+        mDirtyTiles.clear();
+    }
+
     INLINE std::vector<uint32_t>& getDirtyTiles() { return mDirtyTiles; }
     INLINE MacroTileQueue& getMacroTileQueue(uint32_t id) { return mTiles[id]; }
     void markTileComplete(uint32_t id);
@@ -135,15 +144,14 @@ public:
     void operator delete (void *p);
 
 private:
-    Arena& mArena;
-    SWR_FORMAT mFormat;
+    CachingArena& mArena;
     std::unordered_map<uint32_t, MacroTileQueue> mTiles;
 
     // Any tile that has work queued to it is a dirty tile.
     std::vector<uint32_t> mDirtyTiles;
 
-    OSALIGNLINE(LONG) mWorkItemsProduced;
-    OSALIGNLINE(volatile LONG) mWorkItemsConsumed;
+    OSALIGNLINE(LONG) mWorkItemsProduced { 0 };
+    OSALIGNLINE(volatile LONG) mWorkItemsConsumed { 0 };
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -224,7 +232,7 @@ public:
     void *operator new(size_t size);
     void operator delete (void *p);
 
-    void* mpTaskData;        // The API thread will set this up and the callback task function will interpet this.
+    void* mpTaskData{ nullptr };        // The API thread will set this up and the callback task function will interpet this.
 
     OSALIGNLINE(volatile LONG) mTasksAvailable{ 0 };
     OSALIGNLINE(volatile LONG) mTasksOutstanding{ 0 };
@@ -241,7 +249,7 @@ enum HOTTILE_STATE
 
 struct HOTTILE
 {
-    BYTE *pBuffer;
+    uint8_t *pBuffer;
     HOTTILE_STATE state;
     DWORD clearData[4];                 // May need to change based on pfnClearTile implementation.  Reorder for alignment?
     uint32_t numSamples;
@@ -283,108 +291,50 @@ public:
             {
                 for (int a = 0; a < SWR_NUM_ATTACHMENTS; ++a)
                 {
-                    if (mHotTiles[x][y].Attachment[a].pBuffer != NULL)
-                    {
-                        _aligned_free(mHotTiles[x][y].Attachment[a].pBuffer);
-                        mHotTiles[x][y].Attachment[a].pBuffer = NULL;
-                    }
+                    FreeHotTileMem(mHotTiles[x][y].Attachment[a].pBuffer);
                 }
             }
         }
     }
 
-    HOTTILE *GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1, 
-        uint32_t renderTargetArrayIndex = 0)
-    {
-        uint32_t x, y;
-        MacroTileMgr::getTileIndices(macroID, x, y);
+    void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID);
 
-        assert(x < KNOB_NUM_HOT_TILES_X);
-        assert(y < KNOB_NUM_HOT_TILES_Y);
+    HOTTILE *GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1,
+        uint32_t renderTargetArrayIndex = 0);
 
-        HotTileSet &tile = mHotTiles[x][y];
-        HOTTILE& hotTile = tile.Attachment[attachment];
-        if (hotTile.pBuffer == NULL)
-        {
-            if (create)
-            {
-                uint32_t size = numSamples * mHotTileSize[attachment];
-                hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
-                hotTile.state = HOTTILE_INVALID;
-                hotTile.numSamples = numSamples;
-                hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
-            }
-            else
-            {
-                return NULL;
-            }
-        }
-        else
-        {
-            // free the old tile and create a new one with enough space to hold all samples
-            if (numSamples > hotTile.numSamples)
-            {
-                // tile should be either uninitialized or resolved if we're deleting and switching to a 
-                // new sample count
-                assert((hotTile.state == HOTTILE_INVALID) ||
-                       (hotTile.state == HOTTILE_RESOLVED) || 
-                       (hotTile.state == HOTTILE_CLEAR));
-                _aligned_free(hotTile.pBuffer);
-
-                uint32_t size = numSamples * mHotTileSize[attachment];
-                hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
-                hotTile.state = HOTTILE_INVALID;
-                hotTile.numSamples = numSamples;
-            }
+    HOTTILE *GetHotTileNoLoad(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1);
 
-            // if requested render target array index isn't currently loaded, need to store out the current hottile 
-            // and load the requested array slice
-            if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex)
-            {
-                SWR_FORMAT format;
-                switch (attachment)
-                {
-                case SWR_ATTACHMENT_COLOR0:
-                case SWR_ATTACHMENT_COLOR1:
-                case SWR_ATTACHMENT_COLOR2:
-                case SWR_ATTACHMENT_COLOR3:
-                case SWR_ATTACHMENT_COLOR4:
-                case SWR_ATTACHMENT_COLOR5:
-                case SWR_ATTACHMENT_COLOR6:
-                case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break;
-                case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break;
-                case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break;
-                default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break;
-                }
+    static void ClearColorHotTile(const HOTTILE* pHotTile);
+    static void ClearDepthHotTile(const HOTTILE* pHotTile);
+    static void ClearStencilHotTile(const HOTTILE* pHotTile);
 
-                if (hotTile.state == HOTTILE_DIRTY)
-                {
-                    pContext->pfnStoreTile(GetPrivateState(pDC), format, attachment,
-                        x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer);
-                }
-
-                pContext->pfnLoadTile(GetPrivateState(pDC), format, attachment,
-                    x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer);
+private:
+    HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y];
+    uint32_t mHotTileSize[SWR_NUM_ATTACHMENTS];
 
-                hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
-                hotTile.state = HOTTILE_DIRTY;
-            }
-        }
-        return &tile.Attachment[attachment];
+    void* AllocHotTileMem(size_t size, uint32_t align, uint32_t numaNode)
+    {
+        void* p = nullptr;
+#if defined(_WIN32)
+        HANDLE hProcess = GetCurrentProcess();
+        p = VirtualAllocExNuma(hProcess, nullptr, size, MEM_COMMIT | MEM_RESERVE, PAGE_READWRITE, numaNode);
+#else
+        p = _aligned_malloc(size, align);
+#endif
+
+        return p;
     }
 
-    HotTileSet &GetHotTile(uint32_t macroID)
+    void FreeHotTileMem(void* pBuffer)
     {
-        uint32_t x, y;
-        MacroTileMgr::getTileIndices(macroID, x, y);
-        assert(x < KNOB_NUM_HOT_TILES_X);
-        assert(y < KNOB_NUM_HOT_TILES_Y);
-
-        return mHotTiles[x][y];
+        if (pBuffer)
+        {
+#if defined(_WIN32)
+            VirtualFree(pBuffer, 0, MEM_RELEASE);
+#else
+            _aligned_free(pBuffer);
+#endif
+        }
     }
-
-private:
-    HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y];
-    uint32_t mHotTileSize[SWR_NUM_ATTACHMENTS];
 };
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.cpp b/src/gallium/drivers/swr/rasterizer/core/utils.cpp
index f36452f2cec..a1d665e77cc 100644
--- a/src/gallium/drivers/swr/rasterizer/core/utils.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/utils.cpp
@@ -27,6 +27,11 @@
 ******************************************************************************/
 #if defined(_WIN32)
 
+#if defined(NOMINMAX)
+// GDI Plus requires non-std min / max macros be defined :(
+#undef NOMINMAX
+#endif
+
 #include<Windows.h>
 #include <Gdiplus.h>
 #include <Gdiplusheaders.h>
diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h
index b9dc48c4fd7..60a3a6af19e 100644
--- a/src/gallium/drivers/swr/rasterizer/core/utils.h
+++ b/src/gallium/drivers/swr/rasterizer/core/utils.h
@@ -46,8 +46,7 @@ void OpenBitmapFromFile(
     uint32_t *height);
 #endif
 
-/// @todo assume linux is always 64 bit
-#if defined(_WIN64) || defined(__linux__) || defined(__gnu_linux__)
+#if defined(_WIN64) || defined(__x86_64__)
 #define _MM_INSERT_EPI64 _mm_insert_epi64
 #define _MM_EXTRACT_EPI64 _mm_extract_epi64
 #else
@@ -89,7 +88,10 @@ INLINE __m128i  _MM_INSERT_EPI64(__m128i a, INT64 b, const int32_t ndx)
 
 OSALIGNLINE(struct) BBOX
 {
-    int top, bottom, left, right;
+    int top{ 0 };
+    int bottom{ 0 };
+    int left{ 0 };
+    int right{ 0 };
 
     BBOX() {}
     BBOX(int t, int b, int l, int r) : top(t), bottom(b), left(l), right(r) {}
@@ -110,7 +112,10 @@ OSALIGNLINE(struct) BBOX
 
 struct simdBBox
 {
-    simdscalari top, bottom, left, right;
+    simdscalari top;
+    simdscalari bottom;
+    simdscalari left;
+    simdscalari right;
 };
 
 INLINE
@@ -271,7 +276,7 @@ struct TransposeSingleComponent
     /// @brief Pass-thru for single component.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
     {
         memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
     }
@@ -286,7 +291,7 @@ struct Transpose8_8_8_8
     /// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
     {
         simdscalari src = _simd_load_si((const simdscalari*)pSrc);
 #if KNOB_SIMD_WIDTH == 8
@@ -325,7 +330,7 @@ struct Transpose8_8_8
     /// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -337,7 +342,7 @@ struct Transpose8_8
     /// @brief Performs an SOA to AOS conversion for packed 8_8 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
     {
         simdscalari src = _simd_load_si((const simdscalari*)pSrc);
 
@@ -361,7 +366,7 @@ struct Transpose32_32_32_32
     /// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
     {
 #if KNOB_SIMD_WIDTH == 8
         simdscalar src0 = _simd_load_ps((const float*)pSrc);
@@ -394,7 +399,7 @@ struct Transpose32_32_32
     /// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
     {
 #if KNOB_SIMD_WIDTH == 8
         simdscalar src0 = _simd_load_ps((const float*)pSrc);
@@ -426,7 +431,7 @@ struct Transpose32_32
     /// @brief Performs an SOA to AOS conversion for packed 32_32 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
     {
         const float* pfSrc = (const float*)pSrc;
         __m128 src_r0 = _mm_load_ps(pfSrc + 0);
@@ -456,7 +461,7 @@ struct Transpose16_16_16_16
     /// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
     {
 #if KNOB_SIMD_WIDTH == 8
         simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
@@ -496,7 +501,7 @@ struct Transpose16_16_16
     /// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
     {
 #if KNOB_SIMD_WIDTH == 8
         simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
@@ -535,7 +540,7 @@ struct Transpose16_16
     /// @brief Performs an SOA to AOS conversion for packed 16_16 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+    INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
     {
         simdscalar src = _simd_load_ps((const float*)pSrc);
 
@@ -566,7 +571,7 @@ struct Transpose24_8
     /// @brief Performs an SOA to AOS conversion for packed 24_8 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -578,7 +583,7 @@ struct Transpose32_8_24
     /// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
 };
 
 
@@ -592,7 +597,7 @@ struct Transpose4_4_4_4
     /// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -604,7 +609,7 @@ struct Transpose5_6_5
     /// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -616,7 +621,7 @@ struct Transpose9_9_9_5
     /// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -628,7 +633,7 @@ struct Transpose5_5_5_1
     /// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -640,7 +645,7 @@ struct Transpose10_10_10_2
     /// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
 };
 
 //////////////////////////////////////////////////////////////////////////
@@ -652,7 +657,7 @@ struct Transpose11_11_10
     /// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
     /// @param pSrc - source data in SOA form
     /// @param pDst - output data in AOS form
-    static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+    static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
 };
 
 // helper function to unroll loops
@@ -694,7 +699,7 @@ uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
     }
 #endif
 
-    BYTE* pRemainderBytes = (BYTE*)pDataWords;
+    uint8_t* pRemainderBytes = (uint8_t*)pDataWords;
     for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
     {
         crc = _mm_crc32_u8(crc, *pRemainderBytes++);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
index 734c89792f0..de856c4a095 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -47,6 +47,10 @@
 #include "llvm/Analysis/CFGPrinter.h"
 #include "llvm/IRReader/IRReader.h"
 
+#if LLVM_USE_INTEL_JITEVENTS
+#include "llvm/ExecutionEngine/JITEventListener.h"
+#endif
+
 #include "core/state.h"
 #include "common/containers.hpp"
 
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
index c974a611224..4ffb0fbee01 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
@@ -53,6 +53,10 @@
 #include "llvm/Config/config.h"
 #endif
 
+#ifndef HAVE_LLVM
+#define HAVE_LLVM (LLVM_VERSION_MAJOR << 8) || LLVM_VERSION_MINOR
+#endif
+
 #include "llvm/IR/Verifier.h"
 #include "llvm/ExecutionEngine/MCJIT.h"
 #include "llvm/Support/FileSystem.h"
@@ -60,11 +64,10 @@
 
 #include "llvm/Analysis/Passes.h"
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
 #include "llvm/PassManager.h"
 #else
 #include "llvm/IR/LegacyPassManager.h"
-using namespace llvm::legacy;
 #endif
 
 #include "llvm/CodeGen/Passes.h"
@@ -166,7 +169,6 @@ struct JitManager
     FunctionType* mTrinaryFPTy;
     FunctionType* mUnaryIntTy;
     FunctionType* mBinaryIntTy;
-    FunctionType* mTrinaryIntTy;
 
     Type* mSimtFP32Ty;
     Type* mSimtInt32Ty;
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
index 954524afd3a..a64f86006f4 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
@@ -576,9 +576,12 @@ struct BlendJit : public Builder
             src1[i] = LOAD(pSrc1, { i });
         }
         Value* currentMask = VIMMED1(-1);
-        if(state.desc.alphaToCoverageEnable)
+        if (state.desc.alphaToCoverageEnable)
         {
-            currentMask = FP_TO_SI(FMUL(src[3], VBROADCAST(C((float)state.desc.numSamples))), mSimdInt32Ty);
+            Value* pClampedSrc = FCLAMP(src[3], 0.0f, 1.0f);
+            uint32_t bits = (1 << state.desc.numSamples) - 1;
+            currentMask = FMUL(pClampedSrc, VBROADCAST(C((float)bits)));
+            currentMask = FP_TO_SI(FADD(currentMask, VIMMED1(0.5f)), mSimdInt32Ty);
         }
 
         // alpha test
@@ -702,6 +705,12 @@ struct BlendJit : public Builder
             currentMask = AND(sampleMask, currentMask);
         }
 
+        if (state.desc.alphaToCoverageEnable)
+        {
+            Value* sampleMasked = SHL(C(1), sampleNum);
+            currentMask = AND(currentMask, VBROADCAST(sampleMasked));
+        }
+
         if(state.desc.sampleMaskEnable || state.desc.alphaToCoverageEnable ||
            state.desc.oMaskEnable)
         {
@@ -717,7 +726,13 @@ struct BlendJit : public Builder
 
         JitManager::DumpToFile(blendFunc, "");
 
-        FunctionPassManager passes(JM()->mpCurrentModule);
+#if HAVE_LLVM == 0x306
+        FunctionPassManager
+#else
+        llvm::legacy::FunctionPassManager
+#endif
+            passes(JM()->mpCurrentModule);
+
         passes.add(createBreakCriticalEdgesPass());
         passes.add(createCFGSimplificationPass());
         passes.add(createEarlyCSEPass());
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
index c15bdf1e756..757ea3fe39c 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -38,6 +38,8 @@ using namespace llvm;
 Builder::Builder(JitManager *pJitMgr)
     : mpJitMgr(pJitMgr)
 {
+    mVWidth = pJitMgr->mVWidth;
+
     mpIRBuilder = &pJitMgr->mBuilder;
 
     mVoidTy = Type::getVoidTy(pJitMgr->mContext);
@@ -48,14 +50,18 @@ Builder::Builder(JitManager *pJitMgr)
     mInt8Ty = Type::getInt8Ty(pJitMgr->mContext);
     mInt16Ty = Type::getInt16Ty(pJitMgr->mContext);
     mInt32Ty = Type::getInt32Ty(pJitMgr->mContext);
+    mInt8PtrTy = PointerType::get(mInt8Ty, 0);
+    mInt16PtrTy = PointerType::get(mInt16Ty, 0);
+    mInt32PtrTy = PointerType::get(mInt32Ty, 0);
     mInt64Ty = Type::getInt64Ty(pJitMgr->mContext);
     mV4FP32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mFP32Ty), false); // vector4 float type (represented as structure)
     mV4Int32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mInt32Ty), false); // vector4 int type
-    mSimdInt16Ty = VectorType::get(mInt16Ty, mpJitMgr->mVWidth);
-    mSimdInt32Ty = VectorType::get(mInt32Ty, mpJitMgr->mVWidth);
-    mSimdInt64Ty = VectorType::get(mInt64Ty, mpJitMgr->mVWidth);
-    mSimdFP16Ty = VectorType::get(mFP16Ty, mpJitMgr->mVWidth);
-    mSimdFP32Ty = VectorType::get(mFP32Ty, mpJitMgr->mVWidth);
+    mSimdInt16Ty = VectorType::get(mInt16Ty, mVWidth);
+    mSimdInt32Ty = VectorType::get(mInt32Ty, mVWidth);
+    mSimdInt64Ty = VectorType::get(mInt64Ty, mVWidth);
+    mSimdFP16Ty = VectorType::get(mFP16Ty, mVWidth);
+    mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth);
+    mSimdVectorTy = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mSimdFP32Ty), false);
 
     if (sizeof(uint32_t*) == 4)
     {
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
index 49216612cc9..239ef2ab49f 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -43,6 +43,8 @@ struct Builder
     JitManager* mpJitMgr;
     IRBuilder<>* mpIRBuilder;
 
+    uint32_t             mVWidth;
+
     // Built in types.
     Type*                mVoidTy;
     Type*                mInt1Ty;
@@ -54,12 +56,16 @@ struct Builder
     Type*                mFP16Ty;
     Type*                mFP32Ty;
     Type*                mDoubleTy;
+    Type*                mInt8PtrTy;
+    Type*                mInt16PtrTy;
+    Type*                mInt32PtrTy;
     Type*                mSimdFP16Ty;
     Type*                mSimdFP32Ty;
     Type*                mSimdInt16Ty;
     Type*                mSimdInt32Ty;
     Type*                mSimdInt64Ty;
     Type*                mSimdIntPtrTy;
+    Type*                mSimdVectorTy;
     StructType*          mV4FP32Ty;
     StructType*          mV4Int32Ty;
 
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 5394fc7bf5a..486dad8f04c 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -28,6 +28,8 @@
 * 
 ******************************************************************************/
 #include "builder.h"
+#include "common/rdtsc_buckets.h"
+
 #include "llvm/Support/DynamicLibrary.h"
 
 void __cdecl CallPrint(const char* fmt, ...);
@@ -189,32 +191,32 @@ Constant *Builder::PRED(bool pred)
 
 Value *Builder::VIMMED1(int i)
 {
-    return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
+    return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 }
 
 Value *Builder::VIMMED1(uint32_t i)
 {
-    return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
+    return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 }
 
 Value *Builder::VIMMED1(float i)
 {
-    return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantFP>(C(i)));
+    return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
 }
 
 Value *Builder::VIMMED1(bool i)
 {
-    return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
+    return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
 }
 
 Value *Builder::VUNDEF_IPTR()
 {
-    return UndefValue::get(VectorType::get(PointerType::get(mInt32Ty, 0),JM()->mVWidth));
+    return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
 }
 
 Value *Builder::VUNDEF_I()
 {
-    return UndefValue::get(VectorType::get(mInt32Ty, JM()->mVWidth));
+    return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
 }
 
 Value *Builder::VUNDEF(Type *ty, uint32_t size)
@@ -224,15 +226,15 @@ Value *Builder::VUNDEF(Type *ty, uint32_t size)
 
 Value *Builder::VUNDEF_F()
 {
-    return UndefValue::get(VectorType::get(mFP32Ty, JM()->mVWidth));
+    return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
 }
 
 Value *Builder::VUNDEF(Type* t)
 {
-    return UndefValue::get(VectorType::get(t, JM()->mVWidth));
+    return UndefValue::get(VectorType::get(t, mVWidth));
 }
 
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
 Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index)
 {
     return VINSERT(vec, val, C((int64_t)index));
@@ -247,7 +249,7 @@ Value *Builder::VBROADCAST(Value *src)
         return src;
     }
 
-    return VECTOR_SPLAT(JM()->mVWidth, src);
+    return VECTOR_SPLAT(mVWidth, src);
 }
 
 uint32_t Builder::IMMED(Value* v)
@@ -257,6 +259,13 @@ uint32_t Builder::IMMED(Value* v)
     return pValConst->getZExtValue();
 }
 
+int32_t Builder::S_IMMED(Value* v)
+{
+    SWR_ASSERT(isa<ConstantInt>(v));
+    ConstantInt *pValConst = cast<ConstantInt>(v);
+    return pValConst->getSExtValue();
+}
+
 Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
 {
     std::vector<Value*> indices;
@@ -342,8 +351,8 @@ Value *Builder::MASKLOADD(Value* src,Value* mask)
     else
     {
         Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
-        Value* fMask = BITCAST(mask,VectorType::get(mFP32Ty,JM()->mVWidth));
-        vResult = BITCAST(CALL(func,{src,fMask}), VectorType::get(mInt32Ty,JM()->mVWidth));
+        Value* fMask = BITCAST(mask,VectorType::get(mInt32Ty,mVWidth));
+        vResult = BITCAST(CALL(func,{src,fMask}), VectorType::get(mInt32Ty,mVWidth));
     }
     return vResult;
 }
@@ -512,7 +521,7 @@ CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list
 
     // get a pointer to the first character in the constant string array
     std::vector<Constant*> geplist{C(0),C(0)};
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
     Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false);
 #else
     Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
@@ -575,7 +584,7 @@ Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMas
         Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
         Value *vOffsets = MUL(vIndices,vScaleVec);
         Value *mask = MASK(vMask);
-        for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+        for(uint32_t i = 0; i < mVWidth; ++i)
         {
             // single component byte index
             Value *offset = VEXTRACT(vOffsets,C(i));
@@ -625,7 +634,7 @@ Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMas
         Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
         Value *vOffsets = MUL(vIndices, vScaleVec);
         Value *mask = MASK(vMask);
-        for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+        for(uint32_t i = 0; i < mVWidth; ++i)
         {
             // single component byte index
             Value *offset = VEXTRACT(vOffsets, C(i));
@@ -774,12 +783,61 @@ Value *Builder::PERMD(Value* a, Value* idx)
     }
     else
     {
-        res = VSHUFFLE(a, a, idx);
+        if (isa<Constant>(idx))
+        {
+            res = VSHUFFLE(a, a, idx);
+        }
+        else
+        {
+            res = VUNDEF_I();
+            for (uint32_t l = 0; l < JM()->mVWidth; ++l)
+            {
+                Value* pIndex = VEXTRACT(idx, C(l));
+                Value* pVal = VEXTRACT(a, pIndex);
+                res = VINSERT(res, pVal, C(l));
+            }
+        }
     }
     return res;
 }
 
 //////////////////////////////////////////////////////////////////////////
+/// @brief Generate a VPERMPS operation (shuffle 32 bit float values 
+/// across 128 bit lanes) in LLVM IR.  If not supported on the underlying 
+/// platform, emulate it
+/// @param a - 256bit SIMD lane(8x32bit) of float values.
+/// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
+Value *Builder::PERMPS(Value* a, Value* idx)
+{
+    Value* res;
+    // use avx2 permute instruction if available
+    if (JM()->mArch.AVX2())
+    {
+        // llvm 3.6.0 swapped the order of the args to vpermd
+        res = VPERMPS(idx, a);
+    }
+    else
+    {
+        if (isa<Constant>(idx))
+        {
+            res = VSHUFFLE(a, a, idx);
+        }
+        else
+        {
+            res = VUNDEF_F();
+            for (uint32_t l = 0; l < JM()->mVWidth; ++l)
+            {
+                Value* pIndex = VEXTRACT(idx, C(l));
+                Value* pVal = VEXTRACT(a, pIndex);
+                res = VINSERT(res, pVal, C(l));
+            }
+        }
+    }
+
+    return res;
+}
+
+//////////////////////////////////////////////////////////////////////////
 /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
 /// in LLVM IR.  If not supported on the underlying platform, emulate it
 /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
@@ -800,7 +858,7 @@ Value *Builder::CVTPH2PS(Value* a)
         }
 
         Value* pResult = UndefValue::get(mSimdFP32Ty);
-        for (uint32_t i = 0; i < JM()->mVWidth; ++i)
+        for (uint32_t i = 0; i < mVWidth; ++i)
         {
             Value* pSrc = VEXTRACT(a, C(i));
             Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
@@ -833,7 +891,7 @@ Value *Builder::CVTPS2PH(Value* a, Value* rounding)
         }
 
         Value* pResult = UndefValue::get(mSimdInt16Ty);
-        for (uint32_t i = 0; i < JM()->mVWidth; ++i)
+        for (uint32_t i = 0; i < mVWidth; ++i)
         {
             Value* pSrc = VEXTRACT(a, C(i));
             Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
@@ -1085,8 +1143,8 @@ void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byt
 void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
 {
     // cast types
-    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
-    Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits
+    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+    Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
 
     // input could either be float or int vector; do shuffle work in int
     vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
@@ -1094,7 +1152,7 @@ void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInp
 
     if(bPackedOutput) 
     {
-        Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+        Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
 
         // shuffle mask
         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
@@ -1179,12 +1237,12 @@ void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInp
 void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
 {
     // cast types
-    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
-    Type* v32x8Ty =  VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits
+    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+    Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
 
     if(bPackedOutput)
     {
-        Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+        Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
         // shuffle mask
         Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
                                      0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
@@ -1286,16 +1344,18 @@ void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
 {
     Value* pStack = STACKSAVE();
 
+    Type* pSrcTy = vSrc->getType()->getVectorElementType();
+
     // allocate tmp stack for masked off lanes
-    Value* vTmpPtr = ALLOCA(vSrc->getType()->getVectorElementType());
+    Value* vTmpPtr = ALLOCA(pSrcTy);
 
     Value *mask = MASK(vMask);
-    for (uint32_t i = 0; i < JM()->mVWidth; ++i)
+    for (uint32_t i = 0; i < mVWidth; ++i)
     {
         Value *offset = VEXTRACT(vOffsets, C(i));
         // byte pointer to component
         Value *storeAddress = GEP(pDst, offset);
-        storeAddress = BITCAST(storeAddress, PointerType::get(mFP32Ty, 0));
+        storeAddress = BITCAST(storeAddress, PointerType::get(pSrcTy, 0));
         Value *selMask = VEXTRACT(mask, C(i));
         Value *srcElem = VEXTRACT(vSrc, C(i));
         // switch in a safe address to load if we're trying to access a vertex 
@@ -1349,7 +1409,7 @@ Value *Builder::FCLAMP(Value* src, float low, float high)
 Value* Builder::STACKSAVE()
 {
     Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
     return CALL(pfnStackSave);
 #else
     return CALLA(pfnStackSave);
@@ -1401,11 +1461,13 @@ void __cdecl CallPrint(const char* fmt, ...)
     vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
     OutputDebugString(strBuf);
 #endif
+
+    va_end(args);
 }
 
 Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
 {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
     Function *func =
         Intrinsic::getDeclaration(JM()->mpCurrentModule,
                                   Intrinsic::x86_avx_vextractf128_si_256);
@@ -1413,8 +1475,8 @@ Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
 #else
     bool flag = !imm8->isZeroValue();
     SmallVector<Constant*,8> idx;
-    for (unsigned i = 0; i < JM()->mVWidth / 2; i++) {
-        idx.push_back(C(flag ? i + JM()->mVWidth / 2 : i));
+    for (unsigned i = 0; i < mVWidth / 2; i++) {
+        idx.push_back(C(flag ? i + mVWidth / 2 : i));
     }
     return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
 #endif
@@ -1422,7 +1484,7 @@ Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
 
 Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
 {
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
     Function *func =
         Intrinsic::getDeclaration(JM()->mpCurrentModule,
                                   Intrinsic::x86_avx_vinsertf128_si_256);
@@ -1430,18 +1492,54 @@ Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
 #else
     bool flag = !imm8->isZeroValue();
     SmallVector<Constant*,8> idx;
-    for (unsigned i = 0; i < JM()->mVWidth; i++) {
+    for (unsigned i = 0; i < mVWidth; i++) {
         idx.push_back(C(i));
     }
     Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
 
     SmallVector<Constant*,8> idx2;
-    for (unsigned i = 0; i < JM()->mVWidth / 2; i++) {
-        idx2.push_back(C(flag ? i : i + JM()->mVWidth));
+    for (unsigned i = 0; i < mVWidth / 2; i++) {
+        idx2.push_back(C(flag ? i : i + mVWidth));
     }
-    for (unsigned i = JM()->mVWidth / 2; i < JM()->mVWidth; i++) {
-        idx2.push_back(C(flag ? i + JM()->mVWidth / 2 : i));
+    for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
+        idx2.push_back(C(flag ? i + mVWidth / 2 : i));
     }
     return VSHUFFLE(a, inter, ConstantVector::get(idx2));
 #endif
 }
+
+// rdtsc buckets macros
+void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
+{
+    std::vector<Type*> args{
+        PointerType::get(mInt32Ty, 0),   // pBucketMgr
+        mInt32Ty                        // id
+    };
+
+    FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
+    Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
+    if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
+    {
+        sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
+    }
+
+    CALL(pFunc, { pBucketMgr, pId });
+}
+
+void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
+{
+    std::vector<Type*> args{
+        PointerType::get(mInt32Ty, 0),   // pBucketMgr
+        mInt32Ty                        // id
+    };
+
+    FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
+    Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
+    if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
+    {
+        sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
+    }
+
+    CALL(pFunc, { pBucketMgr, pId });
+}
+
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 48e0558c4dd..f43ef69d1ed 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -59,7 +59,7 @@ Value *VUNDEF_F();
 Value *VUNDEF_I();
 Value *VUNDEF(Type* ty, uint32_t size);
 Value *VUNDEF_IPTR();
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
 Value *VINSERT(Value *vec, Value *val, uint64_t index);
 #endif
 Value *VBROADCAST(Value *src);
@@ -67,6 +67,7 @@ Value *VRCP(Value *va);
 Value *VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY);
 
 uint32_t IMMED(Value* i);
+int32_t S_IMMED(Value* i);
 
 Value *GEP(Value* ptr, const std::initializer_list<Value*> &indexList);
 Value *GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList);
@@ -115,6 +116,7 @@ Value *PSHUFB(Value* a, Value* b);
 Value *PMOVSXBD(Value* a);
 Value *PMOVSXWD(Value* a);
 Value *PERMD(Value* a, Value* idx);
+Value *PERMPS(Value* a, Value* idx);
 Value *CVTPH2PS(Value* a);
 Value *CVTPS2PH(Value* a, Value* rounding);
 Value *PMAXSD(Value* a, Value* b);
@@ -147,3 +149,7 @@ Value* INT3() { return INTERRUPT(C((uint8_t)3)); }
 
 Value *VEXTRACTI128(Value* a, Constant* imm8);
 Value *VINSERTI128(Value* a, Value* b, Constant* imm8);
+
+// rdtsc buckets macros
+void RDTSC_START(Value* pBucketMgr, Value* pId);
+void RDTSC_STOP(Value* pBucketMgr, Value* pId);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index c5a180e27cb..2c2c56bd151 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -105,7 +105,7 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
     std::vector<Value*>    vtxInputIndices(2, C(0));
     // GEP
     pVtxOut = GEP(pVtxOut, C(0));
-    pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, JM()->mVWidth), 0));
+    pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
 
     // SWR_FETCH_CONTEXT::pStreams
     Value*    streams = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
@@ -174,7 +174,12 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
 
     verifyFunction(*fetch);
 
-    FunctionPassManager setupPasses(JM()->mpCurrentModule);
+#if HAVE_LLVM == 0x306
+        FunctionPassManager
+#else
+        llvm::legacy::FunctionPassManager
+#endif
+            setupPasses(JM()->mpCurrentModule);
 
     ///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
     setupPasses.add(createBreakCriticalEdgesPass());
@@ -186,7 +191,12 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
 
     JitManager::DumpToFile(fetch, "se");
 
-    FunctionPassManager optPasses(JM()->mpCurrentModule);
+#if HAVE_LLVM == 0x306
+        FunctionPassManager
+#else
+        llvm::legacy::FunctionPassManager
+#endif
+            optPasses(JM()->mpCurrentModule);
 
     ///@todo Haven't touched these either. Need to remove some of these and add others.
     optPasses.add(createCFGSimplificationPass());
@@ -220,8 +230,8 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fet
 
     SWRL::UncheckedFixedVector<Value*, 16>    vectors;
 
-    std::vector<Constant*>    pMask(JM()->mVWidth);
-    for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+    std::vector<Constant*>    pMask(mVWidth);
+    for(uint32_t i = 0; i < mVWidth; ++i)
     {
         pMask[i] = (C(i < 4 ? i : 4));
     }
@@ -254,7 +264,7 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fet
         Value* startVertexOffset = MUL(Z_EXT(startVertex, mInt64Ty), stride);
 
         // Load from the stream.
-        for(uint32_t lane = 0; lane < JM()->mVWidth; ++lane)
+        for(uint32_t lane = 0; lane < mVWidth; ++lane)
         {
             // Get index
             Value* index = VEXTRACT(vIndices, C(lane));
@@ -380,44 +390,44 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fet
             vectors.push_back(wvec);
         }
 
-        std::vector<Constant*>        v01Mask(JM()->mVWidth);
-        std::vector<Constant*>        v23Mask(JM()->mVWidth);
-        std::vector<Constant*>        v02Mask(JM()->mVWidth);
-        std::vector<Constant*>        v13Mask(JM()->mVWidth);
+        std::vector<Constant*>        v01Mask(mVWidth);
+        std::vector<Constant*>        v23Mask(mVWidth);
+        std::vector<Constant*>        v02Mask(mVWidth);
+        std::vector<Constant*>        v13Mask(mVWidth);
 
         // Concatenate the vectors together.
         elements[0] = VUNDEF_F(); 
         elements[1] = VUNDEF_F(); 
         elements[2] = VUNDEF_F(); 
         elements[3] = VUNDEF_F(); 
-        for(uint32_t b = 0, num4Wide = JM()->mVWidth / 4; b < num4Wide; ++b)
+        for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
         {
             v01Mask[4 * b + 0] = C(0 + 4 * b);
             v01Mask[4 * b + 1] = C(1 + 4 * b);
-            v01Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth);
-            v01Mask[4 * b + 3] = C(1 + 4 * b + JM()->mVWidth);
+            v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
+            v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
 
             v23Mask[4 * b + 0] = C(2 + 4 * b);
             v23Mask[4 * b + 1] = C(3 + 4 * b);
-            v23Mask[4 * b + 2] = C(2 + 4 * b + JM()->mVWidth);
-            v23Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth);
+            v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
+            v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 
             v02Mask[4 * b + 0] = C(0 + 4 * b);
             v02Mask[4 * b + 1] = C(2 + 4 * b);
-            v02Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth);
-            v02Mask[4 * b + 3] = C(2 + 4 * b + JM()->mVWidth);
+            v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
+            v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
 
             v13Mask[4 * b + 0] = C(1 + 4 * b);
             v13Mask[4 * b + 1] = C(3 + 4 * b);
-            v13Mask[4 * b + 2] = C(1 + 4 * b + JM()->mVWidth);
-            v13Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth);
+            v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
+            v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
 
-            std::vector<Constant*>    iMask(JM()->mVWidth);
-            for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+            std::vector<Constant*>    iMask(mVWidth);
+            for(uint32_t i = 0; i < mVWidth; ++i)
             {
                 if(((4 * b) <= i) && (i < (4 * (b + 1))))
                 {
-                    iMask[i] = C(i % 4 + JM()->mVWidth);
+                    iMask[i] = C(i % 4 + mVWidth);
                 }
                 else
                 {
@@ -805,7 +815,7 @@ Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
     STORE(C((uint8_t)0), pZeroIndex);
 
     // Load a SIMD of index pointers
-    for(int64_t lane = 0; lane < JM()->mVWidth; lane++)
+    for(int64_t lane = 0; lane < mVWidth; lane++)
     {
         // Calculate the address of the requested index
         Value *pIndex = GEP(pIndices, C(lane));
@@ -840,7 +850,7 @@ Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
     STORE(C((uint16_t)0), pZeroIndex);
 
     // Load a SIMD of index pointers
-    for(int64_t lane = 0; lane < JM()->mVWidth; lane++)
+    for(int64_t lane = 0; lane < mVWidth; lane++)
     {
         // Calculate the address of the requested index
         Value *pIndex = GEP(pIndices, C(lane));
@@ -925,13 +935,13 @@ void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
     const uint32_t (&swizzle)[4] = std::get<9>(args);
 
     // cast types
-    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
-    Type* v32x8Ty =  VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits
+    Type* vGatherTy = mSimdInt32Ty;
+    Type* v32x8Ty =  VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
 
     // have to do extra work for sign extending
     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
-        Type* v16x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 2); // 8x16bit ints in a 128bit lane
-        Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+        Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
+        Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
 
         // shuffle mask, including any swizzling
         const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
@@ -1138,8 +1148,8 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
     Value* (&vVertexElements)[4] = std::get<8>(args);
 
     // cast types
-    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
-    Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits
+    Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+    Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
 
     // have to do extra work for sign extending
     if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
@@ -1149,7 +1159,7 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
         bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
 
         Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
-        Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+        Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
 
         // shuffle mask
         Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
index 1814b7c8d5f..e73b232757b 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
@@ -27,7 +27,7 @@ import json as JSON
 import operator
 
 header = r"""/****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
@@ -84,16 +84,16 @@ inst_aliases = {
 }
 
 intrinsics = [
-	    ["VGATHERPS", "x86_avx2_gather_d_ps_256", ["src", "pBase", "indices", "mask", "scale"]],
+        ["VGATHERPS", "x86_avx2_gather_d_ps_256", ["src", "pBase", "indices", "mask", "scale"]],
         ["VGATHERDD", "x86_avx2_gather_d_d_256", ["src", "pBase", "indices", "mask", "scale"]],
-	    ["VSQRTPS", "x86_avx_sqrt_ps_256", ["a"]],
-	    ["VRSQRTPS", "x86_avx_rsqrt_ps_256", ["a"]],
-	    ["VRCPPS", "x86_avx_rcp_ps_256", ["a"]],
-	    ["VMINPS", "x86_avx_min_ps_256", ["a", "b"]],
-	    ["VMAXPS", "x86_avx_max_ps_256", ["a", "b"]],
-	    ["VPMINSD", "x86_avx2_pmins_d", ["a", "b"]],
-	    ["VPMAXSD", "x86_avx2_pmaxs_d", ["a", "b"]],
-	    ["VROUND", "x86_avx_round_ps_256", ["a", "rounding"]],
+        ["VSQRTPS", "x86_avx_sqrt_ps_256", ["a"]],
+        ["VRSQRTPS", "x86_avx_rsqrt_ps_256", ["a"]],
+        ["VRCPPS", "x86_avx_rcp_ps_256", ["a"]],
+        ["VMINPS", "x86_avx_min_ps_256", ["a", "b"]],
+        ["VMAXPS", "x86_avx_max_ps_256", ["a", "b"]],
+        ["VPMINSD", "x86_avx2_pmins_d", ["a", "b"]],
+        ["VPMAXSD", "x86_avx2_pmaxs_d", ["a", "b"]],
+        ["VROUND", "x86_avx_round_ps_256", ["a", "rounding"]],
         ["VCMPPS", "x86_avx_cmp_ps_256", ["a", "b", "cmpop"]],
         ["VBLENDVPS", "x86_avx_blendv_ps_256", ["a", "b", "mask"]],
         ["BEXTR_32", "x86_bmi_bextr_32", ["src", "control"]],
@@ -103,6 +103,7 @@ intrinsics = [
         ["VPMOVSXBD", "x86_avx2_pmovsxbd", ["a"]],  # sign extend packed 8bit components
         ["VPMOVSXWD", "x86_avx2_pmovsxwd", ["a"]],  # sign extend packed 16bit components
         ["VPERMD", "x86_avx2_permd", ["idx", "a"]],
+        ["VPERMPS", "x86_avx2_permps", ["idx", "a"]],
         ["VCVTPH2PS", "x86_vcvtph2ps_256", ["a"]],
         ["VCVTPS2PH", "x86_vcvtps2ph_256", ["a", "round"]],
         ["VHSUBPS", "x86_avx_hsub_ps_256", ["a", "b"]],
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py
index 7bba435467b..0b53a929e6c 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py
+++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py
@@ -28,7 +28,7 @@ import operator
 
 header = r"""
 /****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+* Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a
 * copy of this software and associated documentation files (the "Software"),
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
index 6c5f22bc47c..36baa8d794b 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
@@ -293,7 +293,13 @@ struct StreamOutJit : public Builder
 
         JitManager::DumpToFile(soFunc, "SoFunc");
 
-        FunctionPassManager passes(JM()->mpCurrentModule);
+#if HAVE_LLVM == 0x306
+        FunctionPassManager
+#else
+        llvm::legacy::FunctionPassManager
+#endif
+            passes(JM()->mpCurrentModule);
+
         passes.add(createBreakCriticalEdgesPass());
         passes.add(createCFGSimplificationPass());
         passes.add(createEarlyCSEPass());
diff --git a/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
index ad73cd840a7..d001cb6b5cb 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
+++ b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
@@ -33,7 +33,7 @@
 #include "memory/tilingtraits.h"
 #include "memory/Convert.h"
 
-typedef void(*PFN_STORE_TILES_CLEAR)(const FLOAT*, SWR_SURFACE_STATE*, UINT, UINT);
+typedef void(*PFN_STORE_TILES_CLEAR)(const float*, SWR_SURFACE_STATE*, UINT, UINT);
 
 //////////////////////////////////////////////////////////////////////////
 /// Clear Raster Tile Function Tables.
@@ -54,17 +54,17 @@ struct StoreRasterTileClear
     /// @param pDstSurface - Destination surface state
     /// @param x, y - Coordinates to raster tile.
     INLINE static void StoreClear(
-        const BYTE* dstFormattedColor,
+        const uint8_t* dstFormattedColor,
         UINT dstBytesPerPixel,
         SWR_SURFACE_STATE* pDstSurface,
         UINT x, UINT y) // (x, y) pixel coordinate to start of raster tile.
     {
         // Compute destination address for raster tile.
-        BYTE* pDstTile = (BYTE*)pDstSurface->pBaseAddress +
+        uint8_t* pDstTile = (uint8_t*)pDstSurface->pBaseAddress +
             (y * pDstSurface->pitch) + (x * dstBytesPerPixel);
 
         // start of first row
-        BYTE* pDst = pDstTile;
+        uint8_t* pDst = pDstTile;
         UINT dstBytesPerRow = 0;
 
         // For each raster tile pixel in row 0 (rx, 0)
@@ -104,15 +104,15 @@ struct StoreMacroTileClear
     /// @param pDstSurface - Destination surface state
     /// @param x, y - Coordinates to macro tile
     static void StoreClear(
-        const FLOAT *pColor,
+        const float *pColor,
         SWR_SURFACE_STATE* pDstSurface,
         UINT x, UINT y)
     {
         UINT dstBytesPerPixel = (FormatTraits<DstFormat>::bpp / 8);
 
-        BYTE dstFormattedColor[16]; // max bpp is 128, so 16 is all we need here for one pixel
+        uint8_t dstFormattedColor[16]; // max bpp is 128, so 16 is all we need here for one pixel
 
-        FLOAT srcColor[4];
+        float srcColor[4];
 
         for (UINT comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
         {
diff --git a/src/gallium/drivers/swr/rasterizer/memory/Convert.h b/src/gallium/drivers/swr/rasterizer/memory/Convert.h
index 0f9e0ad4bd8..7c185e5e454 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/Convert.h
+++ b/src/gallium/drivers/swr/rasterizer/memory/Convert.h
@@ -227,10 +227,10 @@ static uint16_t Convert32To16Float(float val)
 /// @param srcPixel - Pointer to source pixel (pre-swizzled according to dest).
 template<SWR_FORMAT DstFormat>
 static void ConvertPixelFromFloat(
-    BYTE* pDstPixel,
+    uint8_t* pDstPixel,
     const float srcPixel[4])
 {
-    UINT outColor[4];  // typeless bits
+    uint32_t outColor[4] = { 0 };  // typeless bits
 
     // Store component
     for (UINT comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
@@ -390,9 +390,9 @@ static void ConvertPixelFromFloat(
 template<SWR_FORMAT SrcFormat>
 INLINE static void ConvertPixelToFloat(
     float dstPixel[4],
-    const BYTE* pSrc)
+    const uint8_t* pSrc)
 {
-    UINT srcColor[4];  // typeless bits
+    uint32_t srcColor[4];  // typeless bits
 
     // unpack src pixel
     typename FormatTraits<SrcFormat>::FormatT* pPixel = (typename FormatTraits<SrcFormat>::FormatT*)pSrc;
@@ -421,11 +421,11 @@ INLINE static void ConvertPixelToFloat(
     }
 
     // Convert components
-    for (UINT comp = 0; comp < FormatTraits<SrcFormat>::numComps; ++comp)
+    for (uint32_t comp = 0; comp < FormatTraits<SrcFormat>::numComps; ++comp)
     {
         SWR_TYPE type = FormatTraits<SrcFormat>::GetType(comp);
 
-        UINT src = srcColor[comp];
+        uint32_t src = srcColor[comp];
 
         switch (type)
         {
@@ -486,7 +486,7 @@ INLINE static void ConvertPixelToFloat(
         }
         case SWR_TYPE_UINT:
         {
-            UINT dst = (UINT)src;
+            uint32_t dst = (uint32_t)src;
             dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = *(float*)&dst;
             break;
         }
diff --git a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h
index 50f8e57c22a..381ac89a7b8 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h
+++ b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h
@@ -28,6 +28,7 @@
 #pragma once
 
 #include "core/state.h"
+#include "common/simdintrin.h"
 
 template<SWR_TILE_MODE mode, int>
 struct TilingTraits
@@ -130,63 +131,6 @@ template<int X> struct TilingTraits <SWR_TILE_MODE_WMAJOR, X>
     static UINT GetPdepY() { return 0x1ea; }
 };
 
-INLINE
-UINT pdep_u32(UINT a, UINT mask)
-{
-#if KNOB_ARCH==KNOB_ARCH_AVX2
-    return _pdep_u32(a, mask);
-#else
-    UINT result = 0;
-
-    // copied from http://wm.ite.pl/articles/pdep-soft-emu.html 
-    // using bsf instead of funky loop
-    DWORD maskIndex;
-    while (_BitScanForward(&maskIndex, mask))
-    {
-        // 1. isolate lowest set bit of mask
-        const UINT lowest = 1 << maskIndex;
-
-        // 2. populate LSB from src
-        const UINT LSB = (UINT)((int)(a << 31) >> 31);
-
-        // 3. copy bit from mask
-        result |= LSB & lowest;
-
-        // 4. clear lowest bit
-        mask &= ~lowest;
-
-        // 5. prepare for next iteration
-        a >>= 1;
-    }
-
-    return result;
-#endif
-}
-
-INLINE
-UINT pext_u32(UINT a, UINT mask)
-{
-#if KNOB_ARCH==KNOB_ARCH_AVX2
-    return _pext_u32(a, mask);
-#else
-    UINT result = 0;
-    DWORD maskIndex;
-    uint32_t currentBit = 0;
-    while (_BitScanForward(&maskIndex, mask))
-    {
-        // 1. isolate lowest set bit of mask
-        const UINT lowest = 1 << maskIndex;
-
-        // 2. copy bit from mask
-        result |= ((a & lowest) > 0) << currentBit++;
-
-        // 3. clear lowest bit
-        mask &= ~lowest;
-    }
-    return result;
-#endif
-}
-
 //////////////////////////////////////////////////////////////////////////
 /// @brief Computes the tileID for 2D tiled surfaces
 /// @param pitch - surface pitch in bytes
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py b/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py
index 44ab69815b1..3d003fb4a33 100644
--- a/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py
+++ b/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+# Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
index 8c51e1e8e73..0f3ded68544 100644
--- a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
+++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2014-2015 Intel Corporation.   All Rights Reserved.
+# Copyright (C) 2014-2016 Intel Corporation.   All Rights Reserved.
 #
 # Permission is hereby granted, free of charge, to any person obtaining a
 # copy of this software and associated documentation files (the "Software"),
@@ -21,24 +21,20 @@
 
 # Python source
 KNOBS = [
-    ['ENABLE_ASSERT_DIALOGS', {
-        'type'      : 'bool',
-        'default'   : 'true',
-        'desc'      : ['Use dialogs when asserts fire.',
-                       'Asserts are only enabled in debug builds'],
-    }],
 
     ['SINGLE_THREADED', {
         'type'      : 'bool',
         'default'   : 'false',
         'desc'      : ['If enabled will perform all rendering on the API thread.',
                        'This is useful mainly for debugging purposes.'],
+        'category'  : 'debug',
     }],
 
     ['DUMP_SHADER_IR', {
-       'type'       : 'bool',
-       'default'    : 'false',
-       'desc'       : ['Dumps shader LLVM IR at various stages of jit compilation.'],
+        'type'      : 'bool',
+        'default'   : 'false',
+        'desc'      : ['Dumps shader LLVM IR at various stages of jit compilation.'],
+        'category'  : 'debug',
     }],
 
     ['USE_GENERIC_STORETILE', {
@@ -46,6 +42,7 @@ KNOBS = [
         'default'   : 'false',
         'desc'      : ['Always use generic function for performing StoreTile.',
                        'Will be slightly slower than using optimized (jitted) path'],
+        'category'  : 'debug',
     }],
 
     ['FAST_CLEAR', {
@@ -53,6 +50,7 @@ KNOBS = [
         'default'   : 'true',
         'desc'      : ['Replace 3D primitive execute with a SWRClearRT operation and',
                        'defer clear execution to first backend op on hottile, or hottile store'],
+        'category'  : 'perf',
     }],
 
     ['MAX_NUMA_NODES', {
@@ -61,6 +59,7 @@ KNOBS = [
         'desc'      : ['Maximum # of NUMA-nodes per system used for worker threads',
                        '  0 == ALL NUMA-nodes in the system',
                        '  N == Use at most N NUMA-nodes for rendering'],
+        'category'  : 'perf',
     }],
 
     ['MAX_CORES_PER_NUMA_NODE', {
@@ -69,6 +68,7 @@ KNOBS = [
         'desc'      : ['Maximum # of cores per NUMA-node used for worker threads.',
                        '  0 == ALL non-API thread cores per NUMA-node',
                        '  N == Use at most N cores per NUMA-node'],
+        'category'  : 'perf',
     }],
 
     ['MAX_THREADS_PER_CORE', {
@@ -77,6 +77,7 @@ KNOBS = [
         'desc'      : ['Maximum # of (hyper)threads per physical core used for worker threads.',
                        '  0 == ALL hyper-threads per core',
                        '  N == Use at most N hyper-threads per physical core'],
+        'category'  : 'perf',
     }],
 
     ['MAX_WORKER_THREADS', {
@@ -87,6 +88,7 @@ KNOBS = [
                        'IMPORTANT: If this is non-zero, no worker threads will be bound to',
                        'specific HW threads.  They will all be "floating" SW threads.',
                        'In this case, the above 3 KNOBS will be ignored.'],
+        'category'  : 'perf',
     }],
 
     ['BUCKETS_START_FRAME', {
@@ -96,6 +98,7 @@ KNOBS = [
                        '',
                        'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h',
                        'for this to have an effect.'],
+        'category'  : 'perf',
     }],
 
     ['BUCKETS_END_FRAME', {
@@ -105,6 +108,7 @@ KNOBS = [
                        '',
                        'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h',
                        'for this to have an effect.'],
+        'category'  : 'perf',
     }],
 
     ['WORKER_SPIN_LOOP_COUNT', {
@@ -112,46 +116,32 @@ KNOBS = [
         'default'   : '5000',
         'desc'      : ['Number of spin-loop iterations worker threads will perform',
                        'before going to sleep when waiting for work'],
+        'category'  : 'perf',
     }],
 
     ['MAX_DRAWS_IN_FLIGHT', {
         'type'      : 'uint32_t',
-        'default'   : '160',
+        'default'   : '96',
         'desc'      : ['Maximum number of draws outstanding before API thread blocks.'],
+        'category'  : 'perf',
     }],
 
     ['MAX_PRIMS_PER_DRAW', {
-       'type'       : 'uint32_t',
-       'default'    : '2040',
-       'desc'       : ['Maximum primitives in a single Draw().',
+        'type'      : 'uint32_t',
+        'default'   : '2040',
+        'desc'      : ['Maximum primitives in a single Draw().',
                        'Larger primitives are split into smaller Draw calls.',
                        'Should be a multiple of (3 * vectorWidth).'],
+        'category'  : 'perf',
     }],
 
     ['MAX_TESS_PRIMS_PER_DRAW', {
-       'type'       : 'uint32_t',
-       'default'    : '16',
-       'desc'       : ['Maximum primitives in a single Draw() with tessellation enabled.',
+        'type'      : 'uint32_t',
+        'default'   : '16',
+        'desc'      : ['Maximum primitives in a single Draw() with tessellation enabled.',
                        'Larger primitives are split into smaller Draw calls.',
                        'Should be a multiple of (vectorWidth).'],
-    }],
-
-    ['MAX_FRAC_ODD_TESS_FACTOR', {
-        'type'      : 'float',
-        'default'   : '63.0f',
-        'desc'      : ['(DEBUG) Maximum tessellation factor for fractional-odd partitioning.'],
-    }],
-
-    ['MAX_FRAC_EVEN_TESS_FACTOR', {
-        'type'      : 'float',
-        'default'   : '64.0f',
-        'desc'      : ['(DEBUG) Maximum tessellation factor for fractional-even partitioning.'],
-    }],
-
-    ['MAX_INTEGER_TESS_FACTOR', {
-        'type'      : 'uint32_t',
-        'default'   : '64',
-        'desc'      : ['(DEBUG) Maximum tessellation factor for integer partitioning.'],
+        'category'  : 'perf',
     }],
 
 
@@ -159,12 +149,14 @@ KNOBS = [
         'type'      : 'bool',
         'default'   : 'false',
         'desc'      : ['Enable threadviz output.'],
+        'category'  : 'perf',
     }],
 
     ['TOSS_DRAW', {
         'type'      : 'bool',
         'default'   : 'false',
         'desc'      : ['Disable per-draw/dispatch execution'],
+        'category'  : 'perf',
     }],
 
     ['TOSS_QUEUE_FE', {
@@ -173,6 +165,7 @@ KNOBS = [
         'desc'      : ['Stop per-draw execution at worker FE',
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+        'category'  : 'perf',
     }],
 
     ['TOSS_FETCH', {
@@ -181,6 +174,7 @@ KNOBS = [
         'desc'      : ['Stop per-draw execution at vertex fetch',
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+        'category'  : 'perf',
     }],
 
     ['TOSS_IA', {
@@ -189,6 +183,7 @@ KNOBS = [
         'desc'      : ['Stop per-draw execution at input assembler',
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+        'category'  : 'perf',
     }],
 
     ['TOSS_VS', {
@@ -197,6 +192,7 @@ KNOBS = [
         'desc'      : ['Stop per-draw execution at vertex shader',
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+        'category'  : 'perf',
     }],
 
     ['TOSS_SETUP_TRIS', {
@@ -205,6 +201,7 @@ KNOBS = [
         'desc'      : ['Stop per-draw execution at primitive setup',
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+        'category'  : 'perf',
     }],
 
     ['TOSS_BIN_TRIS', {
@@ -213,6 +210,7 @@ KNOBS = [
         'desc'      : ['Stop per-draw execution at primitive binning',
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+        'category'  : 'perf',
     }],
 
     ['TOSS_RS', {
@@ -221,6 +219,5 @@ KNOBS = [
         'desc'      : ['Stop per-draw execution at rasterizer',
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
-    }],
-
-]
+        'category'  : 'perf',
+    }],]
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template b/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template
index 922117e7e16..521346ca833 100644
--- a/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template
+++ b/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template
@@ -10,7 +10,7 @@
         return ' '*(max_len - knob_len)
 %>/******************************************************************************
 *
-* Copyright 2015
+* Copyright 2015-2016
 * Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
@@ -77,7 +77,11 @@ struct GlobalKnobs
     % for line in knob[1]['desc']:
     // ${line}
     % endfor
+    % if knob[1]['type'] == 'std::string':
+    DEFINE_KNOB(${knob[0]}, ${knob[1]['type']}, "${repr(knob[1]['default'])[1:-1]}");
+    % else:
     DEFINE_KNOB(${knob[0]}, ${knob[1]['type']}, ${knob[1]['default']});
+    % endif
 
     % endfor
     GlobalKnobs();
@@ -125,7 +129,7 @@ std::string GlobalKnobs::ToString(const char* optPerLinePrefix)
     str << optPerLinePrefix << "KNOB_${knob[0]}:${space_knob(knob[0])}";
     % if knob[1]['type'] == 'bool':
     str << (KNOB_${knob[0]} ? "+\n" : "-\n");
-    % elif knob[1]['type'] != 'float':
+    % elif knob[1]['type'] != 'float' and knob[1]['type'] != 'std::string':
     str << std::hex << std::setw(11) << std::left << KNOB_${knob[0]};
     str << std::dec << KNOB_${knob[0]} << "\n";
     % else: