Merge remote-tracking branch 'public/master' into vulkan

author: Jason Ekstrand <[email protected]> 2016-04-13 20:25:39 -0700
committer: Jason Ekstrand <[email protected]> 2016-04-13 20:25:39 -0700
commit: 12f88ba32a14ea79134f4e995a55149f078a2f27 (patch)
tree: 9070861dced23d0ad7dbec598bfd96b686eb7bf1 /src/gallium/drivers/swr/rasterizer
parent: 79fbec30fc16399ede9385ef52cb62cefbb388f4 (diff)
parent: 171a570f388b2895d14f6d5418c99573cffd6369 (diff)
19 files changed, 506 insertions, 210 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h
index 5794f3f625a..180a0560822 100644
--- a/src/gallium/drivers/swr/rasterizer/common/os.h
+++ b/src/gallium/drivers/swr/rasterizer/common/os.h
@@ -30,10 +30,6 @@
 
 #define SWR_API __cdecl
 
-#ifndef _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_WARNINGS
-#endif
-
 #ifndef NOMINMAX
 #define NOMINMAX
 #endif
@@ -52,7 +48,6 @@
 
 #define PRAGMA_WARNING_POP() __pragma(warning(pop))
 
-#if defined(_WIN32)
 #if defined(_WIN64)
 #define BitScanReverseSizeT BitScanReverse64
 #define BitScanForwardSizeT BitScanForward64
@@ -62,7 +57,6 @@
 #define BitScanForwardSizeT BitScanForward
 #define _mm_popcount_sizeT _mm_popcnt_u32
 #endif
-#endif
 
 #elif defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
 
@@ -199,9 +193,7 @@ typedef KILOBYTE    MEGABYTE[1024];
 typedef MEGABYTE    GIGABYTE[1024];
 
 #define OSALIGNLINE(RWORD) OSALIGN(RWORD, 64)
-#if KNOB_SIMD_WIDTH == 8
-#define OSALIGNSIMD(RWORD) OSALIGN(RWORD, 32)
-#endif
+#define OSALIGNSIMD(RWORD) OSALIGN(RWORD, KNOB_SIMD_BYTES)
 
 #include "common/swr_assert.h"
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index f0f7956b590..ca9cfdb629e 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -29,10 +29,12 @@
 #include <cfloat>
 #include <cmath>
 #include <cstdio>
+#include <new>
 
 #include "core/api.h"
 #include "core/backend.h"
 #include "core/context.h"
+#include "core/depthstencil.h"
 #include "core/frontend.h"
 #include "core/rasterizer.h"
 #include "core/rdtsc_core.h"
@@ -64,11 +66,14 @@ HANDLE SwrCreateContext(
     pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
     pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
 
+    pContext->pMacroTileManagerArray = (MacroTileMgr*)_aligned_malloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
+    pContext->pDispatchQueueArray = (DispatchQueue*)_aligned_malloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
+
     for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
     {
         pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
-        pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena));
-        pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.
+        new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena);
+        new (&pContext->pDispatchQueueArray[dc]) DispatchQueue();
 
         pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
     }
@@ -86,15 +91,26 @@ HANDLE SwrCreateContext(
     // Calling createThreadPool() above can set SINGLE_THREADED
     if (KNOB_SINGLE_THREADED)
     {
+        SET_KNOB(HYPERTHREADED_FE, false);
         pContext->NumWorkerThreads = 1;
+        pContext->NumFEThreads = 1;
+        pContext->NumBEThreads = 1;
     }
 
     // Allocate scratch space for workers.
     ///@note We could lazily allocate this but its rather small amount of memory.
     for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
     {
-        ///@todo Use numa API for allocations using numa information from thread data (if exists).
-        pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4);
+#if defined(_WIN32)
+        uint32_t numaNode = pContext->threadPool.pThreadData ?
+            pContext->threadPool.pThreadData[i].numaId : 0;
+        pContext->pScratch[i] = (uint8_t*)VirtualAllocExNuma(
+            GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE),
+            MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE,
+            numaNode);
+#else
+        pContext->pScratch[i] = (uint8_t*)_aligned_malloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4);
+#endif
     }
 
     // State setup AFTER context is fully initialized
@@ -131,14 +147,21 @@ void SwrDestroyContext(HANDLE hContext)
     {
         delete pContext->dcRing[i].pArena;
         delete pContext->dsRing[i].pArena;
-        delete(pContext->dcRing[i].pTileMgr);
-        delete(pContext->dcRing[i].pDispatch);
+        pContext->pMacroTileManagerArray[i].~MacroTileMgr();
+        pContext->pDispatchQueueArray[i].~DispatchQueue();
     }
 
+    _aligned_free(pContext->pDispatchQueueArray);
+    _aligned_free(pContext->pMacroTileManagerArray);
+
     // Free scratch space.
     for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
     {
+#if defined(_WIN32)
+        VirtualFree(pContext->pScratch[i], 0, MEM_RELEASE);
+#else
         _aligned_free(pContext->pScratch[i]);
+#endif
     }
 
     delete(pContext->pHotTileMgr);
@@ -160,12 +183,20 @@ void WakeAllThreads(SWR_CONTEXT *pContext)
 template<bool IsDraw>
 void QueueWork(SWR_CONTEXT *pContext)
 {
+    DRAW_CONTEXT* pDC = pContext->pCurDrawContext;
+    uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
+
+    if (IsDraw)
+    {
+        pDC->pTileMgr = &pContext->pMacroTileManagerArray[dcIndex];
+        pDC->pTileMgr->initialize();
+    }
+
     // Each worker thread looks at a DC for both FE and BE work at different times and so we
     // multiply threadDone by 2.  When the threadDone counter has reached 0 then all workers
     // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
     // then moved on if all work is done.)
-    pContext->pCurDrawContext->threadsDone =
-        pContext->NumWorkerThreads ? pContext->NumWorkerThreads * 2 : 2;
+    pContext->pCurDrawContext->threadsDone = pContext->NumFEThreads + pContext->NumBEThreads;
 
     _ReadWriteBarrier();
     {
@@ -183,7 +214,7 @@ void QueueWork(SWR_CONTEXT *pContext)
         {
             static TileSet lockedTiles;
             uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
-            WorkOnFifoFE(pContext, 0, curDraw[0], 0);
+            WorkOnFifoFE(pContext, 0, curDraw[0]);
             WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles, 0, 0);
         }
         else
@@ -232,7 +263,20 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
             _mm_pause();
         }
 
-        uint32_t dcIndex = pContext->dcRing.GetHead() % KNOB_MAX_DRAWS_IN_FLIGHT;
+        uint64_t curDraw = pContext->dcRing.GetHead();
+        uint32_t dcIndex = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT;
+
+        static uint64_t lastDrawChecked;
+        static uint32_t lastFrameChecked;
+        if ((pContext->frameCount - lastFrameChecked) > 2 ||
+            (curDraw - lastDrawChecked) > 0x10000)
+        {
+            // Take this opportunity to clean-up old arena allocations
+            pContext->cachingArenaAllocator.FreeOldBlocks();
+
+            lastFrameChecked = pContext->frameCount;
+            lastDrawChecked = curDraw;
+        }
 
         DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
         pContext->pCurDrawContext = pCurDrawContext;
@@ -284,8 +328,6 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
         pCurDrawContext->FeLock = 0;
         pCurDrawContext->threadsDone = 0;
 
-        pCurDrawContext->pTileMgr->initialize();
-
         // Assign unique drawId for this DC
         pCurDrawContext->drawId = pContext->dcRing.GetHead();
 
@@ -872,6 +914,25 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
                  !pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0;
         }
     }
+
+    // Setup depth quantization function
+    if (pState->state.depthHottileEnable)
+    {
+        switch (pState->state.rastState.depthFormat)
+        {
+        case R32_FLOAT_X8X24_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT_X8X24_TYPELESS > ; break;
+        case R32_FLOAT: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; break;
+        case R24_UNORM_X8_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R24_UNORM_X8_TYPELESS > ; break;
+        case R16_UNORM: pState->state.pfnQuantizeDepth = QuantizeDepth < R16_UNORM > ; break;
+        default: SWR_ASSERT(false, "Unsupported depth format for depth quantiztion.");
+            pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ;
+        }
+    }
+    else
+    {
+        // set up pass-through quantize if depth isn't enabled
+        pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ;
+    }
 }
 
 //////////////////////////////////////////////////////////////////////////
@@ -1029,9 +1090,9 @@ void DrawInstanced(
     SWR_CONTEXT *pContext = GetContext(hContext);
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
 
-    int32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
+    uint32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
     uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw);
-    int32_t remainingVerts = numVertices;
+    uint32_t remainingVerts = numVertices;
 
     API_STATE    *pState = &pDC->pState->state;
     pState->topology = topology;
@@ -1149,9 +1210,9 @@ void DrawIndexedInstance(
     DRAW_CONTEXT* pDC = GetDrawContext(pContext);
     API_STATE* pState = &pDC->pState->state;
 
-    int32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
+    uint32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
     uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw);
-    int32_t remainingIndices = numIndices;
+    uint32_t remainingIndices = numIndices;
 
     uint32_t indexSize = 0;
     switch (pState->indexBuffer.format)
@@ -1334,9 +1395,6 @@ void SwrDispatch(
 
     pDC->isCompute = true;      // This is a compute context.
 
-    // Ensure spill fill pointers are initialized to nullptr.
-    memset(pDC->pSpillFill, 0, sizeof(pDC->pSpillFill));
-
     COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
 
     pTaskData->threadGroupCountX = threadGroupCountX;
@@ -1344,6 +1402,8 @@ void SwrDispatch(
     pTaskData->threadGroupCountZ = threadGroupCountZ;
 
     uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
+    uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
+    pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex];
     pDC->pDispatch->initialize(totalThreadGroups, pTaskData);
 
     QueueDispatch(pContext);
@@ -1497,4 +1557,6 @@ void SWR_API SwrEndFrame(
     HANDLE hContext)
 {
     RDTSC_ENDFRAME();
+    SWR_CONTEXT *pContext = GetContext(hContext);
+    pContext->frameCount++;
 }
diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h
index 67d81a44347..64184e16865 100644
--- a/src/gallium/drivers/swr/rasterizer/core/arena.h
+++ b/src/gallium/drivers/swr/rasterizer/core/arena.h
@@ -65,69 +65,41 @@ static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN,
 template<uint32_t NumBucketsT = 4, uint32_t StartBucketBitT = 16>
 struct CachingAllocatorT : DefaultAllocator
 {
-    static uint32_t GetBucketId(size_t blockSize)
-    {
-        uint32_t bucketId = 0;
-
-#if defined(BitScanReverseSizeT)
-        BitScanReverseSizeT((unsigned long*)&bucketId, blockSize >> CACHE_START_BUCKET_BIT);
-        bucketId = std::min<uint32_t>(bucketId, CACHE_NUM_BUCKETS - 1);
-#endif
-
-        return bucketId;
-    }
-
     void* AllocateAligned(size_t size, size_t align)
     {
         SWR_ASSERT(size >= sizeof(ArenaBlock));
         SWR_ASSERT(size <= uint32_t(-1));
 
         size_t blockSize = size - ARENA_BLOCK_ALIGN;
+        uint32_t bucket = GetBucketId(blockSize);
 
         {
             // search cached blocks
             std::lock_guard<std::mutex> l(m_mutex);
-            ArenaBlock* pPrevBlock = &m_cachedBlocks[GetBucketId(blockSize)];
-            ArenaBlock* pBlock = pPrevBlock->pNext;
-            ArenaBlock* pPotentialBlock = nullptr;
-            ArenaBlock* pPotentialPrev = nullptr;
+            ArenaBlock* pPrevBlock = &m_cachedBlocks[bucket];
+            ArenaBlock* pBlock = SearchBlocks(pPrevBlock, blockSize, align);
 
-            while (pBlock)
+            if (pBlock)
             {
-                if (pBlock->blockSize >= blockSize)
-                {
-                    if (pBlock == AlignUp(pBlock, align))
-                    {
-                        if (pBlock->blockSize == blockSize)
-                        {
-                            // Won't find a better match
-                            break;
-                        }
-
-                        // We could use this as it is larger than we wanted, but
-                        // continue to search for a better match
-                        pPotentialBlock = pBlock;
-                        pPotentialPrev = pPrevBlock;
-                    }
-                }
-                else
+                m_cachedSize -= pBlock->blockSize;
+                if (pBlock == m_pLastCachedBlocks[bucket])
                 {
-                    // Blocks are sorted by size (biggest first)
-                    // So, if we get here, there are no blocks 
-                    // large enough, fall through to allocation.
-                    pBlock = nullptr;
-                    break;
+                    m_pLastCachedBlocks[bucket] = pPrevBlock;
                 }
-
-                pPrevBlock = pBlock;
-                pBlock = pBlock->pNext;
             }
-
-            if (!pBlock)
+            else
             {
-                // Couldn't find an exact match, use next biggest size
-                pBlock = pPotentialBlock;
-                pPrevBlock = pPotentialPrev;
+                pPrevBlock = &m_oldCachedBlocks[GetBucketId(blockSize)];
+                pBlock = SearchBlocks(pPrevBlock, blockSize, align);
+
+                if (pBlock)
+                {
+                    m_oldCachedSize -= pBlock->blockSize;
+                    if (pBlock == m_pOldLastCachedBlocks[bucket])
+                    {
+                        m_pLastCachedBlocks[bucket] = pPrevBlock;
+                    }
+                }
             }
 
             if (pBlock)
@@ -154,7 +126,7 @@ struct CachingAllocatorT : DefaultAllocator
         return this->DefaultAllocator::AllocateAligned(size, align);
     }
 
-    void  Free(void* pMem)
+    void Free(void* pMem)
     {
         if (pMem)
         {
@@ -162,24 +134,57 @@ struct CachingAllocatorT : DefaultAllocator
             SWR_ASSERT(pNewBlock->blockSize >= 0);
 
             std::unique_lock<std::mutex> l(m_mutex);
-            ArenaBlock* pPrevBlock = &m_cachedBlocks[GetBucketId(pNewBlock->blockSize)];
-            ArenaBlock* pBlock = pPrevBlock->pNext;
+            InsertCachedBlock(GetBucketId(pNewBlock->blockSize), pNewBlock);
+        }
+    }
 
-            while (pBlock)
+    void FreeOldBlocks()
+    {
+        if (!m_cachedSize) { return; }
+        std::lock_guard<std::mutex> l(m_mutex);
+
+        bool doFree = (m_oldCachedSize > MAX_UNUSED_SIZE);
+
+        for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i)
+        {
+            if (doFree)
             {
-                if (pNewBlock->blockSize >= pBlock->blockSize)
+                ArenaBlock* pBlock = m_oldCachedBlocks[i].pNext;
+                while (pBlock)
                 {
-                    // Insert here
-                    break;
+                    ArenaBlock* pNext = pBlock->pNext;
+                    m_oldCachedSize -= pBlock->blockSize;
+                    m_totalAllocated -= (pBlock->blockSize + ARENA_BLOCK_ALIGN);
+                    this->DefaultAllocator::Free(pBlock);
+                    pBlock = pNext;
                 }
-                pPrevBlock = pBlock;
-                pBlock = pBlock->pNext;
+                m_oldCachedBlocks[i].pNext = nullptr;
+                m_pOldLastCachedBlocks[i] = &m_oldCachedBlocks[i];
             }
 
-            // Insert into list
-            SWR_ASSERT(pPrevBlock);
-            pPrevBlock->pNext = pNewBlock;
-            pNewBlock->pNext = pBlock;
+            if (m_pLastCachedBlocks[i] != &m_cachedBlocks[i])
+            {
+                m_pLastCachedBlocks[i]->pNext = m_oldCachedBlocks[i].pNext;
+                m_oldCachedBlocks[i].pNext = m_cachedBlocks[i].pNext;
+                m_cachedBlocks[i].pNext = nullptr;
+                if (m_pOldLastCachedBlocks[i]->pNext)
+                {
+                    m_pOldLastCachedBlocks[i] = m_pLastCachedBlocks[i];
+                }
+                m_pLastCachedBlocks[i] = &m_cachedBlocks[i];
+            }
+        }
+
+        m_oldCachedSize += m_cachedSize;
+        m_cachedSize = 0;
+    }
+
+    CachingAllocatorT()
+    {
+        for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i)
+        {
+            m_pLastCachedBlocks[i] = &m_cachedBlocks[i];
+            m_pOldLastCachedBlocks[i] = &m_oldCachedBlocks[i];
         }
     }
 
@@ -195,21 +200,126 @@ struct CachingAllocatorT : DefaultAllocator
                 this->DefaultAllocator::Free(pBlock);
                 pBlock = pNext;
             }
+            pBlock = m_oldCachedBlocks[i].pNext;
+            while (pBlock)
+            {
+                ArenaBlock* pNext = pBlock->pNext;
+                this->DefaultAllocator::Free(pBlock);
+                pBlock = pNext;
+            }
         }
     }
 
+private:
+    static uint32_t GetBucketId(size_t blockSize)
+    {
+        uint32_t bucketId = 0;
+
+#if defined(BitScanReverseSizeT)
+        BitScanReverseSizeT((unsigned long*)&bucketId, blockSize >> CACHE_START_BUCKET_BIT);
+        bucketId = std::min<uint32_t>(bucketId, CACHE_NUM_BUCKETS - 1);
+#endif
+
+        return bucketId;
+    }
+
+    void InsertCachedBlock(uint32_t bucketId, ArenaBlock* pNewBlock)
+    {
+        SWR_ASSERT(bucketId < CACHE_NUM_BUCKETS);
+
+        ArenaBlock* pPrevBlock = &m_cachedBlocks[bucketId];
+        ArenaBlock* pBlock = pPrevBlock->pNext;
+
+        while (pBlock)
+        {
+            if (pNewBlock->blockSize >= pBlock->blockSize)
+            {
+                // Insert here
+                break;
+            }
+            pPrevBlock = pBlock;
+            pBlock = pBlock->pNext;
+        }
+
+        // Insert into list
+        SWR_ASSERT(pPrevBlock);
+        pPrevBlock->pNext = pNewBlock;
+        pNewBlock->pNext = pBlock;
+
+        if (m_pLastCachedBlocks[bucketId] == pPrevBlock)
+        {
+            m_pLastCachedBlocks[bucketId] = pNewBlock;
+        }
+
+        m_cachedSize += pNewBlock->blockSize;
+    }
+
+    static ArenaBlock* SearchBlocks(ArenaBlock*& pPrevBlock, size_t blockSize, size_t align)
+    {
+        ArenaBlock* pBlock = pPrevBlock->pNext;
+        ArenaBlock* pPotentialBlock = nullptr;
+        ArenaBlock* pPotentialPrev = nullptr;
+
+        while (pBlock)
+        {
+            if (pBlock->blockSize >= blockSize)
+            {
+                if (pBlock == AlignUp(pBlock, align))
+                {
+                    if (pBlock->blockSize == blockSize)
+                    {
+                        // Won't find a better match
+                        break;
+                    }
+
+                    // We could use this as it is larger than we wanted, but
+                    // continue to search for a better match
+                    pPotentialBlock = pBlock;
+                    pPotentialPrev = pPrevBlock;
+                }
+            }
+            else
+            {
+                // Blocks are sorted by size (biggest first)
+                // So, if we get here, there are no blocks 
+                // large enough, fall through to allocation.
+                pBlock = nullptr;
+                break;
+            }
+
+            pPrevBlock = pBlock;
+            pBlock = pBlock->pNext;
+        }
+
+        if (!pBlock)
+        {
+            // Couldn't find an exact match, use next biggest size
+            pBlock = pPotentialBlock;
+            pPrevBlock = pPotentialPrev;
+        }
+
+        return pBlock;
+    }
+
     // buckets, for block sizes < (1 << (start+1)), < (1 << (start+2)), ...
     static const uint32_t   CACHE_NUM_BUCKETS       = NumBucketsT;
     static const uint32_t   CACHE_START_BUCKET_BIT  = StartBucketBitT;
+    static const size_t     MAX_UNUSED_SIZE         = 20 * sizeof(MEGABYTE);
 
     ArenaBlock              m_cachedBlocks[CACHE_NUM_BUCKETS];
+    ArenaBlock*             m_pLastCachedBlocks[CACHE_NUM_BUCKETS];
+    ArenaBlock              m_oldCachedBlocks[CACHE_NUM_BUCKETS];
+    ArenaBlock*             m_pOldLastCachedBlocks[CACHE_NUM_BUCKETS];
     std::mutex              m_mutex;
 
     size_t                  m_totalAllocated = 0;
+
+    size_t                  m_cachedSize = 0;
+    size_t                  m_oldCachedSize = 0;
 };
 typedef CachingAllocatorT<> CachingAllocator;
 
-template<typename T = DefaultAllocator, size_t BlockSizeT = (128 * 1024)>
+template<typename T = DefaultAllocator, size_t BlockSizeT = 128 * sizeof(KILOBYTE)>
 class TArena
 {
 public:
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
index 7fb83edf169..b2d3d9ef4f4 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -70,7 +70,7 @@ static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
 /// @param pDC - pointer to draw context (dispatch).
 /// @param workerId - The unique worker ID that is assigned to this thread.
 /// @param threadGroupId - the linear index for the thread group within the dispatch.
-void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId)
+void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer)
 {
     RDTSC_START(BEDispatch);
 
@@ -80,10 +80,10 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup
     SWR_ASSERT(pTaskData != nullptr);
 
     // Ensure spill fill memory has been allocated.
-    if (pDC->pSpillFill[workerId] == nullptr)
+    if (pSpillFillBuffer == nullptr)
     {
         ///@todo Add state which indicates the spill fill size.
-        pDC->pSpillFill[workerId] = (uint8_t*)pDC->pArena->AllocAlignedSync(4096 * 1024, sizeof(float) * 8);
+        pSpillFillBuffer = pDC->pArena->AllocAlignedSync(4 * sizeof(MEGABYTE), sizeof(float) * 8);
     }
 
     const API_STATE& state = GetApiState(pDC);
@@ -94,7 +94,7 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup
     csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
     csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
     csContext.pTGSM = pContext->pScratch[workerId];
-    csContext.pSpillFillBuffer = pDC->pSpillFill[workerId];
+    csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
 
     state.pfnCsFunc(GetPrivateState(pDC), &csContext);
 
@@ -772,8 +772,10 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                     psContext.vOneOverW.centroid = psContext.vOneOverW.center;
                 }
 
-                // interpolate z
+                // interpolate and quantize z
                 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
+                psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
+
                 RDTSC_STOP(BEBarycentric, 0, 0);
 
                 simdmask clipCoverageMask = coverageMask & MASK;
@@ -793,7 +795,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                 if(CanEarlyZ(pPSState))
                 {
                     RDTSC_START(BEEarlyDepthTest);
-                    depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+                    depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
                                                         psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
                     RDTSC_STOP(BEEarlyDepthTest, 0, 0);
 
@@ -825,7 +827,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
                 if(!CanEarlyZ(pPSState))
                 {
                     RDTSC_START(BELateDepthTest);
-                    depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+                    depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
                                                         psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
                     RDTSC_STOP(BELateDepthTest, 0, 0);
 
@@ -977,8 +979,9 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
 
                     backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
 
-                    // interpolate z
+                    // interpolate and quantize z
                     psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
+                    psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
 
                     RDTSC_STOP(BEBarycentric, 0, 0);
 
@@ -1000,7 +1003,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                     if (CanEarlyZ(pPSState))
                     {
                         RDTSC_START(BEEarlyDepthTest);
-                        depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+                        depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
                                               psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
                         RDTSC_STOP(BEEarlyDepthTest, 0, 0);
 
@@ -1033,7 +1036,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
                     if (!CanEarlyZ(pPSState))
                     {
                         RDTSC_START(BELateDepthTest);
-                        depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+                        depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
                                               psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
                         RDTSC_STOP(BELateDepthTest, 0, 0);
 
@@ -1200,8 +1203,9 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
                 RDTSC_START(BEBarycentric);
                 backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
 
-                // interpolate z
+                // interpolate and quantize z
                 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
+                psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
                 RDTSC_STOP(BEBarycentric, 0, 0);
 
                 // execute pixel shader
@@ -1263,10 +1267,11 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
                     // calc I & J per sample
                     backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
 
-                    // interpolate z
+                    // interpolate and quantize z
                     if (!pPSState->writesODepth)
                     {
                         vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
+                        vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
                     }
                     
                     ///@todo: perspective correct vs non-perspective correct clipping?
@@ -1292,7 +1297,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
                 // ZTest for this sample
                 RDTSC_START(BEEarlyDepthTest);
                 stencilPassMask[sample] = vCoverageMask[sample];
-                depthPassMask[sample] = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+                depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing,
                                         vZ[sample], pDepthSample, vCoverageMask[sample], pStencilSample, &stencilPassMask[sample]);
                 RDTSC_STOP(BEEarlyDepthTest, 0, 0);
 
@@ -1308,8 +1313,9 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
             {
                 RDTSC_START(BEBarycentric);
                 backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
-                // interpolate z
+                // interpolate and quantize z
                 psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
+                psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
                 RDTSC_STOP(BEBarycentric, 0, 0);
 
                 // execute pixel shader
@@ -1463,8 +1469,9 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
 
                     backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
 
-                    // interpolate z
+                    // interpolate and quantize z
                     psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
+                    psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
 
                     RDTSC_STOP(BEBarycentric, 0, 0);
 
@@ -1483,7 +1490,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
                     uint8_t *pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
 
                     RDTSC_START(BEEarlyDepthTest);
-                    simdscalar depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+                    simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
                         psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
                     DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
                         pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h
index 2fa18953cad..d0626b997af 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.h
@@ -32,7 +32,7 @@
 #include "core/context.h"
 #include "core/multisample.h"
 
-void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId);
+void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer);
 void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
 void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
 void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
index 3a2a8b35be8..e624fd8f674 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
@@ -162,8 +162,8 @@ int ClipTriToPlane( const float *pInPts, int numInPts,
 void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *pOutTriangles, int *numVerts, float *pOutAttribs)
 {
     // temp storage to hold at least 6 sets of vertices, the max number that can be created during clipping
-    OSALIGN(float, 16) tempPts[6 * 4];
-    OSALIGN(float, 16) tempAttribs[6 * KNOB_NUM_ATTRIBUTES * 4];
+    OSALIGNSIMD(float) tempPts[6 * 4];
+    OSALIGNSIMD(float) tempAttribs[6 * KNOB_NUM_ATTRIBUTES * 4];
 
     // we opt to clip to viewport frustum to produce smaller triangles for rasterization precision
     int NumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pTriangle, 3, pAttribs, numAttribs, tempPts, tempAttribs);
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h
index ba5870a92bb..67a4c4f47bb 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -265,8 +265,8 @@ public:
     // clip a single primitive
     int ClipScalar(PA_STATE& pa, uint32_t primIndex, float* pOutPos, float* pOutAttribs)
     {
-        OSALIGN(float, 16) inVerts[3 * 4];
-        OSALIGN(float, 16) inAttribs[3 * KNOB_NUM_ATTRIBUTES * 4];
+        OSALIGNSIMD(float) inVerts[3 * 4];
+        OSALIGNSIMD(float) inAttribs[3 * KNOB_NUM_ATTRIBUTES * 4];
 
         // transpose primitive position
         __m128 verts[3];
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index 39f23372a18..6464aa20af7 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -308,6 +308,8 @@ OSALIGNLINE(struct) API_STATE
         uint32_t depthHottileEnable: 1;
         uint32_t stencilHottileEnable : 1;
     };
+
+    PFN_QUANTIZE_DEPTH      pfnQuantizeDepth;
 };
 
 class MacroTileMgr;
@@ -380,32 +382,29 @@ struct DRAW_STATE
 //    This draw context maintains all of the state needed for the draw operation.
 struct DRAW_CONTEXT
 {
-    SWR_CONTEXT *pContext;
-
-    uint64_t drawId;
-
-    bool isCompute;    // Is this DC a compute context?
-
-    FE_WORK FeWork;
-    volatile OSALIGNLINE(uint32_t) FeLock;
-    volatile OSALIGNLINE(bool) doneFE;    // Is FE work done for this draw?
-    volatile OSALIGNLINE(int64_t) threadsDone;
-
-    uint64_t dependency;
-
-    MacroTileMgr* pTileMgr;
-
-    // The following fields are valid if isCompute is true.
-    DispatchQueue* pDispatch;               // Queue for thread groups. (isCompute)
+    SWR_CONTEXT*    pContext;
+    uint64_t        drawId;
+    union
+    {
+        MacroTileMgr*   pTileMgr;
+        DispatchQueue*  pDispatch;      // Queue for thread groups. (isCompute)
+    };
+    uint64_t        dependency;
+    DRAW_STATE*     pState;
+    CachingArena*   pArena;
 
-    DRAW_STATE* pState;
-    CachingArena* pArena;
+    bool            isCompute;      // Is this DC a compute context?
+    bool            cleanupState;   // True if this is the last draw using an entry in the state ring.
+    volatile bool   doneFE;         // Is FE work done for this draw?
 
-    uint8_t* pSpillFill[KNOB_MAX_NUM_THREADS];  // Scratch space used for spill fills.
+    FE_WORK         FeWork;
 
-    bool  cleanupState; // True if this is the last draw using an entry in the state ring.
+    volatile OSALIGNLINE(uint32_t)   FeLock;
+    volatile int64_t    threadsDone;
 };
 
+static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT");
+
 INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
 {
     SWR_ASSERT(pDC != nullptr);
@@ -447,6 +446,9 @@ struct SWR_CONTEXT
     DRAW_CONTEXT *pCurDrawContext;    // This points to DC entry in ring for an unsubmitted draw.
     DRAW_CONTEXT *pPrevDrawContext;   // This points to DC entry for the previous context submitted that we can copy state from.
 
+    MacroTileMgr* pMacroTileManagerArray;
+    DispatchQueue* pDispatchQueueArray;
+
     // Draw State Ring
     //  When draw are very large (lots of primitives) then the API thread will break these up.
     //  These split draws all have identical state. So instead of storing the state directly
@@ -457,6 +459,8 @@ struct SWR_CONTEXT
     uint32_t curStateId;               // Current index to the next available entry in the DS ring.
 
     uint32_t NumWorkerThreads;
+    uint32_t NumFEThreads;
+    uint32_t NumBEThreads;
 
     THREAD_POOL threadPool; // Thread pool associated with this context
 
@@ -481,6 +485,7 @@ struct SWR_CONTEXT
     uint8_t* pScratch[KNOB_MAX_NUM_THREADS];
 
     CachingAllocator cachingArenaAllocator;
+    uint32_t frameCount;
 };
 
 void WaitForDependencies(SWR_CONTEXT *pContext, uint64_t drawId);
diff --git a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
index 2cc9d4054ac..7b55580bf0a 100644
--- a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
+++ b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
@@ -80,14 +80,52 @@ void StencilOp(SWR_STENCILOP op, simdscalar mask, simdscalar stencilRefps, simds
 }
 
 
+template<SWR_FORMAT depthFormatT>
+simdscalar QuantizeDepth(simdscalar depth)
+{
+    SWR_TYPE depthType = FormatTraits<depthFormatT>::GetType(0);
+    uint32_t depthBpc = FormatTraits<depthFormatT>::GetBPC(0);
+
+    if (depthType == SWR_TYPE_FLOAT)
+    {
+        // assume only 32bit float depth supported
+        SWR_ASSERT(depthBpc == 32);
+
+        // matches shader precision, no quantizing needed
+        return depth;
+    }
+
+    // should be unorm depth if not float
+    SWR_ASSERT(depthType == SWR_TYPE_UNORM);
+
+    float quantize = (float)((1 << depthBpc) - 1);
+    simdscalar result = _simd_mul_ps(depth, _simd_set1_ps(quantize));
+    result = _simd_add_ps(result, _simd_set1_ps(0.5f));
+    result = _simd_round_ps(result, _MM_FROUND_TO_ZERO);
+    
+    if (depthBpc > 16)
+    {
+        result = _simd_div_ps(result, _simd_set1_ps(quantize));
+    }
+    else
+    {
+        result = _simd_mul_ps(result, _simd_set1_ps(1.0f / quantize));
+    }
+
+    return result;
+}
+
 INLINE
-simdscalar DepthStencilTest(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState,
+simdscalar DepthStencilTest(const API_STATE* pState,
                  bool frontFacing, simdscalar interpZ, uint8_t* pDepthBase, simdscalar coverageMask, uint8_t *pStencilBase,
                  simdscalar* pStencilMask)
 {
     static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
     static_assert(KNOB_STENCIL_HOT_TILE_FORMAT == R8_UINT, "Unsupported stencil hot tile format");
 
+    const SWR_DEPTH_STENCIL_STATE* pDSState = &pState->depthStencilState;
+    const SWR_VIEWPORT* pViewport = &pState->vp[0];
+
     simdscalar depthResult = _simd_set1_ps(-1.0f);
     simdscalar zbuf;
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 36721e00beb..93869610ff9 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -793,8 +793,14 @@ static void GeometryShaderStage(
             uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
             
             DWORD numAttribs;
-            _BitScanReverse(&numAttribs, state.feAttribMask);
-            numAttribs++;
+            if (_BitScanReverse(&numAttribs, state.feAttribMask))
+            {
+                numAttribs++;
+            }
+            else
+            {
+                numAttribs = 0;
+            }
 
             for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
             {
diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h b/src/gallium/drivers/swr/rasterizer/core/knobs.h
index d7feb86273d..55a22a67f4c 100644
--- a/src/gallium/drivers/swr/rasterizer/core/knobs.h
+++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h
@@ -45,14 +45,17 @@
 #define KNOB_ARCH_ISA AVX
 #define KNOB_ARCH_STR "AVX"
 #define KNOB_SIMD_WIDTH 8
+#define KNOB_SIMD_BYTES 32
 #elif (KNOB_ARCH == KNOB_ARCH_AVX2)
 #define KNOB_ARCH_ISA AVX2
 #define KNOB_ARCH_STR "AVX2"
 #define KNOB_SIMD_WIDTH 8
+#define KNOB_SIMD_BYTES 32
 #elif (KNOB_ARCH == KNOB_ARCH_AVX512)
 #define KNOB_ARCH_ISA AVX512F
 #define KNOB_ARCH_STR "AVX512"
 #define KNOB_SIMD_WIDTH 16
+#define KNOB_SIMD_BYTES 64
 #error "AVX512 not yet supported"
 #else
 #error "Unknown architecture"
diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h
index f8f1a33b7e3..17f488538d6 100644
--- a/src/gallium/drivers/swr/rasterizer/core/pa.h
+++ b/src/gallium/drivers/swr/rasterizer/core/pa.h
@@ -1017,13 +1017,13 @@ struct PA_TESS : PA_STATE
     {
         SWR_ASSERT(numPrims <= KNOB_SIMD_WIDTH);
 #if KNOB_SIMD_WIDTH == 8
-        static const OSALIGN(int32_t, 64) maskGen[KNOB_SIMD_WIDTH * 2] =
+        static const OSALIGNLINE(int32_t) maskGen[KNOB_SIMD_WIDTH * 2] =
         {
             -1, -1, -1, -1, -1, -1, -1, -1,
              0,  0,  0,  0,  0,  0,  0,  0
         };
 #elif KNOB_SIMD_WIDTH == 16
-        static const OSALIGN(int32_t, 128) maskGen[KNOB_SIMD_WIDTH * 2] =
+        static const OSALIGNLINE(int32_t) maskGen[KNOB_SIMD_WIDTH * 2] =
         {
             -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
              0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0
@@ -1167,8 +1167,14 @@ struct PA_FACTORY
         {
             memset(&indexStore, 0, sizeof(indexStore));
             DWORD numAttribs;
-            _BitScanReverse(&numAttribs, state.feAttribMask);
-            numAttribs++;
+            if (_BitScanReverse(&numAttribs, state.feAttribMask))
+            {
+                numAttribs++;
+            }
+            else
+            {
+                numAttribs = 0;
+            }
             new (&this->paCut) PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, 
                 &this->indexStore[0], numVerts, numAttribs, state.topology, false);
             cutPA = true;
diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
index 52fb7c88cdd..3144a901c91 100644
--- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
@@ -383,7 +383,7 @@ __declspec(thread) volatile uint64_t gToss;
 
 static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
 // try to avoid _chkstk insertions; make this thread local
-static THREAD OSALIGN(float, 16) perspAttribsTLS[vertsPerTri * KNOB_NUM_ATTRIBUTES * componentsPerAttrib];
+static THREAD OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * KNOB_NUM_ATTRIBUTES * componentsPerAttrib];
 
 INLINE
 void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge)
@@ -439,7 +439,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
     const SWR_RASTSTATE &rastState = state.rastState;
     const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
 
-    OSALIGN(SWR_TRIANGLE_DESC, 16) triDesc;
+    OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
     triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
 
     __m128 vX, vY, vZ, vRecipW;
@@ -502,7 +502,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
     _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2);
     _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);
 
-    OSALIGN(float, 16) oneOverW[4];
+    OSALIGNSIMD(float) oneOverW[4];
     _mm_store_ps(oneOverW, vRecipW);
     triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2];
     triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
@@ -537,7 +537,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
 
     // compute bary Z
     // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
-    OSALIGN(float, 16) a[4];
+    OSALIGNSIMD(float) a[4];
     _mm_store_ps(a, vZ);
     triDesc.Z[0] = a[0] - a[2];
     triDesc.Z[1] = a[1] - a[2];
@@ -575,7 +575,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
     }
 
     // Calc bounding box of triangle
-    OSALIGN(BBOX, 16) bbox;
+    OSALIGNSIMD(BBOX) bbox;
     calcBoundingBoxInt(vXi, vYi, bbox);
 
     // Intersect with scissor/viewport
@@ -594,7 +594,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
     int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
     int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
 
-    OSALIGN(BBOX, 16) intersect;
+    OSALIGNSIMD(BBOX) intersect;
     intersect.left   = std::max(bbox.left, macroBoxLeft);
     intersect.top    = std::max(bbox.top, macroBoxTop);
     intersect.right  = std::min(bbox.right, macroBoxRight);
@@ -1047,7 +1047,7 @@ void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTi
         { 50, 51, 54, 55, 58, 59, 62, 63 }
     };
 
-    OSALIGN(SWR_TRIANGLE_DESC, 16) triDesc;
+    OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
 
     // pull point information from triangle buffer
     // @todo use structs for readability
@@ -1286,7 +1286,7 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi
     // make sure this macrotile intersects the triangle
     __m128i vXai = fpToFixedPoint(vXa);
     __m128i vYai = fpToFixedPoint(vYa);
-    OSALIGN(BBOX, 16) bboxA;
+    OSALIGNSIMD(BBOX) bboxA;
     calcBoundingBoxInt(vXai, vYai, bboxA);
 
     if (!(bboxA.left > macroBoxRight ||
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h
index 5752094ca10..50361068025 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -790,6 +790,7 @@ typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT& soContext);
 typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext);
 typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext);
 typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, uint8_t*, simdvector&, simdscalari*, simdscalari*);
+typedef simdscalar(*PFN_QUANTIZE_DEPTH)(simdscalar);
 
 //////////////////////////////////////////////////////////////////////////
 /// FRONTEND_STATE
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index 07bc94a1a54..4b7a207f366 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -68,7 +68,10 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread
 
 #if defined(_WIN32)
 
-    SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer[KNOB_MAX_NUM_THREADS];
+    static std::mutex m;
+    std::lock_guard<std::mutex> l(m);
+
+    static SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer[KNOB_MAX_NUM_THREADS];
     DWORD bufSize = sizeof(buffer);
 
     BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, buffer, &bufSize);
@@ -288,7 +291,10 @@ INLINE int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
     {
         // Cleanup memory allocations
         pDC->pArena->Reset(true);
-        pDC->pTileMgr->initialize();
+        if (!pDC->isCompute)
+        {
+            pDC->pTileMgr->initialize();
+        }
         if (pDC->cleanupState)
         {
             pDC->pState->pArena->Reset(true);
@@ -302,10 +308,10 @@ INLINE int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
     return result;
 }
 
-INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
+INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE, uint64_t& drawEnqueued)
 {
     // increment our current draw id to the first incomplete draw
-    uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
+    drawEnqueued = GetEnqueuedDraw(pContext);
     while (curDrawBE < drawEnqueued)
     {
         DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT];
@@ -313,8 +319,9 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
         // If its not compute and FE is not done then break out of loop.
         if (!pDC->doneFE && !pDC->isCompute) break;
 
-        bool isWorkComplete = (pDC->isCompute) ?
-            pDC->pDispatch->isWorkComplete() : pDC->pTileMgr->isWorkComplete();
+        bool isWorkComplete = pDC->isCompute ?
+            pDC->pDispatch->isWorkComplete() :
+            pDC->pTileMgr->isWorkComplete();
 
         if (isWorkComplete)
         {
@@ -355,7 +362,8 @@ void WorkOnFifoBE(
 {
     // Find the first incomplete draw that has pending work. If no such draw is found then
     // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
-    if (FindFirstIncompleteDraw(pContext, curDrawBE) == false)
+    uint64_t drawEnqueued = 0;
+    if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false)
     {
         return;
     }
@@ -370,7 +378,7 @@ void WorkOnFifoBE(
     //   2. If we're trying to work on draws after curDrawBE, we are restricted to 
     //      working on those macrotiles that are known to be complete in the prior draw to
     //      maintain order. The locked tiles provides the history to ensures this.
-    for (uint64_t i = curDrawBE; i < GetEnqueuedDraw(pContext); ++i)
+    for (uint64_t i = curDrawBE; i < drawEnqueued; ++i)
     {
         DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT];
 
@@ -463,7 +471,7 @@ void WorkOnFifoBE(
     }
 }
 
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode)
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE)
 {
     // Try to grab the next DC from the ring
     uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
@@ -516,38 +524,44 @@ void WorkOnCompute(
     uint32_t workerId,
     uint64_t& curDrawBE)
 {
-    if (FindFirstIncompleteDraw(pContext, curDrawBE) == false)
+    uint64_t drawEnqueued = 0;
+    if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false)
     {
         return;
     }
 
     uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;
 
-    DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT];
-    if (pDC->isCompute == false) return;
-
-    // check dependencies
-    if (CheckDependency(pContext, pDC, lastRetiredDraw))
+    for (uint64_t i = curDrawBE; curDrawBE < drawEnqueued; ++i)
     {
-        return;
-    }
+        DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT];
+        if (pDC->isCompute == false) return;
 
-    SWR_ASSERT(pDC->pDispatch != nullptr);
-    DispatchQueue& queue = *pDC->pDispatch;
+        // check dependencies
+        if (CheckDependency(pContext, pDC, lastRetiredDraw))
+        {
+            return;
+        }
 
-    // Is there any work remaining?
-    if (queue.getNumQueued() > 0)
-    {
-        uint32_t threadGroupId = 0;
-        while (queue.getWork(threadGroupId))
+        SWR_ASSERT(pDC->pDispatch != nullptr);
+        DispatchQueue& queue = *pDC->pDispatch;
+
+        // Is there any work remaining?
+        if (queue.getNumQueued() > 0)
         {
-            ProcessComputeBE(pDC, workerId, threadGroupId);
+            void* pSpillFillBuffer = nullptr;
+            uint32_t threadGroupId = 0;
+            while (queue.getWork(threadGroupId))
+            {
+                ProcessComputeBE(pDC, workerId, threadGroupId, pSpillFillBuffer);
 
-            queue.finishedWork();
+                queue.finishedWork();
+            }
         }
     }
 }
 
+template<bool IsFEThread, bool IsBEThread>
 DWORD workerThreadMain(LPVOID pData)
 {
     THREAD_DATA *pThreadData = (THREAD_DATA*)pData;
@@ -631,25 +645,38 @@ DWORD workerThreadMain(LPVOID pData)
             }
         }
 
-        RDTSC_START(WorkerWorkOnFifoBE);
-        WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
-        RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);
+        if (IsBEThread)
+        {
+            RDTSC_START(WorkerWorkOnFifoBE);
+            WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
+            RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);
 
-        WorkOnCompute(pContext, workerId, curDrawBE);
+            WorkOnCompute(pContext, workerId, curDrawBE);
+        }
 
-        WorkOnFifoFE(pContext, workerId, curDrawFE, numaNode);
+        if (IsFEThread)
+        {
+            WorkOnFifoFE(pContext, workerId, curDrawFE);
+
+            if (!IsBEThread)
+            {
+                curDrawBE = curDrawFE;
+            }
+        }
     }
 
     return 0;
 }
+template<> DWORD workerThreadMain<false, false>(LPVOID) = delete;
 
+template <bool IsFEThread, bool IsBEThread>
 DWORD workerThreadInit(LPVOID pData)
 {
 #if defined(_WIN32)
     __try
 #endif // _WIN32
     {
-        return workerThreadMain(pData);
+        return workerThreadMain<IsFEThread, IsBEThread>(pData);
     }
 
 #if defined(_WIN32)
@@ -661,6 +688,7 @@ DWORD workerThreadInit(LPVOID pData)
 
     return 1;
 }
+template<> DWORD workerThreadInit<false, false>(LPVOID pData) = delete;
 
 void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
 {
@@ -678,6 +706,16 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
     uint32_t numCoresPerNode    = numHWCoresPerNode;
     uint32_t numHyperThreads    = numHWHyperThreads;
 
+    if (KNOB_MAX_WORKER_THREADS)
+    {
+        SET_KNOB(HYPERTHREADED_FE, false);
+    }
+
+    if (KNOB_HYPERTHREADED_FE)
+    {
+        SET_KNOB(MAX_THREADS_PER_CORE, 0);
+    }
+
     if (KNOB_MAX_NUMA_NODES)
     {
         numNodes = std::min(numNodes, KNOB_MAX_NUMA_NODES);
@@ -693,6 +731,11 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
         numHyperThreads = std::min(numHyperThreads, KNOB_MAX_THREADS_PER_CORE);
     }
 
+    if (numHyperThreads < 2)
+    {
+        SET_KNOB(HYPERTHREADED_FE, false);
+    }
+
     // Calculate numThreads
     uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
 
@@ -767,9 +810,14 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
             pPool->pThreadData[workerId].procGroupId = workerId % numProcGroups;
             pPool->pThreadData[workerId].threadId = 0;
             pPool->pThreadData[workerId].numaId = 0;
+            pPool->pThreadData[workerId].coreId = 0;
+            pPool->pThreadData[workerId].htId = 0;
             pPool->pThreadData[workerId].pContext = pContext;
             pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup;
-            pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]);
+            pPool->threads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
+
+            pContext->NumBEThreads++;
+            pContext->NumFEThreads++;
         }
     }
     else
@@ -780,6 +828,10 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
         for (uint32_t n = 0; n < numNodes; ++n)
         {
             auto& node = nodes[n];
+            if (node.cores.size() == 0)
+            {
+               continue;
+            }
 
             uint32_t numCores = numCoresPerNode;
             for (uint32_t c = 0; c < numCores; ++c)
@@ -797,8 +849,29 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
                     pPool->pThreadData[workerId].procGroupId = core.procGroup;
                     pPool->pThreadData[workerId].threadId = core.threadIds[t];
                     pPool->pThreadData[workerId].numaId = n;
+                    pPool->pThreadData[workerId].coreId = c;
+                    pPool->pThreadData[workerId].htId = t;
                     pPool->pThreadData[workerId].pContext = pContext;
-                    pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]);
+
+                    if (KNOB_HYPERTHREADED_FE)
+                    {
+                        if (t == 0)
+                        {
+                            pContext->NumBEThreads++;
+                            pPool->threads[workerId] = new std::thread(workerThreadInit<false, true>, &pPool->pThreadData[workerId]);
+                        }
+                        else
+                        {
+                            pContext->NumFEThreads++;
+                            pPool->threads[workerId] = new std::thread(workerThreadInit<true, false>, &pPool->pThreadData[workerId]);
+                        }
+                    }
+                    else
+                    {
+                        pPool->threads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
+                        pContext->NumBEThreads++;
+                        pContext->NumFEThreads++;
+                    }
 
                     ++workerId;
                 }
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h
index 821d7dcb16e..3aba6323a95 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@@ -41,6 +41,8 @@ struct THREAD_DATA
     uint32_t procGroupId;   // Will always be 0 for non-Windows OS
     uint32_t threadId;      // within the procGroup for Windows
     uint32_t numaId;        // NUMA node id
+    uint32_t coreId;        // Core id
+    uint32_t htId;          // Hyperthread id
     uint32_t workerId;
     SWR_CONTEXT *pContext;
     bool forceBindProcGroup; // Only useful when KNOB_MAX_WORKER_THREADS is set.
@@ -62,7 +64,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
 void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
 
 // Expose FE and BE worker functions to the API thread if single threaded
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode);
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE);
 void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask);
 void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE);
 int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC);
 \ No newline at end of file
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
index 794577270cf..87d9f42c032 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@@ -35,27 +35,6 @@
 
 #define TILE_ID(x,y) ((x << 16 | y))
 
-// override new/delete for alignment
-void *MacroTileMgr::operator new(size_t size)
-{
-    return _aligned_malloc(size, 64);
-}
-
-void MacroTileMgr::operator delete(void *p)
-{
-    _aligned_free(p);
-}
-
-void* DispatchQueue::operator new(size_t size)
-{
-    return _aligned_malloc(size, 64);
-}
-
-void DispatchQueue::operator delete(void *p)
-{
-    _aligned_free(p);
-}
-
 MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena)
 {
 }
@@ -304,7 +283,6 @@ void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile)
 void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID)
 {
     const API_STATE& state = GetApiState(pDC);
-    HotTileMgr *pHotTileMgr = pContext->pHotTileMgr;
 
     uint32_t x, y;
     MacroTileMgr::getTileIndices(macroID, x, y);
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
index aa561badc1c..82a15e16a33 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
@@ -140,9 +140,6 @@ public:
         x = (tileID >> 16) & 0xffff;
     }
 
-    void *operator new(size_t size);
-    void operator delete (void *p);
-
 private:
     CachingArena& mArena;
     std::unordered_map<uint32_t, MacroTileQueue> mTiles;
@@ -229,9 +226,6 @@ public:
         return mpTaskData;
     }
 
-    void *operator new(size_t size);
-    void operator delete (void *p);
-
     void* mpTaskData{ nullptr };        // The API thread will set this up and the callback task function will interpet this.
 
     OSALIGNLINE(volatile LONG) mTasksAvailable{ 0 };
@@ -272,7 +266,7 @@ class HotTileMgr
 public:
     HotTileMgr()
     {
-        memset(&mHotTiles[0][0], 0, sizeof(mHotTiles));
+        memset(mHotTiles, 0, sizeof(mHotTiles));
 
         // cache hottile size
         for (uint32_t i = SWR_ATTACHMENT_COLOR0; i <= SWR_ATTACHMENT_COLOR7; ++i)
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
index 0f3ded68544..3832b91d93e 100644
--- a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
+++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
@@ -30,6 +30,18 @@ KNOBS = [
         'category'  : 'debug',
     }],
 
+    ['HYPERTHREADED_FE', {
+        'type'      : 'bool',
+        'default'   : 'false',
+        'desc'      : ['EXPERIMENTAL!!',
+                       'If enabled will attempt to use secondary threads per core to perform',
+                       'front-end (VS/GS) work.',
+                       '',
+                       'Note: Setting this will cause KNOB_MAX_THREADS_PER_CORE to be ignored.'],
+        'category'  : 'perf',
+        'advanced'  : 'true',
+    }],
+
     ['DUMP_SHADER_IR', {
         'type'      : 'bool',
         'default'   : 'false',
@@ -166,6 +178,7 @@ KNOBS = [
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
         'category'  : 'perf',
+        'advanced'  : 'true',
     }],
 
     ['TOSS_FETCH', {
@@ -175,6 +188,7 @@ KNOBS = [
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
         'category'  : 'perf',
+        'advanced'  : 'true',
     }],
 
     ['TOSS_IA', {
@@ -184,6 +198,7 @@ KNOBS = [
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
         'category'  : 'perf',
+        'advanced'  : 'true',
     }],
 
     ['TOSS_VS', {
@@ -193,6 +208,7 @@ KNOBS = [
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
         'category'  : 'perf',
+        'advanced'  : 'true',
     }],
 
     ['TOSS_SETUP_TRIS', {
@@ -202,6 +218,7 @@ KNOBS = [
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
         'category'  : 'perf',
+        'advanced'  : 'true',
     }],
 
     ['TOSS_BIN_TRIS', {
@@ -211,6 +228,7 @@ KNOBS = [
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
         'category'  : 'perf',
+        'advanced'  : 'true',
     }],
 
     ['TOSS_RS', {
@@ -220,4 +238,5 @@ KNOBS = [
                        '',
                        'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
         'category'  : 'perf',
+        'advanced'  : 'true',
     }],]
author	Jason Ekstrand <[email protected]>	2016-04-13 20:25:39 -0700
committer	Jason Ekstrand <[email protected]>	2016-04-13 20:25:39 -0700
commit	12f88ba32a14ea79134f4e995a55149f078a2f27 (patch)
tree	9070861dced23d0ad7dbec598bfd96b686eb7bf1 /src/gallium/drivers/swr/rasterizer
parent	79fbec30fc16399ede9385ef52cb62cefbb388f4 (diff)
parent	171a570f388b2895d14f6d5418c99573cffd6369 (diff)