5 files changed, 110 insertions, 10 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index c3c603d294c..453d0295b54 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -189,7 +189,7 @@ void QueueWork(SWR_CONTEXT *pContext)
 
         if (IsDraw)
         {
-            std::unordered_set<uint32_t> lockedTiles;
+            static TileSet lockedTiles;
             uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
             WorkOnFifoFE(pContext, 0, curDraw[0], 0);
             WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles);
diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h
index b6b4d829576..4cdb728e1ef 100644
--- a/src/gallium/drivers/swr/rasterizer/core/arena.h
+++ b/src/gallium/drivers/swr/rasterizer/core/arena.h
@@ -33,6 +33,9 @@
 #pragma once
 
 #include <mutex>
+#include <algorithm>
+#include <atomic>
+#include "core/utils.h"
 
 class DefaultAllocator
 {
@@ -48,7 +51,7 @@ public:
     }
 };
 
-template<typename T = DefaultAllocator>
+template<typename MutexT = std::mutex, typename T = DefaultAllocator>
 class TArena
 {
 public:
@@ -79,7 +82,7 @@ public:
         }
 
         static const size_t ArenaBlockSize = 1024 * 1024;
-        size_t blockSize = std::max(m_size + ArenaBlockSize, std::max(size, ArenaBlockSize));
+        size_t blockSize = std::max<size_t>(m_size + ArenaBlockSize, std::max(size, ArenaBlockSize));
 
         // Add in one BLOCK_ALIGN unit to store ArenaBlock in.
         blockSize = AlignUp(blockSize + BLOCK_ALIGN, BLOCK_ALIGN);
@@ -111,8 +114,9 @@ public:
     {
         void* pAlloc = nullptr;
 
-        std::unique_lock<std::mutex> l(m_mutex);
+        m_mutex.lock();
         pAlloc = AllocAligned(size, align);
+        m_mutex.unlock();
 
         return pAlloc;
     }
@@ -121,8 +125,9 @@ public:
     {
         void* pAlloc = nullptr;
 
-        std::unique_lock<std::mutex> l(m_mutex);
+        m_mutex.lock();
         pAlloc = Alloc(size);
+        m_mutex.unlock();
 
         return pAlloc;
     }
@@ -175,7 +180,96 @@ private:
     size_t          m_size      = 0;
 
     /// @note Mutex is only used by sync allocation functions.
-    std::mutex      m_mutex;
+    MutexT          m_mutex;
 };
 
 typedef TArena<> Arena;
+
+struct NullMutex
+{
+    void lock() {}
+    void unlock() {}
+};
+
+// Ref counted Arena for ArenaAllocator
+// NOT THREAD SAFE!!
+struct RefArena : TArena<NullMutex>
+{
+    uint32_t AddRef() { return ++m_refCount; }
+    uint32_t Release() { if (--m_refCount) { return m_refCount; } delete this; return 0; }
+
+    void* allocate(std::size_t n)
+    {
+        ++m_numAllocations;
+        return Alloc(n);
+    }
+
+    void deallocate(void* p) { --m_numAllocations; }
+    void clear() { SWR_ASSERT(0 == m_numAllocations); Reset(); }
+
+private:
+    uint32_t m_refCount = 0;
+    uint32_t m_numAllocations = 0;
+};
+
+#if 0 // THIS DOESN'T WORK!!!
+// Arena based replacement for std::allocator
+template <typename T>
+struct ArenaAllocator
+{
+    typedef T value_type;
+    ArenaAllocator()
+    {
+        m_pArena = new RefArena();
+        m_pArena->AddRef();
+    }
+    ~ArenaAllocator()
+    {
+        m_pArena->Release(); m_pArena = nullptr;
+    }
+    ArenaAllocator(const ArenaAllocator& copy)
+    {
+        m_pArena = const_cast<RefArena*>(copy.m_pArena); m_pArena->AddRef();
+    }
+
+
+    template <class U> ArenaAllocator(const ArenaAllocator<U>& copy)
+    {
+        m_pArena = const_cast<RefArena*>(copy.m_pArena); m_pArena->AddRef();
+    }
+    T* allocate(std::size_t n)
+    {
+#if defined(_DEBUG)
+        char buf[32];
+        sprintf_s(buf, "Alloc: %lld\n", n);
+        OutputDebugStringA(buf);
+#endif
+        void* p = m_pArena->allocate(n * sizeof(T));
+        return static_cast<T*>(p);
+    }
+    void deallocate(T* p, std::size_t n)
+    {
+#if defined(_DEBUG)
+        char buf[32];
+        sprintf_s(buf, "Dealloc: %lld\n", n);
+        OutputDebugStringA(buf);
+#endif
+        m_pArena->deallocate(p);
+    }
+    void clear() { m_pArena->clear(); }
+
+    RefArena* m_pArena = nullptr;
+};
+
+template <class T, class U>
+bool operator== (const ArenaAllocator<T>&, const ArenaAllocator<U>&)
+{
+    return true;
+}
+
+template <class T, class U>
+bool operator!= (const ArenaAllocator<T>&, const ArenaAllocator<U>&)
+{
+    return false;
+}
+#endif
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index 57408049d03..ff25e82f0fe 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -24,7 +24,6 @@
 #include <stdio.h>
 #include <thread>
 #include <algorithm>
-#include <unordered_set>
 #include <float.h>
 #include <vector>
 #include <utility>
@@ -345,7 +344,7 @@ void WorkOnFifoBE(
     SWR_CONTEXT *pContext,
     uint32_t workerId,
     uint64_t &curDrawBE,
-    std::unordered_set<uint32_t>& lockedTiles)
+    TileSet& lockedTiles)
 {
     // Find the first incomplete draw that has pending work. If no such draw is found then
     // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
@@ -550,7 +549,7 @@ DWORD workerThreadMain(LPVOID pData)
 
     // Track tiles locked by other threads. If we try to lock a macrotile and find its already
     // locked then we'll add it to this list so that we don't try and lock it again.
-    std::unordered_set<uint32_t> lockedTiles;
+    TileSet lockedTiles;
 
     // each worker has the ability to work on any of the queued draws as long as certain
     // conditions are met. the data associated
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h
index ec0b735a4ec..6b37e3ac179 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@@ -54,10 +54,12 @@ struct THREAD_POOL
     THREAD_DATA *pThreadData;
 };
 
+typedef std::unordered_set<uint32_t> TileSet;
+
 void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
 void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
 
 // Expose FE and BE worker functions to the API thread if single threaded
 void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, int numaNode);
-void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, std::unordered_set<uint32_t> &usedTiles);
+void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles);
 void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE);
diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.cpp b/src/gallium/drivers/swr/rasterizer/core/utils.cpp
index f36452f2cec..a1d665e77cc 100644
--- a/src/gallium/drivers/swr/rasterizer/core/utils.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/utils.cpp
@@ -27,6 +27,11 @@
 ******************************************************************************/
 #if defined(_WIN32)
 
+#if defined(NOMINMAX)
+// GDI Plus requires non-std min / max macros be defined :(
+#undef NOMINMAX
+#endif
+
 #include<Windows.h>
 #include <Gdiplus.h>
 #include <Gdiplusheaders.h>