swr/rast: Thread locked tiles improvement

- Change tilemgr TILE_ID encoding to use Morton-order (Z-order). - Change locked tiles set to bitset. Makes clear, set, get much faster. Reviewed-by: Bruce Cherniak <[email protected]>
author: George Kyriazis <[email protected]> 2018-05-01 19:33:38 -0500
committer: George Kyriazis <[email protected]> 2018-05-11 11:26:35 -0500
commit: 4e52cb51b56eaae7153394ed712f49ce0ba63bcc (patch)
tree: 1c28698af74a76fb9d5d9d097d01dbae0653de88 /src
parent: 8238c791dcd244c5d242b0e61cbc744ed64e5e23 (diff)
7 files changed, 152 insertions, 24 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 3458793fd8d..47f3633d54b 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -42,6 +42,7 @@
 #include "core/tilemgr.h"
 #include "core/clip.h"
 #include "core/utils.h"
+#include "core/tileset.h"
 
 #include "common/os.h"
 
@@ -139,6 +140,11 @@ HANDLE SwrCreateContext(
         BindApiThread(pContext, 0);
     }
 
+    if (pContext->threadInfo.SINGLE_THREADED)
+    {
+        pContext->pSingleThreadLockedTiles = new TileSet();
+    }
+
     pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads];
     pContext->pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * pContext->NumWorkerThreads, 64);
 
@@ -245,7 +251,7 @@ void QueueWork(SWR_CONTEXT *pContext)
         {
             uint32_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
             WorkOnFifoFE(pContext, 0, curDraw[0]);
-            WorkOnFifoBE(pContext, 0, curDraw[1], pContext->singleThreadLockedTiles, 0, 0);
+            WorkOnFifoBE(pContext, 0, curDraw[1], *pContext->pSingleThreadLockedTiles, 0, 0);
         }
         else
         {
@@ -427,7 +433,8 @@ void SwrDestroyContext(HANDLE hContext)
     delete[] pContext->ppScratch;
     AlignedFree(pContext->pStats);
 
-    delete(pContext->pHotTileMgr);
+    delete pContext->pHotTileMgr;
+    delete pContext->pSingleThreadLockedTiles;
 
     pContext->~SWR_CONTEXT();
     AlignedFree(GetContext(hContext));
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index af8f4b8db4f..2cd61e4abbb 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -516,7 +516,7 @@ struct SWR_CONTEXT
 
     uint32_t lastFrameChecked;
     uint64_t lastDrawChecked;
-    TileSet singleThreadLockedTiles;
+    TileSet* pSingleThreadLockedTiles;
 
     // ArchRast thread contexts.
     HANDLE* pArContext;
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index 9e16246c3f4..f77ae22a80a 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -49,6 +49,7 @@
 #include "rasterizer.h"
 #include "rdtsc_core.h"
 #include "tilemgr.h"
+#include "tileset.h"
 
 
 
@@ -587,7 +588,7 @@ bool WorkOnFifoBE(
             }
 
             // can only work on this draw if it's not in use by other threads
-            if (lockedTiles.find(tileID) != lockedTiles.end())
+            if (lockedTiles.get(tileID))
             {
                 continue;
             }
@@ -645,7 +646,7 @@ bool WorkOnFifoBE(
             else
             {
                 // This tile is already locked. So let's add it to our locked tiles set. This way we don't try locking this one again.
-                lockedTiles.insert(tileID);
+                lockedTiles.set(tileID);
             }
         }
     }
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h
index cb918ddb60d..0489a3cc6cf 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@@ -62,7 +62,7 @@ struct THREAD_POOL
     THREAD_DATA *pApiThreadData;
 };
 
-typedef std::unordered_set<uint32_t> TileSet;
+struct TileSet;
 
 void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
 void StartThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool);
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
index 28fa7877114..1bdef4bd7dd 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@@ -33,8 +33,6 @@
 #include "core/multisample.h"
 #include "rdtsc_core.h"
 
-#define TILE_ID(x,y) ((x << 16 | y))
-
 MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena)
 {
 }
@@ -50,26 +48,35 @@ void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork)
         return;
     }
 
-    uint32_t id = TILE_ID(x, y);
+    uint32_t id = getTileId(x, y);
+
+    if (id >= mTiles.size())
+    {
+        mTiles.resize((16 + id) * 2);
+    }
 
-    MacroTileQueue &tile = mTiles[id];
-    tile.mWorkItemsFE++;
-    tile.mId = id;
+    MacroTileQueue *pTile = mTiles[id];
+    if (!pTile)
+    {
+        pTile = mTiles[id] = new MacroTileQueue();
+    }
+    pTile->mWorkItemsFE++;
+    pTile->mId = id;
 
-    if (tile.mWorkItemsFE == 1)
+    if (pTile->mWorkItemsFE == 1)
     {
-        tile.clear(mArena);
-        mDirtyTiles.push_back(&tile);
+        pTile->clear(mArena);
+        mDirtyTiles.push_back(pTile);
     }
 
     mWorkItemsProduced++;
-    tile.enqueue_try_nosync(mArena, pWork);
+    pTile->enqueue_try_nosync(mArena, pWork);
 }
 
 void MacroTileMgr::markTileComplete(uint32_t id)
 {
-    SWR_ASSERT(mTiles.find(id) != mTiles.end());
-    MacroTileQueue &tile = mTiles[id];
+    SWR_ASSERT(mTiles.size() > id);
+    MacroTileQueue &tile = *mTiles[id];
     uint32_t numTiles = tile.mWorkItemsFE;
     InterlockedExchangeAdd(&mWorkItemsConsumed, numTiles);
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
index 2831010b12f..8392db1b05f 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
@@ -31,6 +31,7 @@
 #include <set>
 #include <unordered_map>
 #include "common/formats.h"
+#include "common/intrin.h"
 #include "fifo.hpp"
 #include "context.h"
 #include "format_traits.h"
@@ -41,7 +42,7 @@
 struct MacroTileQueue
 {
     MacroTileQueue() { }
-    ~MacroTileQueue() { }
+    ~MacroTileQueue() { destroy(); }
 
     //////////////////////////////////////////////////////////////////////////
     /// @brief Returns number of work items queued for this tile.
@@ -110,9 +111,9 @@ public:
     MacroTileMgr(CachingArena& arena);
     ~MacroTileMgr()
     {
-        for (auto &tile : mTiles)
+        for (auto *pTile : mTiles)
         {
-            tile.second.destroy();
+            delete pTile;
         }
     }
 
@@ -136,13 +137,20 @@ public:
 
     static INLINE void getTileIndices(uint32_t tileID, uint32_t &x, uint32_t &y)
     {
-        y = tileID & 0xffff;
-        x = (tileID >> 16) & 0xffff;
+        // Morton / Z order of tiles
+        x = pext_u32(tileID, 0x55555555);
+        y = pext_u32(tileID, 0xAAAAAAAA);
+    }
+
+    static INLINE uint32_t getTileId(uint32_t x, uint32_t y)
+    {
+        // Morton / Z order of tiles
+        return pdep_u32(x, 0x55555555) | pdep_u32(y, 0xAAAAAAAA);
     }
 
 private:
     CachingArena& mArena;
-    std::unordered_map<uint32_t, MacroTileQueue> mTiles;
+    std::vector<MacroTileQueue*> mTiles;
 
     // Any tile that has work queued to it is a dirty tile.
     std::vector<MacroTileQueue*> mDirtyTiles;
diff --git a/src/gallium/drivers/swr/rasterizer/core/tileset.h b/src/gallium/drivers/swr/rasterizer/core/tileset.h
new file mode 100644
index 00000000000..3eb4c5d1f00
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/tileset.h
@@ -0,0 +1,105 @@
+/****************************************************************************
+* Copyright (C) 2018 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file tileset.h
+*
+* @brief Custom bitset class for managing locked tiles
+*
+******************************************************************************/
+#pragma once
+
+struct TileSet
+{
+    ~TileSet()
+    {
+        if (m_bits)
+        {
+            AlignedFree(m_bits);
+        }
+    }
+    INLINE void set(size_t idx)
+    {
+        _grow(idx);
+        size_t& word = _get_word(idx);
+        word |= (size_t(1) << (idx & BITS_OFFSET));
+        m_maxSet = std::max(m_maxSet, idx + 1);
+    }
+    INLINE bool get(size_t idx)
+    {
+        if (idx >= m_size)
+        {
+            return false;
+        }
+        size_t word = _get_word(idx);
+        return 0 != (word & (size_t(1) << (idx & BITS_OFFSET)));
+    }
+
+    INLINE void clear()
+    {
+        if (m_maxSet)
+        {
+            size_t num_words = (m_maxSet + BITS_OFFSET) / BITS_PER_WORD;
+            memset(m_bits, 0, sizeof(size_t) * num_words);
+            m_maxSet = 0;
+        }
+    }
+
+private:
+    static const size_t BITS_PER_WORD = sizeof(size_t) * 8;
+    static const size_t BITS_OFFSET = BITS_PER_WORD - 1;
+
+    size_t              m_size = 0;
+    size_t              m_maxSet = 0;
+    size_t*             m_bits = nullptr;
+
+    INLINE size_t& _get_word(size_t idx)
+    {
+        return m_bits[idx / BITS_PER_WORD];
+    }
+
+    void _grow(size_t idx)
+    {
+        if (idx < m_size)
+        {
+            return;
+        }
+
+        size_t new_size = (1 + idx + BITS_OFFSET) & ~BITS_OFFSET;
+        size_t num_words = new_size / BITS_PER_WORD;
+        size_t* newBits = (size_t*)AlignedMalloc(sizeof(size_t) * num_words, 64);
+        size_t copy_words = 0;
+
+        if (m_bits)
+        {
+            copy_words = (m_size + BITS_OFFSET) / BITS_PER_WORD;
+            num_words -= copy_words;
+            memcpy(newBits, m_bits, copy_words * sizeof(size_t));
+
+            AlignedFree(m_bits);
+        }
+
+        m_bits = newBits;
+        m_size = new_size;
+
+        memset(&m_bits[copy_words], 0, sizeof(size_t) * num_words);
+    }
+};
author	George Kyriazis <[email protected]>	2018-05-01 19:33:38 -0500
committer	George Kyriazis <[email protected]>	2018-05-11 11:26:35 -0500
commit	4e52cb51b56eaae7153394ed712f49ce0ba63bcc (patch)
tree	1c28698af74a76fb9d5d9d097d01dbae0653de88 /src
parent	8238c791dcd244c5d242b0e61cbc744ed64e5e23 (diff)