From ee6be9e92dbdc3dbeb26e0f873c1784d563bf641 Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Mon, 21 Mar 2016 17:30:03 -0600 Subject: swr: [rasterizer core] CachedArena optimizations Reduce list traversal during Alloc and Free. Add ability to have multiple lists based on alloc size (not used for now) --- src/gallium/drivers/swr/rasterizer/common/os.h | 2 + src/gallium/drivers/swr/rasterizer/core/arena.h | 256 ++++++++++++++-------- src/gallium/drivers/swr/rasterizer/core/context.h | 113 ---------- 3 files changed, 161 insertions(+), 210 deletions(-) (limited to 'src/gallium') diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h index d4bec908bb4..5794f3f625a 100644 --- a/src/gallium/drivers/swr/rasterizer/common/os.h +++ b/src/gallium/drivers/swr/rasterizer/common/os.h @@ -54,9 +54,11 @@ #if defined(_WIN32) #if defined(_WIN64) +#define BitScanReverseSizeT BitScanReverse64 #define BitScanForwardSizeT BitScanForward64 #define _mm_popcount_sizeT _mm_popcnt_u64 #else +#define BitScanReverseSizeT BitScanReverse #define BitScanForwardSizeT BitScanForward #define _mm_popcount_sizeT _mm_popcnt_u32 #endif diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h index 71fb258f4d4..a2db7b38208 100644 --- a/src/gallium/drivers/swr/rasterizer/core/arena.h +++ b/src/gallium/drivers/swr/rasterizer/core/arena.h @@ -51,7 +51,10 @@ public: } }; +static const size_t ARENA_BLOCK_SHIFT = 5; static const size_t ARENA_BLOCK_ALIGN = KNOB_SIMD_WIDTH * 4; +static_assert((1U << ARENA_BLOCK_SHIFT) == ARENA_BLOCK_ALIGN, + "Invalid value for ARENA_BLOCK_ALIGN/SHIFT"); struct ArenaBlock { @@ -59,9 +62,158 @@ struct ArenaBlock size_t blockSize = 0; ArenaBlock* pNext = nullptr; }; -static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN, "Increase BLOCK_ALIGN size"); +static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN, + "Increase BLOCK_ALIGN size"); -template +// Caching Allocator for Arena +template +struct CachingAllocatorT : DefaultAllocator +{ + static uint32_t GetBucketId(size_t blockSize) + { + uint32_t bucketId = 0; + +#if defined(BitScanReverseSizeT) + BitScanReverseSizeT((unsigned long*)&bucketId, blockSize >> CACHE_START_BUCKET_BIT); + bucketId = std::min(bucketId, CACHE_NUM_BUCKETS - 1); +#endif + + return bucketId; + } + + void* AllocateAligned(size_t size, size_t align) + { + SWR_ASSERT(size >= sizeof(ArenaBlock)); + SWR_ASSERT(size <= uint32_t(-1)); + + size_t blockSize = size - ARENA_BLOCK_ALIGN; + + { + // search cached blocks + std::lock_guard l(m_mutex); + ArenaBlock* pPrevBlock = &m_cachedBlocks[GetBucketId(blockSize)]; + ArenaBlock* pBlock = pPrevBlock->pNext; + ArenaBlock* pPotentialBlock = nullptr; + ArenaBlock* pPotentialPrev = nullptr; + + while (pBlock) + { + if (pBlock->blockSize >= blockSize) + { + if (pBlock == AlignUp(pBlock, align)) + { + if (pBlock->blockSize == blockSize) + { + // Won't find a better match + break; + } + + // We could use this as it is larger than we wanted, but + // continue to search for a better match + pPotentialBlock = pBlock; + pPotentialPrev = pPrevBlock; + } + } + else + { + // Blocks are sorted by size (biggest first) + // So, if we get here, there are no blocks + // large enough, fall through to allocation. + pBlock = nullptr; + break; + } + + pPrevBlock = pBlock; + pBlock = pBlock->pNext; + } + + if (!pBlock) + { + // Couldn't find an exact match, use next biggest size + pBlock = pPotentialBlock; + pPrevBlock = pPotentialPrev; + } + + if (pBlock) + { + SWR_ASSERT(pPrevBlock && pPrevBlock->pNext == pBlock); + pPrevBlock->pNext = pBlock->pNext; + pBlock->pNext = nullptr; + + return pBlock; + } + + m_totalAllocated += size; + +#if 0 + { + static uint32_t count = 0; + char buf[128]; + sprintf_s(buf, "Arena Alloc %d 0x%llx bytes - 0x%llx total\n", ++count, uint64_t(size), uint64_t(m_totalAllocated)); + OutputDebugStringA(buf); + } +#endif + } + + return this->DefaultAllocator::AllocateAligned(size, align); + } + + void Free(void* pMem) + { + if (pMem) + { + ArenaBlock* pNewBlock = reinterpret_cast(pMem); + SWR_ASSERT(pNewBlock->blockSize >= 0 && pNewBlock->pMem != nullptr); + + std::unique_lock l(m_mutex); + ArenaBlock* pPrevBlock = &m_cachedBlocks[GetBucketId(pNewBlock->blockSize)]; + ArenaBlock* pBlock = pPrevBlock->pNext; + + while (pBlock) + { + if (pNewBlock->blockSize >= pBlock->blockSize) + { + // Insert here + break; + } + pPrevBlock = pBlock; + pBlock = pBlock->pNext; + } + + // Insert into list + SWR_ASSERT(pPrevBlock); + pPrevBlock->pNext = pNewBlock; + pNewBlock->pNext = pBlock; + } + } + + ~CachingAllocatorT() + { + // Free all cached blocks + for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i) + { + ArenaBlock* pBlock = m_cachedBlocks[i].pNext; + while (pBlock) + { + ArenaBlock* pNext = pBlock->pNext; + this->DefaultAllocator::Free(pBlock); + pBlock = pNext; + } + } + } + + // buckets, for block sizes < (1 << (start+1)), < (1 << (start+2)), ... + static const uint32_t CACHE_NUM_BUCKETS = NumBucketsT; + static const uint32_t CACHE_START_BUCKET_BIT = StartBucketBitT; + + ArenaBlock m_cachedBlocks[CACHE_NUM_BUCKETS]; + std::mutex m_mutex; + + size_t m_totalAllocated = 0; +}; +typedef CachingAllocatorT<> CachingAllocator; + +template class TArena { public: @@ -91,8 +243,8 @@ public: // a new block } - static const size_t ArenaBlockSize = 1024 * 1024; - size_t blockSize = std::max(m_size + ArenaBlockSize, std::max(size, ArenaBlockSize)); + static const size_t ArenaBlockSize = 1024 * 1024 - ARENA_BLOCK_ALIGN; + size_t blockSize = std::max(size, ArenaBlockSize); // Add in one BLOCK_ALIGN unit to store ArenaBlock in. blockSize = AlignUp(blockSize + ARENA_BLOCK_ALIGN, ARENA_BLOCK_ALIGN); @@ -177,101 +329,11 @@ private: size_t m_size = 0; /// @note Mutex is only used by sync allocation functions. - MutexT m_mutex; + std::mutex m_mutex; DefaultAllocator m_defAllocator; T& m_allocator; }; -template -using Arena = TArena; -using StdArena = Arena; - -struct NullMutex -{ - void lock() {} - void unlock() {} -}; - -// Ref counted Arena for ArenaAllocator -// NOT THREAD SAFE!! -struct RefArena : TArena -{ - uint32_t AddRef() { return ++m_refCount; } - uint32_t Release() { if (--m_refCount) { return m_refCount; } delete this; return 0; } - - void* allocate(std::size_t n) - { - ++m_numAllocations; - return Alloc(n); - } - - void deallocate(void* p) { --m_numAllocations; } - void clear() { SWR_ASSERT(0 == m_numAllocations); Reset(); } - -private: - uint32_t m_refCount = 0; - uint32_t m_numAllocations = 0; -}; - -#if 0 // THIS DOESN'T WORK!!! -// Arena based replacement for std::allocator -template -struct ArenaAllocator -{ - typedef T value_type; - ArenaAllocator() - { - m_pArena = new RefArena(); - m_pArena->AddRef(); - } - ~ArenaAllocator() - { - m_pArena->Release(); m_pArena = nullptr; - } - ArenaAllocator(const ArenaAllocator& copy) - { - m_pArena = const_cast(copy.m_pArena); m_pArena->AddRef(); - } - - - template ArenaAllocator(const ArenaAllocator& copy) - { - m_pArena = const_cast(copy.m_pArena); m_pArena->AddRef(); - } - T* allocate(std::size_t n) - { -#if defined(_DEBUG) - char buf[32]; - sprintf_s(buf, "Alloc: %lld\n", n); - OutputDebugStringA(buf); -#endif - void* p = m_pArena->allocate(n * sizeof(T)); - return static_cast(p); - } - void deallocate(T* p, std::size_t n) - { -#if defined(_DEBUG) - char buf[32]; - sprintf_s(buf, "Dealloc: %lld\n", n); - OutputDebugStringA(buf); -#endif - m_pArena->deallocate(p); - } - void clear() { m_pArena->clear(); } - - RefArena* m_pArena = nullptr; -}; - -template -bool operator== (const ArenaAllocator&, const ArenaAllocator&) -{ - return true; -} - -template -bool operator!= (const ArenaAllocator&, const ArenaAllocator&) -{ - return false; -} -#endif +using StdArena = TArena; +using CachingArena = TArena; diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index 6240b2e08d3..b8f15cae4a3 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -360,119 +360,6 @@ struct BACKEND_FUNCS PFN_OUTPUT_MERGER pfnOutputMerger; }; -// Caching Allocator for Arena -struct CachingAllocator : DefaultAllocator -{ - void* AllocateAligned(size_t size, size_t align) - { - SWR_ASSERT(size >= sizeof(ArenaBlock)); - - { - // search cached blocks - std::lock_guard l(m_mutex); - ArenaBlock* pPrevBlock = &m_cachedBlocks; - ArenaBlock* pBlock = m_cachedBlocks.pNext; - ArenaBlock* pPotentialBlock = nullptr; - ArenaBlock* pPotentialPrev = nullptr; - - while (pBlock) - { - if (pBlock->blockSize >= (size - ARENA_BLOCK_ALIGN)) - { - if (pBlock == AlignUp(pBlock, align)) - { - if (pBlock->blockSize == size) - { - // Won't find a better match - break; - } - - // We could use this as it is larger than we wanted, but - // continue to search for a better match - pPotentialBlock = pBlock; - pPotentialPrev = pPrevBlock; - } - } - else - { - // Blocks are sorted by size (biggest first) - // So, if we get here, there are no blocks - // large enough, fall through to allocation. - pBlock = nullptr; - break; - } - - pPrevBlock = pBlock; - pBlock = pBlock->pNext; - } - - if (!pBlock) - { - // Couldn't find an exact match, use next biggest size - pBlock = pPotentialBlock; - pPrevBlock = pPotentialPrev; - } - - if (pBlock) - { - SWR_ASSERT(pPrevBlock && pPrevBlock->pNext == pBlock); - pPrevBlock->pNext = pBlock->pNext; - pBlock->pNext = nullptr; - - return pBlock; - } - } - - return this->DefaultAllocator::AllocateAligned(size, align); - } - - void Free(void* pMem) - { - if (pMem) - { - ArenaBlock* pNewBlock = reinterpret_cast(pMem); - SWR_ASSERT(pNewBlock->blockSize >= 0 && pNewBlock->pMem != nullptr); - - std::unique_lock l(m_mutex); - ArenaBlock* pPrevBlock = &m_cachedBlocks; - ArenaBlock* pBlock = m_cachedBlocks.pNext; - - while (pBlock) - { - if (pNewBlock->blockSize >= pBlock->blockSize) - { - // Insert here - break; - } - pPrevBlock = pBlock; - pBlock = pBlock->pNext; - } - - // Insert into list - SWR_ASSERT(pPrevBlock); - pPrevBlock->pNext = pNewBlock; - pNewBlock->pNext = pBlock; - } - } - - ~CachingAllocator() - { - // Free all cached blocks - ArenaBlock* pBlock = m_cachedBlocks.pNext; - while (pBlock) - { - ArenaBlock* pNext = pBlock->pNext; - this->DefaultAllocator::Free(pBlock); - pBlock = pNext; - } - } - - ArenaBlock m_cachedBlocks; - std::mutex m_mutex; - -}; - -using CachingArena = Arena; // Draw State struct DRAW_STATE -- cgit v1.2.3