diff options
author | Jason Ekstrand <[email protected]> | 2016-04-13 20:25:39 -0700 |
---|---|---|
committer | Jason Ekstrand <[email protected]> | 2016-04-13 20:25:39 -0700 |
commit | 12f88ba32a14ea79134f4e995a55149f078a2f27 (patch) | |
tree | 9070861dced23d0ad7dbec598bfd96b686eb7bf1 /src/gallium/drivers/swr/rasterizer | |
parent | 79fbec30fc16399ede9385ef52cb62cefbb388f4 (diff) | |
parent | 171a570f388b2895d14f6d5418c99573cffd6369 (diff) |
Merge remote-tracking branch 'public/master' into vulkan
Diffstat (limited to 'src/gallium/drivers/swr/rasterizer')
19 files changed, 506 insertions, 210 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h index 5794f3f625a..180a0560822 100644 --- a/src/gallium/drivers/swr/rasterizer/common/os.h +++ b/src/gallium/drivers/swr/rasterizer/common/os.h @@ -30,10 +30,6 @@ #define SWR_API __cdecl -#ifndef _CRT_SECURE_NO_WARNINGS -#define _CRT_SECURE_NO_WARNINGS -#endif - #ifndef NOMINMAX #define NOMINMAX #endif @@ -52,7 +48,6 @@ #define PRAGMA_WARNING_POP() __pragma(warning(pop)) -#if defined(_WIN32) #if defined(_WIN64) #define BitScanReverseSizeT BitScanReverse64 #define BitScanForwardSizeT BitScanForward64 @@ -62,7 +57,6 @@ #define BitScanForwardSizeT BitScanForward #define _mm_popcount_sizeT _mm_popcnt_u32 #endif -#endif #elif defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__) @@ -199,9 +193,7 @@ typedef KILOBYTE MEGABYTE[1024]; typedef MEGABYTE GIGABYTE[1024]; #define OSALIGNLINE(RWORD) OSALIGN(RWORD, 64) -#if KNOB_SIMD_WIDTH == 8 -#define OSALIGNSIMD(RWORD) OSALIGN(RWORD, 32) -#endif +#define OSALIGNSIMD(RWORD) OSALIGN(RWORD, KNOB_SIMD_BYTES) #include "common/swr_assert.h" diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index f0f7956b590..ca9cfdb629e 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -29,10 +29,12 @@ #include <cfloat> #include <cmath> #include <cstdio> +#include <new> #include "core/api.h" #include "core/backend.h" #include "core/context.h" +#include "core/depthstencil.h" #include "core/frontend.h" #include "core/rasterizer.h" #include "core/rdtsc_core.h" @@ -64,11 +66,14 @@ HANDLE SwrCreateContext( pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT); pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT); + pContext->pMacroTileManagerArray = (MacroTileMgr*)_aligned_malloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64); + pContext->pDispatchQueueArray = (DispatchQueue*)_aligned_malloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64); + for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc) { pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator); - pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena)); - pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen. + new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena); + new (&pContext->pDispatchQueueArray[dc]) DispatchQueue(); pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator); } @@ -86,15 +91,26 @@ HANDLE SwrCreateContext( // Calling createThreadPool() above can set SINGLE_THREADED if (KNOB_SINGLE_THREADED) { + SET_KNOB(HYPERTHREADED_FE, false); pContext->NumWorkerThreads = 1; + pContext->NumFEThreads = 1; + pContext->NumBEThreads = 1; } // Allocate scratch space for workers. ///@note We could lazily allocate this but its rather small amount of memory. for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) { - ///@todo Use numa API for allocations using numa information from thread data (if exists). - pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4); +#if defined(_WIN32) + uint32_t numaNode = pContext->threadPool.pThreadData ? + pContext->threadPool.pThreadData[i].numaId : 0; + pContext->pScratch[i] = (uint8_t*)VirtualAllocExNuma( + GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE), + MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE, + numaNode); +#else + pContext->pScratch[i] = (uint8_t*)_aligned_malloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4); +#endif } // State setup AFTER context is fully initialized @@ -131,14 +147,21 @@ void SwrDestroyContext(HANDLE hContext) { delete pContext->dcRing[i].pArena; delete pContext->dsRing[i].pArena; - delete(pContext->dcRing[i].pTileMgr); - delete(pContext->dcRing[i].pDispatch); + pContext->pMacroTileManagerArray[i].~MacroTileMgr(); + pContext->pDispatchQueueArray[i].~DispatchQueue(); } + _aligned_free(pContext->pDispatchQueueArray); + _aligned_free(pContext->pMacroTileManagerArray); + // Free scratch space. for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i) { +#if defined(_WIN32) + VirtualFree(pContext->pScratch[i], 0, MEM_RELEASE); +#else _aligned_free(pContext->pScratch[i]); +#endif } delete(pContext->pHotTileMgr); @@ -160,12 +183,20 @@ void WakeAllThreads(SWR_CONTEXT *pContext) template<bool IsDraw> void QueueWork(SWR_CONTEXT *pContext) { + DRAW_CONTEXT* pDC = pContext->pCurDrawContext; + uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT; + + if (IsDraw) + { + pDC->pTileMgr = &pContext->pMacroTileManagerArray[dcIndex]; + pDC->pTileMgr->initialize(); + } + // Each worker thread looks at a DC for both FE and BE work at different times and so we // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and // then moved on if all work is done.) - pContext->pCurDrawContext->threadsDone = - pContext->NumWorkerThreads ? pContext->NumWorkerThreads * 2 : 2; + pContext->pCurDrawContext->threadsDone = pContext->NumFEThreads + pContext->NumBEThreads; _ReadWriteBarrier(); { @@ -183,7 +214,7 @@ void QueueWork(SWR_CONTEXT *pContext) { static TileSet lockedTiles; uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId }; - WorkOnFifoFE(pContext, 0, curDraw[0], 0); + WorkOnFifoFE(pContext, 0, curDraw[0]); WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles, 0, 0); } else @@ -232,7 +263,20 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) _mm_pause(); } - uint32_t dcIndex = pContext->dcRing.GetHead() % KNOB_MAX_DRAWS_IN_FLIGHT; + uint64_t curDraw = pContext->dcRing.GetHead(); + uint32_t dcIndex = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT; + + static uint64_t lastDrawChecked; + static uint32_t lastFrameChecked; + if ((pContext->frameCount - lastFrameChecked) > 2 || + (curDraw - lastDrawChecked) > 0x10000) + { + // Take this opportunity to clean-up old arena allocations + pContext->cachingArenaAllocator.FreeOldBlocks(); + + lastFrameChecked = pContext->frameCount; + lastDrawChecked = curDraw; + } DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex]; pContext->pCurDrawContext = pCurDrawContext; @@ -284,8 +328,6 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) pCurDrawContext->FeLock = 0; pCurDrawContext->threadsDone = 0; - pCurDrawContext->pTileMgr->initialize(); - // Assign unique drawId for this DC pCurDrawContext->drawId = pContext->dcRing.GetHead(); @@ -872,6 +914,25 @@ void SetupPipeline(DRAW_CONTEXT *pDC) !pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0; } } + + // Setup depth quantization function + if (pState->state.depthHottileEnable) + { + switch (pState->state.rastState.depthFormat) + { + case R32_FLOAT_X8X24_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT_X8X24_TYPELESS > ; break; + case R32_FLOAT: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; break; + case R24_UNORM_X8_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R24_UNORM_X8_TYPELESS > ; break; + case R16_UNORM: pState->state.pfnQuantizeDepth = QuantizeDepth < R16_UNORM > ; break; + default: SWR_ASSERT(false, "Unsupported depth format for depth quantiztion."); + pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; + } + } + else + { + // set up pass-through quantize if depth isn't enabled + pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; + } } ////////////////////////////////////////////////////////////////////////// @@ -1029,9 +1090,9 @@ void DrawInstanced( SWR_CONTEXT *pContext = GetContext(hContext); DRAW_CONTEXT* pDC = GetDrawContext(pContext); - int32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology); + uint32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology); uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw); - int32_t remainingVerts = numVertices; + uint32_t remainingVerts = numVertices; API_STATE *pState = &pDC->pState->state; pState->topology = topology; @@ -1149,9 +1210,9 @@ void DrawIndexedInstance( DRAW_CONTEXT* pDC = GetDrawContext(pContext); API_STATE* pState = &pDC->pState->state; - int32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology); + uint32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology); uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw); - int32_t remainingIndices = numIndices; + uint32_t remainingIndices = numIndices; uint32_t indexSize = 0; switch (pState->indexBuffer.format) @@ -1334,9 +1395,6 @@ void SwrDispatch( pDC->isCompute = true; // This is a compute context. - // Ensure spill fill pointers are initialized to nullptr. - memset(pDC->pSpillFill, 0, sizeof(pDC->pSpillFill)); - COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64); pTaskData->threadGroupCountX = threadGroupCountX; @@ -1344,6 +1402,8 @@ void SwrDispatch( pTaskData->threadGroupCountZ = threadGroupCountZ; uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ; + uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT; + pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex]; pDC->pDispatch->initialize(totalThreadGroups, pTaskData); QueueDispatch(pContext); @@ -1497,4 +1557,6 @@ void SWR_API SwrEndFrame( HANDLE hContext) { RDTSC_ENDFRAME(); + SWR_CONTEXT *pContext = GetContext(hContext); + pContext->frameCount++; } diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h index 67d81a44347..64184e16865 100644 --- a/src/gallium/drivers/swr/rasterizer/core/arena.h +++ b/src/gallium/drivers/swr/rasterizer/core/arena.h @@ -65,69 +65,41 @@ static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN, template<uint32_t NumBucketsT = 4, uint32_t StartBucketBitT = 16> struct CachingAllocatorT : DefaultAllocator { - static uint32_t GetBucketId(size_t blockSize) - { - uint32_t bucketId = 0; - -#if defined(BitScanReverseSizeT) - BitScanReverseSizeT((unsigned long*)&bucketId, blockSize >> CACHE_START_BUCKET_BIT); - bucketId = std::min<uint32_t>(bucketId, CACHE_NUM_BUCKETS - 1); -#endif - - return bucketId; - } - void* AllocateAligned(size_t size, size_t align) { SWR_ASSERT(size >= sizeof(ArenaBlock)); SWR_ASSERT(size <= uint32_t(-1)); size_t blockSize = size - ARENA_BLOCK_ALIGN; + uint32_t bucket = GetBucketId(blockSize); { // search cached blocks std::lock_guard<std::mutex> l(m_mutex); - ArenaBlock* pPrevBlock = &m_cachedBlocks[GetBucketId(blockSize)]; - ArenaBlock* pBlock = pPrevBlock->pNext; - ArenaBlock* pPotentialBlock = nullptr; - ArenaBlock* pPotentialPrev = nullptr; + ArenaBlock* pPrevBlock = &m_cachedBlocks[bucket]; + ArenaBlock* pBlock = SearchBlocks(pPrevBlock, blockSize, align); - while (pBlock) + if (pBlock) { - if (pBlock->blockSize >= blockSize) - { - if (pBlock == AlignUp(pBlock, align)) - { - if (pBlock->blockSize == blockSize) - { - // Won't find a better match - break; - } - - // We could use this as it is larger than we wanted, but - // continue to search for a better match - pPotentialBlock = pBlock; - pPotentialPrev = pPrevBlock; - } - } - else + m_cachedSize -= pBlock->blockSize; + if (pBlock == m_pLastCachedBlocks[bucket]) { - // Blocks are sorted by size (biggest first) - // So, if we get here, there are no blocks - // large enough, fall through to allocation. - pBlock = nullptr; - break; + m_pLastCachedBlocks[bucket] = pPrevBlock; } - - pPrevBlock = pBlock; - pBlock = pBlock->pNext; } - - if (!pBlock) + else { - // Couldn't find an exact match, use next biggest size - pBlock = pPotentialBlock; - pPrevBlock = pPotentialPrev; + pPrevBlock = &m_oldCachedBlocks[GetBucketId(blockSize)]; + pBlock = SearchBlocks(pPrevBlock, blockSize, align); + + if (pBlock) + { + m_oldCachedSize -= pBlock->blockSize; + if (pBlock == m_pOldLastCachedBlocks[bucket]) + { + m_pLastCachedBlocks[bucket] = pPrevBlock; + } + } } if (pBlock) @@ -154,7 +126,7 @@ struct CachingAllocatorT : DefaultAllocator return this->DefaultAllocator::AllocateAligned(size, align); } - void Free(void* pMem) + void Free(void* pMem) { if (pMem) { @@ -162,24 +134,57 @@ struct CachingAllocatorT : DefaultAllocator SWR_ASSERT(pNewBlock->blockSize >= 0); std::unique_lock<std::mutex> l(m_mutex); - ArenaBlock* pPrevBlock = &m_cachedBlocks[GetBucketId(pNewBlock->blockSize)]; - ArenaBlock* pBlock = pPrevBlock->pNext; + InsertCachedBlock(GetBucketId(pNewBlock->blockSize), pNewBlock); + } + } - while (pBlock) + void FreeOldBlocks() + { + if (!m_cachedSize) { return; } + std::lock_guard<std::mutex> l(m_mutex); + + bool doFree = (m_oldCachedSize > MAX_UNUSED_SIZE); + + for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i) + { + if (doFree) { - if (pNewBlock->blockSize >= pBlock->blockSize) + ArenaBlock* pBlock = m_oldCachedBlocks[i].pNext; + while (pBlock) { - // Insert here - break; + ArenaBlock* pNext = pBlock->pNext; + m_oldCachedSize -= pBlock->blockSize; + m_totalAllocated -= (pBlock->blockSize + ARENA_BLOCK_ALIGN); + this->DefaultAllocator::Free(pBlock); + pBlock = pNext; } - pPrevBlock = pBlock; - pBlock = pBlock->pNext; + m_oldCachedBlocks[i].pNext = nullptr; + m_pOldLastCachedBlocks[i] = &m_oldCachedBlocks[i]; } - // Insert into list - SWR_ASSERT(pPrevBlock); - pPrevBlock->pNext = pNewBlock; - pNewBlock->pNext = pBlock; + if (m_pLastCachedBlocks[i] != &m_cachedBlocks[i]) + { + m_pLastCachedBlocks[i]->pNext = m_oldCachedBlocks[i].pNext; + m_oldCachedBlocks[i].pNext = m_cachedBlocks[i].pNext; + m_cachedBlocks[i].pNext = nullptr; + if (m_pOldLastCachedBlocks[i]->pNext) + { + m_pOldLastCachedBlocks[i] = m_pLastCachedBlocks[i]; + } + m_pLastCachedBlocks[i] = &m_cachedBlocks[i]; + } + } + + m_oldCachedSize += m_cachedSize; + m_cachedSize = 0; + } + + CachingAllocatorT() + { + for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i) + { + m_pLastCachedBlocks[i] = &m_cachedBlocks[i]; + m_pOldLastCachedBlocks[i] = &m_oldCachedBlocks[i]; } } @@ -195,21 +200,126 @@ struct CachingAllocatorT : DefaultAllocator this->DefaultAllocator::Free(pBlock); pBlock = pNext; } + pBlock = m_oldCachedBlocks[i].pNext; + while (pBlock) + { + ArenaBlock* pNext = pBlock->pNext; + this->DefaultAllocator::Free(pBlock); + pBlock = pNext; + } } } +private: + static uint32_t GetBucketId(size_t blockSize) + { + uint32_t bucketId = 0; + +#if defined(BitScanReverseSizeT) + BitScanReverseSizeT((unsigned long*)&bucketId, blockSize >> CACHE_START_BUCKET_BIT); + bucketId = std::min<uint32_t>(bucketId, CACHE_NUM_BUCKETS - 1); +#endif + + return bucketId; + } + + void InsertCachedBlock(uint32_t bucketId, ArenaBlock* pNewBlock) + { + SWR_ASSERT(bucketId < CACHE_NUM_BUCKETS); + + ArenaBlock* pPrevBlock = &m_cachedBlocks[bucketId]; + ArenaBlock* pBlock = pPrevBlock->pNext; + + while (pBlock) + { + if (pNewBlock->blockSize >= pBlock->blockSize) + { + // Insert here + break; + } + pPrevBlock = pBlock; + pBlock = pBlock->pNext; + } + + // Insert into list + SWR_ASSERT(pPrevBlock); + pPrevBlock->pNext = pNewBlock; + pNewBlock->pNext = pBlock; + + if (m_pLastCachedBlocks[bucketId] == pPrevBlock) + { + m_pLastCachedBlocks[bucketId] = pNewBlock; + } + + m_cachedSize += pNewBlock->blockSize; + } + + static ArenaBlock* SearchBlocks(ArenaBlock*& pPrevBlock, size_t blockSize, size_t align) + { + ArenaBlock* pBlock = pPrevBlock->pNext; + ArenaBlock* pPotentialBlock = nullptr; + ArenaBlock* pPotentialPrev = nullptr; + + while (pBlock) + { + if (pBlock->blockSize >= blockSize) + { + if (pBlock == AlignUp(pBlock, align)) + { + if (pBlock->blockSize == blockSize) + { + // Won't find a better match + break; + } + + // We could use this as it is larger than we wanted, but + // continue to search for a better match + pPotentialBlock = pBlock; + pPotentialPrev = pPrevBlock; + } + } + else + { + // Blocks are sorted by size (biggest first) + // So, if we get here, there are no blocks + // large enough, fall through to allocation. + pBlock = nullptr; + break; + } + + pPrevBlock = pBlock; + pBlock = pBlock->pNext; + } + + if (!pBlock) + { + // Couldn't find an exact match, use next biggest size + pBlock = pPotentialBlock; + pPrevBlock = pPotentialPrev; + } + + return pBlock; + } + // buckets, for block sizes < (1 << (start+1)), < (1 << (start+2)), ... static const uint32_t CACHE_NUM_BUCKETS = NumBucketsT; static const uint32_t CACHE_START_BUCKET_BIT = StartBucketBitT; + static const size_t MAX_UNUSED_SIZE = 20 * sizeof(MEGABYTE); ArenaBlock m_cachedBlocks[CACHE_NUM_BUCKETS]; + ArenaBlock* m_pLastCachedBlocks[CACHE_NUM_BUCKETS]; + ArenaBlock m_oldCachedBlocks[CACHE_NUM_BUCKETS]; + ArenaBlock* m_pOldLastCachedBlocks[CACHE_NUM_BUCKETS]; std::mutex m_mutex; size_t m_totalAllocated = 0; + + size_t m_cachedSize = 0; + size_t m_oldCachedSize = 0; }; typedef CachingAllocatorT<> CachingAllocator; -template<typename T = DefaultAllocator, size_t BlockSizeT = (128 * 1024)> +template<typename T = DefaultAllocator, size_t BlockSizeT = 128 * sizeof(KILOBYTE)> class TArena { public: diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp index 7fb83edf169..b2d3d9ef4f4 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp @@ -70,7 +70,7 @@ static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS]; /// @param pDC - pointer to draw context (dispatch). /// @param workerId - The unique worker ID that is assigned to this thread. /// @param threadGroupId - the linear index for the thread group within the dispatch. -void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId) +void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer) { RDTSC_START(BEDispatch); @@ -80,10 +80,10 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup SWR_ASSERT(pTaskData != nullptr); // Ensure spill fill memory has been allocated. - if (pDC->pSpillFill[workerId] == nullptr) + if (pSpillFillBuffer == nullptr) { ///@todo Add state which indicates the spill fill size. - pDC->pSpillFill[workerId] = (uint8_t*)pDC->pArena->AllocAlignedSync(4096 * 1024, sizeof(float) * 8); + pSpillFillBuffer = pDC->pArena->AllocAlignedSync(4 * sizeof(MEGABYTE), sizeof(float) * 8); } const API_STATE& state = GetApiState(pDC); @@ -94,7 +94,7 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup csContext.dispatchDims[1] = pTaskData->threadGroupCountY; csContext.dispatchDims[2] = pTaskData->threadGroupCountZ; csContext.pTGSM = pContext->pScratch[workerId]; - csContext.pSpillFillBuffer = pDC->pSpillFill[workerId]; + csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer; state.pfnCsFunc(GetPrivateState(pDC), &csContext); @@ -772,8 +772,10 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 psContext.vOneOverW.centroid = psContext.vOneOverW.center; } - // interpolate z + // interpolate and quantize z psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); + psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); + RDTSC_STOP(BEBarycentric, 0, 0); simdmask clipCoverageMask = coverageMask & MASK; @@ -793,7 +795,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 if(CanEarlyZ(pPSState)) { RDTSC_START(BEEarlyDepthTest); - depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, + depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask); RDTSC_STOP(BEEarlyDepthTest, 0, 0); @@ -825,7 +827,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 if(!CanEarlyZ(pPSState)) { RDTSC_START(BELateDepthTest); - depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, + depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask); RDTSC_STOP(BELateDepthTest, 0, 0); @@ -977,8 +979,9 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext); - // interpolate z + // interpolate and quantize z psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); + psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); RDTSC_STOP(BEBarycentric, 0, 0); @@ -1000,7 +1003,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ if (CanEarlyZ(pPSState)) { RDTSC_START(BEEarlyDepthTest); - depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, + depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); RDTSC_STOP(BEEarlyDepthTest, 0, 0); @@ -1033,7 +1036,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ if (!CanEarlyZ(pPSState)) { RDTSC_START(BELateDepthTest); - depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, + depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); RDTSC_STOP(BELateDepthTest, 0, 0); @@ -1200,8 +1203,9 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t RDTSC_START(BEBarycentric); backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext); - // interpolate z + // interpolate and quantize z psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); + psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); RDTSC_STOP(BEBarycentric, 0, 0); // execute pixel shader @@ -1263,10 +1267,11 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t // calc I & J per sample backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext); - // interpolate z + // interpolate and quantize z if (!pPSState->writesODepth) { vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); + vZ[sample] = state.pfnQuantizeDepth(vZ[sample]); } ///@todo: perspective correct vs non-perspective correct clipping? @@ -1292,7 +1297,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t // ZTest for this sample RDTSC_START(BEEarlyDepthTest); stencilPassMask[sample] = vCoverageMask[sample]; - depthPassMask[sample] = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, + depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing, vZ[sample], pDepthSample, vCoverageMask[sample], pStencilSample, &stencilPassMask[sample]); RDTSC_STOP(BEEarlyDepthTest, 0, 0); @@ -1308,8 +1313,9 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t { RDTSC_START(BEBarycentric); backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext); - // interpolate z + // interpolate and quantize z psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center); + psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); RDTSC_STOP(BEBarycentric, 0, 0); // execute pixel shader @@ -1463,8 +1469,9 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext); - // interpolate z + // interpolate and quantize z psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample); + psContext.vZ = state.pfnQuantizeDepth(psContext.vZ); RDTSC_STOP(BEBarycentric, 0, 0); @@ -1483,7 +1490,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, uint8_t *pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample); RDTSC_START(BEEarlyDepthTest); - simdscalar depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, + simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing, psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask); DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ, pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask); diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h index 2fa18953cad..d0626b997af 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.h +++ b/src/gallium/drivers/swr/rasterizer/core/backend.h @@ -32,7 +32,7 @@ #include "core/context.h" #include "core/multisample.h" -void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId); +void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer); void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData); diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp index 3a2a8b35be8..e624fd8f674 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp @@ -162,8 +162,8 @@ int ClipTriToPlane( const float *pInPts, int numInPts, void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *pOutTriangles, int *numVerts, float *pOutAttribs) { // temp storage to hold at least 6 sets of vertices, the max number that can be created during clipping - OSALIGN(float, 16) tempPts[6 * 4]; - OSALIGN(float, 16) tempAttribs[6 * KNOB_NUM_ATTRIBUTES * 4]; + OSALIGNSIMD(float) tempPts[6 * 4]; + OSALIGNSIMD(float) tempAttribs[6 * KNOB_NUM_ATTRIBUTES * 4]; // we opt to clip to viewport frustum to produce smaller triangles for rasterization precision int NumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pTriangle, 3, pAttribs, numAttribs, tempPts, tempAttribs); diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h index ba5870a92bb..67a4c4f47bb 100644 --- a/src/gallium/drivers/swr/rasterizer/core/clip.h +++ b/src/gallium/drivers/swr/rasterizer/core/clip.h @@ -265,8 +265,8 @@ public: // clip a single primitive int ClipScalar(PA_STATE& pa, uint32_t primIndex, float* pOutPos, float* pOutAttribs) { - OSALIGN(float, 16) inVerts[3 * 4]; - OSALIGN(float, 16) inAttribs[3 * KNOB_NUM_ATTRIBUTES * 4]; + OSALIGNSIMD(float) inVerts[3 * 4]; + OSALIGNSIMD(float) inAttribs[3 * KNOB_NUM_ATTRIBUTES * 4]; // transpose primitive position __m128 verts[3]; diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index 39f23372a18..6464aa20af7 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -308,6 +308,8 @@ OSALIGNLINE(struct) API_STATE uint32_t depthHottileEnable: 1; uint32_t stencilHottileEnable : 1; }; + + PFN_QUANTIZE_DEPTH pfnQuantizeDepth; }; class MacroTileMgr; @@ -380,32 +382,29 @@ struct DRAW_STATE // This draw context maintains all of the state needed for the draw operation. struct DRAW_CONTEXT { - SWR_CONTEXT *pContext; - - uint64_t drawId; - - bool isCompute; // Is this DC a compute context? - - FE_WORK FeWork; - volatile OSALIGNLINE(uint32_t) FeLock; - volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw? - volatile OSALIGNLINE(int64_t) threadsDone; - - uint64_t dependency; - - MacroTileMgr* pTileMgr; - - // The following fields are valid if isCompute is true. - DispatchQueue* pDispatch; // Queue for thread groups. (isCompute) + SWR_CONTEXT* pContext; + uint64_t drawId; + union + { + MacroTileMgr* pTileMgr; + DispatchQueue* pDispatch; // Queue for thread groups. (isCompute) + }; + uint64_t dependency; + DRAW_STATE* pState; + CachingArena* pArena; - DRAW_STATE* pState; - CachingArena* pArena; + bool isCompute; // Is this DC a compute context? + bool cleanupState; // True if this is the last draw using an entry in the state ring. + volatile bool doneFE; // Is FE work done for this draw? - uint8_t* pSpillFill[KNOB_MAX_NUM_THREADS]; // Scratch space used for spill fills. + FE_WORK FeWork; - bool cleanupState; // True if this is the last draw using an entry in the state ring. + volatile OSALIGNLINE(uint32_t) FeLock; + volatile int64_t threadsDone; }; +static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT"); + INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC) { SWR_ASSERT(pDC != nullptr); @@ -447,6 +446,9 @@ struct SWR_CONTEXT DRAW_CONTEXT *pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw. DRAW_CONTEXT *pPrevDrawContext; // This points to DC entry for the previous context submitted that we can copy state from. + MacroTileMgr* pMacroTileManagerArray; + DispatchQueue* pDispatchQueueArray; + // Draw State Ring // When draw are very large (lots of primitives) then the API thread will break these up. // These split draws all have identical state. So instead of storing the state directly @@ -457,6 +459,8 @@ struct SWR_CONTEXT uint32_t curStateId; // Current index to the next available entry in the DS ring. uint32_t NumWorkerThreads; + uint32_t NumFEThreads; + uint32_t NumBEThreads; THREAD_POOL threadPool; // Thread pool associated with this context @@ -481,6 +485,7 @@ struct SWR_CONTEXT uint8_t* pScratch[KNOB_MAX_NUM_THREADS]; CachingAllocator cachingArenaAllocator; + uint32_t frameCount; }; void WaitForDependencies(SWR_CONTEXT *pContext, uint64_t drawId); diff --git a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h index 2cc9d4054ac..7b55580bf0a 100644 --- a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h +++ b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h @@ -80,14 +80,52 @@ void StencilOp(SWR_STENCILOP op, simdscalar mask, simdscalar stencilRefps, simds } +template<SWR_FORMAT depthFormatT> +simdscalar QuantizeDepth(simdscalar depth) +{ + SWR_TYPE depthType = FormatTraits<depthFormatT>::GetType(0); + uint32_t depthBpc = FormatTraits<depthFormatT>::GetBPC(0); + + if (depthType == SWR_TYPE_FLOAT) + { + // assume only 32bit float depth supported + SWR_ASSERT(depthBpc == 32); + + // matches shader precision, no quantizing needed + return depth; + } + + // should be unorm depth if not float + SWR_ASSERT(depthType == SWR_TYPE_UNORM); + + float quantize = (float)((1 << depthBpc) - 1); + simdscalar result = _simd_mul_ps(depth, _simd_set1_ps(quantize)); + result = _simd_add_ps(result, _simd_set1_ps(0.5f)); + result = _simd_round_ps(result, _MM_FROUND_TO_ZERO); + + if (depthBpc > 16) + { + result = _simd_div_ps(result, _simd_set1_ps(quantize)); + } + else + { + result = _simd_mul_ps(result, _simd_set1_ps(1.0f / quantize)); + } + + return result; +} + INLINE -simdscalar DepthStencilTest(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState, +simdscalar DepthStencilTest(const API_STATE* pState, bool frontFacing, simdscalar interpZ, uint8_t* pDepthBase, simdscalar coverageMask, uint8_t *pStencilBase, simdscalar* pStencilMask) { static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format"); static_assert(KNOB_STENCIL_HOT_TILE_FORMAT == R8_UINT, "Unsupported stencil hot tile format"); + const SWR_DEPTH_STENCIL_STATE* pDSState = &pState->depthStencilState; + const SWR_VIEWPORT* pViewport = &pState->vp[0]; + simdscalar depthResult = _simd_set1_ps(-1.0f); simdscalar zbuf; diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index 36721e00beb..93869610ff9 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -793,8 +793,14 @@ static void GeometryShaderStage( uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride; DWORD numAttribs; - _BitScanReverse(&numAttribs, state.feAttribMask); - numAttribs++; + if (_BitScanReverse(&numAttribs, state.feAttribMask)) + { + numAttribs++; + } + else + { + numAttribs = 0; + } for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream) { diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h b/src/gallium/drivers/swr/rasterizer/core/knobs.h index d7feb86273d..55a22a67f4c 100644 --- a/src/gallium/drivers/swr/rasterizer/core/knobs.h +++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h @@ -45,14 +45,17 @@ #define KNOB_ARCH_ISA AVX #define KNOB_ARCH_STR "AVX" #define KNOB_SIMD_WIDTH 8 +#define KNOB_SIMD_BYTES 32 #elif (KNOB_ARCH == KNOB_ARCH_AVX2) #define KNOB_ARCH_ISA AVX2 #define KNOB_ARCH_STR "AVX2" #define KNOB_SIMD_WIDTH 8 +#define KNOB_SIMD_BYTES 32 #elif (KNOB_ARCH == KNOB_ARCH_AVX512) #define KNOB_ARCH_ISA AVX512F #define KNOB_ARCH_STR "AVX512" #define KNOB_SIMD_WIDTH 16 +#define KNOB_SIMD_BYTES 64 #error "AVX512 not yet supported" #else #error "Unknown architecture" diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h index f8f1a33b7e3..17f488538d6 100644 --- a/src/gallium/drivers/swr/rasterizer/core/pa.h +++ b/src/gallium/drivers/swr/rasterizer/core/pa.h @@ -1017,13 +1017,13 @@ struct PA_TESS : PA_STATE { SWR_ASSERT(numPrims <= KNOB_SIMD_WIDTH); #if KNOB_SIMD_WIDTH == 8 - static const OSALIGN(int32_t, 64) maskGen[KNOB_SIMD_WIDTH * 2] = + static const OSALIGNLINE(int32_t) maskGen[KNOB_SIMD_WIDTH * 2] = { -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0 }; #elif KNOB_SIMD_WIDTH == 16 - static const OSALIGN(int32_t, 128) maskGen[KNOB_SIMD_WIDTH * 2] = + static const OSALIGNLINE(int32_t) maskGen[KNOB_SIMD_WIDTH * 2] = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 @@ -1167,8 +1167,14 @@ struct PA_FACTORY { memset(&indexStore, 0, sizeof(indexStore)); DWORD numAttribs; - _BitScanReverse(&numAttribs, state.feAttribMask); - numAttribs++; + if (_BitScanReverse(&numAttribs, state.feAttribMask)) + { + numAttribs++; + } + else + { + numAttribs = 0; + } new (&this->paCut) PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH, &this->indexStore[0], numVerts, numAttribs, state.topology, false); cutPA = true; diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp index 52fb7c88cdd..3144a901c91 100644 --- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp @@ -383,7 +383,7 @@ __declspec(thread) volatile uint64_t gToss; static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4; // try to avoid _chkstk insertions; make this thread local -static THREAD OSALIGN(float, 16) perspAttribsTLS[vertsPerTri * KNOB_NUM_ATTRIBUTES * componentsPerAttrib]; +static THREAD OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * KNOB_NUM_ATTRIBUTES * componentsPerAttrib]; INLINE void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge) @@ -439,7 +439,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, const SWR_RASTSTATE &rastState = state.rastState; const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs; - OSALIGN(SWR_TRIANGLE_DESC, 16) triDesc; + OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc; triDesc.pUserClipBuffer = workDesc.pUserClipBuffer; __m128 vX, vY, vZ, vRecipW; @@ -502,7 +502,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, _MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2); _MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2); - OSALIGN(float, 16) oneOverW[4]; + OSALIGNSIMD(float) oneOverW[4]; _mm_store_ps(oneOverW, vRecipW); triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2]; triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2]; @@ -537,7 +537,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, // compute bary Z // zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0) - OSALIGN(float, 16) a[4]; + OSALIGNSIMD(float) a[4]; _mm_store_ps(a, vZ); triDesc.Z[0] = a[0] - a[2]; triDesc.Z[1] = a[1] - a[2]; @@ -575,7 +575,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, } // Calc bounding box of triangle - OSALIGN(BBOX, 16) bbox; + OSALIGNSIMD(BBOX) bbox; calcBoundingBoxInt(vXi, vYi, bbox); // Intersect with scissor/viewport @@ -594,7 +594,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED; int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1; - OSALIGN(BBOX, 16) intersect; + OSALIGNSIMD(BBOX) intersect; intersect.left = std::max(bbox.left, macroBoxLeft); intersect.top = std::max(bbox.top, macroBoxTop); intersect.right = std::min(bbox.right, macroBoxRight); @@ -1047,7 +1047,7 @@ void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTi { 50, 51, 54, 55, 58, 59, 62, 63 } }; - OSALIGN(SWR_TRIANGLE_DESC, 16) triDesc; + OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc; // pull point information from triangle buffer // @todo use structs for readability @@ -1286,7 +1286,7 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi // make sure this macrotile intersects the triangle __m128i vXai = fpToFixedPoint(vXa); __m128i vYai = fpToFixedPoint(vYa); - OSALIGN(BBOX, 16) bboxA; + OSALIGNSIMD(BBOX) bboxA; calcBoundingBoxInt(vXai, vYai, bboxA); if (!(bboxA.left > macroBoxRight || diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h index 5752094ca10..50361068025 100644 --- a/src/gallium/drivers/swr/rasterizer/core/state.h +++ b/src/gallium/drivers/swr/rasterizer/core/state.h @@ -790,6 +790,7 @@ typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT& soContext); typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext); typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext); typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, uint8_t*, simdvector&, simdscalari*, simdscalari*); +typedef simdscalar(*PFN_QUANTIZE_DEPTH)(simdscalar); ////////////////////////////////////////////////////////////////////////// /// FRONTEND_STATE diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp index 07bc94a1a54..4b7a207f366 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp @@ -68,7 +68,10 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread #if defined(_WIN32) - SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer[KNOB_MAX_NUM_THREADS]; + static std::mutex m; + std::lock_guard<std::mutex> l(m); + + static SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer[KNOB_MAX_NUM_THREADS]; DWORD bufSize = sizeof(buffer); BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, buffer, &bufSize); @@ -288,7 +291,10 @@ INLINE int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC) { // Cleanup memory allocations pDC->pArena->Reset(true); - pDC->pTileMgr->initialize(); + if (!pDC->isCompute) + { + pDC->pTileMgr->initialize(); + } if (pDC->cleanupState) { pDC->pState->pArena->Reset(true); @@ -302,10 +308,10 @@ INLINE int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC) return result; } -INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE) +INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE, uint64_t& drawEnqueued) { // increment our current draw id to the first incomplete draw - uint64_t drawEnqueued = GetEnqueuedDraw(pContext); + drawEnqueued = GetEnqueuedDraw(pContext); while (curDrawBE < drawEnqueued) { DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT]; @@ -313,8 +319,9 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE) // If its not compute and FE is not done then break out of loop. if (!pDC->doneFE && !pDC->isCompute) break; - bool isWorkComplete = (pDC->isCompute) ? - pDC->pDispatch->isWorkComplete() : pDC->pTileMgr->isWorkComplete(); + bool isWorkComplete = pDC->isCompute ? + pDC->pDispatch->isWorkComplete() : + pDC->pTileMgr->isWorkComplete(); if (isWorkComplete) { @@ -355,7 +362,8 @@ void WorkOnFifoBE( { // Find the first incomplete draw that has pending work. If no such draw is found then // return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE. - if (FindFirstIncompleteDraw(pContext, curDrawBE) == false) + uint64_t drawEnqueued = 0; + if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false) { return; } @@ -370,7 +378,7 @@ void WorkOnFifoBE( // 2. If we're trying to work on draws after curDrawBE, we are restricted to // working on those macrotiles that are known to be complete in the prior draw to // maintain order. The locked tiles provides the history to ensures this. - for (uint64_t i = curDrawBE; i < GetEnqueuedDraw(pContext); ++i) + for (uint64_t i = curDrawBE; i < drawEnqueued; ++i) { DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT]; @@ -463,7 +471,7 @@ void WorkOnFifoBE( } } -void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode) +void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE) { // Try to grab the next DC from the ring uint64_t drawEnqueued = GetEnqueuedDraw(pContext); @@ -516,38 +524,44 @@ void WorkOnCompute( uint32_t workerId, uint64_t& curDrawBE) { - if (FindFirstIncompleteDraw(pContext, curDrawBE) == false) + uint64_t drawEnqueued = 0; + if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false) { return; } uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1; - DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT]; - if (pDC->isCompute == false) return; - - // check dependencies - if (CheckDependency(pContext, pDC, lastRetiredDraw)) + for (uint64_t i = curDrawBE; curDrawBE < drawEnqueued; ++i) { - return; - } + DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT]; + if (pDC->isCompute == false) return; - SWR_ASSERT(pDC->pDispatch != nullptr); - DispatchQueue& queue = *pDC->pDispatch; + // check dependencies + if (CheckDependency(pContext, pDC, lastRetiredDraw)) + { + return; + } - // Is there any work remaining? - if (queue.getNumQueued() > 0) - { - uint32_t threadGroupId = 0; - while (queue.getWork(threadGroupId)) + SWR_ASSERT(pDC->pDispatch != nullptr); + DispatchQueue& queue = *pDC->pDispatch; + + // Is there any work remaining? + if (queue.getNumQueued() > 0) { - ProcessComputeBE(pDC, workerId, threadGroupId); + void* pSpillFillBuffer = nullptr; + uint32_t threadGroupId = 0; + while (queue.getWork(threadGroupId)) + { + ProcessComputeBE(pDC, workerId, threadGroupId, pSpillFillBuffer); - queue.finishedWork(); + queue.finishedWork(); + } } } } +template<bool IsFEThread, bool IsBEThread> DWORD workerThreadMain(LPVOID pData) { THREAD_DATA *pThreadData = (THREAD_DATA*)pData; @@ -631,25 +645,38 @@ DWORD workerThreadMain(LPVOID pData) } } - RDTSC_START(WorkerWorkOnFifoBE); - WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask); - RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0); + if (IsBEThread) + { + RDTSC_START(WorkerWorkOnFifoBE); + WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask); + RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0); - WorkOnCompute(pContext, workerId, curDrawBE); + WorkOnCompute(pContext, workerId, curDrawBE); + } - WorkOnFifoFE(pContext, workerId, curDrawFE, numaNode); + if (IsFEThread) + { + WorkOnFifoFE(pContext, workerId, curDrawFE); + + if (!IsBEThread) + { + curDrawBE = curDrawFE; + } + } } return 0; } +template<> DWORD workerThreadMain<false, false>(LPVOID) = delete; +template <bool IsFEThread, bool IsBEThread> DWORD workerThreadInit(LPVOID pData) { #if defined(_WIN32) __try #endif // _WIN32 { - return workerThreadMain(pData); + return workerThreadMain<IsFEThread, IsBEThread>(pData); } #if defined(_WIN32) @@ -661,6 +688,7 @@ DWORD workerThreadInit(LPVOID pData) return 1; } +template<> DWORD workerThreadInit<false, false>(LPVOID pData) = delete; void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) { @@ -678,6 +706,16 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) uint32_t numCoresPerNode = numHWCoresPerNode; uint32_t numHyperThreads = numHWHyperThreads; + if (KNOB_MAX_WORKER_THREADS) + { + SET_KNOB(HYPERTHREADED_FE, false); + } + + if (KNOB_HYPERTHREADED_FE) + { + SET_KNOB(MAX_THREADS_PER_CORE, 0); + } + if (KNOB_MAX_NUMA_NODES) { numNodes = std::min(numNodes, KNOB_MAX_NUMA_NODES); @@ -693,6 +731,11 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) numHyperThreads = std::min(numHyperThreads, KNOB_MAX_THREADS_PER_CORE); } + if (numHyperThreads < 2) + { + SET_KNOB(HYPERTHREADED_FE, false); + } + // Calculate numThreads uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads; @@ -767,9 +810,14 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) pPool->pThreadData[workerId].procGroupId = workerId % numProcGroups; pPool->pThreadData[workerId].threadId = 0; pPool->pThreadData[workerId].numaId = 0; + pPool->pThreadData[workerId].coreId = 0; + pPool->pThreadData[workerId].htId = 0; pPool->pThreadData[workerId].pContext = pContext; pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup; - pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]); + pPool->threads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]); + + pContext->NumBEThreads++; + pContext->NumFEThreads++; } } else @@ -780,6 +828,10 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) for (uint32_t n = 0; n < numNodes; ++n) { auto& node = nodes[n]; + if (node.cores.size() == 0) + { + continue; + } uint32_t numCores = numCoresPerNode; for (uint32_t c = 0; c < numCores; ++c) @@ -797,8 +849,29 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool) pPool->pThreadData[workerId].procGroupId = core.procGroup; pPool->pThreadData[workerId].threadId = core.threadIds[t]; pPool->pThreadData[workerId].numaId = n; + pPool->pThreadData[workerId].coreId = c; + pPool->pThreadData[workerId].htId = t; pPool->pThreadData[workerId].pContext = pContext; - pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]); + + if (KNOB_HYPERTHREADED_FE) + { + if (t == 0) + { + pContext->NumBEThreads++; + pPool->threads[workerId] = new std::thread(workerThreadInit<false, true>, &pPool->pThreadData[workerId]); + } + else + { + pContext->NumFEThreads++; + pPool->threads[workerId] = new std::thread(workerThreadInit<true, false>, &pPool->pThreadData[workerId]); + } + } + else + { + pPool->threads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]); + pContext->NumBEThreads++; + pContext->NumFEThreads++; + } ++workerId; } diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h index 821d7dcb16e..3aba6323a95 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.h +++ b/src/gallium/drivers/swr/rasterizer/core/threads.h @@ -41,6 +41,8 @@ struct THREAD_DATA uint32_t procGroupId; // Will always be 0 for non-Windows OS uint32_t threadId; // within the procGroup for Windows uint32_t numaId; // NUMA node id + uint32_t coreId; // Core id + uint32_t htId; // Hyperthread id uint32_t workerId; SWR_CONTEXT *pContext; bool forceBindProcGroup; // Only useful when KNOB_MAX_WORKER_THREADS is set. @@ -62,7 +64,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool); void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool); // Expose FE and BE worker functions to the API thread if single threaded -void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode); +void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE); void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask); void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE); int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC);
\ No newline at end of file diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp index 794577270cf..87d9f42c032 100644 --- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp @@ -35,27 +35,6 @@ #define TILE_ID(x,y) ((x << 16 | y)) -// override new/delete for alignment -void *MacroTileMgr::operator new(size_t size) -{ - return _aligned_malloc(size, 64); -} - -void MacroTileMgr::operator delete(void *p) -{ - _aligned_free(p); -} - -void* DispatchQueue::operator new(size_t size) -{ - return _aligned_malloc(size, 64); -} - -void DispatchQueue::operator delete(void *p) -{ - _aligned_free(p); -} - MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena) { } @@ -304,7 +283,6 @@ void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile) void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID) { const API_STATE& state = GetApiState(pDC); - HotTileMgr *pHotTileMgr = pContext->pHotTileMgr; uint32_t x, y; MacroTileMgr::getTileIndices(macroID, x, y); diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h index aa561badc1c..82a15e16a33 100644 --- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h +++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h @@ -140,9 +140,6 @@ public: x = (tileID >> 16) & 0xffff; } - void *operator new(size_t size); - void operator delete (void *p); - private: CachingArena& mArena; std::unordered_map<uint32_t, MacroTileQueue> mTiles; @@ -229,9 +226,6 @@ public: return mpTaskData; } - void *operator new(size_t size); - void operator delete (void *p); - void* mpTaskData{ nullptr }; // The API thread will set this up and the callback task function will interpet this. OSALIGNLINE(volatile LONG) mTasksAvailable{ 0 }; @@ -272,7 +266,7 @@ class HotTileMgr public: HotTileMgr() { - memset(&mHotTiles[0][0], 0, sizeof(mHotTiles)); + memset(mHotTiles, 0, sizeof(mHotTiles)); // cache hottile size for (uint32_t i = SWR_ATTACHMENT_COLOR0; i <= SWR_ATTACHMENT_COLOR7; ++i) diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py index 0f3ded68544..3832b91d93e 100644 --- a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py +++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py @@ -30,6 +30,18 @@ KNOBS = [ 'category' : 'debug', }], + ['HYPERTHREADED_FE', { + 'type' : 'bool', + 'default' : 'false', + 'desc' : ['EXPERIMENTAL!!', + 'If enabled will attempt to use secondary threads per core to perform', + 'front-end (VS/GS) work.', + '', + 'Note: Setting this will cause KNOB_MAX_THREADS_PER_CORE to be ignored.'], + 'category' : 'perf', + 'advanced' : 'true', + }], + ['DUMP_SHADER_IR', { 'type' : 'bool', 'default' : 'false', @@ -166,6 +178,7 @@ KNOBS = [ '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], 'category' : 'perf', + 'advanced' : 'true', }], ['TOSS_FETCH', { @@ -175,6 +188,7 @@ KNOBS = [ '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], 'category' : 'perf', + 'advanced' : 'true', }], ['TOSS_IA', { @@ -184,6 +198,7 @@ KNOBS = [ '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], 'category' : 'perf', + 'advanced' : 'true', }], ['TOSS_VS', { @@ -193,6 +208,7 @@ KNOBS = [ '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], 'category' : 'perf', + 'advanced' : 'true', }], ['TOSS_SETUP_TRIS', { @@ -202,6 +218,7 @@ KNOBS = [ '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], 'category' : 'perf', + 'advanced' : 'true', }], ['TOSS_BIN_TRIS', { @@ -211,6 +228,7 @@ KNOBS = [ '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], 'category' : 'perf', + 'advanced' : 'true', }], ['TOSS_RS', { @@ -220,4 +238,5 @@ KNOBS = [ '', 'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'], 'category' : 'perf', + 'advanced' : 'true', }],] |