diff options
-rw-r--r-- | src/gallium/drivers/swr/Makefile.sources-arch | 1 | ||||
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/common/os.h | 1 | ||||
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/core/api.cpp | 153 | ||||
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/core/context.h | 19 | ||||
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/core/ringbuffer.h | 102 | ||||
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/core/threads.cpp | 40 |
6 files changed, 180 insertions, 136 deletions
diff --git a/src/gallium/drivers/swr/Makefile.sources-arch b/src/gallium/drivers/swr/Makefile.sources-arch index 6c105f46199..7544f8efccc 100644 --- a/src/gallium/drivers/swr/Makefile.sources-arch +++ b/src/gallium/drivers/swr/Makefile.sources-arch @@ -83,6 +83,7 @@ CORE_CXX_SOURCES := \ rasterizer/core/rasterizer.h \ rasterizer/core/rdtsc_core.cpp \ rasterizer/core/rdtsc_core.h \ + rasterizer/core/ringbuffer.h \ rasterizer/core/state.h \ rasterizer/core/threads.cpp \ rasterizer/core/threads.h \ diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h index 522ae0dd65f..265b879e1cb 100644 --- a/src/gallium/drivers/swr/rasterizer/common/os.h +++ b/src/gallium/drivers/swr/rasterizer/common/os.h @@ -192,6 +192,7 @@ unsigned int _mm_popcnt_u32(unsigned int v) #define InterlockedCompareExchange(Dest, Exchange, Comparand) __sync_val_compare_and_swap(Dest, Comparand, Exchange) #define InterlockedExchangeAdd(Addend, Value) __sync_fetch_and_add(Addend, Value) #define InterlockedDecrement(Append) __sync_sub_and_fetch(Append, 1) +#define InterlockedDecrement64(Append) __sync_sub_and_fetch(Append, 1) #define InterlockedIncrement(Append) __sync_add_and_fetch(Append, 1) #define _ReadWriteBarrier() asm volatile("" ::: "memory") #define __stdcall diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index c70b4fafedd..e18f9e7a811 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -61,11 +61,8 @@ HANDLE SwrCreateContext( pContext->driverType = pCreateInfo->driver; pContext->privateStateSize = pCreateInfo->privateStateSize; - pContext->dcRing = (DRAW_CONTEXT*)_aligned_malloc(sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT, 64); - memset(pContext->dcRing, 0, sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT); - - pContext->dsRing = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT, 64); - memset(pContext->dsRing, 0, sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT); + pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT); + pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT); pContext->numSubContexts = pCreateInfo->maxSubContexts; if (pContext->numSubContexts > 1) @@ -77,7 +74,6 @@ HANDLE SwrCreateContext( for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc) { pContext->dcRing[dc].pArena = new Arena(); - pContext->dcRing[dc].inUse = false; pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena)); pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen. @@ -108,9 +104,6 @@ HANDLE SwrCreateContext( pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4); } - pContext->nextDrawId = 1; - pContext->DrawEnqueued = 1; - // State setup AFTER context is fully initialized SetupDefaultState(pContext); @@ -148,8 +141,6 @@ void SwrDestroyContext(HANDLE hContext) _aligned_free(pContext->pScratch[i]); } - _aligned_free(pContext->dcRing); - _aligned_free(pContext->dsRing); _aligned_free(pContext->subCtxSave); delete(pContext->pHotTileMgr); @@ -168,49 +159,28 @@ void WakeAllThreads(SWR_CONTEXT *pContext) pContext->FifosNotEmpty.notify_all(); } -bool StillDrawing(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC) +template<bool IsDraw> +void QueueWork(SWR_CONTEXT *pContext) { - // For single thread nothing should still be drawing. - if (KNOB_SINGLE_THREADED) { return false; } - - if (pDC->isCompute) + if (IsDraw) { - if (pDC->doneCompute) - { - pDC->inUse = false; - return false; - } + // Each worker thread looks at a DC for both FE and BE work at different times and so we + // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers + // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and + // then moved on if all work is done.) + pContext->pCurDrawContext->threadsDone = + pContext->NumWorkerThreads ? pContext->NumWorkerThreads * 2 : 2; } - - // Check if backend work is done. First make sure all triangles have been binned. - if (pDC->doneFE == true) + else { - // ensure workers have all moved passed this draw - if (pDC->threadsDoneFE != pContext->NumWorkerThreads) - { - return true; - } - - if (pDC->threadsDoneBE != pContext->NumWorkerThreads) - { - return true; - } - - pDC->inUse = false; // all work is done. + pContext->pCurDrawContext->threadsDone = + pContext->NumWorkerThreads ? pContext->NumWorkerThreads : 1; } - return pDC->inUse; -} - -void QueueDraw(SWR_CONTEXT *pContext) -{ - SWR_ASSERT(pContext->pCurDrawContext->inUse == false); - pContext->pCurDrawContext->inUse = true; - _ReadWriteBarrier(); { std::unique_lock<std::mutex> lock(pContext->WaitLock); - pContext->DrawEnqueued++; + pContext->dcRing.Enqueue(); } if (KNOB_SINGLE_THREADED) @@ -219,10 +189,24 @@ void QueueDraw(SWR_CONTEXT *pContext) uint32_t mxcsr = _mm_getcsr(); _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); - std::unordered_set<uint32_t> lockedTiles; - uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId }; - WorkOnFifoFE(pContext, 0, curDraw[0], 0); - WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles); + if (IsDraw) + { + std::unordered_set<uint32_t> lockedTiles; + uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId }; + WorkOnFifoFE(pContext, 0, curDraw[0], 0); + WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles); + } + else + { + uint64_t curDispatch = pContext->pCurDrawContext->drawId; + WorkOnCompute(pContext, 0, curDispatch); + } + + // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers). + if (!pContext->dcRing.IsEmpty()) + { + pContext->dcRing.Dequeue(); + } // restore csr _mm_setcsr(mxcsr); @@ -239,40 +223,14 @@ void QueueDraw(SWR_CONTEXT *pContext) pContext->pCurDrawContext = nullptr; } -///@todo Combine this with QueueDraw -void QueueDispatch(SWR_CONTEXT *pContext) +INLINE void QueueDraw(SWR_CONTEXT* pContext) { - SWR_ASSERT(pContext->pCurDrawContext->inUse == false); - pContext->pCurDrawContext->inUse = true; - - _ReadWriteBarrier(); - { - std::unique_lock<std::mutex> lock(pContext->WaitLock); - pContext->DrawEnqueued++; - } - - if (KNOB_SINGLE_THREADED) - { - // flush denormals to 0 - uint32_t mxcsr = _mm_getcsr(); - _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON); - - uint64_t curDispatch = pContext->pCurDrawContext->drawId; - WorkOnCompute(pContext, 0, curDispatch); - - // restore csr - _mm_setcsr(mxcsr); - } - else - { - RDTSC_START(APIDrawWakeAllThreads); - WakeAllThreads(pContext); - RDTSC_STOP(APIDrawWakeAllThreads, 1, 0); - } + QueueWork<true>(pContext); +} - // Set current draw context to NULL so that next state call forces a new draw context to be created and populated. - pContext->pPrevDrawContext = pContext->pCurDrawContext; - pContext->pCurDrawContext = nullptr; +INLINE void QueueDispatch(SWR_CONTEXT* pContext) +{ + QueueWork<false>(pContext); } DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) @@ -281,17 +239,17 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) // If current draw context is null then need to obtain a new draw context to use from ring. if (pContext->pCurDrawContext == nullptr) { - uint32_t dcIndex = pContext->nextDrawId % KNOB_MAX_DRAWS_IN_FLIGHT; - - DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex]; - pContext->pCurDrawContext = pCurDrawContext; - - // Need to wait until this draw context is available to use. - while (StillDrawing(pContext, pCurDrawContext)) + // Need to wait for a free entry. + while (pContext->dcRing.IsFull()) { _mm_pause(); } + uint32_t dcIndex = pContext->dcRing.GetHead() % KNOB_MAX_DRAWS_IN_FLIGHT; + + DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex]; + pContext->pCurDrawContext = pCurDrawContext; + // Assign next available entry in DS ring to this DC. uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT; pCurDrawContext->pState = &pContext->dsRing[dsIndex]; @@ -332,18 +290,15 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false) pCurDrawContext->pArena->Reset(); pCurDrawContext->pContext = pContext; pCurDrawContext->isCompute = false; // Dispatch has to set this to true. - pCurDrawContext->inUse = false; - pCurDrawContext->doneCompute = false; pCurDrawContext->doneFE = false; pCurDrawContext->FeLock = 0; - pCurDrawContext->threadsDoneFE = 0; - pCurDrawContext->threadsDoneBE = 0; + pCurDrawContext->threadsDone = 0; pCurDrawContext->pTileMgr->initialize(); // Assign unique drawId for this DC - pCurDrawContext->drawId = pContext->nextDrawId++; + pCurDrawContext->drawId = pContext->dcRing.GetHead(); } else { @@ -431,16 +386,12 @@ void SwrWaitForIdle(HANDLE hContext) SWR_CONTEXT *pContext = GetContext(hContext); RDTSC_START(APIWaitForIdle); - // Wait for all work to complete. - for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc) - { - DRAW_CONTEXT *pDC = &pContext->dcRing[dc]; - while (StillDrawing(pContext, pDC)) - { - _mm_pause(); - } + while (!pContext->dcRing.IsEmpty()) + { + _mm_pause(); } + RDTSC_STOP(APIWaitForIdle, 1, 0); } diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index 4a214aff1c8..d75d9754e57 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -41,6 +41,7 @@ #include "core/knobs.h" #include "common/simdintrin.h" #include "core/threads.h" +#include "ringbuffer.h" // x.8 fixed point precision values #define FIXED_POINT_SHIFT 8 @@ -381,19 +382,14 @@ struct DRAW_CONTEXT FE_WORK FeWork; volatile OSALIGNLINE(uint32_t) FeLock; - volatile OSALIGNLINE(bool) inUse; volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw? - - // Have all worker threads moved past draw in DC ring? - volatile OSALIGNLINE(uint32_t) threadsDoneFE; - volatile OSALIGNLINE(uint32_t) threadsDoneBE; + volatile OSALIGNLINE(int64_t) threadsDone; uint64_t dependency; MacroTileMgr* pTileMgr; // The following fields are valid if isCompute is true. - volatile OSALIGNLINE(bool) doneCompute; // Is this dispatch done? (isCompute) DispatchQueue* pDispatch; // Queue for thread groups. (isCompute) DRAW_STATE* pState; @@ -438,7 +434,7 @@ struct SWR_CONTEXT // 3. State - When an applications sets state after draw // a. Same as step 1. // b. State is copied from prev draw context to current. - DRAW_CONTEXT* dcRing; + RingBuffer<DRAW_CONTEXT> dcRing; DRAW_CONTEXT *pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw. DRAW_CONTEXT *pPrevDrawContext; // This points to DC entry for the previous context submitted that we can copy state from. @@ -448,7 +444,7 @@ struct SWR_CONTEXT // These split draws all have identical state. So instead of storing the state directly // in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs // to reference a single entry in the DS ring. - DRAW_STATE* dsRing; + RingBuffer<DRAW_STATE> dsRing; uint32_t curStateId; // Current index to the next available entry in the DS ring. @@ -463,13 +459,6 @@ struct SWR_CONTEXT std::condition_variable FifosNotEmpty; std::mutex WaitLock; - // Draw Contexts will get a unique drawId generated from this - uint64_t nextDrawId; - - // most recent draw id enqueued by the API thread - // written by api thread, read by multiple workers - OSALIGNLINE(volatile uint64_t) DrawEnqueued; - DRIVER_TYPE driverType; uint32_t privateStateSize; diff --git a/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h new file mode 100644 index 00000000000..e323136bc41 --- /dev/null +++ b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h @@ -0,0 +1,102 @@ +/**************************************************************************** +* Copyright (C) 2016 Intel Corporation. All Rights Reserved. +* +* Permission is hereby granted, free of charge, to any person obtaining a +* copy of this software and associated documentation files (the "Software"), +* to deal in the Software without restriction, including without limitation +* the rights to use, copy, modify, merge, publish, distribute, sublicense, +* and/or sell copies of the Software, and to permit persons to whom the +* Software is furnished to do so, subject to the following conditions: +* +* The above copyright notice and this permission notice (including the next +* paragraph) shall be included in all copies or substantial portions of the +* Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL +* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING +* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS +* IN THE SOFTWARE. +* +* @file arena.h +* +* @brief RingBuffer +* The RingBuffer class manages all aspects of the ring buffer including +* the head/tail indices, etc. +* +******************************************************************************/ +#pragma once + +template<typename T> +class RingBuffer +{ +public: + RingBuffer() + : mpRingBuffer(nullptr), mNumEntries(0), mRingHead(0), mRingTail(0) + { + } + + ~RingBuffer() + { + Destroy(); + } + + void Init(uint32_t numEntries) + { + SWR_ASSERT(numEntries > 0); + mNumEntries = numEntries; + mpRingBuffer = (T*)_aligned_malloc(sizeof(T)*numEntries, 64); + SWR_ASSERT(mpRingBuffer != nullptr); + memset(mpRingBuffer, 0, sizeof(T)*numEntries); + } + + void Destroy() + { + _aligned_free(mpRingBuffer); + mpRingBuffer = nullptr; + } + + T& operator[](const uint32_t index) + { + SWR_ASSERT(index < mNumEntries); + return mpRingBuffer[index]; + } + + INLINE void Enqueue() + { + mRingHead++; // There's only one producer. + } + + INLINE void Dequeue() + { + InterlockedIncrement(&mRingTail); // There are multiple consumers. + } + + INLINE bool IsEmpty() + { + return (GetHead() == GetTail()); + } + + INLINE bool IsFull() + { + ///@note We don't handle wrap case due to using 64-bit indices. + /// It would take 11 million years to wrap at 50,000 DCs per sec. + /// If we used 32-bit indices then its about 23 hours to wrap. + uint64_t numEnqueued = GetHead() - GetTail(); + SWR_ASSERT(numEnqueued <= mNumEntries); + + return (numEnqueued == mNumEntries); + } + + INLINE volatile uint64_t GetTail() { return mRingTail; } + INLINE volatile uint64_t GetHead() { return mRingHead; } + +private: + T* mpRingBuffer; + uint32_t mNumEntries; + + OSALIGNLINE(volatile uint64_t) mRingHead; // Consumer Counter + OSALIGNLINE(volatile uint64_t) mRingTail; // Producer Counter +}; diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp index 24c5588bfec..8f0d9249ae0 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp @@ -265,9 +265,7 @@ void bindThread(uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup= INLINE uint64_t GetEnqueuedDraw(SWR_CONTEXT *pContext) { - //uint64_t result = _InterlockedCompareExchange64((volatile __int64*)&pContext->DrawEnqueued, 0, 0); - //return result; - return pContext->DrawEnqueued; + return pContext->dcRing.GetHead(); } INLINE @@ -449,6 +447,18 @@ void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macro } } +INLINE void CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC) +{ + int64_t result = InterlockedDecrement64(&pDC->threadsDone); + + if (result == 0) + { + _ReadWriteBarrier(); + + pContext->dcRing.Dequeue(); // Remove from tail + } +} + INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE) { // increment our current draw id to the first incomplete draw @@ -466,7 +476,7 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE) if (isWorkComplete) { curDrawBE++; - InterlockedIncrement(&pDC->threadsDoneBE); + CompleteDrawContext(pContext, pDC); } else { @@ -579,7 +589,7 @@ void WorkOnFifoBE( { // We can increment the current BE and safely move to next draw since we know this draw is complete. curDrawBE++; - InterlockedIncrement(&pDC->threadsDoneBE); + CompleteDrawContext(pContext, pDC); lastRetiredDraw++; @@ -608,8 +618,8 @@ void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot]; if (pDC->isCompute || pDC->doneFE || pDC->FeLock) { + CompleteDrawContext(pContext, pDC); curDrawFE++; - InterlockedIncrement(&pDC->threadsDoneFE); } else { @@ -673,22 +683,12 @@ void WorkOnCompute( // Is there any work remaining? if (queue.getNumQueued() > 0) { - bool lastToComplete = false; - uint32_t threadGroupId = 0; while (queue.getWork(threadGroupId)) { ProcessComputeBE(pDC, workerId, threadGroupId); - lastToComplete = queue.finishedWork(); - } - - _ReadWriteBarrier(); - - if (lastToComplete) - { - SWR_ASSERT(queue.isWorkComplete() == true); - pDC->doneCompute = true; + queue.finishedWork(); } } } @@ -732,10 +732,10 @@ DWORD workerThreadMain(LPVOID pData) // the worker can safely increment its oldestDraw counter and move on to the next draw. std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock); - auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->DrawEnqueued; }; + auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->dcRing.GetHead(); }; - uint64_t curDrawBE = 1; - uint64_t curDrawFE = 1; + uint64_t curDrawBE = 0; + uint64_t curDrawFE = 0; while (pContext->threadPool.inThreadShutdown == false) { |