summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/gallium/drivers/swr/Makefile.sources-arch1
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/os.h1
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/api.cpp153
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/context.h19
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/ringbuffer.h102
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/threads.cpp40
6 files changed, 180 insertions, 136 deletions
diff --git a/src/gallium/drivers/swr/Makefile.sources-arch b/src/gallium/drivers/swr/Makefile.sources-arch
index 6c105f46199..7544f8efccc 100644
--- a/src/gallium/drivers/swr/Makefile.sources-arch
+++ b/src/gallium/drivers/swr/Makefile.sources-arch
@@ -83,6 +83,7 @@ CORE_CXX_SOURCES := \
rasterizer/core/rasterizer.h \
rasterizer/core/rdtsc_core.cpp \
rasterizer/core/rdtsc_core.h \
+ rasterizer/core/ringbuffer.h \
rasterizer/core/state.h \
rasterizer/core/threads.cpp \
rasterizer/core/threads.h \
diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h
index 522ae0dd65f..265b879e1cb 100644
--- a/src/gallium/drivers/swr/rasterizer/common/os.h
+++ b/src/gallium/drivers/swr/rasterizer/common/os.h
@@ -192,6 +192,7 @@ unsigned int _mm_popcnt_u32(unsigned int v)
#define InterlockedCompareExchange(Dest, Exchange, Comparand) __sync_val_compare_and_swap(Dest, Comparand, Exchange)
#define InterlockedExchangeAdd(Addend, Value) __sync_fetch_and_add(Addend, Value)
#define InterlockedDecrement(Append) __sync_sub_and_fetch(Append, 1)
+#define InterlockedDecrement64(Append) __sync_sub_and_fetch(Append, 1)
#define InterlockedIncrement(Append) __sync_add_and_fetch(Append, 1)
#define _ReadWriteBarrier() asm volatile("" ::: "memory")
#define __stdcall
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index c70b4fafedd..e18f9e7a811 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -61,11 +61,8 @@ HANDLE SwrCreateContext(
pContext->driverType = pCreateInfo->driver;
pContext->privateStateSize = pCreateInfo->privateStateSize;
- pContext->dcRing = (DRAW_CONTEXT*)_aligned_malloc(sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT, 64);
- memset(pContext->dcRing, 0, sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT);
-
- pContext->dsRing = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT, 64);
- memset(pContext->dsRing, 0, sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT);
+ pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
+ pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
pContext->numSubContexts = pCreateInfo->maxSubContexts;
if (pContext->numSubContexts > 1)
@@ -77,7 +74,6 @@ HANDLE SwrCreateContext(
for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
{
pContext->dcRing[dc].pArena = new Arena();
- pContext->dcRing[dc].inUse = false;
pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena));
pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.
@@ -108,9 +104,6 @@ HANDLE SwrCreateContext(
pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4);
}
- pContext->nextDrawId = 1;
- pContext->DrawEnqueued = 1;
-
// State setup AFTER context is fully initialized
SetupDefaultState(pContext);
@@ -148,8 +141,6 @@ void SwrDestroyContext(HANDLE hContext)
_aligned_free(pContext->pScratch[i]);
}
- _aligned_free(pContext->dcRing);
- _aligned_free(pContext->dsRing);
_aligned_free(pContext->subCtxSave);
delete(pContext->pHotTileMgr);
@@ -168,49 +159,28 @@ void WakeAllThreads(SWR_CONTEXT *pContext)
pContext->FifosNotEmpty.notify_all();
}
-bool StillDrawing(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC)
+template<bool IsDraw>
+void QueueWork(SWR_CONTEXT *pContext)
{
- // For single thread nothing should still be drawing.
- if (KNOB_SINGLE_THREADED) { return false; }
-
- if (pDC->isCompute)
+ if (IsDraw)
{
- if (pDC->doneCompute)
- {
- pDC->inUse = false;
- return false;
- }
+ // Each worker thread looks at a DC for both FE and BE work at different times and so we
+ // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
+ // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
+ // then moved on if all work is done.)
+ pContext->pCurDrawContext->threadsDone =
+ pContext->NumWorkerThreads ? pContext->NumWorkerThreads * 2 : 2;
}
-
- // Check if backend work is done. First make sure all triangles have been binned.
- if (pDC->doneFE == true)
+ else
{
- // ensure workers have all moved passed this draw
- if (pDC->threadsDoneFE != pContext->NumWorkerThreads)
- {
- return true;
- }
-
- if (pDC->threadsDoneBE != pContext->NumWorkerThreads)
- {
- return true;
- }
-
- pDC->inUse = false; // all work is done.
+ pContext->pCurDrawContext->threadsDone =
+ pContext->NumWorkerThreads ? pContext->NumWorkerThreads : 1;
}
- return pDC->inUse;
-}
-
-void QueueDraw(SWR_CONTEXT *pContext)
-{
- SWR_ASSERT(pContext->pCurDrawContext->inUse == false);
- pContext->pCurDrawContext->inUse = true;
-
_ReadWriteBarrier();
{
std::unique_lock<std::mutex> lock(pContext->WaitLock);
- pContext->DrawEnqueued++;
+ pContext->dcRing.Enqueue();
}
if (KNOB_SINGLE_THREADED)
@@ -219,10 +189,24 @@ void QueueDraw(SWR_CONTEXT *pContext)
uint32_t mxcsr = _mm_getcsr();
_mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
- std::unordered_set<uint32_t> lockedTiles;
- uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
- WorkOnFifoFE(pContext, 0, curDraw[0], 0);
- WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles);
+ if (IsDraw)
+ {
+ std::unordered_set<uint32_t> lockedTiles;
+ uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
+ WorkOnFifoFE(pContext, 0, curDraw[0], 0);
+ WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles);
+ }
+ else
+ {
+ uint64_t curDispatch = pContext->pCurDrawContext->drawId;
+ WorkOnCompute(pContext, 0, curDispatch);
+ }
+
+ // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
+ if (!pContext->dcRing.IsEmpty())
+ {
+ pContext->dcRing.Dequeue();
+ }
// restore csr
_mm_setcsr(mxcsr);
@@ -239,40 +223,14 @@ void QueueDraw(SWR_CONTEXT *pContext)
pContext->pCurDrawContext = nullptr;
}
-///@todo Combine this with QueueDraw
-void QueueDispatch(SWR_CONTEXT *pContext)
+INLINE void QueueDraw(SWR_CONTEXT* pContext)
{
- SWR_ASSERT(pContext->pCurDrawContext->inUse == false);
- pContext->pCurDrawContext->inUse = true;
-
- _ReadWriteBarrier();
- {
- std::unique_lock<std::mutex> lock(pContext->WaitLock);
- pContext->DrawEnqueued++;
- }
-
- if (KNOB_SINGLE_THREADED)
- {
- // flush denormals to 0
- uint32_t mxcsr = _mm_getcsr();
- _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
-
- uint64_t curDispatch = pContext->pCurDrawContext->drawId;
- WorkOnCompute(pContext, 0, curDispatch);
-
- // restore csr
- _mm_setcsr(mxcsr);
- }
- else
- {
- RDTSC_START(APIDrawWakeAllThreads);
- WakeAllThreads(pContext);
- RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
- }
+ QueueWork<true>(pContext);
+}
- // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
- pContext->pPrevDrawContext = pContext->pCurDrawContext;
- pContext->pCurDrawContext = nullptr;
+INLINE void QueueDispatch(SWR_CONTEXT* pContext)
+{
+ QueueWork<false>(pContext);
}
DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
@@ -281,17 +239,17 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
// If current draw context is null then need to obtain a new draw context to use from ring.
if (pContext->pCurDrawContext == nullptr)
{
- uint32_t dcIndex = pContext->nextDrawId % KNOB_MAX_DRAWS_IN_FLIGHT;
-
- DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
- pContext->pCurDrawContext = pCurDrawContext;
-
- // Need to wait until this draw context is available to use.
- while (StillDrawing(pContext, pCurDrawContext))
+ // Need to wait for a free entry.
+ while (pContext->dcRing.IsFull())
{
_mm_pause();
}
+ uint32_t dcIndex = pContext->dcRing.GetHead() % KNOB_MAX_DRAWS_IN_FLIGHT;
+
+ DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
+ pContext->pCurDrawContext = pCurDrawContext;
+
// Assign next available entry in DS ring to this DC.
uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
pCurDrawContext->pState = &pContext->dsRing[dsIndex];
@@ -332,18 +290,15 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
pCurDrawContext->pArena->Reset();
pCurDrawContext->pContext = pContext;
pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
- pCurDrawContext->inUse = false;
- pCurDrawContext->doneCompute = false;
pCurDrawContext->doneFE = false;
pCurDrawContext->FeLock = 0;
- pCurDrawContext->threadsDoneFE = 0;
- pCurDrawContext->threadsDoneBE = 0;
+ pCurDrawContext->threadsDone = 0;
pCurDrawContext->pTileMgr->initialize();
// Assign unique drawId for this DC
- pCurDrawContext->drawId = pContext->nextDrawId++;
+ pCurDrawContext->drawId = pContext->dcRing.GetHead();
}
else
{
@@ -431,16 +386,12 @@ void SwrWaitForIdle(HANDLE hContext)
SWR_CONTEXT *pContext = GetContext(hContext);
RDTSC_START(APIWaitForIdle);
- // Wait for all work to complete.
- for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
- {
- DRAW_CONTEXT *pDC = &pContext->dcRing[dc];
- while (StillDrawing(pContext, pDC))
- {
- _mm_pause();
- }
+ while (!pContext->dcRing.IsEmpty())
+ {
+ _mm_pause();
}
+
RDTSC_STOP(APIWaitForIdle, 1, 0);
}
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index 4a214aff1c8..d75d9754e57 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -41,6 +41,7 @@
#include "core/knobs.h"
#include "common/simdintrin.h"
#include "core/threads.h"
+#include "ringbuffer.h"
// x.8 fixed point precision values
#define FIXED_POINT_SHIFT 8
@@ -381,19 +382,14 @@ struct DRAW_CONTEXT
FE_WORK FeWork;
volatile OSALIGNLINE(uint32_t) FeLock;
- volatile OSALIGNLINE(bool) inUse;
volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw?
-
- // Have all worker threads moved past draw in DC ring?
- volatile OSALIGNLINE(uint32_t) threadsDoneFE;
- volatile OSALIGNLINE(uint32_t) threadsDoneBE;
+ volatile OSALIGNLINE(int64_t) threadsDone;
uint64_t dependency;
MacroTileMgr* pTileMgr;
// The following fields are valid if isCompute is true.
- volatile OSALIGNLINE(bool) doneCompute; // Is this dispatch done? (isCompute)
DispatchQueue* pDispatch; // Queue for thread groups. (isCompute)
DRAW_STATE* pState;
@@ -438,7 +434,7 @@ struct SWR_CONTEXT
// 3. State - When an applications sets state after draw
// a. Same as step 1.
// b. State is copied from prev draw context to current.
- DRAW_CONTEXT* dcRing;
+ RingBuffer<DRAW_CONTEXT> dcRing;
DRAW_CONTEXT *pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw.
DRAW_CONTEXT *pPrevDrawContext; // This points to DC entry for the previous context submitted that we can copy state from.
@@ -448,7 +444,7 @@ struct SWR_CONTEXT
// These split draws all have identical state. So instead of storing the state directly
// in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs
// to reference a single entry in the DS ring.
- DRAW_STATE* dsRing;
+ RingBuffer<DRAW_STATE> dsRing;
uint32_t curStateId; // Current index to the next available entry in the DS ring.
@@ -463,13 +459,6 @@ struct SWR_CONTEXT
std::condition_variable FifosNotEmpty;
std::mutex WaitLock;
- // Draw Contexts will get a unique drawId generated from this
- uint64_t nextDrawId;
-
- // most recent draw id enqueued by the API thread
- // written by api thread, read by multiple workers
- OSALIGNLINE(volatile uint64_t) DrawEnqueued;
-
DRIVER_TYPE driverType;
uint32_t privateStateSize;
diff --git a/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h
new file mode 100644
index 00000000000..e323136bc41
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h
@@ -0,0 +1,102 @@
+/****************************************************************************
+* Copyright (C) 2016 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file arena.h
+*
+* @brief RingBuffer
+* The RingBuffer class manages all aspects of the ring buffer including
+* the head/tail indices, etc.
+*
+******************************************************************************/
+#pragma once
+
+template<typename T>
+class RingBuffer
+{
+public:
+ RingBuffer()
+ : mpRingBuffer(nullptr), mNumEntries(0), mRingHead(0), mRingTail(0)
+ {
+ }
+
+ ~RingBuffer()
+ {
+ Destroy();
+ }
+
+ void Init(uint32_t numEntries)
+ {
+ SWR_ASSERT(numEntries > 0);
+ mNumEntries = numEntries;
+ mpRingBuffer = (T*)_aligned_malloc(sizeof(T)*numEntries, 64);
+ SWR_ASSERT(mpRingBuffer != nullptr);
+ memset(mpRingBuffer, 0, sizeof(T)*numEntries);
+ }
+
+ void Destroy()
+ {
+ _aligned_free(mpRingBuffer);
+ mpRingBuffer = nullptr;
+ }
+
+ T& operator[](const uint32_t index)
+ {
+ SWR_ASSERT(index < mNumEntries);
+ return mpRingBuffer[index];
+ }
+
+ INLINE void Enqueue()
+ {
+ mRingHead++; // There's only one producer.
+ }
+
+ INLINE void Dequeue()
+ {
+ InterlockedIncrement(&mRingTail); // There are multiple consumers.
+ }
+
+ INLINE bool IsEmpty()
+ {
+ return (GetHead() == GetTail());
+ }
+
+ INLINE bool IsFull()
+ {
+ ///@note We don't handle wrap case due to using 64-bit indices.
+ /// It would take 11 million years to wrap at 50,000 DCs per sec.
+ /// If we used 32-bit indices then its about 23 hours to wrap.
+ uint64_t numEnqueued = GetHead() - GetTail();
+ SWR_ASSERT(numEnqueued <= mNumEntries);
+
+ return (numEnqueued == mNumEntries);
+ }
+
+ INLINE volatile uint64_t GetTail() { return mRingTail; }
+ INLINE volatile uint64_t GetHead() { return mRingHead; }
+
+private:
+ T* mpRingBuffer;
+ uint32_t mNumEntries;
+
+ OSALIGNLINE(volatile uint64_t) mRingHead; // Consumer Counter
+ OSALIGNLINE(volatile uint64_t) mRingTail; // Producer Counter
+};
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index 24c5588bfec..8f0d9249ae0 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -265,9 +265,7 @@ void bindThread(uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=
INLINE
uint64_t GetEnqueuedDraw(SWR_CONTEXT *pContext)
{
- //uint64_t result = _InterlockedCompareExchange64((volatile __int64*)&pContext->DrawEnqueued, 0, 0);
- //return result;
- return pContext->DrawEnqueued;
+ return pContext->dcRing.GetHead();
}
INLINE
@@ -449,6 +447,18 @@ void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macro
}
}
+INLINE void CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
+{
+ int64_t result = InterlockedDecrement64(&pDC->threadsDone);
+
+ if (result == 0)
+ {
+ _ReadWriteBarrier();
+
+ pContext->dcRing.Dequeue(); // Remove from tail
+ }
+}
+
INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
{
// increment our current draw id to the first incomplete draw
@@ -466,7 +476,7 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
if (isWorkComplete)
{
curDrawBE++;
- InterlockedIncrement(&pDC->threadsDoneBE);
+ CompleteDrawContext(pContext, pDC);
}
else
{
@@ -579,7 +589,7 @@ void WorkOnFifoBE(
{
// We can increment the current BE and safely move to next draw since we know this draw is complete.
curDrawBE++;
- InterlockedIncrement(&pDC->threadsDoneBE);
+ CompleteDrawContext(pContext, pDC);
lastRetiredDraw++;
@@ -608,8 +618,8 @@ void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE,
DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
if (pDC->isCompute || pDC->doneFE || pDC->FeLock)
{
+ CompleteDrawContext(pContext, pDC);
curDrawFE++;
- InterlockedIncrement(&pDC->threadsDoneFE);
}
else
{
@@ -673,22 +683,12 @@ void WorkOnCompute(
// Is there any work remaining?
if (queue.getNumQueued() > 0)
{
- bool lastToComplete = false;
-
uint32_t threadGroupId = 0;
while (queue.getWork(threadGroupId))
{
ProcessComputeBE(pDC, workerId, threadGroupId);
- lastToComplete = queue.finishedWork();
- }
-
- _ReadWriteBarrier();
-
- if (lastToComplete)
- {
- SWR_ASSERT(queue.isWorkComplete() == true);
- pDC->doneCompute = true;
+ queue.finishedWork();
}
}
}
@@ -732,10 +732,10 @@ DWORD workerThreadMain(LPVOID pData)
// the worker can safely increment its oldestDraw counter and move on to the next draw.
std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);
- auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->DrawEnqueued; };
+ auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->dcRing.GetHead(); };
- uint64_t curDrawBE = 1;
- uint64_t curDrawFE = 1;
+ uint64_t curDrawBE = 0;
+ uint64_t curDrawFE = 0;
while (pContext->threadPool.inThreadShutdown == false)
{