aboutsummaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers
diff options
context:
space:
mode:
authorTim Rowley <[email protected]>2016-03-30 14:59:40 -0600
committerTim Rowley <[email protected]>2016-04-12 11:52:05 -0500
commita939a58881063c092a95bd7f1426b8fae1d8a44d (patch)
tree2c4b3ae2e959760badcca6d9464f836c45b1e2b5 /src/gallium/drivers
parent9a8146d0ff623ee26f17b9292176ab0a79ead374 (diff)
swr: [rasterizer core] Add experimental support for hyper-threaded front-end
Acked-by: Brian Paul <[email protected]>
Diffstat (limited to 'src/gallium/drivers')
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/api.cpp8
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/context.h38
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/threads.cpp126
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/threads.h4
-rw-r--r--src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py19
5 files changed, 139 insertions, 56 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 9e13ee142a5..665b6c0453f 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -87,7 +87,10 @@ HANDLE SwrCreateContext(
// Calling createThreadPool() above can set SINGLE_THREADED
if (KNOB_SINGLE_THREADED)
{
+ SET_KNOB(HYPERTHREADED_FE, false);
pContext->NumWorkerThreads = 1;
+ pContext->NumFEThreads = 1;
+ pContext->NumBEThreads = 1;
}
// Allocate scratch space for workers.
@@ -177,8 +180,7 @@ void QueueWork(SWR_CONTEXT *pContext)
// multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
// have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
// then moved on if all work is done.)
- pContext->pCurDrawContext->threadsDone =
- pContext->NumWorkerThreads ? pContext->NumWorkerThreads * 2 : 2;
+ pContext->pCurDrawContext->threadsDone = pContext->NumFEThreads + pContext->NumBEThreads;
_ReadWriteBarrier();
{
@@ -196,7 +198,7 @@ void QueueWork(SWR_CONTEXT *pContext)
{
static TileSet lockedTiles;
uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
- WorkOnFifoFE(pContext, 0, curDraw[0], 0);
+ WorkOnFifoFE(pContext, 0, curDraw[0]);
WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles, 0, 0);
}
else
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index 27abe437718..2c28286b5ad 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -382,32 +382,28 @@ struct DRAW_STATE
// This draw context maintains all of the state needed for the draw operation.
struct DRAW_CONTEXT
{
- SWR_CONTEXT *pContext;
+ SWR_CONTEXT* pContext;
+ uint64_t drawId;
+ MacroTileMgr* pTileMgr;
+ DispatchQueue* pDispatch; // Queue for thread groups. (isCompute)
+ uint64_t dependency;
+ DRAW_STATE* pState;
+ CachingArena* pArena;
- uint64_t drawId;
+ bool isCompute; // Is this DC a compute context?
+ bool cleanupState; // True if this is the last draw using an entry in the state ring.
+ volatile bool doneFE; // Is FE work done for this draw?
- bool isCompute; // Is this DC a compute context?
+ volatile OSALIGNLINE(uint32_t) FeLock;
+ volatile int64_t threadsDone;
- FE_WORK FeWork;
- volatile OSALIGNLINE(uint32_t) FeLock;
- volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw?
- volatile OSALIGNLINE(int64_t) threadsDone;
+ OSALIGNLINE(FE_WORK) FeWork;
+ uint8_t* pSpillFill[KNOB_MAX_NUM_THREADS]; // Scratch space used for spill fills.
- uint64_t dependency;
-
- MacroTileMgr* pTileMgr;
-
- // The following fields are valid if isCompute is true.
- DispatchQueue* pDispatch; // Queue for thread groups. (isCompute)
-
- DRAW_STATE* pState;
- CachingArena* pArena;
-
- uint8_t* pSpillFill[KNOB_MAX_NUM_THREADS]; // Scratch space used for spill fills.
-
- bool cleanupState; // True if this is the last draw using an entry in the state ring.
};
+static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT");
+
INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
{
SWR_ASSERT(pDC != nullptr);
@@ -459,6 +455,8 @@ struct SWR_CONTEXT
uint32_t curStateId; // Current index to the next available entry in the DS ring.
uint32_t NumWorkerThreads;
+ uint32_t NumFEThreads;
+ uint32_t NumBEThreads;
THREAD_POOL threadPool; // Thread pool associated with this context
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index 056003e467c..bee1e138002 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -305,10 +305,10 @@ INLINE int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
return result;
}
-INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
+INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE, uint64_t& drawEnqueued)
{
// increment our current draw id to the first incomplete draw
- uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
+ drawEnqueued = GetEnqueuedDraw(pContext);
while (curDrawBE < drawEnqueued)
{
DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT];
@@ -316,8 +316,9 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
// If its not compute and FE is not done then break out of loop.
if (!pDC->doneFE && !pDC->isCompute) break;
- bool isWorkComplete = (pDC->isCompute) ?
- pDC->pDispatch->isWorkComplete() : pDC->pTileMgr->isWorkComplete();
+ bool isWorkComplete = pDC->isCompute ?
+ pDC->pDispatch->isWorkComplete() :
+ pDC->pTileMgr->isWorkComplete();
if (isWorkComplete)
{
@@ -358,7 +359,8 @@ void WorkOnFifoBE(
{
// Find the first incomplete draw that has pending work. If no such draw is found then
// return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
- if (FindFirstIncompleteDraw(pContext, curDrawBE) == false)
+ uint64_t drawEnqueued = 0;
+ if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false)
{
return;
}
@@ -373,7 +375,7 @@ void WorkOnFifoBE(
// 2. If we're trying to work on draws after curDrawBE, we are restricted to
// working on those macrotiles that are known to be complete in the prior draw to
// maintain order. The locked tiles provides the history to ensures this.
- for (uint64_t i = curDrawBE; i < GetEnqueuedDraw(pContext); ++i)
+ for (uint64_t i = curDrawBE; i < drawEnqueued; ++i)
{
DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT];
@@ -466,7 +468,7 @@ void WorkOnFifoBE(
}
}
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode)
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE)
{
// Try to grab the next DC from the ring
uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
@@ -519,38 +521,43 @@ void WorkOnCompute(
uint32_t workerId,
uint64_t& curDrawBE)
{
- if (FindFirstIncompleteDraw(pContext, curDrawBE) == false)
+ uint64_t drawEnqueued = 0;
+ if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false)
{
return;
}
uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;
- DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT];
- if (pDC->isCompute == false) return;
-
- // check dependencies
- if (CheckDependency(pContext, pDC, lastRetiredDraw))
+ for (uint64_t i = curDrawBE; curDrawBE < drawEnqueued; ++i)
{
- return;
- }
+ DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT];
+ if (pDC->isCompute == false) return;
+
+ // check dependencies
+ if (CheckDependency(pContext, pDC, lastRetiredDraw))
+ {
+ return;
+ }
- SWR_ASSERT(pDC->pDispatch != nullptr);
- DispatchQueue& queue = *pDC->pDispatch;
+ SWR_ASSERT(pDC->pDispatch != nullptr);
+ DispatchQueue& queue = *pDC->pDispatch;
- // Is there any work remaining?
- if (queue.getNumQueued() > 0)
- {
- uint32_t threadGroupId = 0;
- while (queue.getWork(threadGroupId))
+ // Is there any work remaining?
+ if (queue.getNumQueued() > 0)
{
- ProcessComputeBE(pDC, workerId, threadGroupId);
+ uint32_t threadGroupId = 0;
+ while (queue.getWork(threadGroupId))
+ {
+ ProcessComputeBE(pDC, workerId, threadGroupId);
- queue.finishedWork();
+ queue.finishedWork();
+ }
}
}
}
+template<bool IsFEThread, bool IsBEThread>
DWORD workerThreadMain(LPVOID pData)
{
THREAD_DATA *pThreadData = (THREAD_DATA*)pData;
@@ -634,25 +641,38 @@ DWORD workerThreadMain(LPVOID pData)
}
}
- RDTSC_START(WorkerWorkOnFifoBE);
- WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
- RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);
+ if (IsBEThread)
+ {
+ RDTSC_START(WorkerWorkOnFifoBE);
+ WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
+ RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);
- WorkOnCompute(pContext, workerId, curDrawBE);
+ WorkOnCompute(pContext, workerId, curDrawBE);
+ }
+
+ if (IsFEThread)
+ {
+ WorkOnFifoFE(pContext, workerId, curDrawFE);
- WorkOnFifoFE(pContext, workerId, curDrawFE, numaNode);
+ if (!IsBEThread)
+ {
+ curDrawBE = curDrawFE;
+ }
+ }
}
return 0;
}
+template<> DWORD workerThreadMain<false, false>(LPVOID) = delete;
+template <bool IsFEThread, bool IsBEThread>
DWORD workerThreadInit(LPVOID pData)
{
#if defined(_WIN32)
__try
#endif // _WIN32
{
- return workerThreadMain(pData);
+ return workerThreadMain<IsFEThread, IsBEThread>(pData);
}
#if defined(_WIN32)
@@ -664,6 +684,7 @@ DWORD workerThreadInit(LPVOID pData)
return 1;
}
+template<> DWORD workerThreadInit<false, false>(LPVOID pData) = delete;
void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
{
@@ -681,6 +702,16 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
uint32_t numCoresPerNode = numHWCoresPerNode;
uint32_t numHyperThreads = numHWHyperThreads;
+ if (KNOB_MAX_WORKER_THREADS)
+ {
+ SET_KNOB(HYPERTHREADED_FE, false);
+ }
+
+ if (KNOB_HYPERTHREADED_FE)
+ {
+ SET_KNOB(MAX_THREADS_PER_CORE, 0);
+ }
+
if (KNOB_MAX_NUMA_NODES)
{
numNodes = std::min(numNodes, KNOB_MAX_NUMA_NODES);
@@ -696,6 +727,11 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
numHyperThreads = std::min(numHyperThreads, KNOB_MAX_THREADS_PER_CORE);
}
+ if (numHyperThreads < 2)
+ {
+ SET_KNOB(HYPERTHREADED_FE, false);
+ }
+
// Calculate numThreads
uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
@@ -770,9 +806,14 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
pPool->pThreadData[workerId].procGroupId = workerId % numProcGroups;
pPool->pThreadData[workerId].threadId = 0;
pPool->pThreadData[workerId].numaId = 0;
+ pPool->pThreadData[workerId].coreId = 0;
+ pPool->pThreadData[workerId].htId = 0;
pPool->pThreadData[workerId].pContext = pContext;
pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup;
- pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]);
+ pPool->threads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
+
+ pContext->NumBEThreads++;
+ pContext->NumFEThreads++;
}
}
else
@@ -804,8 +845,29 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
pPool->pThreadData[workerId].procGroupId = core.procGroup;
pPool->pThreadData[workerId].threadId = core.threadIds[t];
pPool->pThreadData[workerId].numaId = n;
+ pPool->pThreadData[workerId].coreId = c;
+ pPool->pThreadData[workerId].htId = t;
pPool->pThreadData[workerId].pContext = pContext;
- pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]);
+
+ if (KNOB_HYPERTHREADED_FE)
+ {
+ if (t == 0)
+ {
+ pContext->NumBEThreads++;
+ pPool->threads[workerId] = new std::thread(workerThreadInit<false, true>, &pPool->pThreadData[workerId]);
+ }
+ else
+ {
+ pContext->NumFEThreads++;
+ pPool->threads[workerId] = new std::thread(workerThreadInit<true, false>, &pPool->pThreadData[workerId]);
+ }
+ }
+ else
+ {
+ pPool->threads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
+ pContext->NumBEThreads++;
+ pContext->NumFEThreads++;
+ }
++workerId;
}
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h
index 821d7dcb16e..3aba6323a95 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@@ -41,6 +41,8 @@ struct THREAD_DATA
uint32_t procGroupId; // Will always be 0 for non-Windows OS
uint32_t threadId; // within the procGroup for Windows
uint32_t numaId; // NUMA node id
+ uint32_t coreId; // Core id
+ uint32_t htId; // Hyperthread id
uint32_t workerId;
SWR_CONTEXT *pContext;
bool forceBindProcGroup; // Only useful when KNOB_MAX_WORKER_THREADS is set.
@@ -62,7 +64,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
// Expose FE and BE worker functions to the API thread if single threaded
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode);
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE);
void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask);
void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE);
int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC); \ No newline at end of file
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
index 0f3ded68544..3832b91d93e 100644
--- a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
+++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
@@ -30,6 +30,18 @@ KNOBS = [
'category' : 'debug',
}],
+ ['HYPERTHREADED_FE', {
+ 'type' : 'bool',
+ 'default' : 'false',
+ 'desc' : ['EXPERIMENTAL!!',
+ 'If enabled will attempt to use secondary threads per core to perform',
+ 'front-end (VS/GS) work.',
+ '',
+ 'Note: Setting this will cause KNOB_MAX_THREADS_PER_CORE to be ignored.'],
+ 'category' : 'perf',
+ 'advanced' : 'true',
+ }],
+
['DUMP_SHADER_IR', {
'type' : 'bool',
'default' : 'false',
@@ -166,6 +178,7 @@ KNOBS = [
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
'category' : 'perf',
+ 'advanced' : 'true',
}],
['TOSS_FETCH', {
@@ -175,6 +188,7 @@ KNOBS = [
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
'category' : 'perf',
+ 'advanced' : 'true',
}],
['TOSS_IA', {
@@ -184,6 +198,7 @@ KNOBS = [
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
'category' : 'perf',
+ 'advanced' : 'true',
}],
['TOSS_VS', {
@@ -193,6 +208,7 @@ KNOBS = [
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
'category' : 'perf',
+ 'advanced' : 'true',
}],
['TOSS_SETUP_TRIS', {
@@ -202,6 +218,7 @@ KNOBS = [
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
'category' : 'perf',
+ 'advanced' : 'true',
}],
['TOSS_BIN_TRIS', {
@@ -211,6 +228,7 @@ KNOBS = [
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
'category' : 'perf',
+ 'advanced' : 'true',
}],
['TOSS_RS', {
@@ -220,4 +238,5 @@ KNOBS = [
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
'category' : 'perf',
+ 'advanced' : 'true',
}],]