summaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers
diff options
context:
space:
mode:
authorTim Rowley <[email protected]>2017-12-11 17:45:58 -0600
committerTim Rowley <[email protected]>2017-12-15 10:56:46 -0600
commit20f9006603139a479b756c593c04a540041e3471 (patch)
tree2aa8830b47bd0727f28124f98fc6fc3683d3c682 /src/gallium/drivers
parent182cc51a50492926ebf72d4cd38f1e574c768e72 (diff)
swr/rast: Rework thread binding parameters for machine partitioning
Add BASE_NUMA_NODE, BASE_CORE, BASE_THREAD parameters to SwrCreateContext. Add optional SWR_API_THREADING_INFO parameter to SwrCreateContext to control reservation of API threads. Add SwrBindApiThread() function to allow binding of API threads to reserved HW threads. Reviewed-by: Bruce Cherniak <[email protected]>
Diffstat (limited to 'src/gallium/drivers')
-rw-r--r--src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py29
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/api.cpp40
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/api.h33
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/context.h1
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/threads.cpp299
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/threads.h4
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp4
7 files changed, 322 insertions, 88 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py b/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py
index 09e31246021..30803927e3c 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/knob_defs.py
@@ -62,15 +62,33 @@ KNOBS = [
'category' : 'perf',
}],
- ['MAX_NUMA_NODES', {
+ ['BASE_NUMA_NODE', {
'type' : 'uint32_t',
'default' : '0',
+ 'desc' : ['Starting NUMA node index to use when allocating compute resources.',
+ 'Setting this to a non-zero value will reduce the maximum # of NUMA nodes used.'],
+ 'category' : 'perf',
+ 'advanced' : True,
+ }],
+
+ ['MAX_NUMA_NODES', {
+ 'type' : 'uint32_t',
+ 'default' : '1' if sys.platform == 'win32' else '0',
'desc' : ['Maximum # of NUMA-nodes per system used for worker threads',
' 0 == ALL NUMA-nodes in the system',
' N == Use at most N NUMA-nodes for rendering'],
'category' : 'perf',
}],
+ ['BASE_CORE', {
+ 'type' : 'uint32_t',
+ 'default' : '0',
+ 'desc' : ['Starting core index to use when allocating compute resources.',
+ 'Setting this to a non-zero value will reduce the maximum # of cores used.'],
+ 'category' : 'perf',
+ 'advanced' : True,
+ }],
+
['MAX_CORES_PER_NUMA_NODE', {
'type' : 'uint32_t',
'default' : '0',
@@ -80,6 +98,15 @@ KNOBS = [
'category' : 'perf',
}],
+ ['BASE_THREAD', {
+ 'type' : 'uint32_t',
+ 'default' : '0',
+ 'desc' : ['Starting thread index to use when allocating compute resources.',
+ 'Setting this to a non-zero value will reduce the maximum # of threads used.'],
+ 'category' : 'perf',
+ 'advanced' : True,
+ }],
+
['MAX_THREADS_PER_CORE', {
'type' : 'uint32_t',
'default' : '1',
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index 9265440904f..25a3f348411 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -95,16 +95,32 @@ HANDLE SwrCreateContext(
pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
}
- pContext->threadInfo.MAX_WORKER_THREADS = KNOB_MAX_WORKER_THREADS;
- pContext->threadInfo.MAX_NUMA_NODES = KNOB_MAX_NUMA_NODES;
- pContext->threadInfo.MAX_CORES_PER_NUMA_NODE = KNOB_MAX_CORES_PER_NUMA_NODE;
- pContext->threadInfo.MAX_THREADS_PER_CORE = KNOB_MAX_THREADS_PER_CORE;
- pContext->threadInfo.SINGLE_THREADED = KNOB_SINGLE_THREADED;
-
if (pCreateInfo->pThreadInfo)
{
pContext->threadInfo = *pCreateInfo->pThreadInfo;
}
+ else
+ {
+ pContext->threadInfo.MAX_WORKER_THREADS = KNOB_MAX_WORKER_THREADS;
+ pContext->threadInfo.BASE_NUMA_NODE = KNOB_BASE_NUMA_NODE;
+ pContext->threadInfo.BASE_CORE = KNOB_BASE_CORE;
+ pContext->threadInfo.BASE_THREAD = KNOB_BASE_THREAD;
+ pContext->threadInfo.MAX_NUMA_NODES = KNOB_MAX_NUMA_NODES;
+ pContext->threadInfo.MAX_CORES_PER_NUMA_NODE = KNOB_MAX_CORES_PER_NUMA_NODE;
+ pContext->threadInfo.MAX_THREADS_PER_CORE = KNOB_MAX_THREADS_PER_CORE;
+ pContext->threadInfo.SINGLE_THREADED = KNOB_SINGLE_THREADED;
+ }
+
+ if (pCreateInfo->pApiThreadInfo)
+ {
+ pContext->apiThreadInfo = *pCreateInfo->pApiThreadInfo;
+ }
+ else
+ {
+ pContext->apiThreadInfo.bindAPIThread0 = true;
+ pContext->apiThreadInfo.numAPIReservedThreads = 1;
+ pContext->apiThreadInfo.numAPIThreadsPerCore = 1;
+ }
memset(&pContext->WaitLock, 0, sizeof(pContext->WaitLock));
memset(&pContext->FifosNotEmpty, 0, sizeof(pContext->FifosNotEmpty));
@@ -113,6 +129,11 @@ HANDLE SwrCreateContext(
CreateThreadPool(pContext, &pContext->threadPool);
+ if (pContext->apiThreadInfo.bindAPIThread0)
+ {
+ BindApiThread(pContext, 0);
+ }
+
pContext->ppScratch = new uint8_t*[pContext->NumWorkerThreads];
pContext->pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * pContext->NumWorkerThreads, 64);
@@ -407,6 +428,12 @@ void SwrDestroyContext(HANDLE hContext)
AlignedFree(GetContext(hContext));
}
+void SwrBindApiThread(HANDLE hContext, uint32_t apiThreadId)
+{
+ SWR_CONTEXT *pContext = GetContext(hContext);
+ BindApiThread(pContext, apiThreadId);
+}
+
void SWR_API SwrSaveState(
HANDLE hContext,
void* pOutputStateBlock,
@@ -1688,6 +1715,7 @@ void SwrGetInterface(SWR_INTERFACE &out_funcs)
{
out_funcs.pfnSwrCreateContext = SwrCreateContext;
out_funcs.pfnSwrDestroyContext = SwrDestroyContext;
+ out_funcs.pfnSwrBindApiThread = SwrBindApiThread;
out_funcs.pfnSwrSaveState = SwrSaveState;
out_funcs.pfnSwrRestoreState = SwrRestoreState;
out_funcs.pfnSwrSync = SwrSync;
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h
index c032b0bb103..7247fa4215f 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@@ -181,6 +181,9 @@ class BucketManager;
/////////////////////////////////////////////////////////////////////////
struct SWR_THREADING_INFO
{
+ uint32_t BASE_NUMA_NODE;
+ uint32_t BASE_CORE;
+ uint32_t BASE_THREAD;
uint32_t MAX_WORKER_THREADS;
uint32_t MAX_NUMA_NODES;
uint32_t MAX_CORES_PER_NUMA_NODE;
@@ -189,6 +192,24 @@ struct SWR_THREADING_INFO
};
//////////////////////////////////////////////////////////////////////////
+/// SWR_API_THREADING_INFO
+/// Data used to reserve HW threads for API use
+/// API Threads are reserved from numa nodes / cores used for
+/// SWR Worker threads. Specifying reserved threads here can reduce
+/// the total number of SWR worker threads.
+/////////////////////////////////////////////////////////////////////////
+struct SWR_API_THREADING_INFO
+{
+ uint32_t numAPIReservedThreads; // Default is 1 if SWR_API_THREADING_INFO is not sent
+ uint32_t bindAPIThread0; // Default is true if numAPIReservedThreads is > 0,
+ // binds thread used in SwrCreateContext to API Reserved
+ // thread 0
+ uint32_t numAPIThreadsPerCore; // 0 - means use all threads per core, else clamp to this number.
+ // Independent of KNOB_MAX_THREADS_PER_CORE.
+};
+
+
+//////////////////////////////////////////////////////////////////////////
/// SWR_CREATECONTEXT_INFO
/////////////////////////////////////////////////////////////////////////
struct SWR_CREATECONTEXT_INFO
@@ -219,6 +240,9 @@ struct SWR_CREATECONTEXT_INFO
// Input (optional): Threading info that overrides any set KNOB values.
SWR_THREADING_INFO* pThreadInfo;
+ // Input (optional}: Info for reserving API threads
+ SWR_API_THREADING_INFO* pApiThreadInfo;
+
// Input: if set to non-zero value, overrides KNOB value for maximum
// number of draws in flight
uint32_t MAX_DRAWS_IN_FLIGHT;
@@ -237,6 +261,14 @@ SWR_FUNC(void, SwrDestroyContext,
HANDLE hContext);
//////////////////////////////////////////////////////////////////////////
+/// @brief Bind current thread to an API reserved HW thread
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param apiThreadId - index of reserved HW thread to bind to.
+SWR_FUNC(void, SwrBindApiThread,
+ HANDLE hContext,
+ uint32_t apiThreadId);
+
+//////////////////////////////////////////////////////////////////////////
/// @brief Saves API state associated with hContext
/// @param hContext - Handle passed back from SwrCreateContext
/// @param pOutputStateBlock - Memory block to receive API state data
@@ -720,6 +752,7 @@ struct SWR_INTERFACE
{
PFNSwrCreateContext pfnSwrCreateContext;
PFNSwrDestroyContext pfnSwrDestroyContext;
+ PFNSwrBindApiThread pfnSwrBindApiThread;
PFNSwrSaveState pfnSwrSaveState;
PFNSwrRestoreState pfnSwrRestoreState;
PFNSwrSync pfnSwrSync;
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index cba8de999be..6a63838eb5d 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -480,6 +480,7 @@ struct SWR_CONTEXT
THREAD_POOL threadPool; // Thread pool associated with this context
SWR_THREADING_INFO threadInfo;
+ SWR_API_THREADING_INFO apiThreadInfo;
uint32_t MAX_DRAWS_IN_FLIGHT;
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index 6242cb3fc7c..d684ffe7278 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -284,13 +284,20 @@ void bindThread(SWR_CONTEXT* pContext, uint32_t threadId, uint32_t procGroupId =
{
// If MAX_WORKER_THREADS is set, only bind to the proc group,
// Not the individual HW thread.
- if (!pContext->threadInfo.MAX_WORKER_THREADS)
+ if (!bindProcGroup && !pContext->threadInfo.MAX_WORKER_THREADS)
{
affinity.Mask = KAFFINITY(1) << threadId;
}
+ else
+ {
+ affinity.Mask = KAFFINITY(0);
+ }
}
- SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr);
+ if (!SetThreadGroupAffinity(GetCurrentThread(), &affinity, nullptr))
+ {
+ SWR_INVALID("Failed to set Thread Affinity");
+ }
#elif defined(__linux__) || defined(__gnu_linux__)
@@ -727,6 +734,29 @@ void WorkOnCompute(
}
}
+void BindApiThread(SWR_CONTEXT *pContext, uint32_t apiThreadId)
+{
+ if (nullptr == pContext)
+ {
+ return;
+ }
+
+ if (apiThreadId >= pContext->threadPool.numReservedThreads)
+ {
+ if (pContext->threadPool.numReservedThreads)
+ {
+ const THREAD_DATA &threadData = pContext->threadPool.pApiThreadData[0];
+ // Just bind to the process group used for API thread 0
+ bindThread(pContext, 0, threadData.procGroupId, true);
+ }
+ return;
+ }
+
+ const THREAD_DATA &threadData = pContext->threadPool.pApiThreadData[apiThreadId];
+
+ bindThread(pContext, threadData.threadId, threadData.procGroupId, threadData.forceBindProcGroup);
+}
+
template<bool IsFEThread, bool IsBEThread>
DWORD workerThreadMain(LPVOID pData)
{
@@ -752,7 +782,8 @@ DWORD workerThreadMain(LPVOID pData)
RDTSC_INIT(threadId);
- uint32_t numaNode = pThreadData->numaId;
+ // Only need offset numa index from base for correct masking
+ uint32_t numaNode = pThreadData->numaId - pContext->threadInfo.BASE_NUMA_NODE;
uint32_t numaMask = pContext->threadPool.numaMask;
// flush denormals to 0
@@ -861,28 +892,50 @@ DWORD workerThreadInit(LPVOID pData)
}
template<> DWORD workerThreadInit<false, false>(LPVOID pData) = delete;
+static void InitPerThreadStats(SWR_CONTEXT* pContext, uint32_t numThreads)
+{
+ // Initialize DRAW_CONTEXT's per-thread stats
+ for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
+ {
+ pContext->dcRing[dc].dynState.pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * numThreads, 64);
+ memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads);
+ }
+}
+
//////////////////////////////////////////////////////////////////////////
/// @brief Creates thread pool info but doesn't launch threads.
/// @param pContext - pointer to context
/// @param pPool - pointer to thread pool object.
void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
{
- bindThread(pContext, 0);
-
CPUNumaNodes nodes;
uint32_t numThreadsPerProcGroup = 0;
CalculateProcessorTopology(nodes, numThreadsPerProcGroup);
+ // Assumption, for asymmetric topologies, multi-threaded cores will appear
+ // in the list before single-threaded cores. This appears to be true for
+ // Windows when the total HW threads is limited to 64.
uint32_t numHWNodes = (uint32_t)nodes.size();
uint32_t numHWCoresPerNode = (uint32_t)nodes[0].cores.size();
uint32_t numHWHyperThreads = (uint32_t)nodes[0].cores[0].threadIds.size();
+#if defined(_WIN32) && !defined(_WIN64)
+ if (!pContext->threadInfo.MAX_WORKER_THREADS)
+ {
+ // Limit 32-bit windows to bindable HW threads only
+ if ((numHWCoresPerNode * numHWHyperThreads) > 32)
+ {
+ numHWCoresPerNode = 32 / numHWHyperThreads;
+ }
+ }
+#endif
+
// Calculate num HW threads. Due to asymmetric topologies, this is not
// a trivial multiplication.
uint32_t numHWThreads = 0;
- for (auto& node : nodes)
+ for (auto const& node : nodes)
{
- for (auto& core : node.cores)
+ for (auto const& core : node.cores)
{
numHWThreads += (uint32_t)core.threadIds.size();
}
@@ -892,14 +945,19 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
uint32_t numCoresPerNode = numHWCoresPerNode;
uint32_t numHyperThreads = numHWHyperThreads;
- if (pContext->threadInfo.MAX_NUMA_NODES)
+ // Calc used threads per-core
+ if (numHyperThreads > pContext->threadInfo.BASE_THREAD)
{
- numNodes = std::min(numNodes, pContext->threadInfo.MAX_NUMA_NODES);
+ numHyperThreads -= pContext->threadInfo.BASE_THREAD;
}
-
- if (pContext->threadInfo.MAX_CORES_PER_NUMA_NODE)
+ else
{
- numCoresPerNode = std::min(numCoresPerNode, pContext->threadInfo.MAX_CORES_PER_NUMA_NODE);
+ SWR_ASSERT(
+ false,
+ "Cannot use BASE_THREAD value: %d, maxThreads: %d, reverting BASE_THREAD to 0",
+ pContext->threadInfo.BASE_THREAD,
+ numHyperThreads);
+ pContext->threadInfo.BASE_THREAD = 0;
}
if (pContext->threadInfo.MAX_THREADS_PER_CORE)
@@ -907,93 +965,139 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
numHyperThreads = std::min(numHyperThreads, pContext->threadInfo.MAX_THREADS_PER_CORE);
}
-#if defined(_WIN32) && !defined(_WIN64)
- if (!pContext->threadInfo.MAX_WORKER_THREADS)
+ // Prune any cores that don't support the number of threads
+ if (numHyperThreads > 1)
{
- // Limit 32-bit windows to bindable HW threads only
- if ((numCoresPerNode * numHWHyperThreads) > 32)
+ for (auto& node : nodes)
{
- numCoresPerNode = 32 / numHWHyperThreads;
+ uint32_t numUsableCores = 0;
+ for (auto& core : node.cores)
+ {
+ numUsableCores += (core.threadIds.size() >= numHyperThreads);
+ }
+ numCoresPerNode = std::min(numCoresPerNode, numUsableCores);
}
}
-#endif
-
- // Calculate numThreads
- uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
- numThreads = std::min(numThreads, numHWThreads);
- if (pContext->threadInfo.MAX_WORKER_THREADS)
+ // Calc used cores per NUMA node
+ if (numCoresPerNode > pContext->threadInfo.BASE_CORE)
{
- uint32_t maxHWThreads = numHWNodes * numHWCoresPerNode * numHWHyperThreads;
- numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, maxHWThreads);
+ numCoresPerNode -= pContext->threadInfo.BASE_CORE;
+ }
+ else
+ {
+ SWR_ASSERT(
+ false,
+ "Cannot use BASE_CORE value: %d, maxCores: %d, reverting BASE_CORE to 0",
+ pContext->threadInfo.BASE_CORE,
+ numCoresPerNode);
+ pContext->threadInfo.BASE_CORE = 0;
}
- uint32_t numAPIReservedThreads = 1;
-
+ if (pContext->threadInfo.MAX_CORES_PER_NUMA_NODE)
+ {
+ numCoresPerNode = std::min(numCoresPerNode, pContext->threadInfo.MAX_CORES_PER_NUMA_NODE);
+ }
- if (numThreads == 1)
+ // Calc used NUMA nodes
+ if (numNodes > pContext->threadInfo.BASE_NUMA_NODE)
{
- // If only 1 worker threads, try to move it to an available
- // HW thread. If that fails, use the API thread.
- if (numCoresPerNode < numHWCoresPerNode)
- {
- numCoresPerNode++;
- }
- else if (numHyperThreads < numHWHyperThreads)
- {
- numHyperThreads++;
- }
- else if (numNodes < numHWNodes)
- {
- numNodes++;
- }
- else
- {
- pContext->threadInfo.SINGLE_THREADED = true;
- }
+ numNodes -= pContext->threadInfo.BASE_NUMA_NODE;
}
else
{
- // Save HW threads for the API if we can
- if (numThreads > numAPIReservedThreads)
- {
- numThreads -= numAPIReservedThreads;
- }
- else
- {
- numAPIReservedThreads = 0;
- }
+ SWR_ASSERT(
+ false,
+ "Cannot use BASE_NUMA_NODE value: %d, maxNodes: %d, reverting BASE_NUMA_NODE to 0",
+ pContext->threadInfo.BASE_NUMA_NODE,
+ numNodes);
+ pContext->threadInfo.BASE_NUMA_NODE = 0;
}
- if (pContext->threadInfo.SINGLE_THREADED)
+ if (pContext->threadInfo.MAX_NUMA_NODES)
{
- numThreads = 1;
+ numNodes = std::min(numNodes, pContext->threadInfo.MAX_NUMA_NODES);
}
- // Initialize DRAW_CONTEXT's per-thread stats
- for (uint32_t dc = 0; dc < pContext->MAX_DRAWS_IN_FLIGHT; ++dc)
- {
- pContext->dcRing[dc].dynState.pStats = (SWR_STATS*)AlignedMalloc(sizeof(SWR_STATS) * numThreads, 64);
- memset(pContext->dcRing[dc].dynState.pStats, 0, sizeof(SWR_STATS) * numThreads);
- }
+ // Calculate numThreads - at this point everything should be symmetric
+ uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
+ SWR_REL_ASSERT(numThreads <= numHWThreads);
+
+ uint32_t& numAPIReservedThreads = pContext->apiThreadInfo.numAPIReservedThreads;
+ uint32_t& numAPIThreadsPerCore = pContext->apiThreadInfo.numAPIThreadsPerCore;
+ uint32_t numRemovedThreads = 0;
if (pContext->threadInfo.SINGLE_THREADED)
{
+ numAPIReservedThreads = 0;
+ numThreads = 1;
pContext->NumWorkerThreads = 1;
pContext->NumFEThreads = 1;
pContext->NumBEThreads = 1;
pPool->numThreads = 0;
+ }
+ else if (pContext->threadInfo.MAX_WORKER_THREADS)
+ {
+ numThreads = std::min(pContext->threadInfo.MAX_WORKER_THREADS, numHWThreads);
+ pContext->threadInfo.BASE_NUMA_NODE = 0;
+ pContext->threadInfo.BASE_CORE = 0;
+ pContext->threadInfo.BASE_THREAD = 0;
+ numAPIReservedThreads = 0;
+ }
+ else
+ {
+ if (numAPIReservedThreads >= numThreads)
+ {
+ numAPIReservedThreads = 0;
+ }
+ else if (numAPIReservedThreads)
+ {
+ numAPIThreadsPerCore = std::min(numAPIThreadsPerCore, numHWHyperThreads);
+
+ if (0 == numAPIThreadsPerCore)
+ {
+ numAPIThreadsPerCore = numHWHyperThreads;
+ }
+
+ numRemovedThreads = numAPIReservedThreads;
+ if (numAPIThreadsPerCore == 2 && numHyperThreads == 1)
+ {
+ // Adjust removed threads to make logic below work
+ numRemovedThreads = std::max(1U, (numRemovedThreads + numAPIThreadsPerCore - 1) / 2);
+ }
+
+ numThreads -= numRemovedThreads;
+ }
+ }
+ InitPerThreadStats(pContext, numThreads);
+
+ if (pContext->threadInfo.SINGLE_THREADED)
+ {
return;
}
+ if (numAPIReservedThreads)
+ {
+ pPool->pApiThreadData = new (std::nothrow) THREAD_DATA[numAPIReservedThreads];
+ SWR_ASSERT(pPool->pApiThreadData);
+ if (!pPool->pApiThreadData)
+ {
+ numAPIReservedThreads = 0;
+ }
+ }
+ pPool->numReservedThreads = numAPIReservedThreads;
+
pPool->numThreads = numThreads;
pContext->NumWorkerThreads = pPool->numThreads;
- pPool->pThreadData = (THREAD_DATA *)malloc(pPool->numThreads * sizeof(THREAD_DATA));
+ pPool->pThreadData = new (std::nothrow) THREAD_DATA[pPool->numThreads];
+ SWR_ASSERT(pPool->pThreadData);
pPool->numaMask = 0;
- pPool->pThreads = new THREAD_PTR[pPool->numThreads];
+
+ pPool->pThreads = new (std::nothrow) THREAD_PTR[pPool->numThreads];
+ SWR_ASSERT(pPool->pThreads);
if (pContext->threadInfo.MAX_WORKER_THREADS)
{
@@ -1021,37 +1125,72 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
// numa distribution assumes workers on all nodes
bool useNuma = true;
if (numCoresPerNode * numHyperThreads == 1)
+ {
useNuma = false;
+ }
- if (useNuma) {
+ if (useNuma)
+ {
pPool->numaMask = numNodes - 1; // Only works for 2**n numa nodes (1, 2, 4, etc.)
- } else {
+ }
+ else
+ {
pPool->numaMask = 0;
}
uint32_t workerId = 0;
+ uint32_t numReservedThreads = numAPIReservedThreads;
for (uint32_t n = 0; n < numNodes; ++n)
{
- auto& node = nodes[n];
+ if ((n + pContext->threadInfo.BASE_NUMA_NODE) >= nodes.size())
+ {
+ break;
+ }
+ auto& node = nodes[n + pContext->threadInfo.BASE_NUMA_NODE];
uint32_t numCores = numCoresPerNode;
for (uint32_t c = 0; c < numCores; ++c)
{
- if (c >= node.cores.size())
+ if ((c + pContext->threadInfo.BASE_CORE) >= node.cores.size())
{
break;
}
- auto& core = node.cores[c];
+ auto& core = node.cores[c + pContext->threadInfo.BASE_CORE];
for (uint32_t t = 0; t < numHyperThreads; ++t)
{
- if (t >= core.threadIds.size())
+ if ((t + pContext->threadInfo.BASE_THREAD) >= core.threadIds.size())
{
break;
}
- if (numAPIReservedThreads)
+ if (numRemovedThreads)
{
- --numAPIReservedThreads;
+ --numRemovedThreads;
+ SWR_REL_ASSERT(numReservedThreads);
+ --numReservedThreads;
+ pPool->pApiThreadData[numReservedThreads].workerId = 0xFFFFFFFFU;
+ pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
+ pPool->pApiThreadData[numReservedThreads].threadId = core.threadIds[t];
+ pPool->pApiThreadData[numReservedThreads].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
+ pPool->pApiThreadData[numReservedThreads].coreId = c + pContext->threadInfo.BASE_CORE;
+ pPool->pApiThreadData[numReservedThreads].htId = t + pContext->threadInfo.BASE_THREAD;
+ pPool->pApiThreadData[numReservedThreads].pContext = pContext;
+ pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
+
+
+ if (numAPIThreadsPerCore > numHyperThreads && numReservedThreads)
+ {
+ --numReservedThreads;
+ pPool->pApiThreadData[numReservedThreads].workerId = 0xFFFFFFFFU;
+ pPool->pApiThreadData[numReservedThreads].procGroupId = core.procGroup;
+ pPool->pApiThreadData[numReservedThreads].threadId = core.threadIds[t + 1];
+ pPool->pApiThreadData[numReservedThreads].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
+ pPool->pApiThreadData[numReservedThreads].coreId = c + pContext->threadInfo.BASE_CORE;
+ pPool->pApiThreadData[numReservedThreads].htId = t + pContext->threadInfo.BASE_THREAD;
+ pPool->pApiThreadData[numReservedThreads].pContext = pContext;
+ pPool->pApiThreadData[numReservedThreads].forceBindProcGroup = false;
+ }
+
continue;
}
@@ -1059,11 +1198,12 @@ void CreateThreadPool(SWR_CONTEXT* pContext, THREAD_POOL* pPool)
pPool->pThreadData[workerId].workerId = workerId;
pPool->pThreadData[workerId].procGroupId = core.procGroup;
- pPool->pThreadData[workerId].threadId = core.threadIds[t];
- pPool->pThreadData[workerId].numaId = useNuma ? n : 0;
- pPool->pThreadData[workerId].coreId = c;
- pPool->pThreadData[workerId].htId = t;
+ pPool->pThreadData[workerId].threadId = core.threadIds[t + pContext->threadInfo.BASE_THREAD];
+ pPool->pThreadData[workerId].numaId = useNuma ? (n + pContext->threadInfo.BASE_NUMA_NODE) : 0;
+ pPool->pThreadData[workerId].coreId = c + pContext->threadInfo.BASE_CORE;
+ pPool->pThreadData[workerId].htId = t + pContext->threadInfo.BASE_THREAD;
pPool->pThreadData[workerId].pContext = pContext;
+ pPool->pThreadData[workerId].forceBindProcGroup = false;
pContext->NumBEThreads++;
pContext->NumFEThreads++;
@@ -1113,9 +1253,10 @@ void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
delete(pPool->pThreads[t]);
}
- delete [] pPool->pThreads;
+ delete[] pPool->pThreads;
// Clean up data used by threads
- free(pPool->pThreadData);
+ delete[] pPool->pThreadData;
+ delete[] pPool->pApiThreadData;
}
}
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h
index dac8f86c1df..2e53265f424 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@@ -55,6 +55,8 @@ struct THREAD_POOL
uint32_t numThreads;
uint32_t numaMask;
THREAD_DATA *pThreadData;
+ uint32_t numReservedThreads; // Number of threads reserved for API use
+ THREAD_DATA *pApiThreadData;
};
typedef std::unordered_set<uint32_t> TileSet;
@@ -68,3 +70,5 @@ void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawFE)
bool WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask);
void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint32_t &curDrawBE);
int32_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC);
+
+void BindApiThread(SWR_CONTEXT *pContext, uint32_t apiThreadId);
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
index a6c54ab86e8..3ade6e4333e 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@@ -100,7 +100,7 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32
{
uint32_t size = numSamples * mHotTileSize[attachment];
uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
- hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode);
+ hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE);
hotTile.state = HOTTILE_INVALID;
hotTile.numSamples = numSamples;
hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
@@ -124,7 +124,7 @@ HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32
uint32_t size = numSamples * mHotTileSize[attachment];
uint32_t numaNode = ((x ^ y) & pContext->threadPool.numaMask);
- hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode);
+ hotTile.pBuffer = (uint8_t*)AllocHotTileMem(size, 64, numaNode + pContext->threadInfo.BASE_NUMA_NODE);
hotTile.state = HOTTILE_INVALID;
hotTile.numSamples = numSamples;
}