summaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/swr/rasterizer
diff options
context:
space:
mode:
authorTim Rowley <[email protected]>2016-04-27 16:45:13 -0600
committerTim Rowley <[email protected]>2016-05-05 14:49:11 -0500
commit6e8922705435a1649e51ea404b89aaeba37d555a (patch)
tree6d3a82f4be3612831ae3e4333655fb735c1302b1 /src/gallium/drivers/swr/rasterizer
parentc2f5d2daa897c11e4d8f5804217e7e35d3092313 (diff)
swr: [rasterizer core] Fix thread allocation
Fix windows in 32-bit mode when hyperthreading is disabled on Xeons. Some support for asymmetric processor topologies. Reviewed-by: Bruce Cherniak <[email protected]>
Diffstat (limited to 'src/gallium/drivers/swr/rasterizer')
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/threads.cpp64
1 files changed, 47 insertions, 17 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index 47629e53ada..0b57a3fc836 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -68,6 +68,8 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread
#if defined(_WIN32)
+ std::vector<KAFFINITY> threadMaskPerProcGroup;
+
static std::mutex m;
std::lock_guard<std::mutex> l(m);
@@ -96,14 +98,33 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread
while (BitScanForwardSizeT((unsigned long*)&threadId, gmask.Mask))
{
// clear mask
- gmask.Mask &= ~(KAFFINITY(1) << threadId);
+ KAFFINITY threadMask = KAFFINITY(1) << threadId;
+ gmask.Mask &= ~threadMask;
+
+ if (procGroup >= threadMaskPerProcGroup.size())
+ {
+ threadMaskPerProcGroup.resize(procGroup + 1);
+ }
+
+ if (threadMaskPerProcGroup[procGroup] & threadMask)
+ {
+ // Already seen this mask. This means that we are in 32-bit mode and
+ // have seen more than 32 HW threads for this procGroup
+ // Don't use it
+#if defined(_WIN64)
+ SWR_ASSERT(false, "Shouldn't get here in 64-bit mode");
+#endif
+ continue;
+ }
+
+ threadMaskPerProcGroup[procGroup] |= (KAFFINITY(1) << threadId);
// Find Numa Node
+ uint32_t numaId = 0;
PROCESSOR_NUMBER procNum = {};
procNum.Group = WORD(procGroup);
procNum.Number = UCHAR(threadId);
- uint32_t numaId = 0;
ret = GetNumaProcessorNodeEx(&procNum, (PUSHORT)&numaId);
SWR_ASSERT(ret);
@@ -118,16 +139,6 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread
numaNode.cores.push_back(Core());
pCore = &numaNode.cores.back();
pCore->procGroup = procGroup;
-#if !defined(_WIN64)
- coreId = (uint32_t)numaNode.cores.size();
- if ((coreId * numThreads) > 32)
- {
- // Windows doesn't return threadIds >= 32 for a processor group correctly
- // when running a 32-bit application.
- // Just save -1 as the threadId
- threadId = uint32_t(-1);
- }
-#endif
}
pCore->threadIds.push_back(threadId);
if (procGroup == 0)
@@ -712,6 +723,17 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
uint32_t numHWCoresPerNode = (uint32_t)nodes[0].cores.size();
uint32_t numHWHyperThreads = (uint32_t)nodes[0].cores[0].threadIds.size();
+ // Calculate num HW threads. Due to asymmetric topologies, this is not
+ // a trivial multiplication.
+ uint32_t numHWThreads = 0;
+ for (auto& node : nodes)
+ {
+ for (auto& core : node.cores)
+ {
+ numHWThreads += (uint32_t)core.threadIds.size();
+ }
+ }
+
uint32_t numNodes = numHWNodes;
uint32_t numCoresPerNode = numHWCoresPerNode;
uint32_t numHyperThreads = numHWHyperThreads;
@@ -759,6 +781,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
// Calculate numThreads
uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
+ numThreads = std::min(numThreads, numHWThreads);
if (KNOB_MAX_WORKER_THREADS)
{
@@ -849,23 +872,30 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
for (uint32_t n = 0; n < numNodes; ++n)
{
auto& node = nodes[n];
- if (node.cores.size() == 0)
- {
- continue;
- }
-
uint32_t numCores = numCoresPerNode;
for (uint32_t c = 0; c < numCores; ++c)
{
+ if (c >= node.cores.size())
+ {
+ break;
+ }
+
auto& core = node.cores[c];
for (uint32_t t = 0; t < numHyperThreads; ++t)
{
+ if (t >= core.threadIds.size())
+ {
+ break;
+ }
+
if (numAPIReservedThreads)
{
--numAPIReservedThreads;
continue;
}
+ SWR_ASSERT(workerId < numThreads);
+
pPool->pThreadData[workerId].workerId = workerId;
pPool->pThreadData[workerId].procGroupId = core.procGroup;
pPool->pThreadData[workerId].threadId = core.threadIds[t];