aboutsummaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/swr/rasterizer
diff options
context:
space:
mode:
authorJason Ekstrand <[email protected]>2016-04-13 20:25:39 -0700
committerJason Ekstrand <[email protected]>2016-04-13 20:25:39 -0700
commit12f88ba32a14ea79134f4e995a55149f078a2f27 (patch)
tree9070861dced23d0ad7dbec598bfd96b686eb7bf1 /src/gallium/drivers/swr/rasterizer
parent79fbec30fc16399ede9385ef52cb62cefbb388f4 (diff)
parent171a570f388b2895d14f6d5418c99573cffd6369 (diff)
Merge remote-tracking branch 'public/master' into vulkan
Diffstat (limited to 'src/gallium/drivers/swr/rasterizer')
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/os.h10
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/api.cpp100
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/arena.h232
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/backend.cpp39
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/backend.h2
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/clip.cpp4
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/clip.h4
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/context.h47
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/depthstencil.h40
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/frontend.cpp10
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/knobs.h3
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/pa.h14
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp16
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/state.h1
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/threads.cpp141
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/threads.h4
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp22
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/tilemgr.h8
-rw-r--r--src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py19
19 files changed, 506 insertions, 210 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h
index 5794f3f625a..180a0560822 100644
--- a/src/gallium/drivers/swr/rasterizer/common/os.h
+++ b/src/gallium/drivers/swr/rasterizer/common/os.h
@@ -30,10 +30,6 @@
#define SWR_API __cdecl
-#ifndef _CRT_SECURE_NO_WARNINGS
-#define _CRT_SECURE_NO_WARNINGS
-#endif
-
#ifndef NOMINMAX
#define NOMINMAX
#endif
@@ -52,7 +48,6 @@
#define PRAGMA_WARNING_POP() __pragma(warning(pop))
-#if defined(_WIN32)
#if defined(_WIN64)
#define BitScanReverseSizeT BitScanReverse64
#define BitScanForwardSizeT BitScanForward64
@@ -62,7 +57,6 @@
#define BitScanForwardSizeT BitScanForward
#define _mm_popcount_sizeT _mm_popcnt_u32
#endif
-#endif
#elif defined(FORCE_LINUX) || defined(__linux__) || defined(__gnu_linux__)
@@ -199,9 +193,7 @@ typedef KILOBYTE MEGABYTE[1024];
typedef MEGABYTE GIGABYTE[1024];
#define OSALIGNLINE(RWORD) OSALIGN(RWORD, 64)
-#if KNOB_SIMD_WIDTH == 8
-#define OSALIGNSIMD(RWORD) OSALIGN(RWORD, 32)
-#endif
+#define OSALIGNSIMD(RWORD) OSALIGN(RWORD, KNOB_SIMD_BYTES)
#include "common/swr_assert.h"
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index f0f7956b590..ca9cfdb629e 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -29,10 +29,12 @@
#include <cfloat>
#include <cmath>
#include <cstdio>
+#include <new>
#include "core/api.h"
#include "core/backend.h"
#include "core/context.h"
+#include "core/depthstencil.h"
#include "core/frontend.h"
#include "core/rasterizer.h"
#include "core/rdtsc_core.h"
@@ -64,11 +66,14 @@ HANDLE SwrCreateContext(
pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
+ pContext->pMacroTileManagerArray = (MacroTileMgr*)_aligned_malloc(sizeof(MacroTileMgr) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
+ pContext->pDispatchQueueArray = (DispatchQueue*)_aligned_malloc(sizeof(DispatchQueue) * KNOB_MAX_DRAWS_IN_FLIGHT, 64);
+
for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
{
pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
- pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena));
- pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.
+ new (&pContext->pMacroTileManagerArray[dc]) MacroTileMgr(*pContext->dcRing[dc].pArena);
+ new (&pContext->pDispatchQueueArray[dc]) DispatchQueue();
pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
}
@@ -86,15 +91,26 @@ HANDLE SwrCreateContext(
// Calling createThreadPool() above can set SINGLE_THREADED
if (KNOB_SINGLE_THREADED)
{
+ SET_KNOB(HYPERTHREADED_FE, false);
pContext->NumWorkerThreads = 1;
+ pContext->NumFEThreads = 1;
+ pContext->NumBEThreads = 1;
}
// Allocate scratch space for workers.
///@note We could lazily allocate this but its rather small amount of memory.
for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
{
- ///@todo Use numa API for allocations using numa information from thread data (if exists).
- pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4);
+#if defined(_WIN32)
+ uint32_t numaNode = pContext->threadPool.pThreadData ?
+ pContext->threadPool.pThreadData[i].numaId : 0;
+ pContext->pScratch[i] = (uint8_t*)VirtualAllocExNuma(
+ GetCurrentProcess(), nullptr, 32 * sizeof(KILOBYTE),
+ MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE,
+ numaNode);
+#else
+ pContext->pScratch[i] = (uint8_t*)_aligned_malloc(32 * sizeof(KILOBYTE), KNOB_SIMD_WIDTH * 4);
+#endif
}
// State setup AFTER context is fully initialized
@@ -131,14 +147,21 @@ void SwrDestroyContext(HANDLE hContext)
{
delete pContext->dcRing[i].pArena;
delete pContext->dsRing[i].pArena;
- delete(pContext->dcRing[i].pTileMgr);
- delete(pContext->dcRing[i].pDispatch);
+ pContext->pMacroTileManagerArray[i].~MacroTileMgr();
+ pContext->pDispatchQueueArray[i].~DispatchQueue();
}
+ _aligned_free(pContext->pDispatchQueueArray);
+ _aligned_free(pContext->pMacroTileManagerArray);
+
// Free scratch space.
for (uint32_t i = 0; i < pContext->NumWorkerThreads; ++i)
{
+#if defined(_WIN32)
+ VirtualFree(pContext->pScratch[i], 0, MEM_RELEASE);
+#else
_aligned_free(pContext->pScratch[i]);
+#endif
}
delete(pContext->pHotTileMgr);
@@ -160,12 +183,20 @@ void WakeAllThreads(SWR_CONTEXT *pContext)
template<bool IsDraw>
void QueueWork(SWR_CONTEXT *pContext)
{
+ DRAW_CONTEXT* pDC = pContext->pCurDrawContext;
+ uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
+
+ if (IsDraw)
+ {
+ pDC->pTileMgr = &pContext->pMacroTileManagerArray[dcIndex];
+ pDC->pTileMgr->initialize();
+ }
+
// Each worker thread looks at a DC for both FE and BE work at different times and so we
// multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
// have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
// then moved on if all work is done.)
- pContext->pCurDrawContext->threadsDone =
- pContext->NumWorkerThreads ? pContext->NumWorkerThreads * 2 : 2;
+ pContext->pCurDrawContext->threadsDone = pContext->NumFEThreads + pContext->NumBEThreads;
_ReadWriteBarrier();
{
@@ -183,7 +214,7 @@ void QueueWork(SWR_CONTEXT *pContext)
{
static TileSet lockedTiles;
uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
- WorkOnFifoFE(pContext, 0, curDraw[0], 0);
+ WorkOnFifoFE(pContext, 0, curDraw[0]);
WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles, 0, 0);
}
else
@@ -232,7 +263,20 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
_mm_pause();
}
- uint32_t dcIndex = pContext->dcRing.GetHead() % KNOB_MAX_DRAWS_IN_FLIGHT;
+ uint64_t curDraw = pContext->dcRing.GetHead();
+ uint32_t dcIndex = curDraw % KNOB_MAX_DRAWS_IN_FLIGHT;
+
+ static uint64_t lastDrawChecked;
+ static uint32_t lastFrameChecked;
+ if ((pContext->frameCount - lastFrameChecked) > 2 ||
+ (curDraw - lastDrawChecked) > 0x10000)
+ {
+ // Take this opportunity to clean-up old arena allocations
+ pContext->cachingArenaAllocator.FreeOldBlocks();
+
+ lastFrameChecked = pContext->frameCount;
+ lastDrawChecked = curDraw;
+ }
DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
pContext->pCurDrawContext = pCurDrawContext;
@@ -284,8 +328,6 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
pCurDrawContext->FeLock = 0;
pCurDrawContext->threadsDone = 0;
- pCurDrawContext->pTileMgr->initialize();
-
// Assign unique drawId for this DC
pCurDrawContext->drawId = pContext->dcRing.GetHead();
@@ -872,6 +914,25 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
!pState->state.blendState.renderTarget[rt].writeDisableBlue) ? (1 << rt) : 0;
}
}
+
+ // Setup depth quantization function
+ if (pState->state.depthHottileEnable)
+ {
+ switch (pState->state.rastState.depthFormat)
+ {
+ case R32_FLOAT_X8X24_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT_X8X24_TYPELESS > ; break;
+ case R32_FLOAT: pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ; break;
+ case R24_UNORM_X8_TYPELESS: pState->state.pfnQuantizeDepth = QuantizeDepth < R24_UNORM_X8_TYPELESS > ; break;
+ case R16_UNORM: pState->state.pfnQuantizeDepth = QuantizeDepth < R16_UNORM > ; break;
+ default: SWR_ASSERT(false, "Unsupported depth format for depth quantiztion.");
+ pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ;
+ }
+ }
+ else
+ {
+ // set up pass-through quantize if depth isn't enabled
+ pState->state.pfnQuantizeDepth = QuantizeDepth < R32_FLOAT > ;
+ }
}
//////////////////////////////////////////////////////////////////////////
@@ -1029,9 +1090,9 @@ void DrawInstanced(
SWR_CONTEXT *pContext = GetContext(hContext);
DRAW_CONTEXT* pDC = GetDrawContext(pContext);
- int32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
+ uint32_t maxVertsPerDraw = MaxVertsPerDraw(pDC, numVertices, topology);
uint32_t primsPerDraw = GetNumPrims(topology, maxVertsPerDraw);
- int32_t remainingVerts = numVertices;
+ uint32_t remainingVerts = numVertices;
API_STATE *pState = &pDC->pState->state;
pState->topology = topology;
@@ -1149,9 +1210,9 @@ void DrawIndexedInstance(
DRAW_CONTEXT* pDC = GetDrawContext(pContext);
API_STATE* pState = &pDC->pState->state;
- int32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
+ uint32_t maxIndicesPerDraw = MaxVertsPerDraw(pDC, numIndices, topology);
uint32_t primsPerDraw = GetNumPrims(topology, maxIndicesPerDraw);
- int32_t remainingIndices = numIndices;
+ uint32_t remainingIndices = numIndices;
uint32_t indexSize = 0;
switch (pState->indexBuffer.format)
@@ -1334,9 +1395,6 @@ void SwrDispatch(
pDC->isCompute = true; // This is a compute context.
- // Ensure spill fill pointers are initialized to nullptr.
- memset(pDC->pSpillFill, 0, sizeof(pDC->pSpillFill));
-
COMPUTE_DESC* pTaskData = (COMPUTE_DESC*)pDC->pArena->AllocAligned(sizeof(COMPUTE_DESC), 64);
pTaskData->threadGroupCountX = threadGroupCountX;
@@ -1344,6 +1402,8 @@ void SwrDispatch(
pTaskData->threadGroupCountZ = threadGroupCountZ;
uint32_t totalThreadGroups = threadGroupCountX * threadGroupCountY * threadGroupCountZ;
+ uint32_t dcIndex = pDC->drawId % KNOB_MAX_DRAWS_IN_FLIGHT;
+ pDC->pDispatch = &pContext->pDispatchQueueArray[dcIndex];
pDC->pDispatch->initialize(totalThreadGroups, pTaskData);
QueueDispatch(pContext);
@@ -1497,4 +1557,6 @@ void SWR_API SwrEndFrame(
HANDLE hContext)
{
RDTSC_ENDFRAME();
+ SWR_CONTEXT *pContext = GetContext(hContext);
+ pContext->frameCount++;
}
diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h
index 67d81a44347..64184e16865 100644
--- a/src/gallium/drivers/swr/rasterizer/core/arena.h
+++ b/src/gallium/drivers/swr/rasterizer/core/arena.h
@@ -65,69 +65,41 @@ static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN,
template<uint32_t NumBucketsT = 4, uint32_t StartBucketBitT = 16>
struct CachingAllocatorT : DefaultAllocator
{
- static uint32_t GetBucketId(size_t blockSize)
- {
- uint32_t bucketId = 0;
-
-#if defined(BitScanReverseSizeT)
- BitScanReverseSizeT((unsigned long*)&bucketId, blockSize >> CACHE_START_BUCKET_BIT);
- bucketId = std::min<uint32_t>(bucketId, CACHE_NUM_BUCKETS - 1);
-#endif
-
- return bucketId;
- }
-
void* AllocateAligned(size_t size, size_t align)
{
SWR_ASSERT(size >= sizeof(ArenaBlock));
SWR_ASSERT(size <= uint32_t(-1));
size_t blockSize = size - ARENA_BLOCK_ALIGN;
+ uint32_t bucket = GetBucketId(blockSize);
{
// search cached blocks
std::lock_guard<std::mutex> l(m_mutex);
- ArenaBlock* pPrevBlock = &m_cachedBlocks[GetBucketId(blockSize)];
- ArenaBlock* pBlock = pPrevBlock->pNext;
- ArenaBlock* pPotentialBlock = nullptr;
- ArenaBlock* pPotentialPrev = nullptr;
+ ArenaBlock* pPrevBlock = &m_cachedBlocks[bucket];
+ ArenaBlock* pBlock = SearchBlocks(pPrevBlock, blockSize, align);
- while (pBlock)
+ if (pBlock)
{
- if (pBlock->blockSize >= blockSize)
- {
- if (pBlock == AlignUp(pBlock, align))
- {
- if (pBlock->blockSize == blockSize)
- {
- // Won't find a better match
- break;
- }
-
- // We could use this as it is larger than we wanted, but
- // continue to search for a better match
- pPotentialBlock = pBlock;
- pPotentialPrev = pPrevBlock;
- }
- }
- else
+ m_cachedSize -= pBlock->blockSize;
+ if (pBlock == m_pLastCachedBlocks[bucket])
{
- // Blocks are sorted by size (biggest first)
- // So, if we get here, there are no blocks
- // large enough, fall through to allocation.
- pBlock = nullptr;
- break;
+ m_pLastCachedBlocks[bucket] = pPrevBlock;
}
-
- pPrevBlock = pBlock;
- pBlock = pBlock->pNext;
}
-
- if (!pBlock)
+ else
{
- // Couldn't find an exact match, use next biggest size
- pBlock = pPotentialBlock;
- pPrevBlock = pPotentialPrev;
+ pPrevBlock = &m_oldCachedBlocks[GetBucketId(blockSize)];
+ pBlock = SearchBlocks(pPrevBlock, blockSize, align);
+
+ if (pBlock)
+ {
+ m_oldCachedSize -= pBlock->blockSize;
+ if (pBlock == m_pOldLastCachedBlocks[bucket])
+ {
+ m_pLastCachedBlocks[bucket] = pPrevBlock;
+ }
+ }
}
if (pBlock)
@@ -154,7 +126,7 @@ struct CachingAllocatorT : DefaultAllocator
return this->DefaultAllocator::AllocateAligned(size, align);
}
- void Free(void* pMem)
+ void Free(void* pMem)
{
if (pMem)
{
@@ -162,24 +134,57 @@ struct CachingAllocatorT : DefaultAllocator
SWR_ASSERT(pNewBlock->blockSize >= 0);
std::unique_lock<std::mutex> l(m_mutex);
- ArenaBlock* pPrevBlock = &m_cachedBlocks[GetBucketId(pNewBlock->blockSize)];
- ArenaBlock* pBlock = pPrevBlock->pNext;
+ InsertCachedBlock(GetBucketId(pNewBlock->blockSize), pNewBlock);
+ }
+ }
- while (pBlock)
+ void FreeOldBlocks()
+ {
+ if (!m_cachedSize) { return; }
+ std::lock_guard<std::mutex> l(m_mutex);
+
+ bool doFree = (m_oldCachedSize > MAX_UNUSED_SIZE);
+
+ for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i)
+ {
+ if (doFree)
{
- if (pNewBlock->blockSize >= pBlock->blockSize)
+ ArenaBlock* pBlock = m_oldCachedBlocks[i].pNext;
+ while (pBlock)
{
- // Insert here
- break;
+ ArenaBlock* pNext = pBlock->pNext;
+ m_oldCachedSize -= pBlock->blockSize;
+ m_totalAllocated -= (pBlock->blockSize + ARENA_BLOCK_ALIGN);
+ this->DefaultAllocator::Free(pBlock);
+ pBlock = pNext;
}
- pPrevBlock = pBlock;
- pBlock = pBlock->pNext;
+ m_oldCachedBlocks[i].pNext = nullptr;
+ m_pOldLastCachedBlocks[i] = &m_oldCachedBlocks[i];
}
- // Insert into list
- SWR_ASSERT(pPrevBlock);
- pPrevBlock->pNext = pNewBlock;
- pNewBlock->pNext = pBlock;
+ if (m_pLastCachedBlocks[i] != &m_cachedBlocks[i])
+ {
+ m_pLastCachedBlocks[i]->pNext = m_oldCachedBlocks[i].pNext;
+ m_oldCachedBlocks[i].pNext = m_cachedBlocks[i].pNext;
+ m_cachedBlocks[i].pNext = nullptr;
+ if (m_pOldLastCachedBlocks[i]->pNext)
+ {
+ m_pOldLastCachedBlocks[i] = m_pLastCachedBlocks[i];
+ }
+ m_pLastCachedBlocks[i] = &m_cachedBlocks[i];
+ }
+ }
+
+ m_oldCachedSize += m_cachedSize;
+ m_cachedSize = 0;
+ }
+
+ CachingAllocatorT()
+ {
+ for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i)
+ {
+ m_pLastCachedBlocks[i] = &m_cachedBlocks[i];
+ m_pOldLastCachedBlocks[i] = &m_oldCachedBlocks[i];
}
}
@@ -195,21 +200,126 @@ struct CachingAllocatorT : DefaultAllocator
this->DefaultAllocator::Free(pBlock);
pBlock = pNext;
}
+ pBlock = m_oldCachedBlocks[i].pNext;
+ while (pBlock)
+ {
+ ArenaBlock* pNext = pBlock->pNext;
+ this->DefaultAllocator::Free(pBlock);
+ pBlock = pNext;
+ }
}
}
+private:
+ static uint32_t GetBucketId(size_t blockSize)
+ {
+ uint32_t bucketId = 0;
+
+#if defined(BitScanReverseSizeT)
+ BitScanReverseSizeT((unsigned long*)&bucketId, blockSize >> CACHE_START_BUCKET_BIT);
+ bucketId = std::min<uint32_t>(bucketId, CACHE_NUM_BUCKETS - 1);
+#endif
+
+ return bucketId;
+ }
+
+ void InsertCachedBlock(uint32_t bucketId, ArenaBlock* pNewBlock)
+ {
+ SWR_ASSERT(bucketId < CACHE_NUM_BUCKETS);
+
+ ArenaBlock* pPrevBlock = &m_cachedBlocks[bucketId];
+ ArenaBlock* pBlock = pPrevBlock->pNext;
+
+ while (pBlock)
+ {
+ if (pNewBlock->blockSize >= pBlock->blockSize)
+ {
+ // Insert here
+ break;
+ }
+ pPrevBlock = pBlock;
+ pBlock = pBlock->pNext;
+ }
+
+ // Insert into list
+ SWR_ASSERT(pPrevBlock);
+ pPrevBlock->pNext = pNewBlock;
+ pNewBlock->pNext = pBlock;
+
+ if (m_pLastCachedBlocks[bucketId] == pPrevBlock)
+ {
+ m_pLastCachedBlocks[bucketId] = pNewBlock;
+ }
+
+ m_cachedSize += pNewBlock->blockSize;
+ }
+
+ static ArenaBlock* SearchBlocks(ArenaBlock*& pPrevBlock, size_t blockSize, size_t align)
+ {
+ ArenaBlock* pBlock = pPrevBlock->pNext;
+ ArenaBlock* pPotentialBlock = nullptr;
+ ArenaBlock* pPotentialPrev = nullptr;
+
+ while (pBlock)
+ {
+ if (pBlock->blockSize >= blockSize)
+ {
+ if (pBlock == AlignUp(pBlock, align))
+ {
+ if (pBlock->blockSize == blockSize)
+ {
+ // Won't find a better match
+ break;
+ }
+
+ // We could use this as it is larger than we wanted, but
+ // continue to search for a better match
+ pPotentialBlock = pBlock;
+ pPotentialPrev = pPrevBlock;
+ }
+ }
+ else
+ {
+ // Blocks are sorted by size (biggest first)
+ // So, if we get here, there are no blocks
+ // large enough, fall through to allocation.
+ pBlock = nullptr;
+ break;
+ }
+
+ pPrevBlock = pBlock;
+ pBlock = pBlock->pNext;
+ }
+
+ if (!pBlock)
+ {
+ // Couldn't find an exact match, use next biggest size
+ pBlock = pPotentialBlock;
+ pPrevBlock = pPotentialPrev;
+ }
+
+ return pBlock;
+ }
+
// buckets, for block sizes < (1 << (start+1)), < (1 << (start+2)), ...
static const uint32_t CACHE_NUM_BUCKETS = NumBucketsT;
static const uint32_t CACHE_START_BUCKET_BIT = StartBucketBitT;
+ static const size_t MAX_UNUSED_SIZE = 20 * sizeof(MEGABYTE);
ArenaBlock m_cachedBlocks[CACHE_NUM_BUCKETS];
+ ArenaBlock* m_pLastCachedBlocks[CACHE_NUM_BUCKETS];
+ ArenaBlock m_oldCachedBlocks[CACHE_NUM_BUCKETS];
+ ArenaBlock* m_pOldLastCachedBlocks[CACHE_NUM_BUCKETS];
std::mutex m_mutex;
size_t m_totalAllocated = 0;
+
+ size_t m_cachedSize = 0;
+ size_t m_oldCachedSize = 0;
};
typedef CachingAllocatorT<> CachingAllocator;
-template<typename T = DefaultAllocator, size_t BlockSizeT = (128 * 1024)>
+template<typename T = DefaultAllocator, size_t BlockSizeT = 128 * sizeof(KILOBYTE)>
class TArena
{
public:
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
index 7fb83edf169..b2d3d9ef4f4 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -70,7 +70,7 @@ static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS];
/// @param pDC - pointer to draw context (dispatch).
/// @param workerId - The unique worker ID that is assigned to this thread.
/// @param threadGroupId - the linear index for the thread group within the dispatch.
-void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId)
+void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer)
{
RDTSC_START(BEDispatch);
@@ -80,10 +80,10 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup
SWR_ASSERT(pTaskData != nullptr);
// Ensure spill fill memory has been allocated.
- if (pDC->pSpillFill[workerId] == nullptr)
+ if (pSpillFillBuffer == nullptr)
{
///@todo Add state which indicates the spill fill size.
- pDC->pSpillFill[workerId] = (uint8_t*)pDC->pArena->AllocAlignedSync(4096 * 1024, sizeof(float) * 8);
+ pSpillFillBuffer = pDC->pArena->AllocAlignedSync(4 * sizeof(MEGABYTE), sizeof(float) * 8);
}
const API_STATE& state = GetApiState(pDC);
@@ -94,7 +94,7 @@ void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroup
csContext.dispatchDims[1] = pTaskData->threadGroupCountY;
csContext.dispatchDims[2] = pTaskData->threadGroupCountZ;
csContext.pTGSM = pContext->pScratch[workerId];
- csContext.pSpillFillBuffer = pDC->pSpillFill[workerId];
+ csContext.pSpillFillBuffer = (uint8_t*)pSpillFillBuffer;
state.pfnCsFunc(GetPrivateState(pDC), &csContext);
@@ -772,8 +772,10 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
psContext.vOneOverW.centroid = psContext.vOneOverW.center;
}
- // interpolate z
+ // interpolate and quantize z
psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
+ psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
+
RDTSC_STOP(BEBarycentric, 0, 0);
simdmask clipCoverageMask = coverageMask & MASK;
@@ -793,7 +795,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
if(CanEarlyZ(pPSState))
{
RDTSC_START(BEEarlyDepthTest);
- depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+ depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
RDTSC_STOP(BEEarlyDepthTest, 0, 0);
@@ -825,7 +827,7 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
if(!CanEarlyZ(pPSState))
{
RDTSC_START(BELateDepthTest);
- depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+ depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
psContext.vZ, pDepthBase, vCoverageMask, pStencilBase, &stencilPassMask);
RDTSC_STOP(BELateDepthTest, 0, 0);
@@ -977,8 +979,9 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
- // interpolate z
+ // interpolate and quantize z
psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
+ psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
RDTSC_STOP(BEBarycentric, 0, 0);
@@ -1000,7 +1003,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
if (CanEarlyZ(pPSState))
{
RDTSC_START(BEEarlyDepthTest);
- depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+ depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
RDTSC_STOP(BEEarlyDepthTest, 0, 0);
@@ -1033,7 +1036,7 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
if (!CanEarlyZ(pPSState))
{
RDTSC_START(BELateDepthTest);
- depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+ depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
RDTSC_STOP(BELateDepthTest, 0, 0);
@@ -1200,8 +1203,9 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
RDTSC_START(BEBarycentric);
backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
- // interpolate z
+ // interpolate and quantize z
psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
+ psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
RDTSC_STOP(BEBarycentric, 0, 0);
// execute pixel shader
@@ -1263,10 +1267,11 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
// calc I & J per sample
backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
- // interpolate z
+ // interpolate and quantize z
if (!pPSState->writesODepth)
{
vZ[sample] = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
+ vZ[sample] = state.pfnQuantizeDepth(vZ[sample]);
}
///@todo: perspective correct vs non-perspective correct clipping?
@@ -1292,7 +1297,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
// ZTest for this sample
RDTSC_START(BEEarlyDepthTest);
stencilPassMask[sample] = vCoverageMask[sample];
- depthPassMask[sample] = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+ depthPassMask[sample] = DepthStencilTest(&state, work.triFlags.frontFacing,
vZ[sample], pDepthSample, vCoverageMask[sample], pStencilSample, &stencilPassMask[sample]);
RDTSC_STOP(BEEarlyDepthTest, 0, 0);
@@ -1308,8 +1313,9 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
{
RDTSC_START(BEBarycentric);
backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
- // interpolate z
+ // interpolate and quantize z
psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.center, psContext.vJ.center);
+ psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
RDTSC_STOP(BEBarycentric, 0, 0);
// execute pixel shader
@@ -1463,8 +1469,9 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
backendFuncs.pfnCalcSampleBarycentrics(coeffs, psContext);
- // interpolate z
+ // interpolate and quantize z
psContext.vZ = vplaneps(coeffs.vZa, coeffs.vZb, coeffs.vZc, psContext.vI.sample, psContext.vJ.sample);
+ psContext.vZ = state.pfnQuantizeDepth(psContext.vZ);
RDTSC_STOP(BEBarycentric, 0, 0);
@@ -1483,7 +1490,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
uint8_t *pStencilSample = pStencilBase + MultisampleTraits<sampleCount>::RasterTileStencilOffset(sample);
RDTSC_START(BEEarlyDepthTest);
- simdscalar depthPassMask = DepthStencilTest(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing,
+ simdscalar depthPassMask = DepthStencilTest(&state, work.triFlags.frontFacing,
psContext.vZ, pDepthSample, vCoverageMask, pStencilSample, &stencilPassMask);
DepthStencilWrite(&state.vp[0], &state.depthStencilState, work.triFlags.frontFacing, psContext.vZ,
pDepthSample, depthPassMask, vCoverageMask, pStencilSample, stencilPassMask);
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h
index 2fa18953cad..d0626b997af 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.h
@@ -32,7 +32,7 @@
#include "core/context.h"
#include "core/multisample.h"
-void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId);
+void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId, void*& pSpillFillBuffer);
void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
index 3a2a8b35be8..e624fd8f674 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
@@ -162,8 +162,8 @@ int ClipTriToPlane( const float *pInPts, int numInPts,
void Clip(const float *pTriangle, const float *pAttribs, int numAttribs, float *pOutTriangles, int *numVerts, float *pOutAttribs)
{
// temp storage to hold at least 6 sets of vertices, the max number that can be created during clipping
- OSALIGN(float, 16) tempPts[6 * 4];
- OSALIGN(float, 16) tempAttribs[6 * KNOB_NUM_ATTRIBUTES * 4];
+ OSALIGNSIMD(float) tempPts[6 * 4];
+ OSALIGNSIMD(float) tempAttribs[6 * KNOB_NUM_ATTRIBUTES * 4];
// we opt to clip to viewport frustum to produce smaller triangles for rasterization precision
int NumOutPts = ClipTriToPlane<FRUSTUM_NEAR>(pTriangle, 3, pAttribs, numAttribs, tempPts, tempAttribs);
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h
index ba5870a92bb..67a4c4f47bb 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -265,8 +265,8 @@ public:
// clip a single primitive
int ClipScalar(PA_STATE& pa, uint32_t primIndex, float* pOutPos, float* pOutAttribs)
{
- OSALIGN(float, 16) inVerts[3 * 4];
- OSALIGN(float, 16) inAttribs[3 * KNOB_NUM_ATTRIBUTES * 4];
+ OSALIGNSIMD(float) inVerts[3 * 4];
+ OSALIGNSIMD(float) inAttribs[3 * KNOB_NUM_ATTRIBUTES * 4];
// transpose primitive position
__m128 verts[3];
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index 39f23372a18..6464aa20af7 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -308,6 +308,8 @@ OSALIGNLINE(struct) API_STATE
uint32_t depthHottileEnable: 1;
uint32_t stencilHottileEnable : 1;
};
+
+ PFN_QUANTIZE_DEPTH pfnQuantizeDepth;
};
class MacroTileMgr;
@@ -380,32 +382,29 @@ struct DRAW_STATE
// This draw context maintains all of the state needed for the draw operation.
struct DRAW_CONTEXT
{
- SWR_CONTEXT *pContext;
-
- uint64_t drawId;
-
- bool isCompute; // Is this DC a compute context?
-
- FE_WORK FeWork;
- volatile OSALIGNLINE(uint32_t) FeLock;
- volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw?
- volatile OSALIGNLINE(int64_t) threadsDone;
-
- uint64_t dependency;
-
- MacroTileMgr* pTileMgr;
-
- // The following fields are valid if isCompute is true.
- DispatchQueue* pDispatch; // Queue for thread groups. (isCompute)
+ SWR_CONTEXT* pContext;
+ uint64_t drawId;
+ union
+ {
+ MacroTileMgr* pTileMgr;
+ DispatchQueue* pDispatch; // Queue for thread groups. (isCompute)
+ };
+ uint64_t dependency;
+ DRAW_STATE* pState;
+ CachingArena* pArena;
- DRAW_STATE* pState;
- CachingArena* pArena;
+ bool isCompute; // Is this DC a compute context?
+ bool cleanupState; // True if this is the last draw using an entry in the state ring.
+ volatile bool doneFE; // Is FE work done for this draw?
- uint8_t* pSpillFill[KNOB_MAX_NUM_THREADS]; // Scratch space used for spill fills.
+ FE_WORK FeWork;
- bool cleanupState; // True if this is the last draw using an entry in the state ring.
+ volatile OSALIGNLINE(uint32_t) FeLock;
+ volatile int64_t threadsDone;
};
+static_assert((sizeof(DRAW_CONTEXT) & 63) == 0, "Invalid size for DRAW_CONTEXT");
+
INLINE const API_STATE& GetApiState(const DRAW_CONTEXT* pDC)
{
SWR_ASSERT(pDC != nullptr);
@@ -447,6 +446,9 @@ struct SWR_CONTEXT
DRAW_CONTEXT *pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw.
DRAW_CONTEXT *pPrevDrawContext; // This points to DC entry for the previous context submitted that we can copy state from.
+ MacroTileMgr* pMacroTileManagerArray;
+ DispatchQueue* pDispatchQueueArray;
+
// Draw State Ring
// When draw are very large (lots of primitives) then the API thread will break these up.
// These split draws all have identical state. So instead of storing the state directly
@@ -457,6 +459,8 @@ struct SWR_CONTEXT
uint32_t curStateId; // Current index to the next available entry in the DS ring.
uint32_t NumWorkerThreads;
+ uint32_t NumFEThreads;
+ uint32_t NumBEThreads;
THREAD_POOL threadPool; // Thread pool associated with this context
@@ -481,6 +485,7 @@ struct SWR_CONTEXT
uint8_t* pScratch[KNOB_MAX_NUM_THREADS];
CachingAllocator cachingArenaAllocator;
+ uint32_t frameCount;
};
void WaitForDependencies(SWR_CONTEXT *pContext, uint64_t drawId);
diff --git a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
index 2cc9d4054ac..7b55580bf0a 100644
--- a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
+++ b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
@@ -80,14 +80,52 @@ void StencilOp(SWR_STENCILOP op, simdscalar mask, simdscalar stencilRefps, simds
}
+template<SWR_FORMAT depthFormatT>
+simdscalar QuantizeDepth(simdscalar depth)
+{
+ SWR_TYPE depthType = FormatTraits<depthFormatT>::GetType(0);
+ uint32_t depthBpc = FormatTraits<depthFormatT>::GetBPC(0);
+
+ if (depthType == SWR_TYPE_FLOAT)
+ {
+ // assume only 32bit float depth supported
+ SWR_ASSERT(depthBpc == 32);
+
+ // matches shader precision, no quantizing needed
+ return depth;
+ }
+
+ // should be unorm depth if not float
+ SWR_ASSERT(depthType == SWR_TYPE_UNORM);
+
+ float quantize = (float)((1 << depthBpc) - 1);
+ simdscalar result = _simd_mul_ps(depth, _simd_set1_ps(quantize));
+ result = _simd_add_ps(result, _simd_set1_ps(0.5f));
+ result = _simd_round_ps(result, _MM_FROUND_TO_ZERO);
+
+ if (depthBpc > 16)
+ {
+ result = _simd_div_ps(result, _simd_set1_ps(quantize));
+ }
+ else
+ {
+ result = _simd_mul_ps(result, _simd_set1_ps(1.0f / quantize));
+ }
+
+ return result;
+}
+
INLINE
-simdscalar DepthStencilTest(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState,
+simdscalar DepthStencilTest(const API_STATE* pState,
bool frontFacing, simdscalar interpZ, uint8_t* pDepthBase, simdscalar coverageMask, uint8_t *pStencilBase,
simdscalar* pStencilMask)
{
static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
static_assert(KNOB_STENCIL_HOT_TILE_FORMAT == R8_UINT, "Unsupported stencil hot tile format");
+ const SWR_DEPTH_STENCIL_STATE* pDSState = &pState->depthStencilState;
+ const SWR_VIEWPORT* pViewport = &pState->vp[0];
+
simdscalar depthResult = _simd_set1_ps(-1.0f);
simdscalar zbuf;
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 36721e00beb..93869610ff9 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -793,8 +793,14 @@ static void GeometryShaderStage(
uint8_t* pCutBase = pCutBufferBase + instance * cutInstanceStride;
DWORD numAttribs;
- _BitScanReverse(&numAttribs, state.feAttribMask);
- numAttribs++;
+ if (_BitScanReverse(&numAttribs, state.feAttribMask))
+ {
+ numAttribs++;
+ }
+ else
+ {
+ numAttribs = 0;
+ }
for (uint32_t stream = 0; stream < MAX_SO_STREAMS; ++stream)
{
diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h b/src/gallium/drivers/swr/rasterizer/core/knobs.h
index d7feb86273d..55a22a67f4c 100644
--- a/src/gallium/drivers/swr/rasterizer/core/knobs.h
+++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h
@@ -45,14 +45,17 @@
#define KNOB_ARCH_ISA AVX
#define KNOB_ARCH_STR "AVX"
#define KNOB_SIMD_WIDTH 8
+#define KNOB_SIMD_BYTES 32
#elif (KNOB_ARCH == KNOB_ARCH_AVX2)
#define KNOB_ARCH_ISA AVX2
#define KNOB_ARCH_STR "AVX2"
#define KNOB_SIMD_WIDTH 8
+#define KNOB_SIMD_BYTES 32
#elif (KNOB_ARCH == KNOB_ARCH_AVX512)
#define KNOB_ARCH_ISA AVX512F
#define KNOB_ARCH_STR "AVX512"
#define KNOB_SIMD_WIDTH 16
+#define KNOB_SIMD_BYTES 64
#error "AVX512 not yet supported"
#else
#error "Unknown architecture"
diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h
index f8f1a33b7e3..17f488538d6 100644
--- a/src/gallium/drivers/swr/rasterizer/core/pa.h
+++ b/src/gallium/drivers/swr/rasterizer/core/pa.h
@@ -1017,13 +1017,13 @@ struct PA_TESS : PA_STATE
{
SWR_ASSERT(numPrims <= KNOB_SIMD_WIDTH);
#if KNOB_SIMD_WIDTH == 8
- static const OSALIGN(int32_t, 64) maskGen[KNOB_SIMD_WIDTH * 2] =
+ static const OSALIGNLINE(int32_t) maskGen[KNOB_SIMD_WIDTH * 2] =
{
-1, -1, -1, -1, -1, -1, -1, -1,
0, 0, 0, 0, 0, 0, 0, 0
};
#elif KNOB_SIMD_WIDTH == 16
- static const OSALIGN(int32_t, 128) maskGen[KNOB_SIMD_WIDTH * 2] =
+ static const OSALIGNLINE(int32_t) maskGen[KNOB_SIMD_WIDTH * 2] =
{
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
@@ -1167,8 +1167,14 @@ struct PA_FACTORY
{
memset(&indexStore, 0, sizeof(indexStore));
DWORD numAttribs;
- _BitScanReverse(&numAttribs, state.feAttribMask);
- numAttribs++;
+ if (_BitScanReverse(&numAttribs, state.feAttribMask))
+ {
+ numAttribs++;
+ }
+ else
+ {
+ numAttribs = 0;
+ }
new (&this->paCut) PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * KNOB_SIMD_WIDTH,
&this->indexStore[0], numVerts, numAttribs, state.topology, false);
cutPA = true;
diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
index 52fb7c88cdd..3144a901c91 100644
--- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
@@ -383,7 +383,7 @@ __declspec(thread) volatile uint64_t gToss;
static const uint32_t vertsPerTri = 3, componentsPerAttrib = 4;
// try to avoid _chkstk insertions; make this thread local
-static THREAD OSALIGN(float, 16) perspAttribsTLS[vertsPerTri * KNOB_NUM_ATTRIBUTES * componentsPerAttrib];
+static THREAD OSALIGNLINE(float) perspAttribsTLS[vertsPerTri * KNOB_NUM_ATTRIBUTES * componentsPerAttrib];
INLINE
void ComputeEdgeData(int32_t a, int32_t b, EDGE& edge)
@@ -439,7 +439,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
const SWR_RASTSTATE &rastState = state.rastState;
const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
- OSALIGN(SWR_TRIANGLE_DESC, 16) triDesc;
+ OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
triDesc.pUserClipBuffer = workDesc.pUserClipBuffer;
__m128 vX, vY, vZ, vRecipW;
@@ -502,7 +502,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
_MM_EXTRACT_FLOAT(triDesc.J[1], vB, 2);
_MM_EXTRACT_FLOAT(triDesc.J[2], vC, 2);
- OSALIGN(float, 16) oneOverW[4];
+ OSALIGNSIMD(float) oneOverW[4];
_mm_store_ps(oneOverW, vRecipW);
triDesc.OneOverW[0] = oneOverW[0] - oneOverW[2];
triDesc.OneOverW[1] = oneOverW[1] - oneOverW[2];
@@ -537,7 +537,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
// compute bary Z
// zInterp = zVert0 + i(zVert1-zVert0) + j (zVert2 - zVert0)
- OSALIGN(float, 16) a[4];
+ OSALIGNSIMD(float) a[4];
_mm_store_ps(a, vZ);
triDesc.Z[0] = a[0] - a[2];
triDesc.Z[1] = a[1] - a[2];
@@ -575,7 +575,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
}
// Calc bounding box of triangle
- OSALIGN(BBOX, 16) bbox;
+ OSALIGNSIMD(BBOX) bbox;
calcBoundingBoxInt(vXi, vYi, bbox);
// Intersect with scissor/viewport
@@ -594,7 +594,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
int32_t macroBoxTop = macroY * KNOB_MACROTILE_Y_DIM_FIXED;
int32_t macroBoxBottom = macroBoxTop + KNOB_MACROTILE_Y_DIM_FIXED - 1;
- OSALIGN(BBOX, 16) intersect;
+ OSALIGNSIMD(BBOX) intersect;
intersect.left = std::max(bbox.left, macroBoxLeft);
intersect.top = std::max(bbox.top, macroBoxTop);
intersect.right = std::min(bbox.right, macroBoxRight);
@@ -1047,7 +1047,7 @@ void RasterizeSimplePoint(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTi
{ 50, 51, 54, 55, 58, 59, 62, 63 }
};
- OSALIGN(SWR_TRIANGLE_DESC, 16) triDesc;
+ OSALIGNSIMD(SWR_TRIANGLE_DESC) triDesc;
// pull point information from triangle buffer
// @todo use structs for readability
@@ -1286,7 +1286,7 @@ void RasterizeLine(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, voi
// make sure this macrotile intersects the triangle
__m128i vXai = fpToFixedPoint(vXa);
__m128i vYai = fpToFixedPoint(vYa);
- OSALIGN(BBOX, 16) bboxA;
+ OSALIGNSIMD(BBOX) bboxA;
calcBoundingBoxInt(vXai, vYai, bboxA);
if (!(bboxA.left > macroBoxRight ||
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h
index 5752094ca10..50361068025 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -790,6 +790,7 @@ typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT& soContext);
typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext);
typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext);
typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, uint8_t*, simdvector&, simdscalari*, simdscalari*);
+typedef simdscalar(*PFN_QUANTIZE_DEPTH)(simdscalar);
//////////////////////////////////////////////////////////////////////////
/// FRONTEND_STATE
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index 07bc94a1a54..4b7a207f366 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -68,7 +68,10 @@ void CalculateProcessorTopology(CPUNumaNodes& out_nodes, uint32_t& out_numThread
#if defined(_WIN32)
- SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer[KNOB_MAX_NUM_THREADS];
+ static std::mutex m;
+ std::lock_guard<std::mutex> l(m);
+
+ static SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX buffer[KNOB_MAX_NUM_THREADS];
DWORD bufSize = sizeof(buffer);
BOOL ret = GetLogicalProcessorInformationEx(RelationProcessorCore, buffer, &bufSize);
@@ -288,7 +291,10 @@ INLINE int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
{
// Cleanup memory allocations
pDC->pArena->Reset(true);
- pDC->pTileMgr->initialize();
+ if (!pDC->isCompute)
+ {
+ pDC->pTileMgr->initialize();
+ }
if (pDC->cleanupState)
{
pDC->pState->pArena->Reset(true);
@@ -302,10 +308,10 @@ INLINE int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
return result;
}
-INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
+INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE, uint64_t& drawEnqueued)
{
// increment our current draw id to the first incomplete draw
- uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
+ drawEnqueued = GetEnqueuedDraw(pContext);
while (curDrawBE < drawEnqueued)
{
DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT];
@@ -313,8 +319,9 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
// If its not compute and FE is not done then break out of loop.
if (!pDC->doneFE && !pDC->isCompute) break;
- bool isWorkComplete = (pDC->isCompute) ?
- pDC->pDispatch->isWorkComplete() : pDC->pTileMgr->isWorkComplete();
+ bool isWorkComplete = pDC->isCompute ?
+ pDC->pDispatch->isWorkComplete() :
+ pDC->pTileMgr->isWorkComplete();
if (isWorkComplete)
{
@@ -355,7 +362,8 @@ void WorkOnFifoBE(
{
// Find the first incomplete draw that has pending work. If no such draw is found then
// return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
- if (FindFirstIncompleteDraw(pContext, curDrawBE) == false)
+ uint64_t drawEnqueued = 0;
+ if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false)
{
return;
}
@@ -370,7 +378,7 @@ void WorkOnFifoBE(
// 2. If we're trying to work on draws after curDrawBE, we are restricted to
// working on those macrotiles that are known to be complete in the prior draw to
// maintain order. The locked tiles provides the history to ensures this.
- for (uint64_t i = curDrawBE; i < GetEnqueuedDraw(pContext); ++i)
+ for (uint64_t i = curDrawBE; i < drawEnqueued; ++i)
{
DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT];
@@ -463,7 +471,7 @@ void WorkOnFifoBE(
}
}
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode)
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE)
{
// Try to grab the next DC from the ring
uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
@@ -516,38 +524,44 @@ void WorkOnCompute(
uint32_t workerId,
uint64_t& curDrawBE)
{
- if (FindFirstIncompleteDraw(pContext, curDrawBE) == false)
+ uint64_t drawEnqueued = 0;
+ if (FindFirstIncompleteDraw(pContext, curDrawBE, drawEnqueued) == false)
{
return;
}
uint64_t lastRetiredDraw = pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT].drawId - 1;
- DRAW_CONTEXT *pDC = &pContext->dcRing[curDrawBE % KNOB_MAX_DRAWS_IN_FLIGHT];
- if (pDC->isCompute == false) return;
-
- // check dependencies
- if (CheckDependency(pContext, pDC, lastRetiredDraw))
+ for (uint64_t i = curDrawBE; curDrawBE < drawEnqueued; ++i)
{
- return;
- }
+ DRAW_CONTEXT *pDC = &pContext->dcRing[i % KNOB_MAX_DRAWS_IN_FLIGHT];
+ if (pDC->isCompute == false) return;
- SWR_ASSERT(pDC->pDispatch != nullptr);
- DispatchQueue& queue = *pDC->pDispatch;
+ // check dependencies
+ if (CheckDependency(pContext, pDC, lastRetiredDraw))
+ {
+ return;
+ }
- // Is there any work remaining?
- if (queue.getNumQueued() > 0)
- {
- uint32_t threadGroupId = 0;
- while (queue.getWork(threadGroupId))
+ SWR_ASSERT(pDC->pDispatch != nullptr);
+ DispatchQueue& queue = *pDC->pDispatch;
+
+ // Is there any work remaining?
+ if (queue.getNumQueued() > 0)
{
- ProcessComputeBE(pDC, workerId, threadGroupId);
+ void* pSpillFillBuffer = nullptr;
+ uint32_t threadGroupId = 0;
+ while (queue.getWork(threadGroupId))
+ {
+ ProcessComputeBE(pDC, workerId, threadGroupId, pSpillFillBuffer);
- queue.finishedWork();
+ queue.finishedWork();
+ }
}
}
}
+template<bool IsFEThread, bool IsBEThread>
DWORD workerThreadMain(LPVOID pData)
{
THREAD_DATA *pThreadData = (THREAD_DATA*)pData;
@@ -631,25 +645,38 @@ DWORD workerThreadMain(LPVOID pData)
}
}
- RDTSC_START(WorkerWorkOnFifoBE);
- WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
- RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);
+ if (IsBEThread)
+ {
+ RDTSC_START(WorkerWorkOnFifoBE);
+ WorkOnFifoBE(pContext, workerId, curDrawBE, lockedTiles, numaNode, numaMask);
+ RDTSC_STOP(WorkerWorkOnFifoBE, 0, 0);
- WorkOnCompute(pContext, workerId, curDrawBE);
+ WorkOnCompute(pContext, workerId, curDrawBE);
+ }
- WorkOnFifoFE(pContext, workerId, curDrawFE, numaNode);
+ if (IsFEThread)
+ {
+ WorkOnFifoFE(pContext, workerId, curDrawFE);
+
+ if (!IsBEThread)
+ {
+ curDrawBE = curDrawFE;
+ }
+ }
}
return 0;
}
+template<> DWORD workerThreadMain<false, false>(LPVOID) = delete;
+template <bool IsFEThread, bool IsBEThread>
DWORD workerThreadInit(LPVOID pData)
{
#if defined(_WIN32)
__try
#endif // _WIN32
{
- return workerThreadMain(pData);
+ return workerThreadMain<IsFEThread, IsBEThread>(pData);
}
#if defined(_WIN32)
@@ -661,6 +688,7 @@ DWORD workerThreadInit(LPVOID pData)
return 1;
}
+template<> DWORD workerThreadInit<false, false>(LPVOID pData) = delete;
void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
{
@@ -678,6 +706,16 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
uint32_t numCoresPerNode = numHWCoresPerNode;
uint32_t numHyperThreads = numHWHyperThreads;
+ if (KNOB_MAX_WORKER_THREADS)
+ {
+ SET_KNOB(HYPERTHREADED_FE, false);
+ }
+
+ if (KNOB_HYPERTHREADED_FE)
+ {
+ SET_KNOB(MAX_THREADS_PER_CORE, 0);
+ }
+
if (KNOB_MAX_NUMA_NODES)
{
numNodes = std::min(numNodes, KNOB_MAX_NUMA_NODES);
@@ -693,6 +731,11 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
numHyperThreads = std::min(numHyperThreads, KNOB_MAX_THREADS_PER_CORE);
}
+ if (numHyperThreads < 2)
+ {
+ SET_KNOB(HYPERTHREADED_FE, false);
+ }
+
// Calculate numThreads
uint32_t numThreads = numNodes * numCoresPerNode * numHyperThreads;
@@ -767,9 +810,14 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
pPool->pThreadData[workerId].procGroupId = workerId % numProcGroups;
pPool->pThreadData[workerId].threadId = 0;
pPool->pThreadData[workerId].numaId = 0;
+ pPool->pThreadData[workerId].coreId = 0;
+ pPool->pThreadData[workerId].htId = 0;
pPool->pThreadData[workerId].pContext = pContext;
pPool->pThreadData[workerId].forceBindProcGroup = bForceBindProcGroup;
- pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]);
+ pPool->threads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
+
+ pContext->NumBEThreads++;
+ pContext->NumFEThreads++;
}
}
else
@@ -780,6 +828,10 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
for (uint32_t n = 0; n < numNodes; ++n)
{
auto& node = nodes[n];
+ if (node.cores.size() == 0)
+ {
+ continue;
+ }
uint32_t numCores = numCoresPerNode;
for (uint32_t c = 0; c < numCores; ++c)
@@ -797,8 +849,29 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
pPool->pThreadData[workerId].procGroupId = core.procGroup;
pPool->pThreadData[workerId].threadId = core.threadIds[t];
pPool->pThreadData[workerId].numaId = n;
+ pPool->pThreadData[workerId].coreId = c;
+ pPool->pThreadData[workerId].htId = t;
pPool->pThreadData[workerId].pContext = pContext;
- pPool->threads[workerId] = new std::thread(workerThreadInit, &pPool->pThreadData[workerId]);
+
+ if (KNOB_HYPERTHREADED_FE)
+ {
+ if (t == 0)
+ {
+ pContext->NumBEThreads++;
+ pPool->threads[workerId] = new std::thread(workerThreadInit<false, true>, &pPool->pThreadData[workerId]);
+ }
+ else
+ {
+ pContext->NumFEThreads++;
+ pPool->threads[workerId] = new std::thread(workerThreadInit<true, false>, &pPool->pThreadData[workerId]);
+ }
+ }
+ else
+ {
+ pPool->threads[workerId] = new std::thread(workerThreadInit<true, true>, &pPool->pThreadData[workerId]);
+ pContext->NumBEThreads++;
+ pContext->NumFEThreads++;
+ }
++workerId;
}
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h
index 821d7dcb16e..3aba6323a95 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@@ -41,6 +41,8 @@ struct THREAD_DATA
uint32_t procGroupId; // Will always be 0 for non-Windows OS
uint32_t threadId; // within the procGroup for Windows
uint32_t numaId; // NUMA node id
+ uint32_t coreId; // Core id
+ uint32_t htId; // Hyperthread id
uint32_t workerId;
SWR_CONTEXT *pContext;
bool forceBindProcGroup; // Only useful when KNOB_MAX_WORKER_THREADS is set.
@@ -62,7 +64,7 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
// Expose FE and BE worker functions to the API thread if single threaded
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, uint32_t numaNode);
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE);
void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles, uint32_t numaNode, uint32_t numaMask);
void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE);
int64_t CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC); \ No newline at end of file
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
index 794577270cf..87d9f42c032 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@@ -35,27 +35,6 @@
#define TILE_ID(x,y) ((x << 16 | y))
-// override new/delete for alignment
-void *MacroTileMgr::operator new(size_t size)
-{
- return _aligned_malloc(size, 64);
-}
-
-void MacroTileMgr::operator delete(void *p)
-{
- _aligned_free(p);
-}
-
-void* DispatchQueue::operator new(size_t size)
-{
- return _aligned_malloc(size, 64);
-}
-
-void DispatchQueue::operator delete(void *p)
-{
- _aligned_free(p);
-}
-
MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena)
{
}
@@ -304,7 +283,6 @@ void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile)
void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID)
{
const API_STATE& state = GetApiState(pDC);
- HotTileMgr *pHotTileMgr = pContext->pHotTileMgr;
uint32_t x, y;
MacroTileMgr::getTileIndices(macroID, x, y);
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
index aa561badc1c..82a15e16a33 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
@@ -140,9 +140,6 @@ public:
x = (tileID >> 16) & 0xffff;
}
- void *operator new(size_t size);
- void operator delete (void *p);
-
private:
CachingArena& mArena;
std::unordered_map<uint32_t, MacroTileQueue> mTiles;
@@ -229,9 +226,6 @@ public:
return mpTaskData;
}
- void *operator new(size_t size);
- void operator delete (void *p);
-
void* mpTaskData{ nullptr }; // The API thread will set this up and the callback task function will interpet this.
OSALIGNLINE(volatile LONG) mTasksAvailable{ 0 };
@@ -272,7 +266,7 @@ class HotTileMgr
public:
HotTileMgr()
{
- memset(&mHotTiles[0][0], 0, sizeof(mHotTiles));
+ memset(mHotTiles, 0, sizeof(mHotTiles));
// cache hottile size
for (uint32_t i = SWR_ATTACHMENT_COLOR0; i <= SWR_ATTACHMENT_COLOR7; ++i)
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
index 0f3ded68544..3832b91d93e 100644
--- a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
+++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
@@ -30,6 +30,18 @@ KNOBS = [
'category' : 'debug',
}],
+ ['HYPERTHREADED_FE', {
+ 'type' : 'bool',
+ 'default' : 'false',
+ 'desc' : ['EXPERIMENTAL!!',
+ 'If enabled will attempt to use secondary threads per core to perform',
+ 'front-end (VS/GS) work.',
+ '',
+ 'Note: Setting this will cause KNOB_MAX_THREADS_PER_CORE to be ignored.'],
+ 'category' : 'perf',
+ 'advanced' : 'true',
+ }],
+
['DUMP_SHADER_IR', {
'type' : 'bool',
'default' : 'false',
@@ -166,6 +178,7 @@ KNOBS = [
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
'category' : 'perf',
+ 'advanced' : 'true',
}],
['TOSS_FETCH', {
@@ -175,6 +188,7 @@ KNOBS = [
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
'category' : 'perf',
+ 'advanced' : 'true',
}],
['TOSS_IA', {
@@ -184,6 +198,7 @@ KNOBS = [
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
'category' : 'perf',
+ 'advanced' : 'true',
}],
['TOSS_VS', {
@@ -193,6 +208,7 @@ KNOBS = [
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
'category' : 'perf',
+ 'advanced' : 'true',
}],
['TOSS_SETUP_TRIS', {
@@ -202,6 +218,7 @@ KNOBS = [
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
'category' : 'perf',
+ 'advanced' : 'true',
}],
['TOSS_BIN_TRIS', {
@@ -211,6 +228,7 @@ KNOBS = [
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
'category' : 'perf',
+ 'advanced' : 'true',
}],
['TOSS_RS', {
@@ -220,4 +238,5 @@ KNOBS = [
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
'category' : 'perf',
+ 'advanced' : 'true',
}],]