diff options
Diffstat (limited to 'src/gallium/drivers')
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/core/frontend.cpp | 57 | ||||
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/core/pa.h | 13 |
2 files changed, 54 insertions, 16 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index b09a7985ccc..3886c64ccf6 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -551,6 +551,7 @@ static void StreamOut( _mm_store_ps((float*)pPrimDataAttrib, attrib[v]); } + soMask &= ~(1 << slot); } @@ -1345,8 +1346,6 @@ static void TessellationStages( const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH); const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH; - const uint32_t primMask = GenMask(numPrims); - const simd16scalari primID = _simd16_set1_epi32(dsContext.PrimitiveID); const simdscalari primID_lo = _simd16_extract_si(primID, 0); const simdscalari primID_hi = _simd16_extract_si(primID, 1); @@ -1390,9 +1389,9 @@ static void TessellationStages( if (HasRastT::value) { #if USE_SIMD16_FRONTEND - simd16vector prim_simd16[3]; + simd16vector prim_simd16[3]; // Only deal with triangles, lines, or points #else - simdvector prim[3]; // Only deal with triangles, lines, or points + simdvector prim[3]; // Only deal with triangles, lines, or points #endif AR_BEGIN(FEPAAssemble, pDC->drawId); bool assemble = @@ -1407,7 +1406,7 @@ static void TessellationStages( SWR_ASSERT(pfnClipFunc); #if USE_SIMD16_FRONTEND tessPa.useAlternateOffset = false; - pfnClipFunc(pDC, tessPa, workerId, prim_simd16, primMask, primID, _simd16_set1_epi32(0)); + pfnClipFunc(pDC, tessPa, workerId, prim_simd16, GenMask(numPrims), primID, _simd16_set1_epi32(0)); #else pfnClipFunc(pDC, tessPa, workerId, prim, GenMask(tessPa.NumPrims()), _simd_set1_epi32(dsContext.PrimitiveID), _simd_set1_epi32(0)); @@ -1420,9 +1419,21 @@ static void TessellationStages( } // while (tessPa.HasWork()) } // for (uint32_t p = 0; p < numPrims; ++p) +#if USE_SIMD16_FRONTEND + if (gt_pTessellationThreadData->pDSOutput != nullptr) + { + AlignedFree(gt_pTessellationThreadData->pDSOutput); + gt_pTessellationThreadData->pDSOutput = nullptr; + } + gt_pTessellationThreadData->numDSOutputVectors = 0; + +#endif TSDestroyCtx(tsCtx); } +THREAD PA_STATE::SIMDVERTEX *pVertexStore = nullptr; +THREAD uint32_t gVertexStoreSize = 0; + ////////////////////////////////////////////////////////////////////////// /// @brief FE handler for SwrDraw. /// @tparam IsIndexedT - Is indexed drawing enabled @@ -1530,8 +1541,36 @@ void ProcessDraw( pSoPrimData = (uint32_t*)pDC->pArena->AllocAligned(4096, 16); } + const uint32_t vertexCount = NumVertsPerPrim(state.topology, state.gsState.gsEnable); + + SWR_ASSERT(vertexCount <= MAX_NUM_VERTS_PER_PRIM); + + // grow the vertex store for the PA as necessary + if (gVertexStoreSize < vertexCount) + { + if (pVertexStore != nullptr) + { + AlignedFree(pVertexStore); + } + + while (gVertexStoreSize < vertexCount) + { +#if USE_SIMD16_FRONTEND + gVertexStoreSize += 4; // grow in chunks of 4 simd16vertex +#else + gVertexStoreSize += 8; // grow in chunks of 8 simdvertex +#endif + } + + SWR_ASSERT(gVertexStoreSize <= MAX_NUM_VERTS_PER_PRIM); + + pVertexStore = reinterpret_cast<PA_STATE::SIMDVERTEX *>(AlignedMalloc(gVertexStoreSize * sizeof(pVertexStore[0]), 64)); + + SWR_ASSERT(pVertexStore != nullptr); + } + // choose primitive assembler - PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts); + PA_FACTORY<IsIndexedT, IsCutIndexEnabledT> paFactory(pDC, state.topology, work.numVerts, pVertexStore, gVertexStoreSize); PA_STATE& pa = paFactory.GetPA(); #if USE_SIMD16_FRONTEND @@ -1689,8 +1728,6 @@ void ProcessDraw( const uint32_t numPrims_lo = std::min<uint32_t>(numPrims, KNOB_SIMD_WIDTH); const uint32_t numPrims_hi = std::max<uint32_t>(numPrims, KNOB_SIMD_WIDTH) - KNOB_SIMD_WIDTH; - const uint32_t primMask = GenMask(numPrims); - const simd16scalari primID = pa.GetPrimID(work.startPrimID); const simdscalari primID_lo = _simd16_extract_si(primID, 0); const simdscalari primID_hi = _simd16_extract_si(primID, 1); @@ -1732,7 +1769,7 @@ void ProcessDraw( StreamOut(pDC, pa, workerId, pSoPrimData, numPrims_hi, 0); } #else - pa.useAlternateOffset = false; // StreamOut() is SIMD16-compatible.. + pa.useAlternateOffset = false; StreamOut(pDC, pa, workerId, pSoPrimData, 0); #endif } @@ -1742,7 +1779,7 @@ void ProcessDraw( SWR_ASSERT(pDC->pState->pfnProcessPrims_simd16); pa.useAlternateOffset = false; - pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, primMask, primID, _simd16_setzero_si()); + pDC->pState->pfnProcessPrims_simd16(pDC, pa, workerId, prim_simd16, GenMask(numPrims), primID, _simd16_setzero_si()); } } } diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h index 10570f43f0f..403efe057d0 100644 --- a/src/gallium/drivers/swr/rasterizer/core/pa.h +++ b/src/gallium/drivers/swr/rasterizer/core/pa.h @@ -136,9 +136,9 @@ struct PA_STATE_OPT : public PA_STATE uint32_t primIDIncr{ 0 }; // how much to increment for each vector (typically vector / {1, 2}) SIMDSCALARI primID; - typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]); + typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); #if ENABLE_AVX512_SIMD16 - typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& state, uint32_t slot, simd16vector verts[]); + typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); #endif typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); @@ -691,6 +691,7 @@ PRAGMA_WARNING_PUSH_DISABLE(4789) pBase += SIMD_WIDTH; } } + return true; } PRAGMA_WARNING_POP() @@ -1392,7 +1393,7 @@ private: template <typename IsIndexedT, typename IsCutIndexEnabledT> struct PA_FACTORY { - PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts) : topo(in_topo) + PA_FACTORY(DRAW_CONTEXT* pDC, PRIMITIVE_TOPOLOGY in_topo, uint32_t numVerts, PA_STATE::SIMDVERTEX *pVertexStore, uint32_t vertexStoreSize) : topo(in_topo) { #if KNOB_ENABLE_CUT_AWARE_PA == TRUE const API_STATE& state = GetApiState(pDC); @@ -1408,7 +1409,7 @@ struct PA_FACTORY memset(&indexStore, 0, sizeof(indexStore)); uint32_t numAttribs = state.feNumAttributes; - new (&this->paCut) PA_STATE_CUT(pDC, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * PA_STATE::SIMD_WIDTH, + new (&this->paCut) PA_STATE_CUT(pDC, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH, &this->indexStore[0], numVerts, numAttribs, state.topology, false); cutPA = true; } @@ -1416,7 +1417,7 @@ struct PA_FACTORY #endif { uint32_t numPrims = GetNumPrims(in_topo, numVerts); - new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, (uint8_t*)&this->vertexStore[0], MAX_NUM_VERTS_PER_PRIM * PA_STATE::SIMD_WIDTH, false); + new (&this->paOpt) PA_STATE_OPT(pDC, numPrims, reinterpret_cast<uint8_t *>(pVertexStore), vertexStoreSize * PA_STATE::SIMD_WIDTH, false); cutPA = false; } @@ -1438,10 +1439,10 @@ struct PA_FACTORY PA_STATE_OPT paOpt; PA_STATE_CUT paCut; + bool cutPA{ false }; PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN }; - PA_STATE::SIMDVERTEX vertexStore[MAX_NUM_VERTS_PER_PRIM]; PA_STATE::SIMDMASK indexStore[MAX_NUM_VERTS_PER_PRIM]; }; |