From feecd7dcf5e63c1ae9e8d2f74cc70371818958b8 Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Wed, 18 Jan 2017 18:08:40 -0600 Subject: swr: [rasterizer core] Frontend SIMD16 WIP SIMD16 Primitive Assembly (PA) only supports TriList and RectList. CUT_AWARE_PA, TESS, GS, and SO disabled in the SIMD16 front end. Reviewed-by: Bruce Cherniak --- .../drivers/swr/rasterizer/core/frontend.cpp | 299 ++++++++++++++++++--- src/gallium/drivers/swr/rasterizer/core/frontend.h | 4 +- src/gallium/drivers/swr/rasterizer/core/knobs.h | 1 + src/gallium/drivers/swr/rasterizer/core/pa.h | 268 +++++++++++++++++- src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp | 284 ++++++++++++++++++- 5 files changed, 813 insertions(+), 43 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp index c8dce10c9de..b005ead0d15 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp @@ -1027,7 +1027,7 @@ static void TessellationStages( SWR_TS_TESSELLATED_DATA tsData = { 0 }; AR_BEGIN(FETessellation, pDC->drawId); TSTessellate(tsCtx, hsContext.pCPout[p].tessFactors, tsData); - AR_EVENT(TessPrimCount(1)); + AR_EVENT(TessPrimCount(1)); AR_END(FETessellation, 0); if (tsData.NumPrimitives == 0) @@ -1161,12 +1161,9 @@ void ProcessDraw( DRAW_WORK& work = *(DRAW_WORK*)pUserData; const API_STATE& state = GetApiState(pDC); - __m256i vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); - SWR_VS_CONTEXT vsContext; - simdvertex vin; - int indexSize = 0; - uint32_t endVertex = work.numVerts; + uint32_t indexSize = 0; + uint32_t endVertex = work.numVerts; const int32_t* pLastRequestedIndex = nullptr; if (IsIndexedT::value) @@ -1197,30 +1194,6 @@ void ProcessDraw( endVertex = GetNumVerts(state.topology, GetNumPrims(state.topology, work.numVerts)); } - SWR_FETCH_CONTEXT fetchInfo = { 0 }; - fetchInfo.pStreams = &state.vertexBuffers[0]; - fetchInfo.StartInstance = work.startInstance; - fetchInfo.StartVertex = 0; - - vsContext.pVin = &vin; - - if (IsIndexedT::value) - { - fetchInfo.BaseVertex = work.baseVertex; - - // if the entire index buffer isn't being consumed, set the last index - // so that fetches < a SIMD wide will be masked off - fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size); - if (pLastRequestedIndex < fetchInfo.pLastIndex) - { - fetchInfo.pLastIndex = pLastRequestedIndex; - } - } - else - { - fetchInfo.StartVertex = work.startVertex; - } - #if defined(KNOB_ENABLE_RDTSC) || defined(KNOB_ENABLE_AR) uint32_t numPrims = GetNumPrims(state.topology, work.numVerts); #endif @@ -1259,6 +1232,267 @@ void ProcessDraw( PA_FACTORY paFactory(pDC, state.topology, work.numVerts); PA_STATE& pa = paFactory.GetPA(); +#if USE_SIMD16_FRONTEND + simdvertex vin_lo; + simdvertex vin_hi; + SWR_VS_CONTEXT vsContext_lo; + SWR_VS_CONTEXT vsContext_hi; + + vsContext_lo.pVin = &vin_lo; + vsContext_hi.pVin = &vin_hi; + + SWR_FETCH_CONTEXT fetchInfo_lo = { 0 }; + + fetchInfo_lo.pStreams = &state.vertexBuffers[0]; + fetchInfo_lo.StartInstance = work.startInstance; + fetchInfo_lo.StartVertex = 0; + + if (IsIndexedT::value) + { + fetchInfo_lo.BaseVertex = work.baseVertex; + + // if the entire index buffer isn't being consumed, set the last index + // so that fetches < a SIMD wide will be masked off + fetchInfo_lo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size); + if (pLastRequestedIndex < fetchInfo_lo.pLastIndex) + { + fetchInfo_lo.pLastIndex = pLastRequestedIndex; + } + } + else + { + fetchInfo_lo.StartVertex = work.startVertex; + } + + SWR_FETCH_CONTEXT fetchInfo_hi = fetchInfo_lo; + + const simd16scalari vScale = _simd16_set_epi32(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0); + + for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++) + { + uint32_t i = 0; + + simd16scalari vIndex; + + if (IsIndexedT::value) + { + fetchInfo_lo.pIndices = work.pIB; + fetchInfo_hi.pIndices = (int32_t *)((uint8_t *)fetchInfo_lo.pIndices + KNOB_SIMD_WIDTH * indexSize); // 1/2 of KNOB_SIMD16_WIDTH + } + else + { + vIndex = _simd16_add_epi32(_simd16_set1_epi32(work.startVertexID), vScale); + + fetchInfo_lo.pIndices = (const int32_t *)&vIndex.lo; + fetchInfo_hi.pIndices = (const int32_t *)&vIndex.hi; + } + + fetchInfo_lo.CurInstance = instanceNum; + fetchInfo_hi.CurInstance = instanceNum; + + vsContext_lo.InstanceID = instanceNum; + vsContext_hi.InstanceID = instanceNum; + + while (pa.HasWork()) + { + // PaGetNextVsOutput currently has the side effect of updating some PA state machine state. + // So we need to keep this outside of (i < endVertex) check. + + simdmask *pvCutIndices_lo = nullptr; + simdmask *pvCutIndices_hi = nullptr; + + if (IsIndexedT::value) + { + pvCutIndices_lo = &pa.GetNextVsIndices(); + pvCutIndices_hi = &pa.GetNextVsIndices(); + } + + simdvertex &vout_lo = pa.GetNextVsOutput_simd16_lo(); + simdvertex &vout_hi = pa.GetNextVsOutput_simd16_hi(); + + vsContext_lo.pVout = &vout_lo; + vsContext_hi.pVout = &vout_hi; + + if (i < endVertex) + { + // 1. Execute FS/VS for a single SIMD. + AR_BEGIN(FEFetchShader, pDC->drawId); + state.pfnFetchFunc(fetchInfo_lo, vin_lo); + if ((i + KNOB_SIMD_WIDTH) < endVertex) + { + state.pfnFetchFunc(fetchInfo_hi, vin_hi); + } + AR_END(FEFetchShader, 0); + + // forward fetch generated vertex IDs to the vertex shader + vsContext_lo.VertexID = fetchInfo_lo.VertexID; + vsContext_hi.VertexID = fetchInfo_hi.VertexID; + + // Setup active mask for vertex shader. + vsContext_lo.mask = GenerateMask(endVertex - i); + vsContext_hi.mask = GenerateMask(endVertex - (i + KNOB_SIMD_WIDTH)); + + // forward cut mask to the PA + if (IsIndexedT::value) + { + *pvCutIndices_lo = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_lo.CutMask)); + *pvCutIndices_hi = _simd_movemask_ps(_simd_castsi_ps(fetchInfo_hi.CutMask)); + } + + UPDATE_STAT_FE(IaVertices, GetNumInvocations(i, endVertex)); + +#if KNOB_ENABLE_TOSS_POINTS + if (!KNOB_TOSS_FETCH) +#endif + { + AR_BEGIN(FEVertexShader, pDC->drawId); + state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_lo); + if ((i + KNOB_SIMD_WIDTH) < endVertex) + { + state.pfnVertexFunc(GetPrivateState(pDC), &vsContext_hi); + } + AR_END(FEVertexShader, 0); + + UPDATE_STAT_FE(VsInvocations, GetNumInvocations(i, endVertex)); + } + } + + // 2. Assemble primitives given the last two SIMD. + do + { + simd16vector prim_simd16[MAX_NUM_VERTS_PER_PRIM]; + + RDTSC_START(FEPAAssemble); + bool assemble = pa.Assemble_simd16(VERTEX_POSITION_SLOT, prim_simd16); + RDTSC_STOP(FEPAAssemble, 1, 0); + +#if KNOB_ENABLE_TOSS_POINTS + if (!KNOB_TOSS_FETCH) +#endif + { +#if KNOB_ENABLE_TOSS_POINTS + if (!KNOB_TOSS_VS) +#endif + { + if (assemble) + { + UPDATE_STAT_FE(IaPrimitives, pa.NumPrims()); + +#if 0 + if (HasTessellationT::value) + { + TessellationStages( + pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID)); + } + else if (HasGeometryShaderT::value) + { + GeometryShaderStage( + pDC, workerId, pa, pGsOut, pCutBuffer, pStreamCutBuffer, pSoPrimData, pa.GetPrimID(work.startPrimID)); + } + else +#endif + { +#if 0 + // If streamout is enabled then stream vertices out to memory. + if (HasStreamOutT::value) + { + StreamOut(pDC, pa, workerId, pSoPrimData, 0); + } + +#endif + if (HasRastT::value) + { + SWR_ASSERT(pDC->pState->pfnProcessPrims); + + uint32_t genMask = GenMask(pa.NumPrims_simd16()); + uint32_t genMask_lo = genMask & 255; + uint32_t genMask_hi = (genMask >> 8) & 255; + + simdscalari getPrimId_lo = pa.GetPrimID_simd16_lo(work.startPrimID); + simdscalari getPrimId_hi = pa.GetPrimID_simd16_hi(work.startPrimID); + + simdvector prim[MAX_NUM_VERTS_PER_PRIM]; + + for (uint32_t i = 0; i < 3; i += 1) + { + for (uint32_t j = 0; j < 4; j += 1) + { + prim[i][j] = prim_simd16[i][j].lo; + } + } + + pa.useAlternateOffset = false; + pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim, + genMask_lo, getPrimId_lo, _simd_set1_epi32(0)); + + if (genMask_hi) + { + for (uint32_t i = 0; i < 3; i += 1) + { + for (uint32_t j = 0; j < 4; j += 1) + { + prim[i][j] = prim_simd16[i][j].hi; + } + } + + pa.useAlternateOffset = true; + pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim, + genMask_hi, getPrimId_hi, _simd_set1_epi32(0)); + } + } + } + } + } + } + } while (pa.NextPrim()); + + if (IsIndexedT::value) + { + fetchInfo_lo.pIndices = (int32_t *)((uint8_t*)fetchInfo_lo.pIndices + KNOB_SIMD16_WIDTH * indexSize); + fetchInfo_hi.pIndices = (int32_t *)((uint8_t*)fetchInfo_hi.pIndices + KNOB_SIMD16_WIDTH * indexSize); + } + else + { + vIndex = _simd16_add_epi32(vIndex, _simd16_set1_epi32(KNOB_SIMD16_WIDTH)); + } + + i += KNOB_SIMD16_WIDTH; + } + + pa.Reset(); + } + +#else + simdvertex vin; + SWR_VS_CONTEXT vsContext; + + vsContext.pVin = &vin; + + SWR_FETCH_CONTEXT fetchInfo = { 0 }; + + fetchInfo.pStreams = &state.vertexBuffers[0]; + fetchInfo.StartInstance = work.startInstance; + fetchInfo.StartVertex = 0; + + if (IsIndexedT::value) + { + fetchInfo.BaseVertex = work.baseVertex; + + // if the entire index buffer isn't being consumed, set the last index + // so that fetches < a SIMD wide will be masked off + fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size); + if (pLastRequestedIndex < fetchInfo.pLastIndex) + { + fetchInfo.pLastIndex = pLastRequestedIndex; + } + } + else + { + fetchInfo.StartVertex = work.startVertex; + } + + const simdscalari vScale = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0); + /// @todo: temporarily move instance loop in the FE to ensure SO ordering for (uint32_t instanceNum = 0; instanceNum < work.numInstances; instanceNum++) { @@ -1367,6 +1601,7 @@ void ProcessDraw( if (HasRastT::value) { SWR_ASSERT(pDC->pState->pfnProcessPrims); + pDC->pState->pfnProcessPrims(pDC, pa, workerId, prim, GenMask(pa.NumPrims()), pa.GetPrimID(work.startPrimID), _simd_set1_epi32(0)); } @@ -1376,7 +1611,6 @@ void ProcessDraw( } } while (pa.NextPrim()); - i += KNOB_SIMD_WIDTH; if (IsIndexedT::value) { fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize); @@ -1385,10 +1619,13 @@ void ProcessDraw( { vIndex = _simd_add_epi32(vIndex, _simd_set1_epi32(KNOB_SIMD_WIDTH)); } + + i += KNOB_SIMD_WIDTH; } pa.Reset(); } +#endif AR_END(FEProcessDraw, numPrims * work.numInstances); } diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h index 46924947a73..6d5f6a31b8e 100644 --- a/src/gallium/drivers/swr/rasterizer/core/frontend.h +++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h @@ -170,8 +170,8 @@ void calcDeterminantIntVertical(const simdscalari vA[3], const simdscalari vB[3] simdscalari detHi = _simd_sub_epi64(vA1B2Hi, vA2B1Hi); // shuffle 0 1 4 5 -> 0 1 2 3 - simdscalari vResultLo = _mm256_permute2f128_si256(detLo, detHi, 0x20); - simdscalari vResultHi = _mm256_permute2f128_si256(detLo, detHi, 0x31); + simdscalari vResultLo = _simd_permute2f128_si(detLo, detHi, 0x20); + simdscalari vResultHi = _simd_permute2f128_si(detLo, detHi, 0x31); pvDet[0] = vResultLo; pvDet[1] = vResultHi; diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h b/src/gallium/drivers/swr/rasterizer/core/knobs.h index bbe15c1e48f..8e54f90526b 100644 --- a/src/gallium/drivers/swr/rasterizer/core/knobs.h +++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h @@ -40,6 +40,7 @@ #define ENABLE_AVX512_SIMD16 0 #define USE_8x2_TILE_BACKEND 0 +#define USE_SIMD16_FRONTEND 0 /////////////////////////////////////////////////////////////////////////////// // Architecture validation diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h index 2b8110f4c70..826032ad54e 100644 --- a/src/gallium/drivers/swr/rasterizer/core/pa.h +++ b/src/gallium/drivers/swr/rasterizer/core/pa.h @@ -41,6 +41,10 @@ struct PA_STATE // The topology the binner will use. In some cases the FE changes the topology from the api state. PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN }; +#if ENABLE_AVX512_SIMD16 + bool useAlternateOffset{ false }; + +#endif PA_STATE() {} PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) : pDC(in_pDC), pStreamBase(in_pStreamBase), streamSizeInVerts(in_streamSizeInVerts) {} @@ -48,14 +52,28 @@ struct PA_STATE virtual bool HasWork() = 0; virtual simdvector& GetSimdVector(uint32_t index, uint32_t slot) = 0; virtual bool Assemble(uint32_t slot, simdvector verts[]) = 0; +#if ENABLE_AVX512_SIMD16 + virtual bool Assemble_simd16(uint32_t slot, simd16vector verts[]) = 0; +#endif virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0; virtual bool NextPrim() = 0; virtual simdvertex& GetNextVsOutput() = 0; +#if ENABLE_AVX512_SIMD16 + virtual simdvertex& GetNextVsOutput_simd16_lo() = 0; + virtual simdvertex& GetNextVsOutput_simd16_hi() = 0; +#endif virtual bool GetNextStreamOutput() = 0; virtual simdmask& GetNextVsIndices() = 0; virtual uint32_t NumPrims() = 0; +#if ENABLE_AVX512_SIMD16 + virtual uint32_t NumPrims_simd16() = 0; +#endif virtual void Reset() = 0; virtual simdscalari GetPrimID(uint32_t startID) = 0; +#if ENABLE_AVX512_SIMD16 + virtual simdscalari GetPrimID_simd16_lo(uint32_t startID) = 0; + virtual simdscalari GetPrimID_simd16_hi(uint32_t startID) = 0; +#endif }; // The Optimized PA is a state machine that assembles triangles from vertex shader simd @@ -94,13 +112,23 @@ struct PA_STATE_OPT : public PA_STATE typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]); typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); +#if ENABLE_AVX512_SIMD16 + typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& state, uint32_t slot, simd16vector verts[]); +#endif PFN_PA_FUNC pfnPaFunc{ nullptr }; // PA state machine function for assembling 4 triangles. PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr }; // PA state machine function for assembling single triangle. PFN_PA_FUNC pfnPaFuncReset{ nullptr }; // initial state to set on reset +#if ENABLE_AVX512_SIMD16 + PFN_PA_FUNC_SIMD16 pfnPaFunc_simd16{ nullptr }; // PA state machine function for assembling 16 triangles + PFN_PA_FUNC_SIMD16 pfnPaFuncReset_simd16{ nullptr }; // initial state to set on reset +#endif // state used to advance the PA when Next is called PFN_PA_FUNC pfnPaNextFunc{ nullptr }; +#if ENABLE_AVX512_SIMD16 + PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16{ nullptr }; +#endif uint32_t nextNumSimdPrims{ 0 }; uint32_t nextNumPrimsIncrement{ 0 }; bool nextReset{ false }; @@ -130,6 +158,13 @@ struct PA_STATE_OPT : public PA_STATE return this->pfnPaFunc(*this, slot, verts); } +#if ENABLE_AVX512_SIMD16 + bool Assemble_simd16(uint32_t slot, simd16vector verts[]) + { + return this->pfnPaFunc_simd16(*this, slot, verts); + } + +#endif // Assembles 1 primitive. Each simdscalar is a vertex (xyzw). void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) { @@ -139,6 +174,9 @@ struct PA_STATE_OPT : public PA_STATE bool NextPrim() { this->pfnPaFunc = this->pfnPaNextFunc; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = this->pfnPaNextFunc_simd16; +#endif this->numSimdPrims = this->nextNumSimdPrims; this->numPrimsComplete += this->nextNumPrimsIncrement; this->reset = this->nextReset; @@ -181,7 +219,33 @@ struct PA_STATE_OPT : public PA_STATE simdvertex* pVertex = (simdvertex*)pStreamBase; return pVertex[this->cur]; } - + +#if ENABLE_AVX512_SIMD16 + simdvertex& GetNextVsOutput_simd16_lo() + { + // increment cur and prev indices + const uint32_t numSimdVerts = this->streamSizeInVerts / KNOB_SIMD16_WIDTH; + this->prev = this->cur; // prev is undefined for first state. + this->cur = this->counter % numSimdVerts; + + simdvertex* pVertex = (simdvertex*)pStreamBase; + return pVertex[this->cur * 2]; + } + + simdvertex& GetNextVsOutput_simd16_hi() + { + // increment cur and prev indices + const uint32_t numSimdVerts = this->streamSizeInVerts / KNOB_SIMD16_WIDTH; +#if 1 + this->prev = this->cur; // prev is undefined for first state. + this->cur = this->counter % numSimdVerts; +#endif + + simdvertex* pVertex = (simdvertex*)pStreamBase; + return pVertex[this->cur * 2 + 1]; + } + +#endif simdmask& GetNextVsIndices() { // unused in optimized PA, pass tmp buffer back @@ -202,6 +266,14 @@ struct PA_STATE_OPT : public PA_STATE (KNOB_SIMD_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : KNOB_SIMD_WIDTH; } +#if ENABLE_AVX512_SIMD16 + uint32_t NumPrims_simd16() + { + return (this->numPrimsComplete + this->nextNumPrimsIncrement > this->numPrims) ? + (KNOB_SIMD16_WIDTH - (this->numPrimsComplete + this->nextNumPrimsIncrement - this->numPrims)) : KNOB_SIMD16_WIDTH; + } + +#endif void SetNextState(PA_STATE_OPT::PFN_PA_FUNC pfnPaNextFunc, PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, uint32_t numSimdPrims = 0, @@ -216,8 +288,28 @@ struct PA_STATE_OPT : public PA_STATE this->pfnPaSingleFunc = pfnPaNextSingleFunc; } +#if ENABLE_AVX512_SIMD16 + void SetNextState_simd16(PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16, + PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, + uint32_t numSimdPrims = 0, + uint32_t numPrimsIncrement = 0, + bool reset = false) + { + this->pfnPaNextFunc_simd16 = pfnPaNextFunc_simd16; + this->nextNumSimdPrims = numSimdPrims; + this->nextNumPrimsIncrement = numPrimsIncrement; + this->nextReset = reset; + + this->pfnPaSingleFunc = pfnPaNextSingleFunc; + } + +#endif void Reset() { +#if ENABLE_AVX512_SIMD16 + useAlternateOffset = false; + +#endif this->pfnPaFunc = this->pfnPaFuncReset; this->numPrimsComplete = 0; this->numSimdPrims = 0; @@ -233,6 +325,28 @@ struct PA_STATE_OPT : public PA_STATE return _simd_add_epi32(this->primID, _simd_set1_epi32(startID + this->primIDIncr * (this->numPrimsComplete / KNOB_SIMD_WIDTH))); } +#if ENABLE_AVX512_SIMD16 + + simdscalari GetPrimID_simd16_lo(uint32_t startID) + { +#if 1 + return _simd_add_epi32(this->primID, + _simd_set1_epi32(startID + (this->primIDIncr / 2) * (this->numPrimsComplete / KNOB_SIMD_WIDTH) * 2)); +#else + return _simd_set1_epi32(0); +#endif + } + + simdscalari GetPrimID_simd16_hi(uint32_t startID) + { +#if 1 + return _simd_add_epi32(this->primID, + _simd_set1_epi32(startID + (this->primIDIncr / 2) * ((this->numPrimsComplete / KNOB_SIMD_WIDTH) * 2 + 1))); +#else + return _simd_set1_epi32(0); +#endif + } +#endif }; // helper C wrappers to avoid having to rewrite all the PA topology state functions @@ -244,6 +358,18 @@ INLINE void SetNextPaState(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC pfnPaNext { return pa.SetNextState(pfnPaNextFunc, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset); } + +#if ENABLE_AVX512_SIMD16 +INLINE void SetNextPaState_simd16(PA_STATE_OPT& pa, PA_STATE_OPT::PFN_PA_FUNC_SIMD16 pfnPaNextFunc_simd16, + PA_STATE_OPT::PFN_PA_SINGLE_FUNC pfnPaNextSingleFunc, + uint32_t numSimdPrims = 0, + uint32_t numPrimsIncrement = 0, + bool reset = false) +{ + return pa.SetNextState_simd16(pfnPaNextFunc_simd16, pfnPaNextSingleFunc, numSimdPrims, numPrimsIncrement, reset); +} + +#endif INLINE simdvector& PaGetSimdVector(PA_STATE& pa, uint32_t index, uint32_t slot) { return pa.GetSimdVector(index, slot); @@ -418,6 +544,24 @@ struct PA_STATE_CUT : public PA_STATE return ((simdvertex*)pStreamBase)[vertexIndex]; } +#if ENABLE_AVX512_SIMD16 + simdvertex& GetNextVsOutput_simd16_lo() + { + uint32_t vertexIndex = this->headVertex / KNOB_SIMD16_WIDTH; + this->headVertex = (this->headVertex + KNOB_SIMD16_WIDTH) % this->numVerts; + this->needOffsets = true; + return ((simdvertex*)pStreamBase)[vertexIndex * 2]; + } + + simdvertex& GetNextVsOutput_simd16_hi() + { + uint32_t vertexIndex = this->headVertex / KNOB_SIMD16_WIDTH; + this->headVertex = (this->headVertex + KNOB_SIMD16_WIDTH) % this->numVerts; + this->needOffsets = true; + return ((simdvertex*)pStreamBase)[vertexIndex * 2 + 1]; + } + +#endif simdmask& GetNextVsIndices() { uint32_t vertexIndex = this->headVertex / KNOB_SIMD_WIDTH; @@ -444,8 +588,24 @@ struct PA_STATE_CUT : public PA_STATE return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId); } +#if ENABLE_AVX512_SIMD16 + simdscalari GetPrimID_simd16_lo(uint32_t startID) + { + return _simd_add_epi32(_simd_set1_epi32(startID), this->vPrimId); + } + + simdscalari GetPrimID_simd16_hi(uint32_t startID) + { + return _simd_add_epi32(_simd_set1_epi32(startID + KNOB_SIMD_WIDTH), this->vPrimId); + } + +#endif void Reset() { +#if ENABLE_AVX512_SIMD16 + useAlternateOffset = false; + +#endif this->numRemainingVerts = this->numVertsToAssemble; this->numPrimsAssembled = 0; this->curIndex = 0; @@ -597,6 +757,14 @@ struct PA_STATE_CUT : public PA_STATE return true; } +#if ENABLE_AVX512_SIMD16 + bool Assemble_simd16(uint32_t slot, simd16vector verts[]) + { + SWR_ASSERT(false); + return false; + } + +#endif void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3]) { // move to slot @@ -620,6 +788,13 @@ struct PA_STATE_CUT : public PA_STATE return this->numPrimsAssembled; } +#if ENABLE_AVX512_SIMD16 + uint32_t NumPrims_simd16() + { + return this->numPrimsAssembled; + } + +#endif // Per-topology functions void ProcessVertTriStrip(uint32_t index, bool finish) { @@ -1025,12 +1200,6 @@ struct PA_TESS : PA_STATE -1, -1, -1, -1, -1, -1, -1, -1, 0, 0, 0, 0, 0, 0, 0, 0 }; -#elif KNOB_SIMD_WIDTH == 16 - static const OSALIGNLINE(int32_t) maskGen[KNOB_SIMD_WIDTH * 2] = - { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 - }; #else #error "Help, help, I can't get up!" #endif @@ -1038,6 +1207,21 @@ struct PA_TESS : PA_STATE return _simd_loadu_si((const simdscalari*)&maskGen[KNOB_SIMD_WIDTH - numPrims]); } +#if ENABLE_AVX512_SIMD16 + static simd16scalari GenPrimMask_simd16(uint32_t numPrims) + { + SWR_ASSERT(numPrims <= KNOB_SIMD16_WIDTH); + + static const OSALIGNSIMD16(int32_t) maskGen_16[KNOB_SIMD16_WIDTH * 2] = + { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 + }; + + return _simd16_loadu_si((const simd16scalari*)&maskGen_16[KNOB_SIMD16_WIDTH - numPrims]); + } + +#endif bool Assemble(uint32_t slot, simdvector verts[]) { static_assert(KNOB_SIMD_WIDTH == 8, "Need to revisit this when AVX512 is implemented"); @@ -1072,6 +1256,41 @@ struct PA_TESS : PA_STATE return true; } +#if ENABLE_AVX512_SIMD16 + bool Assemble_simd16(uint32_t slot, simd16vector verts[]) + { + SWR_ASSERT(slot < m_numAttributes); + + uint32_t numPrimsToAssemble = PA_TESS::NumPrims_simd16(); + if (0 == numPrimsToAssemble) + { + return false; + } + + simd16scalari mask = GenPrimMask_simd16(numPrimsToAssemble); + + const float* pBaseAttrib = (const float*)&m_pVertexData[slot * m_attributeStrideInVectors * 4]; + for (uint32_t i = 0; i < m_numVertsPerPrim; ++i) + { + simd16scalari indices = _simd16_load_si((const simd16scalari*)m_ppIndices[i]); + + const float* pBase = pBaseAttrib; + for (uint32_t c = 0; c < 4; ++c) + { + verts[i].v[c] = _simd16_mask_i32gather_ps( + _simd16_setzero_ps(), + pBase, + indices, + mask, + 4 /* gcc doesn't like sizeof(float) */); + pBase += m_attributeStrideInVectors * KNOB_SIMD16_WIDTH; + } + } + + return true; + } + +#endif void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) { SWR_ASSERT(slot < m_numAttributes); @@ -1110,6 +1329,22 @@ struct PA_TESS : PA_STATE return junk; } +#if ENABLE_AVX512_SIMD16 + simdvertex& GetNextVsOutput_simd16_lo() + { + SWR_ASSERT(0, "%s", __FUNCTION__); + static simdvertex junk; + return junk; + } + + simdvertex& GetNextVsOutput_simd16_hi() + { + SWR_ASSERT(0, "%s", __FUNCTION__); + static simdvertex junk; + return junk; + } + +#endif bool GetNextStreamOutput() { SWR_ASSERT(0, "%s", __FUNCTION__); @@ -1128,6 +1363,13 @@ struct PA_TESS : PA_STATE return std::min(m_numPrims, KNOB_SIMD_WIDTH); } +#if ENABLE_AVX512_SIMD16 + uint32_t NumPrims_simd16() + { + return std::min(m_numPrims, KNOB_SIMD16_WIDTH); + } + +#endif void Reset() { SWR_ASSERT(0); }; simdscalari GetPrimID(uint32_t startID) @@ -1135,6 +1377,18 @@ struct PA_TESS : PA_STATE return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId); } +#if ENABLE_AVX512_SIMD16 + simdscalari GetPrimID_simd16_lo(uint32_t startID) + { + return _simd_add_epi32(_simd_set1_epi32(startID), m_vPrimId); + } + + simdscalari GetPrimID_simd16_hi(uint32_t startID) + { + return _simd_add_epi32(_simd_set1_epi32(startID + KNOB_SIMD_WIDTH), m_vPrimId); + } + +#endif private: const simdscalar* m_pVertexData = nullptr; uint32_t m_attributeStrideInVectors = 0; diff --git a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp index a95bbbfbd63..e2ae962b122 100644 --- a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp @@ -37,6 +37,11 @@ bool PaTriList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); bool PaTriList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); +#if ENABLE_AVX512_SIMD16 +bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); +bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); +bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); +#endif void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); @@ -68,6 +73,11 @@ void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); bool PaRectList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]); +#if ENABLE_AVX512_SIMD16 +bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); +bool PaRectList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); +bool PaRectList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]); +#endif void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]); template @@ -235,9 +245,9 @@ bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) #elif KNOB_ARCH >= KNOB_ARCH_AVX2 - simdvector &a = PaGetSimdVector(pa, 0, slot); - simdvector &b = PaGetSimdVector(pa, 1, slot); - simdvector &c = PaGetSimdVector(pa, 2, slot); + const simdvector &a = PaGetSimdVector(pa, 0, slot); + const simdvector &b = PaGetSimdVector(pa, 1, slot); + const simdvector &c = PaGetSimdVector(pa, 2, slot); // v0 -> a0 a3 a6 b1 b4 b7 c2 c5 // v1 -> a1 a4 a7 b2 b5 c0 c3 c6 @@ -251,6 +261,7 @@ bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) simdvector &v1 = verts[1]; simdvector &v2 = verts[2]; + // for simd x, y, z, and w for (int i = 0; i < 4; ++i) { v0[i] = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x92), c[i], 0x24); @@ -269,15 +280,156 @@ bool PaTriList2(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]) return true; } +#if ENABLE_AVX512_SIMD16 +bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) +{ + SetNextPaState_simd16(pa, PaTriList1_simd16, PaTriListSingle0); + return false; // Not enough vertices to assemble 16 triangles +} + +bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) +{ + SetNextPaState_simd16(pa, PaTriList2_simd16, PaTriListSingle0); + return false; // Not enough vertices to assemble 16 triangles +} + +bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) +{ +#if 0 + const simdscalari perm0 = _simd_set_epi32(5, 2, 7, 4, 1, 6, 3, 0); + const simdscalari perm1 = _simd_set_epi32(6, 3, 0, 5, 2, 7, 4, 1); + const simdscalari perm2 = _simd_set_epi32(7, 4, 1, 6, 3, 0, 5, 2); + + simd16vector &v0 = verts[0]; + simd16vector &v1 = verts[1]; + simd16vector &v2 = verts[2]; + + { + const simdvector &a = PaGetSimdVector(pa, 0, slot); + const simdvector &b = PaGetSimdVector(pa, 1, slot); + const simdvector &c = PaGetSimdVector(pa, 2, slot); + + // v0 -> a0 a3 a6 b1 b4 b7 c2 c5 + // v1 -> a1 a4 a7 b2 b5 c0 c3 c6 + // v2 -> a2 a5 b0 b3 b6 c1 c4 c7 + + // for simd x, y, z, and w + for (int i = 0; i < 4; i += 1) + { + v0[i].lo = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x92), c[i], 0x24); + v0[i].lo = _mm256_permutevar8x32_ps(v0[i].lo, perm0); + + v1[i].lo = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x24), c[i], 0x49); + v1[i].lo = _mm256_permutevar8x32_ps(v1[i].lo, perm1); + + v2[i].lo = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x49), c[i], 0x92); + v2[i].lo = _mm256_permutevar8x32_ps(v2[i].lo, perm2); + } + } + + { + const simdvector &a = PaGetSimdVector(pa, 3, slot); + const simdvector &b = PaGetSimdVector(pa, 4, slot); + const simdvector &c = PaGetSimdVector(pa, 5, slot); + + // v0 -> a0 a3 a6 b1 b4 b7 c2 c5 + // v1 -> a1 a4 a7 b2 b5 c0 c3 c6 + // v2 -> a2 a5 b0 b3 b6 c1 c4 c7 + + // for simd x, y, z, and w + for (int i = 0; i < 4; i += 1) + { + v0[i].hi = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x92), c[i], 0x24); + v0[i].hi = _mm256_permutevar8x32_ps(v0[i].hi, perm0); + + v1[i].hi = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x24), c[i], 0x49); + v1[i].hi = _mm256_permutevar8x32_ps(v1[i].hi, perm1); + + v2[i].hi = _simd_blend_ps(_simd_blend_ps(a[i], b[i], 0x49), c[i], 0x92); + v2[i].hi = _mm256_permutevar8x32_ps(v2[i].hi, perm2); + } + } + +#else +#if 1 + const simdvector &a_lo = reinterpret_cast(PaGetSimdVector(pa, 0, slot)); + const simdvector &a_hi = reinterpret_cast(PaGetSimdVector(pa, 1, slot)); + const simdvector &b_lo = reinterpret_cast(PaGetSimdVector(pa, 2, slot)); + const simdvector &b_hi = reinterpret_cast(PaGetSimdVector(pa, 3, slot)); + const simdvector &c_lo = reinterpret_cast(PaGetSimdVector(pa, 4, slot)); + const simdvector &c_hi = reinterpret_cast(PaGetSimdVector(pa, 5, slot)); + + simd16vector a; + simd16vector b; + simd16vector c; + + for (uint32_t i = 0; i < 4; i += 1) + { + a[i].lo = a_lo[i]; + a[i].hi = a_hi[i]; + b[i].lo = b_lo[i]; + b[i].hi = b_hi[i]; + c[i].lo = c_lo[i]; + c[i].hi = c_hi[i]; + } + +#else + const simd16vector &a = reinterpret_cast(PaGetSimdVector(pa, 0 * 2, slot)); + const simd16vector &b = reinterpret_cast(PaGetSimdVector(pa, 1 * 2, slot)); + const simd16vector &c = reinterpret_cast(PaGetSimdVector(pa, 2 * 2, slot)); + +#endif + const simd16scalari perm0 = _simd16_set_epi32(13, 10, 7, 4, 1, 14, 11, 8, 5, 2, 15, 12, 9, 6, 3, 0); + const simd16scalari perm1 = _simd16_set_epi32(14, 11, 8, 5, 2, 15, 12, 9, 6, 3, 0, 13, 10, 7, 4, 1); + const simd16scalari perm2 = _simd16_set_epi32(15, 12, 9, 6, 3, 0, 13, 10, 7, 4, 1, 14, 11, 8, 5, 2); + + simd16vector &v0 = verts[0]; + simd16vector &v1 = verts[1]; + simd16vector &v2 = verts[2]; + + // v0 -> a0 a3 a6 a9 aC aF b2 b5 b8 bB bE c1 c4 c7 cA cD + // v1 -> a1 a4 a7 aA aD b0 b3 b6 b9 bC bF c2 c5 c8 cB cE + // v2 -> a2 a5 b8 aB aE b1 b4 b7 bA bD c0 c3 c6 c9 cC cF + + // for simd16 x, y, z, and w + for (int i = 0; i < 4; i += 1) + { + v0[i] = _simd16_blend_ps(_simd16_blend_ps(a[i], b[i], 0x4924), c[i], 0x2492); + v0[i] = _simd16_permute_ps(v0[i], perm0); + + v1[i] = _simd16_blend_ps(_simd16_blend_ps(a[i], b[i], 0x9249), c[i], 0x4924); + v1[i] = _simd16_permute_ps(v1[i], perm1); + + v2[i] = _simd16_blend_ps(_simd16_blend_ps(a[i], b[i], 0x2492), c[i], 0x9249); + v2[i] = _simd16_permute_ps(v2[i], perm2); + } + +#endif + SetNextPaState_simd16(pa, PaTriList0_simd16, PaTriListSingle0, 0, KNOB_SIMD16_WIDTH, true); + return true; +} + +#endif void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]) { // We have 12 simdscalars contained within 3 simdvectors which // hold at least 8 triangles worth of data. We want to assemble a single // triangle with data in horizontal form. +#if ENABLE_AVX512_SIMD16 + const uint32_t i0 = pa.useAlternateOffset ? 3 : 0; + const uint32_t i1 = pa.useAlternateOffset ? 4 : 1; + const uint32_t i2 = pa.useAlternateOffset ? 5 : 2; + + simdvector& a = PaGetSimdVector(pa, i0, slot); + simdvector& b = PaGetSimdVector(pa, i1, slot); + simdvector& c = PaGetSimdVector(pa, i2, slot); + +#else simdvector& a = PaGetSimdVector(pa, 0, slot); simdvector& b = PaGetSimdVector(pa, 1, slot); simdvector& c = PaGetSimdVector(pa, 2, slot); +#endif // Convert from vertical to horizontal. // Tri Pattern - provoking vertex is always v0 // v0 -> 0 3 6 9 12 15 18 21 @@ -940,6 +1092,112 @@ bool PaRectList2( return true; } +#if ENABLE_AVX512_SIMD16 +////////////////////////////////////////////////////////////////////////// +/// @brief State 1 for RECT_LIST topology. +/// There is not enough to assemble 8 triangles. +bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) +{ + SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectListSingle0); + return false; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief State 1 for RECT_LIST topology. +/// Rect lists has the following format. +/// w x y z +/// v2 o---o v5 o---o v8 o---o v11 o---o +/// | \ | | \ | | \ | | \ | +/// v1 o---o v4 o---o v7 o---o v10 o---o +/// v0 v3 v6 v9 +/// +/// Only 3 vertices of the rectangle are supplied. The 4th vertex is implied. +/// +/// tri0 = { v0, v1, v2 } tri1 = { v0, v2, w } <-- w = v0 - v1 + v2 +/// tri2 = { v3, v4, v5 } tri3 = { v3, v5, x } <-- x = v3 - v4 + v5 +/// etc. +/// +/// PA outputs 3 simdvectors for each of the triangle vertices v0, v1, v2 +/// where v0 contains all the first vertices for 8 triangles. +/// +/// Result: +/// verts[0] = { v0, v0, v3, v3, v6, v6, v9, v9 } +/// verts[1] = { v1, v2, v4, v5, v7, v8, v10, v11 } +/// verts[2] = { v2, w, v5, x, v8, y, v11, z } +/// +/// @param pa - State for PA state machine. +/// @param slot - Index into VS output which is either a position (slot 0) or attribute. +/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc. +bool PaRectList1_simd16( + PA_STATE_OPT& pa, + uint32_t slot, + simd16vector verts[]) +{ + // SIMD vectors a and b are the last two vertical outputs from the vertex shader. + simdvector& a = PaGetSimdVector(pa, 0, slot); // a[] = { v0, v1, v2, v3, v4, v5, v6, v7 } + simdvector& b = PaGetSimdVector(pa, 1, slot); // b[] = { v8, v9, v10, v11, v12, v13, v14, v15 } + + __m256 tmp0, tmp1, tmp2; + + // Loop over each component in the simdvector. + for (int i = 0; i < 4; i += 1) + { + simd16vector& v0 = verts[0]; // verts[0] needs to be { v0, v0, v3, v3, v6, v6, v9, v9 } + tmp0 = _mm256_permute2f128_ps(b[i], b[i], 0x01); // tmp0 = { v12, v13, v14, v15, v8, v9, v10, v11 } + v0[i].lo = _mm256_blend_ps(a[i], tmp0, 0x20); // v0 = { v0, *, *, v3, *, v9, v6, * } where * is don't care. + tmp1 = _mm256_permute_ps(v0[i].lo, 0xF0); // tmp1 = { v0, v0, v3, v3, *, *, *, * } + v0[i].lo = _mm256_permute_ps(v0[i].lo, 0x5A); // v0 = { *, *, *, *, v6, v6, v9, v9 } + v0[i].lo = _mm256_blend_ps(tmp1, v0[i].lo, 0xF0); // v0 = { v0, v0, v3, v3, v6, v6, v9, v9 } + + /// NOTE This is a bit expensive due to conflicts between vertices in 'a' and 'b'. + /// AVX2 should make this much cheaper. + simd16vector& v1 = verts[1]; // verts[1] needs to be { v1, v2, v4, v5, v7, v8, v10, v11 } + v1[i].lo = _mm256_permute_ps(a[i], 0x09); // v1 = { v1, v2, *, *, *, *, *, * } + tmp1 = _mm256_permute_ps(a[i], 0x43); // tmp1 = { *, *, *, *, v7, *, v4, v5 } + tmp2 = _mm256_blend_ps(v1[i].lo, tmp1, 0xF0); // tmp2 = { v1, v2, *, *, v7, *, v4, v5 } + tmp1 = _mm256_permute2f128_ps(tmp2, tmp2, 0x1); // tmp1 = { v7, *, v4, v5, * *, *, * } + v1[i].lo = _mm256_permute_ps(tmp0, 0xE0); // v1 = { *, *, *, *, *, v8, v10, v11 } + v1[i].lo = _mm256_blend_ps(tmp2, v1[i].lo, 0xE0); // v1 = { v1, v2, *, *, v7, v8, v10, v11 } + v1[i].lo = _mm256_blend_ps(v1[i].lo, tmp1, 0x0C); // v1 = { v1, v2, v4, v5, v7, v8, v10, v11 } + + // verts[2] = { v2, w, v5, x, v8, y, v11, z } + simd16vector& v2 = verts[2]; // verts[2] needs to be { v2, w, v5, x, v8, y, v11, z } + v2[i].lo = _mm256_permute_ps(tmp0, 0x30); // v2 = { *, *, *, *, v8, *, v11, * } + tmp1 = _mm256_permute_ps(tmp2, 0x31); // tmp1 = { v2, *, v5, *, *, *, *, * } + v2[i].lo = _mm256_blend_ps(tmp1, v2[i].lo, 0xF0); + + // Need to compute 4th implied vertex for the rectangle. + tmp2 = _mm256_sub_ps(v0[i].lo, v1[i].lo); + tmp2 = _mm256_add_ps(tmp2, v2[i].lo); // tmp2 = { w, *, x, *, y, *, z, * } + tmp2 = _mm256_permute_ps(tmp2, 0xA0); // tmp2 = { *, w, *, x, *, y, *, z } + v2[i].lo = _mm256_blend_ps(v2[i].lo, tmp2, 0xAA); // v2 = { v2, w, v5, x, v8, y, v11, z } + + v0[i].hi = _simd_setzero_ps(); + v1[i].hi = _simd_setzero_ps(); + v2[i].hi = _simd_setzero_ps(); + } + + SetNextPaState_simd16(pa, PaRectList1_simd16, PaRectListSingle0, 0, KNOB_SIMD16_WIDTH, true); + return true; +} + +////////////////////////////////////////////////////////////////////////// +/// @brief State 2 for RECT_LIST topology. +/// Not implemented unless there is a use case for more then 8 rects. +/// @param pa - State for PA state machine. +/// @param slot - Index into VS output which is either a position (slot 0) or attribute. +/// @param verts - triangle output for binner. SOA - Array of v0 for 8 triangles, followed by v1, etc. +bool PaRectList2_simd16( + PA_STATE_OPT& pa, + uint32_t slot, + simd16vector verts[]) +{ + SWR_ASSERT(0); // Is rect list used for anything other then clears? + SetNextPaState_simd16(pa, PaRectList0_simd16, PaRectListSingle0, 0, KNOB_SIMD16_WIDTH, true); + return true; +} + +#endif ////////////////////////////////////////////////////////////////////////// /// @brief This procedure is called by the Binner to assemble the attributes. /// Unlike position, which is stored vertically, the attributes are @@ -959,8 +1217,15 @@ void PaRectListSingle0( // We have 12 simdscalars contained within 3 simdvectors which // hold at least 8 triangles worth of data. We want to assemble a single // triangle with data in horizontal form. +#if ENABLE_AVX512_SIMD16 + const uint32_t i0 = pa.useAlternateOffset ? 3 : 0; + + simdvector& a = PaGetSimdVector(pa, i0, slot); + +#else simdvector& a = PaGetSimdVector(pa, 0, slot); +#endif // Convert from vertical to horizontal. switch(primIndex) { @@ -993,10 +1258,17 @@ PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t* this->binTopology = topo == TOP_UNKNOWN ? state.topology : topo; +#if ENABLE_AVX512_SIMD16 + pfnPaFunc_simd16 = nullptr; + +#endif switch (this->binTopology) { case TOP_TRIANGLE_LIST: this->pfnPaFunc = PaTriList0; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaTriList0_simd16; +#endif break; case TOP_TRIANGLE_STRIP: this->pfnPaFunc = PaTriStrip0; @@ -1032,6 +1304,9 @@ PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t* break; case TOP_RECT_LIST: this->pfnPaFunc = PaRectList0; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFunc_simd16 = PaRectList0_simd16; +#endif this->numPrims = in_numPrims * 2; break; @@ -1138,6 +1413,9 @@ PA_STATE_OPT::PA_STATE_OPT(DRAW_CONTEXT *in_pDC, uint32_t in_numPrims, uint8_t* }; this->pfnPaFuncReset = this->pfnPaFunc; +#if ENABLE_AVX512_SIMD16 + this->pfnPaFuncReset_simd16 = this->pfnPaFunc_simd16; +#endif // simdscalari id8 = _mm256_set_epi32(0, 1, 2, 3, 4, 5, 6, 7); // simdscalari id4 = _mm256_set_epi32(0, 0, 1, 1, 2, 2, 3, 3); -- cgit v1.2.3