From f445b6de9cc416ba3f3a900e98baa57e090c39ed Mon Sep 17 00:00:00 2001 From: Tim Rowley Date: Thu, 16 Feb 2017 10:53:01 -0800 Subject: swr: [rasterizer] Backend code adjustments Reviewed-by: Bruce Cherniak --- .../drivers/swr/rasterizer/common/simdintrin.h | 9 ++ src/gallium/drivers/swr/rasterizer/core/api.cpp | 5 +- .../drivers/swr/rasterizer/core/backend.cpp | 1 + src/gallium/drivers/swr/rasterizer/core/backend.h | 96 ++++++++++++---------- src/gallium/drivers/swr/rasterizer/core/state.h | 4 +- 5 files changed, 70 insertions(+), 45 deletions(-) diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h index ea79902a002..562408db8db 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h @@ -659,6 +659,15 @@ simdscalar vMask(int32_t mask) return _simd_castsi_ps(vec); } +INLINE +simdscalari vMaski(int32_t mask) +{ + __m256i vec = _mm256_set1_epi32(mask); + const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); + vec = _simd_and_si(vec, bit); + return _simd_cmplt_epi32(_mm256_setzero_si256(), vec); +} + INLINE void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int slane) { diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index 90503ba6de4..f622e94d91f 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -806,7 +806,6 @@ void SetupPipeline(DRAW_CONTEXT *pDC) const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.forcedSampleCount) ? 1 : 0; const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0; const uint32_t canEarlyZ = (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesSourceDepth && !psState.usesUAV)) ? 1 : 0; - SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask; // select backend function @@ -817,7 +816,9 @@ void SetupPipeline(DRAW_CONTEXT *pDC) { // always need to generate I & J per sample for Z interpolation barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK); - backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount][canEarlyZ]; + backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage] + [centroid][forcedSampleCount][canEarlyZ] + ; } else { diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp index b1bcdb0b393..b915e327426 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp @@ -39,6 +39,7 @@ typedef void(*PFN_CLEAR_TILES)(DRAW_CONTEXT*, SWR_RENDERTARGET_ATTACHMENT rt, uint32_t, uint32_t, DWORD[4], const SWR_RECT& rect); static PFN_CLEAR_TILES sClearTilesTable[NUM_SWR_FORMATS]; + ////////////////////////////////////////////////////////////////////////// /// @brief Process compute work. /// @param pDC - pointer to draw context (dispatch). diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h index c3585cc930c..f022990bf26 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.h +++ b/src/gallium/drivers/swr/rasterizer/core/backend.h @@ -605,8 +605,10 @@ struct PixelRateZTestLoop if(psState.writesODepth) { - // broadcast and test oDepth(psContext.vZ) written from the PS for each sample - vZ[sample] = psContext.vZ; + { + // broadcast and test oDepth(psContext.vZ) written from the PS for each sample + vZ[sample] = psContext.vZ; + } } else { @@ -713,23 +715,26 @@ INLINE void OutputMerger4x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SW uint8_t *pColorSample = pColorBase[rt] + rasterTileColorOffset; const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt]; - // pfnBlendFunc may not update all channels. Initialize with PS output. - /// TODO: move this into the blend JIT. - blendOut = psContext.shaded[rt]; - // Blend outputs and update coverage mask for alpha test - if(pfnBlendFunc[rt] != nullptr) { - pfnBlendFunc[rt]( - pBlendState, - psContext.shaded[rt], - psContext.shaded[1], - psContext.shaded[0].w, - sample, - pColorSample, - blendOut, - &psContext.oMask, - (simdscalari*)&coverageMask); + // pfnBlendFunc may not update all channels. Initialize with PS output. + /// TODO: move this into the blend JIT. + blendOut = psContext.shaded[rt]; + + // Blend outputs and update coverage mask for alpha test + if(pfnBlendFunc[rt] != nullptr) + { + pfnBlendFunc[rt]( + pBlendState, + psContext.shaded[rt], + psContext.shaded[1], + psContext.shaded[0].w, + sample, + pColorSample, + blendOut, + &psContext.oMask, + (simdscalari*)&coverageMask); + } } // final write mask @@ -782,9 +787,6 @@ INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SW simdscalar *pColorSample = reinterpret_cast(pColorBase[rt] + rasterTileColorOffset); const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt]; - // pfnBlendFunc may not update all channels. Initialize with PS output. - /// TODO: move this into the blend JIT. - blendOut = psContext.shaded[rt]; if (colorBufferBit & colorBufferEnableMask) { @@ -794,19 +796,25 @@ INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SW blendSrc[3] = pColorSample[6]; } - // Blend outputs and update coverage mask for alpha test - if (pfnBlendFunc[rt] != nullptr) { - pfnBlendFunc[rt]( - pBlendState, - psContext.shaded[rt], - psContext.shaded[1], - psContext.shaded[0].w, - sample, - reinterpret_cast(&blendSrc), - blendOut, - &psContext.oMask, - reinterpret_cast(&coverageMask)); + // pfnBlendFunc may not update all channels. Initialize with PS output. + /// TODO: move this into the blend JIT. + blendOut = psContext.shaded[rt]; + + // Blend outputs and update coverage mask for alpha test + if(pfnBlendFunc[rt] != nullptr) + { + pfnBlendFunc[rt]( + pBlendState, + psContext.shaded[rt], + psContext.shaded[1], + psContext.shaded[0].w, + sample, + reinterpret_cast(&blendSrc), + blendOut, + &psContext.oMask, + reinterpret_cast(&coverageMask)); + } } // final write mask @@ -840,6 +848,9 @@ INLINE void OutputMerger8x2(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SW template void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers) { + ///@todo: Need to move locals off stack to prevent __chkstk's from being generated for the backend + + SWR_CONTEXT *pContext = pDC->pContext; AR_BEGIN(BEPixelRateBackend, pDC->drawId); @@ -850,12 +861,12 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t BarycentricCoeffs coeffs; SetupBarycentricCoeffs(&coeffs, work); - uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], *pDepthBuffer, *pStencilBuffer; - SetupRenderBuffers(pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers); - SWR_PS_CONTEXT psContext; SetupPixelShaderContext(&psContext, work); + uint8_t *pDepthBuffer, *pStencilBuffer; + SetupRenderBuffers(psContext.pColorBuffer, &pDepthBuffer, &pStencilBuffer, state.psState.numRenderTargets, renderBuffers); + AR_END(BESetup, 0); PixelRateZTestLoop PixelRateZTest(pDC, workerId, work, coeffs, state, pDepthBuffer, pStencilBuffer, state.rastState.clipDistanceMask); @@ -975,10 +986,10 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t // broadcast the results of the PS to all passing pixels #if USE_8x2_TILE_BACKEND - OutputMerger8x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset); -#else - OutputMerger4x2(psContext, pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets); -#endif + OutputMerger8x2(psContext, psContext.pColorBuffer, sample, &state.blendState,state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets, state.colorHottileEnable, useAlternateOffset); +#else // USE_8x2_TILE_BACKEND + OutputMerger4x2(psContext, psContext.pColorBuffer, sample, &state.blendState, state.pfnBlendFunc, coverageMask, depthMask, state.psState.numRenderTargets); +#endif // USE_8x2_TILE_BACKEND if(!state.psState.forceEarlyZ && !T::bForcedSampleCount) { @@ -1009,13 +1020,13 @@ Endtile: { for (uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) { - pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; + psContext.pColorBuffer[rt] += (2 * KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; } } #else for(uint32_t rt = 0; rt < state.psState.numRenderTargets; ++rt) { - pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; + psContext.pColorBuffer[rt] += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; } pDepthBuffer += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; pStencilBuffer += (KNOB_SIMD_WIDTH * FormatTraits::bpp) / 8; @@ -1035,7 +1046,8 @@ Endtile: } template + uint32_t coverage = 0, uint32_t centroid = 0, uint32_t forced = 0, uint32_t canEarlyZ = 0 + > struct SwrBackendTraits { static const bool bIsStandardPattern = (samplePattern == SWR_MSAA_STANDARD_PATTERN); diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h index 05347dce986..bb1336c429f 100644 --- a/src/gallium/drivers/swr/rasterizer/core/state.h +++ b/src/gallium/drivers/swr/rasterizer/core/state.h @@ -330,6 +330,8 @@ struct SWR_PS_CONTEXT uint32_t rasterizerSampleCount; // IN: sample count used by the rasterizer + uint8_t* pColorBuffer[SWR_NUM_RENDERTARGETS]; + // IN: Pointers to render target hottiles }; ////////////////////////////////////////////////////////////////////////// @@ -511,6 +513,7 @@ struct SWR_SURFACE_STATE uint8_t *pAuxBaseAddress; // Used for compression, append/consume counter, etc. SWR_AUX_MODE auxMode; // @llvm_enum + bool bInterleavedSamples; // are MSAA samples stored interleaved or planar }; @@ -1087,7 +1090,6 @@ struct SWR_PS_STATE uint32_t barycentricsMask : 3; // which type(s) of barycentric coords does the PS interpolate attributes with uint32_t usesUAV : 1; // pixel shader accesses UAV uint32_t forceEarlyZ : 1; // force execution of early depth/stencil test - }; // depth bounds state -- cgit v1.2.3