diff options
author | Tim Rowley <[email protected]> | 2016-07-28 16:25:09 -0600 |
---|---|---|
committer | Tim Rowley <[email protected]> | 2016-08-04 14:38:35 -0500 |
commit | 68dc54487903024b90177f9e9e372bcdfa1c6a1a (patch) | |
tree | 3072a63cfadd91f694d148a8a08e461ac21d9646 /src/gallium/drivers/swr | |
parent | 4034f48833a0e4a00636b24e4d4e7980202ef4aa (diff) |
swr: [rasterizer core] implement InnerConservative input coverage
Signed-off-by: Tim Rowley <[email protected]>
Diffstat (limited to 'src/gallium/drivers/swr')
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/core/api.cpp | 13 | ||||
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/core/backend.cpp | 70 | ||||
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/core/backend.h | 260 | ||||
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/core/conservativeRast.h | 12 | ||||
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/core/context.h | 3 | ||||
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp | 181 |
6 files changed, 357 insertions, 182 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp index 21b9e3f8c7d..00352580ab2 100644 --- a/src/gallium/drivers/swr/rasterizer/core/api.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp @@ -736,9 +736,9 @@ void SetupMacroTileScissors(DRAW_CONTEXT *pDC) // templated backend function tables extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT]; -extern PFN_BACKEND_FUNC gBackendSingleSample[2][2][2]; -extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_MSAA_SAMPLE_PATTERN_COUNT][2][2][2][2]; -extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT][2][2][2]; +extern PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT][2][2]; +extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_MSAA_SAMPLE_PATTERN_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2][2]; +extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]; void SetupPipeline(DRAW_CONTEXT *pDC) { DRAW_STATE* pState = pDC->pState; @@ -757,7 +757,6 @@ void SetupPipeline(DRAW_CONTEXT *pDC) const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.forcedSampleCount) ? 1 : 0; const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0; const uint32_t canEarlyZ = (psState.forceEarlyZ || (!psState.writesODepth && !psState.usesSourceDepth && !psState.usesUAV)) ? 1 : 0; - const uint32_t inputCoverage = (psState.inputCoverage != SWR_INPUT_COVERAGE_NONE) ? 1 : 0; SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask; @@ -769,20 +768,20 @@ void SetupPipeline(DRAW_CONTEXT *pDC) { // always need to generate I & J per sample for Z interpolation barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK); - backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][inputCoverage][centroid][forcedSampleCount][canEarlyZ]; + backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount][canEarlyZ]; } else { // always need to generate I & J per pixel for Z interpolation barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK); - backendFuncs.pfnBackend = gBackendSingleSample[inputCoverage][centroid][canEarlyZ]; + backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid][canEarlyZ]; } break; case SWR_SHADING_RATE_SAMPLE: SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN); // always need to generate I & J per sample for Z interpolation barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK); - backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][inputCoverage][centroid][canEarlyZ]; + backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid][canEarlyZ]; break; default: SWR_ASSERT(0 && "Invalid shading rate"); diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp index b1e6c918715..92634b12f63 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp @@ -492,9 +492,11 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 // pixel center psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx)); - if(T::bInputCoverage) + if(T::InputCoverage != SWR_INPUT_COVERAGE_NONE) { - generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask); + const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : + &work.coverageMask[0]; + generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, pBlendState->sampleMask); } RDTSC_START(BEBarycentric); @@ -593,6 +595,10 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3 Endtile: RDTSC_START(BEEndTile); coverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); + if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) + { + work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); + } pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; @@ -678,9 +684,11 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ CalcPixelBarycentrics(coeffs, psContext); RDTSC_STOP(BEBarycentric, 0, 0); - if(T::bInputCoverage) + if(T::InputCoverage != SWR_INPUT_COVERAGE_NONE) { - generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask); + const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : + &work.coverageMask[0]; + generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, pBlendState->sampleMask); } if(T::bCentroidPos) @@ -808,6 +816,10 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_ work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); } RDTSC_START(BEEndTile); + if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) + { + work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); + } pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; @@ -896,9 +908,11 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t CalcPixelBarycentrics(coeffs, psContext); RDTSC_STOP(BEBarycentric, 0, 0); - if (T::bInputCoverage) + if (T::InputCoverage != SWR_INPUT_COVERAGE_NONE) { - generateInputCoverage<T>(&work.coverageMask[0], psContext.inputMask, pBlendState->sampleMask); + const uint64_t* pCoverageMask = (T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) ? &work.innerCoverageMask : + &work.coverageMask[0]; + generateInputCoverage<T, T::InputCoverage>(pCoverageMask, psContext.inputMask, pBlendState->sampleMask); } if(T::bCentroidPos) @@ -1018,6 +1032,10 @@ Endtile: work.coverageMask[sample] >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); } + if(T::InputCoverage == SWR_INPUT_COVERAGE_INNER_CONSERVATIVE) + { + work.innerCoverageMask >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); + } work.anyCoveredSamples >>= (SIMD_TILE_Y_DIM * SIMD_TILE_X_DIM); pDepthBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_DEPTH_HOT_TILE_FORMAT>::bpp) / 8; pStencilBase += (KNOB_SIMD_WIDTH * FormatTraits<KNOB_STENCIL_HOT_TILE_FORMAT>::bpp) / 8; @@ -1143,19 +1161,19 @@ void InitClearTilesTable() } PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_COUNT]; -PFN_BACKEND_FUNC gBackendSingleSample[2] // input coverage +PFN_BACKEND_FUNC gBackendSingleSample[SWR_INPUT_COVERAGE_COUNT] [2] // centroid [2] // canEarlyZ = {}; PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_COUNT] [SWR_MSAA_SAMPLE_PATTERN_COUNT] - [2] // input coverage + [SWR_INPUT_COVERAGE_COUNT] [2] // centroid [2] // forcedSampleCount [2] // canEarlyZ = {}; PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_COUNT] - [2] // input coverage + [SWR_INPUT_COVERAGE_COUNT] [2] // centroid [2] // canEarlyZ = {}; @@ -1197,6 +1215,22 @@ struct BEChooser // Recursively parse args template <typename... TArgsT> + static PFN_BACKEND_FUNC GetFunc(SWR_INPUT_COVERAGE tArg, TArgsT... remainingArgs) + { + switch(tArg) + { + case SWR_INPUT_COVERAGE_NONE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); break; + case SWR_INPUT_COVERAGE_NORMAL: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NORMAL>::GetFunc(remainingArgs...); break; + case SWR_INPUT_COVERAGE_INNER_CONSERVATIVE: return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>::GetFunc(remainingArgs...); break; + default: + SWR_ASSERT(0 && "Invalid sample pattern\n"); + return BEChooser<ArgsT..., SWR_INPUT_COVERAGE_NONE>::GetFunc(remainingArgs...); + break; + } + } + + // Recursively parse args + template <typename... TArgsT> static PFN_BACKEND_FUNC GetFunc(SWR_MULTISAMPLE_COUNT tArg, TArgsT... remainingArgs) { switch(tArg) @@ -1226,29 +1260,29 @@ struct BEChooser } }; -void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[2][2][2]) +void InitBackendSingleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_INPUT_COVERAGE_COUNT][2][2]) { - for(uint32_t inputCoverage = 0; inputCoverage < 2; inputCoverage++) + for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++) { for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++) { for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++) { table[inputCoverage][isCentroid][canEarlyZ] = - BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, (inputCoverage > 0), + BEChooser<>::GetFunc(SWR_MULTISAMPLE_1X, SWR_MSAA_STANDARD_PATTERN, (SWR_INPUT_COVERAGE)inputCoverage, (isCentroid > 0), false, (canEarlyZ > 0), SWR_BACKEND_SINGLE_SAMPLE); } } } } -void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_MSAA_SAMPLE_PATTERN_COUNT][2][2][2][2]) +void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_MSAA_SAMPLE_PATTERN_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2][2]) { for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++) { for(uint32_t samplePattern = SWR_MSAA_CENTER_PATTERN; samplePattern < SWR_MSAA_SAMPLE_PATTERN_COUNT; samplePattern++) { - for(uint32_t inputCoverage = 0; inputCoverage < 2; inputCoverage++) + for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++) { for(uint32_t isCentroid = 0; isCentroid < 2; isCentroid++) { @@ -1257,7 +1291,7 @@ void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_CO for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++) { table[sampleCount][samplePattern][inputCoverage][isCentroid][forcedSampleCount][canEarlyZ] = - BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (inputCoverage > 0), + BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, (SWR_MSAA_SAMPLE_PATTERN)samplePattern, (SWR_INPUT_COVERAGE)inputCoverage, (isCentroid > 0), (forcedSampleCount > 0), (canEarlyZ > 0), SWR_BACKEND_MSAA_PIXEL_RATE); } } @@ -1267,18 +1301,18 @@ void InitBackendPixelFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_CO } } -void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][2][2][2]) +void InitBackendSampleFuncTable(PFN_BACKEND_FUNC (&table)[SWR_MULTISAMPLE_TYPE_COUNT][SWR_INPUT_COVERAGE_COUNT][2][2]) { for(uint32_t sampleCount = SWR_MULTISAMPLE_1X; sampleCount < SWR_MULTISAMPLE_TYPE_COUNT; sampleCount++) { - for(uint32_t inputCoverage = 0; inputCoverage < 2; inputCoverage++) + for(uint32_t inputCoverage = 0; inputCoverage < SWR_INPUT_COVERAGE_COUNT; inputCoverage++) { for(uint32_t centroid = 0; centroid < 2; centroid++) { for(uint32_t canEarlyZ = 0; canEarlyZ < 2; canEarlyZ++) { table[sampleCount][inputCoverage][centroid][canEarlyZ] = - BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (inputCoverage > 0), + BEChooser<>::GetFunc((SWR_MULTISAMPLE_COUNT)sampleCount, SWR_MSAA_STANDARD_PATTERN, (SWR_INPUT_COVERAGE)inputCoverage, (centroid > 0), false, (canEarlyZ > 0), (SWR_BACKEND_FUNCS)SWR_BACKEND_MSAA_SAMPLE_RATE); } } diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h index 8a289c70265..c8824792891 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.h +++ b/src/gallium/drivers/swr/rasterizer/core/backend.h @@ -134,154 +134,184 @@ INLINE static uint32_t RasterTileStencilOffset(uint32_t sampleNum) return RasterTileStencilOffsets[sampleNum]; } -template<typename T> -INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask) +template<typename T, uint32_t InputCoverage> +struct generateInputCoverage { - - // will need to update for avx512 - assert(KNOB_SIMD_WIDTH == 8); - - __m256i mask[2]; - __m256i sampleCoverage[2]; - if(T::bIsStandardPattern) + INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask) { - __m256i src = _mm256_set1_epi32(0); - __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1; + // will need to update for avx512 + assert(KNOB_SIMD_WIDTH == 8); - if(T::MultisampleT::numSamples == 1) - { - mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1); - } - else if(T::MultisampleT::numSamples == 2) - { - mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); - } - else if(T::MultisampleT::numSamples == 4) - { - mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); - } - else if(T::MultisampleT::numSamples == 8) + __m256i mask[2]; + __m256i sampleCoverage[2]; + if(T::bIsStandardPattern) { - mask[0] = _mm256_set1_epi32(-1); + __m256i src = _mm256_set1_epi32(0); + __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1; + + if(T::MultisampleT::numSamples == 1) + { + mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1); + } + else if(T::MultisampleT::numSamples == 2) + { + mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1); + } + else if(T::MultisampleT::numSamples == 4) + { + mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1); + } + else if(T::MultisampleT::numSamples == 8) + { + mask[0] = _mm256_set1_epi32(-1); + } + else if(T::MultisampleT::numSamples == 16) + { + mask[0] = _mm256_set1_epi32(-1); + mask[1] = _mm256_set1_epi32(-1); + index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8); + } + + // gather coverage for samples 0-7 + sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8)); + if(T::MultisampleT::numSamples > 8) + { + // gather coverage for samples 8-15 + sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8)); + } } - else if(T::MultisampleT::numSamples == 16) + else { - mask[0] = _mm256_set1_epi32(-1); - mask[1] = _mm256_set1_epi32(-1); - index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8); + // center coverage is the same for all samples; just broadcast to the sample slots + uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK); + if(T::MultisampleT::numSamples == 1) + { + sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage); + } + else if(T::MultisampleT::numSamples == 2) + { + sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage); + } + else if(T::MultisampleT::numSamples == 4) + { + sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage); + } + else if(T::MultisampleT::numSamples == 8) + { + sampleCoverage[0] = _mm256_set1_epi32(centerCoverage); + } + else if(T::MultisampleT::numSamples == 16) + { + sampleCoverage[0] = _mm256_set1_epi32(centerCoverage); + sampleCoverage[1] = _mm256_set1_epi32(centerCoverage); + } } - // gather coverage for samples 0-7 - sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8)); + mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0); + // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane + __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]); + + __m256i packedCoverage1; if(T::MultisampleT::numSamples > 8) { - // gather coverage for samples 8-15 - sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8)); + // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane + packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]); } - } - else - { - // center coverage is the same for all samples; just broadcast to the sample slots - uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK); - if(T::MultisampleT::numSamples == 1) + + #if (KNOB_ARCH == KNOB_ARCH_AVX) + // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane + __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83); + __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); + packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE)); + + __m256i packedSampleCoverage; + if(T::MultisampleT::numSamples > 8) { - sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage); + // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane + hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83); + shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); + shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE); + packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01))); + packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC)); } - else if(T::MultisampleT::numSamples == 2) + else { - sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage); + packedSampleCoverage = packedCoverage0; } - else if(T::MultisampleT::numSamples == 4) + #else + __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0); + // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane + packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask); + + __m256i packedSampleCoverage; + if(T::MultisampleT::numSamples > 8) { - sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage); + permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7); + // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane + packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask); + + // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane + packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C); } - else if(T::MultisampleT::numSamples == 8) + else { - sampleCoverage[0] = _mm256_set1_epi32(centerCoverage); + packedSampleCoverage = packedCoverage0; } - else if(T::MultisampleT::numSamples == 16) + #endif + + for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--) { - sampleCoverage[0] = _mm256_set1_epi32(centerCoverage); - sampleCoverage[1] = _mm256_set1_epi32(centerCoverage); - } - } + // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2 + inputMask[i] = _simd_movemask_epi8(packedSampleCoverage); - mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0); - // pull out the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane - __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]); + if(!T::bForcedSampleCount) + { + // input coverage has to be anded with sample mask if MSAA isn't forced on + inputMask[i] &= sampleMask; + } - __m256i packedCoverage1; - if(T::MultisampleT::numSamples > 8) - { - // pull out the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane - packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]); + // shift to the next pixel in the 4x2 + packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1); + } } -#if (KNOB_ARCH == KNOB_ARCH_AVX) - // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane - __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83); - __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); - packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE)); - - __m256i packedSampleCoverage; - if(T::MultisampleT::numSamples > 8) - { - // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane - hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83); - shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1)); - shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE); - packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01))); - packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC)); - } - else + INLINE generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask) { - packedSampleCoverage = packedCoverage0; + uint32_t inputMask[KNOB_SIMD_WIDTH]; + generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask); + inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0])); } -#else - __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0); - // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane - packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask); - __m256i packedSampleCoverage; - if(T::MultisampleT::numSamples > 8) - { - permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7); - // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane - packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask); +}; - // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane - packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C); - } - else +template<typename T> +struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE> +{ + INLINE generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask) { - packedSampleCoverage = packedCoverage0; + // will need to update for avx512 + assert(KNOB_SIMD_WIDTH == 8); + __m256i vec = _mm256_set1_epi32(coverageMask[0]); + const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01); + vec = _simd_and_si(vec, bit); + vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec); + vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec); + inputCoverage = _simd_castsi_ps(vec); } -#endif - for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--) + INLINE generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask) { - // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2 - inputMask[i] = _simd_movemask_epi8(packedSampleCoverage); - - if(!T::bForcedSampleCount) + unsigned long index; + uint32_t simdCoverage = (coverageMask[0] & MASK); + static const uint32_t FullCoverageMask = (1 << T::MultisampleT::numSamples) - 1; + while(_BitScanForward(&index, simdCoverage)) { - // input coverage has to be anded with sample mask if MSAA isn't forced on - inputMask[i] &= sampleMask; + // set all samples to covered + inputMask[index] = FullCoverageMask; } - - // shift to the next pixel in the 4x2 - packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1); } -} - -template<typename T> -INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask) -{ - uint32_t inputMask[KNOB_SIMD_WIDTH]; - generateInputCoverage<T>(coverageMask, inputMask, sampleMask); - inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0])); -} +}; //////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////////// // Centroid behaves exactly as follows : @@ -298,7 +328,7 @@ INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const uint64_t *const cov const simdscalar vXSamplePosUL, const simdscalar vYSamplePosUL) { uint32_t inputMask[KNOB_SIMD_WIDTH]; - generateInputCoverage<T>(coverageMask, inputMask, sampleMask); + generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask); // Case (2) - partially covered pixel @@ -592,7 +622,7 @@ template<uint32_t sampleCountT = SWR_MULTISAMPLE_1X, uint32_t samplePattern = SW struct SwrBackendTraits { static const bool bIsStandardPattern = (samplePattern == SWR_MSAA_STANDARD_PATTERN); - static const bool bInputCoverage = (coverage == 1); + static const uint32_t InputCoverage = coverage; static const bool bCentroidPos = (centroid == 1); static const bool bForcedSampleCount = (forced == 1); static const bool bCanEarlyZ = (canEarlyZ == 1); diff --git a/src/gallium/drivers/swr/rasterizer/core/conservativeRast.h b/src/gallium/drivers/swr/rasterizer/core/conservativeRast.h index 1bc3938595c..1d8546959f5 100644 --- a/src/gallium/drivers/swr/rasterizer/core/conservativeRast.h +++ b/src/gallium/drivers/swr/rasterizer/core/conservativeRast.h @@ -131,9 +131,10 @@ typedef ConservativeRastFETraits<ConservativeRastT> FEConservativeRastT; /// default to standard rasterization behavior /// @tparam ConservativeT: type of conservative rasterization /// @tparam InputCoverageT: type of input coverage requested, if any -template <typename ConservativeT, typename InputCoverageT> +template <typename ConservativeT, typename _InputCoverageT> struct ConservativeRastBETraits { typedef std::false_type IsConservativeT; + typedef _InputCoverageT InputCoverageT; typedef FixedPointTraits<Fixed_16_8> ConservativePrecisionT; typedef std::integral_constant<int32_t, 0> ConservativeEdgeOffsetT; typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT; @@ -141,10 +142,11 @@ struct ConservativeRastBETraits { ////////////////////////////////////////////////////////////////////////// /// @brief StandardRastT specialization of ConservativeRastBETraits -template <typename InputCoverageT> -struct ConservativeRastBETraits<StandardRastT, InputCoverageT> +template <typename _InputCoverageT> +struct ConservativeRastBETraits<StandardRastT, _InputCoverageT> { typedef std::false_type IsConservativeT; + typedef _InputCoverageT InputCoverageT; typedef FixedPointTraits<Fixed_16_8> ConservativePrecisionT; typedef std::integral_constant<int32_t, 0> ConservativeEdgeOffsetT; typedef std::integral_constant<int32_t, 0> InnerConservativeEdgeOffsetT; @@ -206,8 +208,8 @@ struct ConservativeRastBETraits<ConservativeRastT, InnerConservativeCoverageT> /// intersects a pixel typedef std::integral_constant<int32_t, (ConservativePrecisionT::ScaleT::value/2) + 1> ConservativeEdgeOffsetT; - /// offset edge towards from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision + /// undo the outer conservative offset and offset edge towards from pixel center by 1/2 pixel + 1/512, in Fixed 16.9 precision /// this allows the rasterizer to do the 3 edge coverage tests against a single point, instead of /// of having to compare individual edges to pixel corners to check if a pixel is fully covered by a triangle - typedef std::integral_constant<int32_t, static_cast<int32_t>(-((ConservativePrecisionT::ScaleT::value/2) + 1))> InnerConservativeEdgeOffsetT; + typedef std::integral_constant<int32_t, static_cast<int32_t>(-((ConservativePrecisionT::ScaleT::value/2) + 1) - ConservativeEdgeOffsetT::value)> InnerConservativeEdgeOffsetT; };
\ No newline at end of file diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index 70472b4bf98..56f97975764 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -83,8 +83,7 @@ struct SWR_TRIANGLE_DESC float *pUserClipBuffer; uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES]; - uint64_t conservativeCoverageMask; - uint64_t innerConservativeCoverageMask; + uint64_t innerCoverageMask; // Conservative rasterization inner coverage: marked covered if entire pixel is covered uint64_t anyCoveredSamples; TRI_FLAGS triFlags; diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp index c5ef072de39..3c5d73466e2 100644 --- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp @@ -291,20 +291,10 @@ constexpr int64_t ManhToEdgePrecisionAdjust() /// the adjustEdgeConservative function. This struct should never /// be instantiated. /// @tparam RT: rasterizer traits -/// @tparam IsConservativeT: is conservative rast enabled? -template <typename RT, typename IsConservativeT> +/// @tparam ConservativeEdgeOffsetT: does the edge need offsetting? +template <typename RT, typename ConservativeEdgeOffsetT> struct adjustEdgeConservative { - INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) = delete; -}; - -////////////////////////////////////////////////////////////////////////// -/// @brief adjustEdgeConservative<RT, std::true_type> specialization -/// of adjustEdgeConservative. Used for conservative rasterization specific -/// edge adjustments -template <typename RT> -struct adjustEdgeConservative<RT, std::true_type> -{ ////////////////////////////////////////////////////////////////////////// /// @brief Performs calculations to adjust each edge of a triangle away /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y @@ -327,12 +317,12 @@ struct adjustEdgeConservative<RT, std::true_type> // 'fixed point' multiply (in double to be avx1 friendly) // need doubles to hold result of a fixed multiply: 16.8 * 16.9 = 32.17, for example __m256d vAai = _mm256_cvtepi32_pd(_mm_abs_epi32(vAi)), vBai = _mm256_cvtepi32_pd(_mm_abs_epi32(vBi)); - __m256d manh = _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(RT::ConservativeEdgeOffsetT::value)), - _mm256_mul_pd(vBai, _mm256_set1_pd(RT::ConservativeEdgeOffsetT::value))); + __m256d manh = _mm256_add_pd(_mm256_mul_pd(vAai, _mm256_set1_pd(ConservativeEdgeOffsetT::value)), + _mm256_mul_pd(vBai, _mm256_set1_pd(ConservativeEdgeOffsetT::value))); static_assert(RT::PrecisionT::BitsT::value + RT::ConservativePrecisionT::BitsT::value >= RT::EdgePrecisionT::BitsT::value, "Inadequate precision of result of manh calculation "); - + // rasterizer incoming edge precision is x.16, so we need to get our edge offset into the same precision // since we're doing fixed math in double format, multiply by multiples of 1/2 instead of a bit shift right manh = _mm256_mul_pd(manh, _mm256_set1_pd(ManhToEdgePrecisionAdjust<RT>() * 0.5)); @@ -345,14 +335,11 @@ struct adjustEdgeConservative<RT, std::true_type> }; ////////////////////////////////////////////////////////////////////////// -/// @brief adjustEdgeConservative<RT, std::false_type> specialization -/// of adjustEdgeConservative. Allows code to be generically called; when -/// IsConservativeT trait is disabled this inlines an empty function, which -/// should get optimized out. +/// @brief adjustEdgeConservative specialization where no edge offset is needed template <typename RT> -struct adjustEdgeConservative<RT, std::false_type> +struct adjustEdgeConservative<RT, std::integral_constant<int32_t, 0>> { - INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge){}; + INLINE adjustEdgeConservative(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) {}; }; ////////////////////////////////////////////////////////////////////////// @@ -369,7 +356,7 @@ constexpr int64_t ConservativeScissorOffset() } ////////////////////////////////////////////////////////////////////////// -/// @brief Performs calculations to adjust each a scalar edge out +/// @brief Performs calculations to adjust each a vector of evaluated edges out /// from the pixel center by 1/2 pixel + uncertainty region in both the x and y /// direction. template <typename RT> @@ -381,17 +368,46 @@ INLINE void adjustScissorEdge(const double a, const double b, __m256d &vEdge) }; ////////////////////////////////////////////////////////////////////////// +/// @brief Performs calculations to adjust each a scalar evaluated edge out +/// from the pixel center by 1/2 pixel + uncertainty region in both the x and y +/// direction. +template <typename RT, typename OffsetT> +INLINE double adjustScalarEdge(const double a, const double b, const double Edge) +{ + int64_t aabs = std::abs(static_cast<int64_t>(a)), babs = std::abs(static_cast<int64_t>(b)); + int64_t manh = ((aabs * OffsetT::value) + (babs * OffsetT::value)) >> ManhToEdgePrecisionAdjust<RT>(); + return (Edge - manh); +}; + +////////////////////////////////////////////////////////////////////////// /// @brief Perform any needed adjustments to evaluated triangle edges -template <typename RT> -INLINE void adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) +template <typename RT, typename EdgeOffsetT> +struct adjustEdgesFix16 { - static_assert(std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value, - "Edge equation expected to be in x.16 fixed point"); - // need to offset the edge before applying the top-left rule - adjustEdgeConservative<RT, typename RT::IsConservativeT>(vAi, vBi, vEdge); + INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) + { + static_assert(std::is_same<typename RT::EdgePrecisionT, FixedPointTraits<Fixed_X_16>>::value, + "Edge equation expected to be in x.16 fixed point"); - adjustTopLeftRuleIntFix16(vAi, vBi, vEdge); -} + static_assert(RT::IsConservativeT::value, "Edge offset assumes conservative rasterization is enabled"); + + // need to apply any edge offsets before applying the top-left rule + adjustEdgeConservative<RT, EdgeOffsetT>(vAi, vBi, vEdge); + + adjustTopLeftRuleIntFix16(vAi, vBi, vEdge); + } +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief Perform top left adjustments to evaluated triangle edges +template <typename RT> +struct adjustEdgesFix16<RT, std::integral_constant<int32_t, 0>> +{ + INLINE adjustEdgesFix16(const __m128i &vAi, const __m128i &vBi, __m256d &vEdge) + { + adjustTopLeftRuleIntFix16(vAi, vBi, vEdge); + } +}; // max(abs(dz/dx), abs(dz,dy) INLINE float ComputeMaxDepthSlope(const SWR_TRIANGLE_DESC* pDesc) @@ -533,7 +549,7 @@ void ComputeEdgeData(const POS& p0, const POS& p1, EDGE& edge) /// corner to sample position, and test for coverage /// @tparam sampleCount: multisample count template <typename NumSamplesT> -INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d (&vEdgeFix16)[7], +INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16, int32_t &mask0, int32_t &mask1, int32_t &mask2) { __m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2; @@ -550,7 +566,7 @@ INLINE void UpdateEdgeMasks(const __m256d (&vEdgeTileBbox)[3], const __m256d (&v /// @brief UpdateEdgeMasks<SingleSampleT> specialization, instantiated /// when only rasterizing a single coverage test point template <> -INLINE void UpdateEdgeMasks<SingleSampleT>(const __m256d(&)[3], const __m256d (&vEdgeFix16)[7], +INLINE void UpdateEdgeMasks<SingleSampleT>(const __m256d(&)[3], const __m256d* vEdgeFix16, int32_t &mask0, int32_t &mask1, int32_t &mask2) { mask0 = _mm256_movemask_pd(vEdgeFix16[0]); @@ -722,6 +738,86 @@ INLINE bool TrivialAcceptTest<AllEdgesValidT>(const int mask0, const int mask1, return ((mask0 & mask1 & mask2) == 0xf); }; +////////////////////////////////////////////////////////////////////////// +/// @brief Primary function template for GenerateSVInnerCoverage. Results +/// in an empty function call if SVInnerCoverage isn't requested +template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT> +struct GenerateSVInnerCoverage +{ + INLINE GenerateSVInnerCoverage(DRAW_CONTEXT*, EDGE*, double*, uint64_t &){}; +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief Specialization of GenerateSVInnerCoverage where all edges +/// are non-degenerate and SVInnerCoverage is requested. Offsets the evaluated +/// edge values from OuterConservative to InnerConservative and rasterizes. +template <typename RT> +struct GenerateSVInnerCoverage<RT, AllEdgesValidT, InnerConservativeCoverageT> +{ + INLINE GenerateSVInnerCoverage(DRAW_CONTEXT* pDC, EDGE* pRastEdges, double* pStartQuadEdges, uint64_t &innerCoverageMask) + { + double startQuadEdgesAdj[RT::NumEdgesT::value]; + for(uint32_t e = 0; e < RT::NumEdgesT::value; ++e) + { + startQuadEdgesAdj[e] = adjustScalarEdge<RT, typename RT::InnerConservativeEdgeOffsetT>(pRastEdges[e].a, pRastEdges[e].b, pStartQuadEdges[e]); + } + + // not trivial accept or reject, must rasterize full tile + RDTSC_START(BERasterizePartial); + innerCoverageMask = rasterizePartialTile<RT::NumEdgesT::value, typename RT::ValidEdgeMaskT>(pDC, startQuadEdgesAdj, pRastEdges); + RDTSC_STOP(BERasterizePartial, 0, 0); + } +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief Primary function template for UpdateEdgeMasksInnerConservative. Results +/// in an empty function call if SVInnerCoverage isn't requested +template <typename RT, typename ValidEdgeMaskT, typename InputCoverageT> +struct UpdateEdgeMasksInnerConservative +{ + INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d*, + const __m128i, const __m128i, int32_t &, int32_t &, int32_t &){}; +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief Specialization of UpdateEdgeMasksInnerConservative where all edges +/// are non-degenerate and SVInnerCoverage is requested. Offsets the edges +/// evaluated at raster tile corners to inner conservative position and +/// updates edge masks +template <typename RT> +struct UpdateEdgeMasksInnerConservative<RT, AllEdgesValidT, InnerConservativeCoverageT> +{ + INLINE UpdateEdgeMasksInnerConservative(const __m256d (&vEdgeTileBbox)[3], const __m256d* vEdgeFix16, + const __m128i vAi, const __m128i vBi, int32_t &mask0, int32_t &mask1, int32_t &mask2) + { + __m256d vTempEdge[3]{vEdgeFix16[0], vEdgeFix16[1], vEdgeFix16[2]}; + + // instead of keeping 2 copies of evaluated edges around, just compensate for the outer + // conservative evaluated edge when adjusting the edge in for inner conservative tests + adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[0]); + adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[1]); + adjustEdgeConservative<RT, typename RT::InnerConservativeEdgeOffsetT>(vAi, vBi, vTempEdge[2]); + + UpdateEdgeMasks<typename RT::NumRasterSamplesT>(vEdgeTileBbox, vTempEdge, mask0, mask1, mask2); + } +}; + +////////////////////////////////////////////////////////////////////////// +/// @brief Specialization of UpdateEdgeMasksInnerConservative where SVInnerCoverage +/// is requested but at least one edge is degenerate. Since a degenerate triangle cannot +/// cover an entire raster tile, set mask0 to 0 to force it down the +/// rastierizePartialTile path +template <typename RT, typename ValidEdgeMaskT> +struct UpdateEdgeMasksInnerConservative<RT, ValidEdgeMaskT, InnerConservativeCoverageT> +{ + INLINE UpdateEdgeMasksInnerConservative(const __m256d (&)[3], const __m256d*, + const __m128i, const __m128i, int32_t &mask0, int32_t &, int32_t &) + { + // set one mask to zero to force the triangle down the rastierizePartialTile path + mask0 = 0; + } +}; + template <typename RT> void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, void* pDesc) { @@ -963,8 +1059,8 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, __m256d vBiDeltaYFix16 = _mm256_mul_pd(vBipd, vDeltaYpd); __m256d vEdge = _mm256_add_pd(vAiDeltaXFix16, vBiDeltaYFix16); - // apply and edge adjustments(top-left, crast, etc) - adjustEdgesFix16<RT>(vAi, vBi, vEdge); + // apply any edge adjustments(top-left, crast, etc) + adjustEdgesFix16<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdge); // broadcast respective edge results to all lanes double* pEdge = (double*)&vEdge; @@ -1016,6 +1112,9 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8); __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8); vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16); + + // adjust for msaa tile bbox edges outward for conservative rast, if enabled + adjustEdgeConservative<RT, typename RT::ConservativeEdgeOffsetT>(vAi, vBi, vEdgeTileBbox[e]); } } @@ -1056,11 +1155,20 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, { // trivial accept mask triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL; + + // Update the raster tile edge masks based on inner conservative edge offsets, if enabled + UpdateEdgeMasksInnerConservative<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT> + (vEdgeTileBbox, vEdgeFix16, vAi, vBi, mask0, mask1, mask2); + if (TrivialAcceptTest<typename RT::ValidEdgeMaskT>(mask0, mask1, mask2)) { - triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum]; // trivial accept, all 4 corners of all 3 edges are negative // i.e. raster tile completely inside triangle + triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum]; + if(std::is_same<typename RT::InputCoverageT, InnerConservativeCoverageT>::value) + { + triDesc.innerCoverageMask = 0xffffffffffffffffULL; + } RDTSC_EVENT(BETrivialAccept, 1, 0); } else @@ -1104,6 +1212,9 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile, RDTSC_STOP(BERasterizePartial, 0, 0); triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum]; + + // Output SV InnerCoverage, if needed + GenerateSVInnerCoverage<RT, typename RT::ValidEdgeMaskT, typename RT::InputCoverageT>(pDC, rastEdges, startQuadEdges, triDesc.innerCoverageMask); } } else |