diff options
author | Tim Rowley <[email protected]> | 2016-12-07 17:02:29 -0600 |
---|---|---|
committer | Tim Rowley <[email protected]> | 2016-12-09 10:41:31 -0600 |
commit | 7aea08667c673713e1f419539e788eedeea047cb (patch) | |
tree | 1bcf024f5a2c7a678158a99bf29f1b0ce1dddf8d | |
parent | 53e1c970efec039047527cd06aafd09ecb86babe (diff) |
swr: [rasterizer core/memory] Finish R24_UNORM_X8_TYPELESS for AVX512
This one-off specialization was missed.
Reviewed-by: Bruce Cherniak <[email protected]>
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/core/backend.h | 4 | ||||
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/memory/StoreTile.h | 46 |
2 files changed, 24 insertions, 26 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h index 706b372898d..48cfce2bb85 100644 --- a/src/gallium/drivers/swr/rasterizer/core/backend.h +++ b/src/gallium/drivers/swr/rasterizer/core/backend.h @@ -463,7 +463,7 @@ inline void SetupBarycentricCoeffs(BarycentricCoeffs *coeffs, const SWR_TRIANGLE inline void SetupRenderBuffers(uint8_t *pColorBuffer[SWR_NUM_RENDERTARGETS], uint8_t **pDepthBuffer, uint8_t **pStencilBuffer, uint32_t colorBufferCount, RenderOutputBuffers &renderBuffers) { - SWR_ASSERT(colorBufferCount <= SWR_NUM_RENDERTARGETS); + assert(colorBufferCount <= SWR_NUM_RENDERTARGETS); if (pColorBuffer) { @@ -754,8 +754,6 @@ INLINE void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_N INLINE void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_RENDERTARGETS], uint32_t sample, const SWR_BLEND_STATE *pBlendState, const PFN_BLEND_JIT_FUNC(&pfnBlendFunc)[SWR_NUM_RENDERTARGETS], simdscalar &coverageMask, simdscalar depthPassMask, const uint32_t NumRT, bool useAlternateOffset) { - assert(sample == 0); // will need up upate Raster Tile Color Offsets to support more than single sample here.. - // type safety guaranteed from template instantiation in BEChooser<>::GetFunc uint32_t rasterTileColorOffset = RasterTileColorOffset(sample); diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h index 558f0489d84..4fa6683e42b 100644 --- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h +++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h @@ -545,38 +545,38 @@ struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS> INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) { #if USE_8x2_TILE_BACKEND - static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 4; // 16 pixels * 4 bytes per pixel + simd16scalar comp = _simd16_load_ps(reinterpret_cast<const float *>(pSrc)); - OSALIGNSIMD16(uint8_t) soaTile[MAX_RASTER_TILE_BYTES]; - OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; + // clamp + const simd16scalar zero = _simd16_setzero_ps(); + const simd16scalar ones = _simd16_set1_ps(1.0f); - // Convert from SrcFormat --> DstFormat - simd16vector src; - LoadSOA<SrcFormat>(pSrc, src); - StoreSOA<DstFormat>(src, soaTile); + comp = _simd16_max_ps(comp, zero); + comp = _simd16_min_ps(comp, ones); - // Convert from SOA --> AOS - FormatTraits<DstFormat>::TransposeT::Transpose_16(soaTile, aosTile); - - // Store data into destination but don't overwrite the X8 bits - // Each 4-pixel row is 16-bytes + // normalize + comp = _simd16_mul_ps(comp, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); - simdscalari loadlo = _simd_load_si(reinterpret_cast<simdscalari *>(aosTile)); - simdscalari loadhi = _simd_load_si(reinterpret_cast<simdscalari *>(aosTile + sizeof(simdscalari))); + simd16scalari temp = _simd16_cvtps_epi32(comp); - simdscalari templo = _simd_unpacklo_epi64(loadlo, loadhi); - simdscalari temphi = _simd_unpackhi_epi64(loadlo, loadhi); + // swizzle + temp = _simd16_permute_epi32(temp, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0)); + // merge/store data into destination but don't overwrite the X8 bits simdscalari destlo = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0])); simdscalari desthi = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2])); - simdscalari mask = _simd_set1_epi32(0x00FFFFFF); + simd16scalari dest = _simd16_setzero_si(); + + dest = _simd16_insert_si(dest, destlo, 0); + dest = _simd16_insert_si(dest, desthi, 1); + + simd16scalari mask = _simd16_set1_epi32(0x00FFFFFF); - destlo = _simd_or_si(_simd_andnot_si(mask, destlo), _simd_and_si(mask, templo)); - desthi = _simd_or_si(_simd_andnot_si(mask, desthi), _simd_and_si(mask, templo)); + dest = _simd16_or_si(_simd16_andnot_si(mask, dest), _simd16_and_si(mask, temp)); - _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]), destlo); - _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]), desthi); + _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]), _simd16_extract_si(dest, 0)); + _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]), _simd16_extract_si(dest, 1)); #else static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel @@ -663,7 +663,7 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDs simd16scalari src3 = _simd16_cvtps_epi32(comp3); // padded byte aaaaaaaaaaaaaaaa // SOA to AOS conversion - src1 = _simd16_slli_epi32(src1, 8); + src1 = _simd16_slli_epi32(src1, 8); src2 = _simd16_slli_epi32(src2, 16); src3 = _simd16_slli_epi32(src3, 24); @@ -836,7 +836,7 @@ INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8 simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb // SOA to AOS conversion - src1 = _simd16_slli_epi32(src1, 8); + src1 = _simd16_slli_epi32(src1, 8); src2 = _simd16_slli_epi32(src2, 16); simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), src2); // 0 1 2 3 4 5 6 7 8 9 A B C D E F |