diff options
12 files changed, 333 insertions, 560 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h index b00beeb36dd..e812da39851 100644 --- a/src/gallium/drivers/swr/rasterizer/common/os.h +++ b/src/gallium/drivers/swr/rasterizer/common/os.h @@ -265,9 +265,7 @@ typedef MEGABYTE GIGABYTE[1024]; #define OSALIGNLINE(RWORD) OSALIGN(RWORD, 64) #define OSALIGNSIMD(RWORD) OSALIGN(RWORD, KNOB_SIMD_BYTES) -#if ENABLE_AVX512_SIMD16 #define OSALIGNSIMD16(RWORD) OSALIGN(RWORD, KNOB_SIMD16_BYTES) -#endif #include "common/swr_assert.h" diff --git a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h index b08fb2eaaea..5964edff4d3 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h @@ -24,8 +24,6 @@ #ifndef __SWR_SIMD16INTRIN_H__ #define __SWR_SIMD16INTRIN_H__ -#if ENABLE_AVX512_SIMD16 - #if KNOB_SIMD16_WIDTH == 16 typedef SIMD512 SIMD16; #else @@ -167,6 +165,4 @@ typedef SIMD512 SIMD16; #define _simd16_mask2int(mask) int(mask) #define _simd16_vmask_ps SIMD16::vmask_ps -#endif // ENABLE_AVX512_SIMD16 - #endif //__SWR_SIMD16INTRIN_H_ diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h index 8ffda3f8458..5eae34ef4e2 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h @@ -341,8 +341,6 @@ static SIMDINLINE simdscalar _simd_abs_ps(simdscalar const& a) return _simd_castsi_ps(_simd_and_si(ai, _simd_set1_epi32(0x7fffffff))); } -#if ENABLE_AVX512_SIMD16 #include "simd16intrin.h" -#endif // ENABLE_AVX512_SIMD16 #endif //__SWR_SIMDINTRIN_H__ diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index efbddb01e0e..a8182559791 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -230,7 +230,6 @@ typedef void (*PFN_PROCESS_PRIMS)(DRAW_CONTEXT* pDC, simdscalari const& viewportIdx, simdscalari const& rtIdx); -#if ENABLE_AVX512_SIMD16 // function signature for pipeline stages that execute after primitive assembly typedef void(SIMDCALL* PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT* pDC, PA_STATE& pa, @@ -241,7 +240,6 @@ typedef void(SIMDCALL* PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT* pDC, simd16scalari const& viewportIdx, simd16scalari const& rtIdx); -#endif OSALIGNLINE(struct) API_STATE { // Vertex Buffers diff --git a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h index 90bf118727e..247ba0b0dcd 100644 --- a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h +++ b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h @@ -33,15 +33,17 @@ /// SOA RGBA32_FLOAT format. /// @param pSrc - source data in SOA form /// @param dst - output data in SOA form -template <SWR_FORMAT SrcFormat> -INLINE void LoadSOA(const uint8_t* pSrc, simdvector& dst) +template <typename SIMD_T, SWR_FORMAT SrcFormat> +INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, Vec4<SIMD_T>& dst) { // fast path for float32 if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<SrcFormat>::GetBPC(0) == 32)) { - auto lambda = [&](int comp) { - simdscalar vComp = _simd_load_ps((const float*)(pSrc + comp * sizeof(simdscalar))); + auto lambda = [&](int comp) + { + Float<SIMD_T> vComp = + SIMD_T::load_ps(reinterpret_cast<const float*>(pSrc + comp * sizeof(Float<SIMD_T>))); dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp; }; @@ -50,9 +52,11 @@ INLINE void LoadSOA(const uint8_t* pSrc, simdvector& dst) return; } - auto lambda = [&](int comp) { + auto lambda = [&](int comp) + { // load SIMD components - simdscalar vComp = FormatTraits<SrcFormat>::loadSOA(comp, pSrc); + Float<SIMD_T> vComp; + FormatTraits<SrcFormat>::loadSOA(comp, pSrc, vComp); // unpack vComp = FormatTraits<SrcFormat>::unpack(comp, vComp); @@ -60,250 +64,119 @@ INLINE void LoadSOA(const uint8_t* pSrc, simdvector& dst) // convert if (FormatTraits<SrcFormat>::isNormalized(comp)) { - vComp = _simd_cvtepi32_ps(_simd_castps_si(vComp)); - vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<SrcFormat>::toFloat(comp))); + vComp = SIMD_T::cvtepi32_ps(SIMD_T::castps_si(vComp)); + vComp = SIMD_T::mul_ps(vComp, SIMD_T::set1_ps(FormatTraits<SrcFormat>::toFloat(comp))); } dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp; - pSrc += (FormatTraits<SrcFormat>::GetBPC(comp) * KNOB_SIMD_WIDTH) / 8; + // is there a better way to get this from the SIMD traits? + const uint32_t SIMD_WIDTH = sizeof(typename SIMD_T::Float) / sizeof(float); + + pSrc += (FormatTraits<SrcFormat>::GetBPC(comp) * SIMD_WIDTH) / 8; }; UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda); } +template <SWR_FORMAT SrcFormat> +INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, simdvector& dst) +{ + LoadSOA<SIMD256, SrcFormat>(pSrc, dst); +} + +template <SWR_FORMAT SrcFormat> +INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, simd16vector& dst) +{ + LoadSOA<SIMD512, SrcFormat>(pSrc, dst); +} + ////////////////////////////////////////////////////////////////////////// /// @brief Clamps the given component based on the requirements on the /// Format template arg /// @param vComp - SIMD vector of floats /// @param Component - component -template <SWR_FORMAT Format> -INLINE simdscalar Clamp(simdscalar const& vC, uint32_t Component) +template <typename SIMD_T, SWR_FORMAT Format> +INLINE Float<SIMD_T> SIMDCALL Clamp(Float<SIMD_T> const& v, uint32_t Component) { - simdscalar vComp = vC; + Float<SIMD_T> vComp = v; if (FormatTraits<Format>::isNormalized(Component)) { if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UNORM) { - vComp = _simd_max_ps(vComp, _simd_setzero_ps()); + vComp = SIMD_T::max_ps(vComp, SIMD_T::setzero_ps()); } if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SNORM) { - vComp = _simd_max_ps(vComp, _simd_set1_ps(-1.0f)); + vComp = SIMD_T::max_ps(vComp, SIMD_T::set1_ps(-1.0f)); } - vComp = _simd_min_ps(vComp, _simd_set1_ps(1.0f)); + vComp = SIMD_T::min_ps(vComp, SIMD_T::set1_ps(1.0f)); } else if (FormatTraits<Format>::GetBPC(Component) < 32) { if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UINT) { - int iMax = (1 << FormatTraits<Format>::GetBPC(Component)) - 1; - int iMin = 0; - simdscalari vCompi = _simd_castps_si(vComp); - vCompi = _simd_max_epu32(vCompi, _simd_set1_epi32(iMin)); - vCompi = _simd_min_epu32(vCompi, _simd_set1_epi32(iMax)); - vComp = _simd_castsi_ps(vCompi); + int iMax = (1 << FormatTraits<Format>::GetBPC(Component)) - 1; + int iMin = 0; + Integer<SIMD_T> vCompi = SIMD_T::castps_si(vComp); + vCompi = SIMD_T::max_epu32(vCompi, SIMD_T::set1_epi32(iMin)); + vCompi = SIMD_T::min_epu32(vCompi, SIMD_T::set1_epi32(iMax)); + vComp = SIMD_T::castsi_ps(vCompi); } else if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SINT) { - int iMax = (1 << (FormatTraits<Format>::GetBPC(Component) - 1)) - 1; - int iMin = -1 - iMax; - simdscalari vCompi = _simd_castps_si(vComp); - vCompi = _simd_max_epi32(vCompi, _simd_set1_epi32(iMin)); - vCompi = _simd_min_epi32(vCompi, _simd_set1_epi32(iMax)); - vComp = _simd_castsi_ps(vCompi); + int iMax = (1 << (FormatTraits<Format>::GetBPC(Component) - 1)) - 1; + int iMin = -1 - iMax; + Integer<SIMD_T> vCompi = SIMD_T::castps_si(vComp); + vCompi = SIMD_T::max_epi32(vCompi, SIMD_T::set1_epi32(iMin)); + vCompi = SIMD_T::min_epi32(vCompi, SIMD_T::set1_epi32(iMax)); + vComp = SIMD_T::castsi_ps(vCompi); } } return vComp; } -////////////////////////////////////////////////////////////////////////// -/// @brief Normalize the given component based on the requirements on the -/// Format template arg -/// @param vComp - SIMD vector of floats -/// @param Component - component template <SWR_FORMAT Format> -INLINE simdscalar Normalize(simdscalar const& vC, uint32_t Component) +INLINE simdscalar SIMDCALL Clamp(simdscalar const& v, uint32_t Component) { - simdscalar vComp = vC; - if (FormatTraits<Format>::isNormalized(Component)) - { - vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<Format>::fromFloat(Component))); - vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp)); - } - return vComp; -} - -////////////////////////////////////////////////////////////////////////// -/// @brief Convert and store simdvector of pixels in SOA -/// RGBA32_FLOAT to SOA format -/// @param src - source data in SOA form -/// @param dst - output data in SOA form -template <SWR_FORMAT DstFormat> -INLINE void StoreSOA(const simdvector& src, uint8_t* pDst) -{ - // fast path for float32 - if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) && - (FormatTraits<DstFormat>::GetBPC(0) == 32)) - { - for (uint32_t comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp) - { - simdscalar vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)]; - - // Gamma-correct - if (FormatTraits<DstFormat>::isSRGB) - { - if (comp < 3) // Input format is always RGBA32_FLOAT. - { - vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp); - } - } - - _simd_store_ps((float*)(pDst + comp * sizeof(simdscalar)), vComp); - } - return; - } - - auto lambda = [&](int comp) { - simdscalar vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)]; - - // Gamma-correct - if (FormatTraits<DstFormat>::isSRGB) - { - if (comp < 3) // Input format is always RGBA32_FLOAT. - { - vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp); - } - } - - // clamp - vComp = Clamp<DstFormat>(vComp, comp); - - // normalize - vComp = Normalize<DstFormat>(vComp, comp); - - // pack - vComp = FormatTraits<DstFormat>::pack(comp, vComp); - - // store - FormatTraits<DstFormat>::storeSOA(comp, pDst, vComp); - - pDst += (FormatTraits<DstFormat>::GetBPC(comp) * KNOB_SIMD_WIDTH) / 8; - }; - - UnrollerL<0, FormatTraits<DstFormat>::numComps, 1>::step(lambda); + return Clamp<SIMD256, Format>(v, Component); } -#if ENABLE_AVX512_SIMD16 -////////////////////////////////////////////////////////////////////////// -/// @brief Load SIMD packed pixels in SOA format and converts to -/// SOA RGBA32_FLOAT format. -/// @param pSrc - source data in SOA form -/// @param dst - output data in SOA form -template <SWR_FORMAT SrcFormat> -INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, simd16vector& dst) +template <SWR_FORMAT Format> +INLINE simd16scalar SIMDCALL Clamp(simd16scalar const& v, uint32_t Component) { - // fast path for float32 - if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) && - (FormatTraits<SrcFormat>::GetBPC(0) == 32)) - { - auto lambda = [&](int comp) { - simd16scalar vComp = - _simd16_load_ps(reinterpret_cast<const float*>(pSrc + comp * sizeof(simd16scalar))); - - dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp; - }; - - UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda); - return; - } - - auto lambda = [&](int comp) { - // load SIMD components - simd16scalar vComp = FormatTraits<SrcFormat>::loadSOA_16(comp, pSrc); - - // unpack - vComp = FormatTraits<SrcFormat>::unpack(comp, vComp); - - // convert - if (FormatTraits<SrcFormat>::isNormalized(comp)) - { - vComp = _simd16_cvtepi32_ps(_simd16_castps_si(vComp)); - vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<SrcFormat>::toFloat(comp))); - } - - dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp; - - pSrc += (FormatTraits<SrcFormat>::GetBPC(comp) * KNOB_SIMD16_WIDTH) / 8; - }; - - UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda); + return Clamp<SIMD512, Format>(v, Component); } ////////////////////////////////////////////////////////////////////////// -/// @brief Clamps the given component based on the requirements on the +/// @brief Normalize the given component based on the requirements on the /// Format template arg /// @param vComp - SIMD vector of floats /// @param Component - component -template <SWR_FORMAT Format> -INLINE simd16scalar SIMDCALL Clamp(simd16scalar const& v, uint32_t Component) +template <typename SIMD_T, SWR_FORMAT Format> +INLINE Float<SIMD_T> SIMDCALL Normalize(Float<SIMD_T> const& vComp, uint32_t Component) { - simd16scalar vComp = v; + Float<SIMD_T> r = vComp; if (FormatTraits<Format>::isNormalized(Component)) { - if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UNORM) - { - vComp = _simd16_max_ps(vComp, _simd16_setzero_ps()); - } - - if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SNORM) - { - vComp = _simd16_max_ps(vComp, _simd16_set1_ps(-1.0f)); - } - vComp = _simd16_min_ps(vComp, _simd16_set1_ps(1.0f)); - } - else if (FormatTraits<Format>::GetBPC(Component) < 32) - { - if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UINT) - { - int iMax = (1 << FormatTraits<Format>::GetBPC(Component)) - 1; - int iMin = 0; - simd16scalari vCompi = _simd16_castps_si(vComp); - vCompi = _simd16_max_epu32(vCompi, _simd16_set1_epi32(iMin)); - vCompi = _simd16_min_epu32(vCompi, _simd16_set1_epi32(iMax)); - vComp = _simd16_castsi_ps(vCompi); - } - else if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SINT) - { - int iMax = (1 << (FormatTraits<Format>::GetBPC(Component) - 1)) - 1; - int iMin = -1 - iMax; - simd16scalari vCompi = _simd16_castps_si(vComp); - vCompi = _simd16_max_epi32(vCompi, _simd16_set1_epi32(iMin)); - vCompi = _simd16_min_epi32(vCompi, _simd16_set1_epi32(iMax)); - vComp = _simd16_castsi_ps(vCompi); - } + r = SIMD_T::mul_ps(r, SIMD_T::set1_ps(FormatTraits<Format>::fromFloat(Component))); + r = SIMD_T::castsi_ps(SIMD_T::cvtps_epi32(r)); } + return r; +} - return vComp; +template <SWR_FORMAT Format> +INLINE simdscalar SIMDCALL Normalize(simdscalar const& vComp, uint32_t Component) +{ + return Normalize<SIMD256, Format>(vComp, Component); } -////////////////////////////////////////////////////////////////////////// -/// @brief Normalize the given component based on the requirements on the -/// Format template arg -/// @param vComp - SIMD vector of floats -/// @param Component - component template <SWR_FORMAT Format> INLINE simd16scalar SIMDCALL Normalize(simd16scalar const& vComp, uint32_t Component) { - simd16scalar r = vComp; - if (FormatTraits<Format>::isNormalized(Component)) - { - r = _simd16_mul_ps(r, _simd16_set1_ps(FormatTraits<Format>::fromFloat(Component))); - r = _simd16_castsi_ps(_simd16_cvtps_epi32(r)); - } - return r; + return Normalize<SIMD512, Format>(vComp, Component); } ////////////////////////////////////////////////////////////////////////// @@ -311,8 +184,8 @@ INLINE simd16scalar SIMDCALL Normalize(simd16scalar const& vComp, uint32_t Compo /// RGBA32_FLOAT to SOA format /// @param src - source data in SOA form /// @param dst - output data in SOA form -template <SWR_FORMAT DstFormat> -INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst) +template <typename SIMD_T, SWR_FORMAT DstFormat> +INLINE void SIMDCALL StoreSOA(const Vec4<SIMD_T>& src, uint8_t* pDst) { // fast path for float32 if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) && @@ -320,7 +193,7 @@ INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst) { for (uint32_t comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp) { - simd16scalar vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)]; + Float<SIMD_T> vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)]; // Gamma-correct if (FormatTraits<DstFormat>::isSRGB) @@ -331,13 +204,13 @@ INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst) } } - _simd16_store_ps(reinterpret_cast<float*>(pDst + comp * sizeof(simd16scalar)), vComp); + SIMD_T::store_ps(reinterpret_cast<float*>(pDst + comp * sizeof(simd16scalar)), vComp); } return; } auto lambda = [&](int comp) { - simd16scalar vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)]; + Float<SIMD_T> vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)]; // Gamma-correct if (FormatTraits<DstFormat>::isSRGB) @@ -349,10 +222,10 @@ INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst) } // clamp - vComp = Clamp<DstFormat>(vComp, comp); + vComp = Clamp<SIMD_T, DstFormat>(vComp, comp); // normalize - vComp = Normalize<DstFormat>(vComp, comp); + vComp = Normalize<SIMD_T, DstFormat>(vComp, comp); // pack vComp = FormatTraits<DstFormat>::pack(comp, vComp); @@ -360,10 +233,24 @@ INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst) // store FormatTraits<DstFormat>::storeSOA(comp, pDst, vComp); - pDst += (FormatTraits<DstFormat>::GetBPC(comp) * KNOB_SIMD16_WIDTH) / 8; + // is there a better way to get this from the SIMD traits? + const uint32_t SIMD_WIDTH = sizeof(typename SIMD_T::Float) / sizeof(float); + + pDst += (FormatTraits<DstFormat>::GetBPC(comp) * SIMD_WIDTH) / 8; }; UnrollerL<0, FormatTraits<DstFormat>::numComps, 1>::step(lambda); } -#endif +template <SWR_FORMAT DstFormat> +INLINE void SIMDCALL StoreSOA(const simdvector& src, uint8_t* pDst) +{ + StoreSOA<SIMD256, DstFormat>(src, pDst); +} + +template <SWR_FORMAT DstFormat> +INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst) +{ + StoreSOA<SIMD512, DstFormat>(src, pDst); +} + diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h b/src/gallium/drivers/swr/rasterizer/core/format_types.h index 518da829d58..7d7dd843349 100644 --- a/src/gallium/drivers/swr/rasterizer/core/format_types.h +++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h @@ -36,17 +36,17 @@ template <uint32_t NumBits, bool Signed = false> struct PackTraits { - static const uint32_t MyNumBits = NumBits; + static const uint32_t MyNumBits = NumBits; + static simdscalar loadSOA(const uint8_t* pSrc) = delete; static void storeSOA(uint8_t* pDst, simdscalar const& src) = delete; static simdscalar unpack(simdscalar& in) = delete; static simdscalar pack(simdscalar& in) = delete; -#if ENABLE_AVX512_SIMD16 - static simd16scalar loadSOA_16(const uint8_t* pSrc) = delete; + + static simd16scalar loadSOA_16(const uint8_t* pSrc) = delete; static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) = delete; static simd16scalar unpack(simd16scalar& in) = delete; static simd16scalar pack(simd16scalar& in) = delete; -#endif }; ////////////////////////////////////////////////////////////////////////// @@ -61,12 +61,11 @@ struct PackTraits<0, false> static void storeSOA(uint8_t* pDst, simdscalar const& src) { return; } static simdscalar unpack(simdscalar& in) { return _simd_setzero_ps(); } static simdscalar pack(simdscalar& in) { return _simd_setzero_ps(); } -#if ENABLE_AVX512_SIMD16 - static simd16scalar loadSOA_16(const uint8_t* pSrc) { return _simd16_setzero_ps(); } + + static simd16scalar loadSOA_16(const uint8_t* pSrc) { return _simd16_setzero_ps(); } static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) { return; } static simd16scalar unpack(simd16scalar& in) { return _simd16_setzero_ps(); } static simd16scalar pack(simd16scalar& in) { return _simd16_setzero_ps(); } -#endif }; ////////////////////////////////////////////////////////////////////////// @@ -131,7 +130,6 @@ struct PackTraits<8, false> #error Unsupported vector width #endif } -#if ENABLE_AVX512_SIMD16 static simd16scalar loadSOA_16(const uint8_t* pSrc) { @@ -163,40 +161,31 @@ struct PackTraits<8, false> static simd16scalar pack(simd16scalar& in) { + // clang-format off + simd16scalari result = _simd16_setzero_si(); - simdscalari inlo = - _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b) - simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF + simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b) + simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF - simdscalari permlo = - _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b) - simdscalari permhi = - _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b) + simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b) + simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b) - simdscalari pack = _simd_packus_epi32( - permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b) + simdscalari pack = _simd_packus_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b) const simdscalari zero = _simd_setzero_si(); - permlo = _simd_permute2f128_si( - pack, - zero, - 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b) - permhi = _simd_permute2f128_si( - pack, - zero, - 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b) + permlo = _simd_permute2f128_si(pack, zero, 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b) + permhi = _simd_permute2f128_si(pack, zero, 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b) - pack = _simd_packus_epi16(permlo, - permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 - // 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b) + pack = _simd_packus_epi16(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b) result = _simd16_insert_si(result, pack, 0); return _simd16_castsi_ps(result); + + // clang-format on } -#endif }; ////////////////////////////////////////////////////////////////////////// @@ -262,7 +251,6 @@ struct PackTraits<8, true> #error Unsupported vector width #endif } -#if ENABLE_AVX512_SIMD16 static simd16scalar loadSOA_16(const uint8_t* pSrc) { @@ -294,40 +282,31 @@ struct PackTraits<8, true> static simd16scalar pack(simd16scalar& in) { + // clang-format off + simd16scalari result = _simd16_setzero_si(); - simdscalari inlo = - _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b) - simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF + simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b) + simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF - simdscalari permlo = - _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b) - simdscalari permhi = - _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b) + simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b) + simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b) - simdscalari pack = _simd_packs_epi32( - permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b) + simdscalari pack = _simd_packs_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b) const simdscalari zero = _simd_setzero_si(); - permlo = _simd_permute2f128_si( - pack, - zero, - 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b) - permhi = _simd_permute2f128_si( - pack, - zero, - 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b) + permlo = _simd_permute2f128_si(pack, zero, 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b) + permhi = _simd_permute2f128_si(pack, zero, 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b) - pack = - _simd_packs_epi16(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 - // 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b) + pack = _simd_packs_epi16(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b) result = _simd16_insert_si(result, pack, 0); return _simd16_castsi_ps(result); + + // clang-format on } -#endif }; ////////////////////////////////////////////////////////////////////////// @@ -391,7 +370,6 @@ struct PackTraits<16, false> #error Unsupported vector width #endif } -#if ENABLE_AVX512_SIMD16 static simd16scalar loadSOA_16(const uint8_t* pSrc) { @@ -418,24 +396,19 @@ struct PackTraits<16, false> static simd16scalar pack(simd16scalar& in) { + // clang-format off + const simd16scalari zero = _simd16_setzero_si(); - simd16scalari permlo = _simd16_permute2f128_si( - _simd16_castps_si(in), - zero, - 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b) - simd16scalari permhi = _simd16_permute2f128_si( - _simd16_castps_si(in), - zero, - 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00 + simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b) + simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00 - simd16scalari result = _simd16_packus_epi32( - permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 - // 00 00 00 00 00 00 00 00 00 (16b) + simd16scalari result = _simd16_packus_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b) return _simd16_castsi_ps(result); + + // clang-format on } -#endif }; ////////////////////////////////////////////////////////////////////////// @@ -500,7 +473,6 @@ struct PackTraits<16, true> #error Unsupported vector width #endif } -#if ENABLE_AVX512_SIMD16 static simd16scalar loadSOA_16(const uint8_t* pSrc) { @@ -527,24 +499,19 @@ struct PackTraits<16, true> static simd16scalar pack(simd16scalar& in) { + // clang-format off + const simd16scalari zero = _simd16_setzero_si(); - simd16scalari permlo = _simd16_permute2f128_si( - _simd16_castps_si(in), - zero, - 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b) - simd16scalari permhi = _simd16_permute2f128_si( - _simd16_castps_si(in), - zero, - 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00 + simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b) + simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00 - simd16scalari result = _simd16_packs_epi32( - permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 - // 00 00 00 00 00 00 00 00 00 (16b) + simd16scalari result = _simd16_packs_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b) return _simd16_castsi_ps(result); + + // clang-format on } -#endif }; ////////////////////////////////////////////////////////////////////////// @@ -562,7 +529,6 @@ struct PackTraits<32, false> } static simdscalar unpack(simdscalar& in) { return in; } static simdscalar pack(simdscalar& in) { return in; } -#if ENABLE_AVX512_SIMD16 static simd16scalar loadSOA_16(const uint8_t* pSrc) { @@ -577,7 +543,6 @@ struct PackTraits<32, false> static simd16scalar unpack(simd16scalar& in) { return in; } static simd16scalar pack(simd16scalar& in) { return in; } -#endif }; ////////////////////////////////////////////////////////////////////////// @@ -957,7 +922,6 @@ static inline __m128 ConvertFloatToSRGB2(__m128& Src) return Result; } -#if ENABLE_AVX512_SIMD16 template <unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden> inline static simd16scalar SIMDCALL fastpow(simd16scalar const& value) { @@ -1058,7 +1022,7 @@ static inline simd16scalar ConvertFloatToSRGB2(const simd16scalar& value) // only native AVX512 can directly use the computed mask for the blend operation result = _mm512_mask_blend_ps(mask, result2, result); #else - result = _simd16_blendv_ps( + result = _simd16_blendv_ps( result2, result, _simd16_cmplt_ps(value, _simd16_set1_ps(0.0031308f))); #endif } @@ -1066,7 +1030,6 @@ static inline simd16scalar ConvertFloatToSRGB2(const simd16scalar& value) return result; } -#endif ////////////////////////////////////////////////////////////////////////// /// TypeTraits - Format type traits specialization for FLOAT16 ////////////////////////////////////////////////////////////////////////// @@ -1202,7 +1165,6 @@ struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16> SWR_NOT_IMPL; // @todo return _simd_setzero_ps(); } -#if ENABLE_AVX512_SIMD16 static simd16scalar pack(const simd16scalar& in) { @@ -1235,7 +1197,6 @@ struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16> SWR_NOT_IMPL; // @todo return _simd16_setzero_ps(); } -#endif }; ////////////////////////////////////////////////////////////////////////// @@ -1263,10 +1224,8 @@ struct TypeTraits<SWR_TYPE_FLOAT, 32> : PackTraits<32> #endif return in; } -#if ENABLE_AVX512_SIMD16 static inline simd16scalar convertSrgb(simd16scalar& in) { return ConvertFloatToSRGB2(in); } -#endif }; ////////////////////////////////////////////////////////////////////////// @@ -1467,21 +1426,25 @@ struct ComponentTraits return TypeTraits<X, NumBitsX>::fromFloat(); } - INLINE static simdscalar loadSOA(uint32_t comp, const uint8_t* pSrc) + INLINE static void loadSOA(uint32_t comp, const uint8_t* pSrc, simdscalar& dst) { switch (comp) { case 0: - return TypeTraits<X, NumBitsX>::loadSOA(pSrc); + dst = TypeTraits<X, NumBitsX>::loadSOA(pSrc); + return; case 1: - return TypeTraits<Y, NumBitsY>::loadSOA(pSrc); + dst = TypeTraits<Y, NumBitsY>::loadSOA(pSrc); + return; case 2: - return TypeTraits<Z, NumBitsZ>::loadSOA(pSrc); + dst = TypeTraits<Z, NumBitsZ>::loadSOA(pSrc); + return; case 3: - return TypeTraits<W, NumBitsW>::loadSOA(pSrc); + dst = TypeTraits<W, NumBitsW>::loadSOA(pSrc); + return; } SWR_INVALID("Invalid component: %d", comp); - return TypeTraits<X, NumBitsX>::loadSOA(pSrc); + dst = TypeTraits<X, NumBitsX>::loadSOA(pSrc); } INLINE static void storeSOA(uint32_t comp, uint8_t* pDst, simdscalar const& src) @@ -1570,23 +1533,26 @@ struct ComponentTraits SWR_INVALID("Invalid component: %d", comp); return TypeTraits<X, NumBitsX>::convertSrgb(in); } -#if ENABLE_AVX512_SIMD16 - INLINE static simd16scalar loadSOA_16(uint32_t comp, const uint8_t* pSrc) + INLINE static void SIMDCALL loadSOA(uint32_t comp, const uint8_t* pSrc, simd16scalar& dst) { switch (comp) { case 0: - return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc); + dst = TypeTraits<X, NumBitsX>::loadSOA_16(pSrc); + return; case 1: - return TypeTraits<Y, NumBitsY>::loadSOA_16(pSrc); + dst = TypeTraits<Y, NumBitsY>::loadSOA_16(pSrc); + return; case 2: - return TypeTraits<Z, NumBitsZ>::loadSOA_16(pSrc); + dst = TypeTraits<Z, NumBitsZ>::loadSOA_16(pSrc); + return; case 3: - return TypeTraits<W, NumBitsW>::loadSOA_16(pSrc); + dst = TypeTraits<W, NumBitsW>::loadSOA_16(pSrc); + return; } SWR_INVALID("Invalid component: %d", comp); - return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc); + dst = TypeTraits<X, NumBitsX>::loadSOA_16(pSrc); } INLINE static void SIMDCALL storeSOA(uint32_t comp, uint8_t* pDst, simd16scalar const& src) @@ -1660,5 +1626,4 @@ struct ComponentTraits SWR_INVALID("Invalid component: %d", comp); return TypeTraits<X, NumBitsX>::convertSrgb(in); } -#endif }; diff --git a/src/gallium/drivers/swr/rasterizer/core/format_utils.h b/src/gallium/drivers/swr/rasterizer/core/format_utils.h index b51755dab50..7c0b62f1910 100644 --- a/src/gallium/drivers/swr/rasterizer/core/format_utils.h +++ b/src/gallium/drivers/swr/rasterizer/core/format_utils.h @@ -136,7 +136,6 @@ void vTranspose4x8(simd4scalar (&vDst)[8], vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1); } -#if ENABLE_AVX512_SIMD16 INLINE void vTranspose4x16(simd16scalar (&dst)[4], const simd16scalar& src0, @@ -145,22 +144,9 @@ void vTranspose4x16(simd16scalar (&dst)[4], const simd16scalar& src3) { const simd16scalari perm = - _simd16_set_epi32(15, - 11, - 7, - 3, - 14, - 10, - 6, - 2, - 13, - 9, - 5, - 1, - 12, - 8, - 4, - 0); // pre-permute input to setup the right order after all the unpacking + _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0); + + // pre-permute input to setup the right order after all the unpacking simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g @@ -178,7 +164,6 @@ void vTranspose4x16(simd16scalar (&dst)[4], dst[3] = _simd16_unpackhi_ps(rbhi, gahi); } -#endif INLINE void vTranspose8x8(simdscalar (&vDst)[8], const simdscalar& vMask0, @@ -253,13 +238,11 @@ struct TransposeSingleComponent { memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8); } -#if ENABLE_AVX512_SIMD16 - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) { memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8); } -#endif }; ////////////////////////////////////////////////////////////////////////// @@ -315,34 +298,35 @@ struct Transpose8_8_8_8 #error Unsupported vector width #endif } -#if ENABLE_AVX512_SIMD16 - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) { - simd4scalari src0 = - SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc)); // rrrrrrrrrrrrrrrr - simd4scalari src1 = - SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 1); // gggggggggggggggg - simd4scalari src2 = - SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb - simd4scalari src3 = - SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 3); // aaaaaaaaaaaaaaaa +#if KNOB_SIMD16_WIDTH == 16 + // clang-format off + + simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc)); // rrrrrrrrrrrrrrrr + simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 1); // gggggggggggggggg + simd4scalari src2 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb + simd4scalari src3 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 3); // aaaaaaaaaaaaaaaa simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0); simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1); simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2); simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3); - simd16scalari shl1 = _simd16_slli_epi32(cvt1, 8); + simd16scalari shl1 = _simd16_slli_epi32(cvt1, 8); simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16); simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24); simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3)); - _simd16_store_si(reinterpret_cast<simd16scalari*>(pDst), - dst); // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba - } + _simd16_store_si(reinterpret_cast<simd16scalari*>(pDst), dst); // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba + + // clang-format on +#else +#error Unsupported vector width #endif + } }; ////////////////////////////////////////////////////////////////////////// @@ -355,10 +339,7 @@ struct Transpose8_8_8 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif + INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -383,14 +364,14 @@ struct Transpose8_8 #error Unsupported vector width #endif } -#if ENABLE_AVX512_SIMD16 - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) { - simd4scalari src0 = - SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc)); // rrrrrrrrrrrrrrrr - simd4scalari src1 = - SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 1); // gggggggggggggggg +#if KNOB_SIMD16_WIDTH == 16 + // clang-format off + + simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc)); // rrrrrrrrrrrrrrrr + simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 1); // gggggggggggggggg simdscalari cvt0 = _simd_cvtepu8_epi16(src0); simdscalari cvt1 = _simd_cvtepu8_epi16(src1); @@ -399,10 +380,13 @@ struct Transpose8_8 simdscalari dst = _simd_or_si(cvt0, shl1); - _simd_store_si(reinterpret_cast<simdscalari*>(pDst), - dst); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg - } + _simd_store_si(reinterpret_cast<simdscalari*>(pDst), dst); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg + + // clang-format on +#else +#error Unsupported vector width #endif + } }; ////////////////////////////////////////////////////////////////////////// @@ -436,10 +420,12 @@ struct Transpose32_32_32_32 #error Unsupported vector width #endif } -#if ENABLE_AVX512_SIMD16 - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) { +#if KNOB_SIMD16_WIDTH == 16 + // clang-format off + simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc)); simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16); simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 32); @@ -449,12 +435,16 @@ struct Transpose32_32_32_32 vTranspose4x16(dst, src0, src1, src2, src3); - _simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst[0]); + _simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst[0]); _simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst[1]); _simd16_store_ps(reinterpret_cast<float*>(pDst) + 32, dst[2]); _simd16_store_ps(reinterpret_cast<float*>(pDst) + 48, dst[3]); - } + + // clang-format on +#else +#error Unsupported vector width #endif + } }; ////////////////////////////////////////////////////////////////////////// @@ -487,10 +477,12 @@ struct Transpose32_32_32 #error Unsupported vector width #endif } -#if ENABLE_AVX512_SIMD16 - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) { +#if KNOB_SIMD16_WIDTH == 16 + // clang-format off + simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc)); simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16); simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 32); @@ -500,12 +492,16 @@ struct Transpose32_32_32 vTranspose4x16(dst, src0, src1, src2, src3); - _simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst[0]); + _simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst[0]); _simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst[1]); _simd16_store_ps(reinterpret_cast<float*>(pDst) + 32, dst[2]); _simd16_store_ps(reinterpret_cast<float*>(pDst) + 48, dst[3]); - } + + // clang-format on +#else +#error Unsupported vector width #endif + } }; ////////////////////////////////////////////////////////////////////////// @@ -540,42 +536,32 @@ struct Transpose32_32 #error Unsupported vector width #endif } -#if ENABLE_AVX512_SIMD16 - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) { - simd16scalar src0 = - _simd16_load_ps(reinterpret_cast<const float*>(pSrc)); // rrrrrrrrrrrrrrrr - simd16scalar src1 = - _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16); // gggggggggggggggg - - simd16scalar tmp0 = - _simd16_unpacklo_ps(src0, src1); // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD - simd16scalar tmp1 = - _simd16_unpackhi_ps(src0, src1); // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF - - simd16scalar per0 = _simd16_permute2f128_ps( - tmp0, - tmp1, - 0x44); // (1, 0, 1, 0) // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7 - simd16scalar per1 = _simd16_permute2f128_ps( - tmp0, - tmp1, - 0xEE); // (3, 2, 3, 2) // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF - - simd16scalar dst0 = _simd16_permute2f128_ps( - per0, - per0, - 0xD8); // (3, 1, 2, 0) // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7 - simd16scalar dst1 = _simd16_permute2f128_ps( - per1, - per1, - 0xD8); // (3, 1, 2, 0) // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF - - _simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg - _simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst1); // rgrgrgrgrgrgrgrg - } +#if KNOB_SIMD16_WIDTH == 16 + // clang-format off + + simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc)); // rrrrrrrrrrrrrrrr + simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16); // gggggggggggggggg + + simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1); // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD + simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1); // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF + + simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44); // (1, 0, 1, 0) // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7 + simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE); // (3, 2, 3, 2) // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF + + simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8); // (3, 1, 2, 0) // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7 + simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8); // (3, 1, 2, 0) // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF + + _simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg + _simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst1); // rgrgrgrgrgrgrgrg + + // clang-format on +#else +#error Unsupported vector width #endif + } }; ////////////////////////////////////////////////////////////////////////// @@ -616,44 +602,42 @@ struct Transpose16_16_16_16 #error Unsupported vector width #endif } -#if ENABLE_AVX512_SIMD16 - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) { - simdscalari src0 = - _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr - simdscalari src1 = - _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg - simdscalari src2 = - _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb - simdscalari src3 = - _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 3); // aaaaaaaaaaaaaaaa - - simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB - simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF - simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB - simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF - - simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9 - simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB - simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD - simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF - - simdscalari dst0 = _simd_permute2f128_si( - tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3 - simdscalari dst1 = _simd_permute2f128_si( - tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7 - simdscalari dst2 = _simd_permute2f128_si( - tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB - simdscalari dst3 = _simd_permute2f128_si( - tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF - - _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgbargbargbargba - _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgbargbargbargba - _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 2, dst2); // rgbargbargbargba - _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 3, dst3); // rgbargbargbargba - } +#if KNOB_SIMD16_WIDTH == 16 + // clang-format off + + simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr + simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg + simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb + simdscalari src3 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 3); // aaaaaaaaaaaaaaaa + + simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB + simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF + simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB + simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF + + simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9 + simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB + simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD + simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF + + simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3 + simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7 + simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB + simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF + + _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgbargbargbargba + _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgbargbargbargba + _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 2, dst2); // rgbargbargbargba + _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 3, dst3); // rgbargbargbargba + + // clang-format on +#else +#error Unsupported vector width #endif + } }; ////////////////////////////////////////////////////////////////////////// @@ -693,43 +677,42 @@ struct Transpose16_16_16 #error Unsupported vector width #endif } -#if ENABLE_AVX512_SIMD16 - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) { - simdscalari src0 = - _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr - simdscalari src1 = - _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg - simdscalari src2 = - _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb - simdscalari src3 = _simd_setzero_si(); // aaaaaaaaaaaaaaaa - - simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB - simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF - simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB - simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF - - simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9 - simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB - simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD - simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF - - simdscalari dst0 = _simd_permute2f128_si( - tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3 - simdscalari dst1 = _simd_permute2f128_si( - tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7 - simdscalari dst2 = _simd_permute2f128_si( - tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB - simdscalari dst3 = _simd_permute2f128_si( - tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF - - _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgbargbargbargba - _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgbargbargbargba - _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 2, dst2); // rgbargbargbargba - _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 3, dst3); // rgbargbargbargba - } +#if KNOB_SIMD16_WIDTH == 16 + // clang-format off + + simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr + simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg + simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb + simdscalari src3 = _simd_setzero_si(); // aaaaaaaaaaaaaaaa + + simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB + simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF + simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB + simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF + + simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9 + simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB + simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD + simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF + + simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3 + simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7 + simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB + simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF + + _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgbargbargbargba + _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgbargbargbargba + _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 2, dst2); // rgbargbargbargba + _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 3, dst3); // rgbargbargbargba + + // clang-format on +#else +#error Unsupported vector width #endif + } }; ////////////////////////////////////////////////////////////////////////// @@ -761,27 +744,29 @@ struct Transpose16_16 #error Unsupported vector width #endif } -#if ENABLE_AVX512_SIMD16 - INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) + INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) { - simdscalari src0 = - _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr - simdscalari src1 = - _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg +#if KNOB_SIMD16_WIDTH == 16 + // clang-format off - simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB - simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF + simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr + simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg - simdscalari dst0 = _simd_permute2f128_si( - tmp0, tmp1, 0x20); // (2, 0) // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7 - simdscalari dst1 = _simd_permute2f128_si( - tmp0, tmp1, 0x31); // (3, 1) // rg8 rg9 rgA rgB rgC rgD rgE rgF + simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB + simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF - _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg - _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgrgrgrgrgrgrgrg - } + simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7 + simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rg8 rg9 rgA rgB rgC rgD rgE rgF + + _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg + _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgrgrgrgrgrgrgrg + + // clang-format on +#else +#error Unsupported vector width #endif + } }; ////////////////////////////////////////////////////////////////////////// @@ -794,10 +779,7 @@ struct Transpose24_8 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif + static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -810,10 +792,7 @@ struct Transpose32_8_24 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif + static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -826,10 +805,7 @@ struct Transpose4_4_4_4 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif + static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -842,10 +818,7 @@ struct Transpose5_6_5 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif + static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -858,10 +831,7 @@ struct Transpose9_9_9_5 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif + static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -874,10 +844,7 @@ struct Transpose5_5_5_1 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif + static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -890,6 +857,7 @@ struct Transpose1_5_5_5 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; + static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -902,10 +870,7 @@ struct Transpose10_10_10_2 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif + static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -918,10 +883,7 @@ struct Transpose11_11_10 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif + static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -934,10 +896,7 @@ struct Transpose64 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif + static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -950,10 +909,7 @@ struct Transpose64_64 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif + static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -966,10 +922,7 @@ struct Transpose64_64_64 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif + static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; }; ////////////////////////////////////////////////////////////////////////// @@ -982,8 +935,5 @@ struct Transpose64_64_64_64 /// @param pSrc - source data in SOA form /// @param pDst - output data in AOS form static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete; -#if ENABLE_AVX512_SIMD16 - - static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete; -#endif + static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete; }; diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h b/src/gallium/drivers/swr/rasterizer/core/knobs.h index 8cccbf416af..92fbf8840e1 100644 --- a/src/gallium/drivers/swr/rasterizer/core/knobs.h +++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h @@ -53,35 +53,22 @@ #if (KNOB_ARCH == KNOB_ARCH_AVX) #define KNOB_ARCH_ISA AVX #define KNOB_ARCH_STR "AVX" -#define KNOB_SIMD_WIDTH 8 -#define KNOB_SIMD_BYTES 32 #elif (KNOB_ARCH == KNOB_ARCH_AVX2) #define KNOB_ARCH_ISA AVX2 #define KNOB_ARCH_STR "AVX2" -#define KNOB_SIMD_WIDTH 8 -#define KNOB_SIMD_BYTES 32 #elif (KNOB_ARCH == KNOB_ARCH_AVX512) #define KNOB_ARCH_ISA AVX512F #define KNOB_ARCH_STR "AVX512" -#define KNOB_SIMD_WIDTH 8 -#define KNOB_SIMD_BYTES 32 #else #error "Unknown architecture" #endif -#if ENABLE_AVX512_SIMD16 +#define KNOB_SIMD_WIDTH 8 +#define KNOB_SIMD_BYTES 32 #define KNOB_SIMD16_WIDTH 16 #define KNOB_SIMD16_BYTES 64 -#if (KNOB_ARCH == KNOB_ARCH_AVX512) -#define ENABLE_AVX512_EMULATION 0 -#else -#define ENABLE_AVX512_EMULATION 1 -#endif - -#endif - #define MAX_KNOB_ARCH_STR_LEN sizeof("AVX512_PLUS_PADDING") /////////////////////////////////////////////////////////////////////////////// diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h index 04fad69feae..3f8123250c6 100644 --- a/src/gallium/drivers/swr/rasterizer/core/state.h +++ b/src/gallium/drivers/swr/rasterizer/core/state.h @@ -201,14 +201,11 @@ struct simdvertex simdvector attrib[SWR_VTX_NUM_SLOTS]; }; -#if ENABLE_AVX512_SIMD16 struct simd16vertex { simd16vector attrib[SWR_VTX_NUM_SLOTS]; }; -#endif - template <typename SIMD_T> struct SIMDVERTEX_T { @@ -429,11 +426,12 @@ struct SWR_CS_CONTEXT // enums enum SWR_TILE_MODE { - SWR_TILE_NONE = 0x0, // Linear mode (no tiling) - SWR_TILE_MODE_WMAJOR, // W major tiling - SWR_TILE_MODE_XMAJOR, // X major tiling - SWR_TILE_MODE_YMAJOR, // Y major tiling - SWR_TILE_SWRZ, // SWR-Z tiling + SWR_TILE_NONE = 0x0, // Linear mode (no tiling) + SWR_TILE_MODE_WMAJOR, // W major tiling + SWR_TILE_MODE_XMAJOR, // X major tiling + SWR_TILE_MODE_YMAJOR, // Y major tiling + SWR_TILE_SWRZ, // SWR-Z tiling + SWR_TILE_MODE_COUNT }; diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h index e008cc8d739..9b483776be9 100644 --- a/src/gallium/drivers/swr/rasterizer/core/utils.h +++ b/src/gallium/drivers/swr/rasterizer/core/utils.h @@ -44,7 +44,6 @@ struct simdBBox simdscalari xmax; }; -#if ENABLE_AVX512_SIMD16 struct simd16BBox { simd16scalari ymin; @@ -52,7 +51,6 @@ struct simd16BBox simd16scalari xmin; simd16scalari xmax; }; -#endif template <typename SIMD_T> struct SIMDBBOX_T diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h index 407cefae54e..02c6df0e075 100644 --- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h +++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h @@ -355,7 +355,7 @@ struct ConvertPixelsSOAtoAOS StoreSOA<DstFormat>(src, soaTile); // Convert from SOA --> AOS - FormatTraits<DstFormat>::TransposeT::Transpose_16(soaTile, aosTile); + FormatTraits<DstFormat>::TransposeT::Transpose_simd16(soaTile, aosTile); // Store data into destination StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts); @@ -382,7 +382,7 @@ struct ConvertPixelsSOAtoAOS<Format, Format> OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; // Convert from SOA --> AOS - FormatTraits<Format>::TransposeT::Transpose_16(pSrc, aosTile); + FormatTraits<Format>::TransposeT::Transpose_simd16(pSrc, aosTile); // Store data into destination StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts); diff --git a/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h b/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h index abb0c53ec41..cd29550691d 100644 --- a/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h +++ b/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h @@ -153,7 +153,6 @@ struct SimdTile <R8_UINT,R8_UINT> } }; -#if ENABLE_AVX512_SIMD16 ////////////////////////////////////////////////////////////////////////// /// SimdTile 8x2 for AVX-512 ////////////////////////////////////////////////////////////////////////// @@ -253,7 +252,6 @@ struct SimdTile_16 <R8_UINT, R8_UINT> } }; -#endif ////////////////////////////////////////////////////////////////////////// /// @brief Computes lod offset for 1D surface at specified lod. /// @param baseWidth - width of basemip (mip 0). |