aboutsummaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/swr
diff options
context:
space:
mode:
authorAlok Hota <[email protected]>2018-06-19 17:22:32 -0500
committerAlok Hota <[email protected]>2019-04-26 13:00:38 -0500
commit0e49963212fb85e4fb83c3d4003907e232f151bd (patch)
treee07bcf3c8853d01730be680e88d8180d2ca504dc /src/gallium/drivers/swr
parent0bf1df2bb6e2311c532734c4cb6a096389e511bf (diff)
swr/rast: AVX512 support compiled in by default
- Emulation of AVX512 built into SIMDLIB - Remove associated macros - Remove knobs controlling AVX512 and let emulation handle it - Refactor variable names for SIMD16 Reviewed-by: Bruce Cherniak <[email protected]>
Diffstat (limited to 'src/gallium/drivers/swr')
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/os.h2
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/simd16intrin.h4
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/simdintrin.h2
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/context.h2
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/format_conversion.h287
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/format_types.h165
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/format_utils.h392
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/knobs.h17
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/state.h14
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/utils.h2
-rw-r--r--src/gallium/drivers/swr/rasterizer/memory/StoreTile.h4
-rw-r--r--src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h2
12 files changed, 333 insertions, 560 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h
index b00beeb36dd..e812da39851 100644
--- a/src/gallium/drivers/swr/rasterizer/common/os.h
+++ b/src/gallium/drivers/swr/rasterizer/common/os.h
@@ -265,9 +265,7 @@ typedef MEGABYTE GIGABYTE[1024];
#define OSALIGNLINE(RWORD) OSALIGN(RWORD, 64)
#define OSALIGNSIMD(RWORD) OSALIGN(RWORD, KNOB_SIMD_BYTES)
-#if ENABLE_AVX512_SIMD16
#define OSALIGNSIMD16(RWORD) OSALIGN(RWORD, KNOB_SIMD16_BYTES)
-#endif
#include "common/swr_assert.h"
diff --git a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
index b08fb2eaaea..5964edff4d3 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
@@ -24,8 +24,6 @@
#ifndef __SWR_SIMD16INTRIN_H__
#define __SWR_SIMD16INTRIN_H__
-#if ENABLE_AVX512_SIMD16
-
#if KNOB_SIMD16_WIDTH == 16
typedef SIMD512 SIMD16;
#else
@@ -167,6 +165,4 @@ typedef SIMD512 SIMD16;
#define _simd16_mask2int(mask) int(mask)
#define _simd16_vmask_ps SIMD16::vmask_ps
-#endif // ENABLE_AVX512_SIMD16
-
#endif //__SWR_SIMD16INTRIN_H_
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
index 8ffda3f8458..5eae34ef4e2 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
@@ -341,8 +341,6 @@ static SIMDINLINE simdscalar _simd_abs_ps(simdscalar const& a)
return _simd_castsi_ps(_simd_and_si(ai, _simd_set1_epi32(0x7fffffff)));
}
-#if ENABLE_AVX512_SIMD16
#include "simd16intrin.h"
-#endif // ENABLE_AVX512_SIMD16
#endif //__SWR_SIMDINTRIN_H__
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index efbddb01e0e..a8182559791 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -230,7 +230,6 @@ typedef void (*PFN_PROCESS_PRIMS)(DRAW_CONTEXT* pDC,
simdscalari const& viewportIdx,
simdscalari const& rtIdx);
-#if ENABLE_AVX512_SIMD16
// function signature for pipeline stages that execute after primitive assembly
typedef void(SIMDCALL* PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT* pDC,
PA_STATE& pa,
@@ -241,7 +240,6 @@ typedef void(SIMDCALL* PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT* pDC,
simd16scalari const& viewportIdx,
simd16scalari const& rtIdx);
-#endif
OSALIGNLINE(struct) API_STATE
{
// Vertex Buffers
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
index 90bf118727e..247ba0b0dcd 100644
--- a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
@@ -33,15 +33,17 @@
/// SOA RGBA32_FLOAT format.
/// @param pSrc - source data in SOA form
/// @param dst - output data in SOA form
-template <SWR_FORMAT SrcFormat>
-INLINE void LoadSOA(const uint8_t* pSrc, simdvector& dst)
+template <typename SIMD_T, SWR_FORMAT SrcFormat>
+INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, Vec4<SIMD_T>& dst)
{
// fast path for float32
if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) &&
(FormatTraits<SrcFormat>::GetBPC(0) == 32))
{
- auto lambda = [&](int comp) {
- simdscalar vComp = _simd_load_ps((const float*)(pSrc + comp * sizeof(simdscalar)));
+ auto lambda = [&](int comp)
+ {
+ Float<SIMD_T> vComp =
+ SIMD_T::load_ps(reinterpret_cast<const float*>(pSrc + comp * sizeof(Float<SIMD_T>)));
dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp;
};
@@ -50,9 +52,11 @@ INLINE void LoadSOA(const uint8_t* pSrc, simdvector& dst)
return;
}
- auto lambda = [&](int comp) {
+ auto lambda = [&](int comp)
+ {
// load SIMD components
- simdscalar vComp = FormatTraits<SrcFormat>::loadSOA(comp, pSrc);
+ Float<SIMD_T> vComp;
+ FormatTraits<SrcFormat>::loadSOA(comp, pSrc, vComp);
// unpack
vComp = FormatTraits<SrcFormat>::unpack(comp, vComp);
@@ -60,250 +64,119 @@ INLINE void LoadSOA(const uint8_t* pSrc, simdvector& dst)
// convert
if (FormatTraits<SrcFormat>::isNormalized(comp))
{
- vComp = _simd_cvtepi32_ps(_simd_castps_si(vComp));
- vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<SrcFormat>::toFloat(comp)));
+ vComp = SIMD_T::cvtepi32_ps(SIMD_T::castps_si(vComp));
+ vComp = SIMD_T::mul_ps(vComp, SIMD_T::set1_ps(FormatTraits<SrcFormat>::toFloat(comp)));
}
dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp;
- pSrc += (FormatTraits<SrcFormat>::GetBPC(comp) * KNOB_SIMD_WIDTH) / 8;
+ // is there a better way to get this from the SIMD traits?
+ const uint32_t SIMD_WIDTH = sizeof(typename SIMD_T::Float) / sizeof(float);
+
+ pSrc += (FormatTraits<SrcFormat>::GetBPC(comp) * SIMD_WIDTH) / 8;
};
UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda);
}
+template <SWR_FORMAT SrcFormat>
+INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, simdvector& dst)
+{
+ LoadSOA<SIMD256, SrcFormat>(pSrc, dst);
+}
+
+template <SWR_FORMAT SrcFormat>
+INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, simd16vector& dst)
+{
+ LoadSOA<SIMD512, SrcFormat>(pSrc, dst);
+}
+
//////////////////////////////////////////////////////////////////////////
/// @brief Clamps the given component based on the requirements on the
/// Format template arg
/// @param vComp - SIMD vector of floats
/// @param Component - component
-template <SWR_FORMAT Format>
-INLINE simdscalar Clamp(simdscalar const& vC, uint32_t Component)
+template <typename SIMD_T, SWR_FORMAT Format>
+INLINE Float<SIMD_T> SIMDCALL Clamp(Float<SIMD_T> const& v, uint32_t Component)
{
- simdscalar vComp = vC;
+ Float<SIMD_T> vComp = v;
if (FormatTraits<Format>::isNormalized(Component))
{
if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UNORM)
{
- vComp = _simd_max_ps(vComp, _simd_setzero_ps());
+ vComp = SIMD_T::max_ps(vComp, SIMD_T::setzero_ps());
}
if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SNORM)
{
- vComp = _simd_max_ps(vComp, _simd_set1_ps(-1.0f));
+ vComp = SIMD_T::max_ps(vComp, SIMD_T::set1_ps(-1.0f));
}
- vComp = _simd_min_ps(vComp, _simd_set1_ps(1.0f));
+ vComp = SIMD_T::min_ps(vComp, SIMD_T::set1_ps(1.0f));
}
else if (FormatTraits<Format>::GetBPC(Component) < 32)
{
if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UINT)
{
- int iMax = (1 << FormatTraits<Format>::GetBPC(Component)) - 1;
- int iMin = 0;
- simdscalari vCompi = _simd_castps_si(vComp);
- vCompi = _simd_max_epu32(vCompi, _simd_set1_epi32(iMin));
- vCompi = _simd_min_epu32(vCompi, _simd_set1_epi32(iMax));
- vComp = _simd_castsi_ps(vCompi);
+ int iMax = (1 << FormatTraits<Format>::GetBPC(Component)) - 1;
+ int iMin = 0;
+ Integer<SIMD_T> vCompi = SIMD_T::castps_si(vComp);
+ vCompi = SIMD_T::max_epu32(vCompi, SIMD_T::set1_epi32(iMin));
+ vCompi = SIMD_T::min_epu32(vCompi, SIMD_T::set1_epi32(iMax));
+ vComp = SIMD_T::castsi_ps(vCompi);
}
else if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SINT)
{
- int iMax = (1 << (FormatTraits<Format>::GetBPC(Component) - 1)) - 1;
- int iMin = -1 - iMax;
- simdscalari vCompi = _simd_castps_si(vComp);
- vCompi = _simd_max_epi32(vCompi, _simd_set1_epi32(iMin));
- vCompi = _simd_min_epi32(vCompi, _simd_set1_epi32(iMax));
- vComp = _simd_castsi_ps(vCompi);
+ int iMax = (1 << (FormatTraits<Format>::GetBPC(Component) - 1)) - 1;
+ int iMin = -1 - iMax;
+ Integer<SIMD_T> vCompi = SIMD_T::castps_si(vComp);
+ vCompi = SIMD_T::max_epi32(vCompi, SIMD_T::set1_epi32(iMin));
+ vCompi = SIMD_T::min_epi32(vCompi, SIMD_T::set1_epi32(iMax));
+ vComp = SIMD_T::castsi_ps(vCompi);
}
}
return vComp;
}
-//////////////////////////////////////////////////////////////////////////
-/// @brief Normalize the given component based on the requirements on the
-/// Format template arg
-/// @param vComp - SIMD vector of floats
-/// @param Component - component
template <SWR_FORMAT Format>
-INLINE simdscalar Normalize(simdscalar const& vC, uint32_t Component)
+INLINE simdscalar SIMDCALL Clamp(simdscalar const& v, uint32_t Component)
{
- simdscalar vComp = vC;
- if (FormatTraits<Format>::isNormalized(Component))
- {
- vComp = _simd_mul_ps(vComp, _simd_set1_ps(FormatTraits<Format>::fromFloat(Component)));
- vComp = _simd_castsi_ps(_simd_cvtps_epi32(vComp));
- }
- return vComp;
-}
-
-//////////////////////////////////////////////////////////////////////////
-/// @brief Convert and store simdvector of pixels in SOA
-/// RGBA32_FLOAT to SOA format
-/// @param src - source data in SOA form
-/// @param dst - output data in SOA form
-template <SWR_FORMAT DstFormat>
-INLINE void StoreSOA(const simdvector& src, uint8_t* pDst)
-{
- // fast path for float32
- if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) &&
- (FormatTraits<DstFormat>::GetBPC(0) == 32))
- {
- for (uint32_t comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
- {
- simdscalar vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
-
- // Gamma-correct
- if (FormatTraits<DstFormat>::isSRGB)
- {
- if (comp < 3) // Input format is always RGBA32_FLOAT.
- {
- vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp);
- }
- }
-
- _simd_store_ps((float*)(pDst + comp * sizeof(simdscalar)), vComp);
- }
- return;
- }
-
- auto lambda = [&](int comp) {
- simdscalar vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
-
- // Gamma-correct
- if (FormatTraits<DstFormat>::isSRGB)
- {
- if (comp < 3) // Input format is always RGBA32_FLOAT.
- {
- vComp = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(comp, vComp);
- }
- }
-
- // clamp
- vComp = Clamp<DstFormat>(vComp, comp);
-
- // normalize
- vComp = Normalize<DstFormat>(vComp, comp);
-
- // pack
- vComp = FormatTraits<DstFormat>::pack(comp, vComp);
-
- // store
- FormatTraits<DstFormat>::storeSOA(comp, pDst, vComp);
-
- pDst += (FormatTraits<DstFormat>::GetBPC(comp) * KNOB_SIMD_WIDTH) / 8;
- };
-
- UnrollerL<0, FormatTraits<DstFormat>::numComps, 1>::step(lambda);
+ return Clamp<SIMD256, Format>(v, Component);
}
-#if ENABLE_AVX512_SIMD16
-//////////////////////////////////////////////////////////////////////////
-/// @brief Load SIMD packed pixels in SOA format and converts to
-/// SOA RGBA32_FLOAT format.
-/// @param pSrc - source data in SOA form
-/// @param dst - output data in SOA form
-template <SWR_FORMAT SrcFormat>
-INLINE void SIMDCALL LoadSOA(const uint8_t* pSrc, simd16vector& dst)
+template <SWR_FORMAT Format>
+INLINE simd16scalar SIMDCALL Clamp(simd16scalar const& v, uint32_t Component)
{
- // fast path for float32
- if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) &&
- (FormatTraits<SrcFormat>::GetBPC(0) == 32))
- {
- auto lambda = [&](int comp) {
- simd16scalar vComp =
- _simd16_load_ps(reinterpret_cast<const float*>(pSrc + comp * sizeof(simd16scalar)));
-
- dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp;
- };
-
- UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda);
- return;
- }
-
- auto lambda = [&](int comp) {
- // load SIMD components
- simd16scalar vComp = FormatTraits<SrcFormat>::loadSOA_16(comp, pSrc);
-
- // unpack
- vComp = FormatTraits<SrcFormat>::unpack(comp, vComp);
-
- // convert
- if (FormatTraits<SrcFormat>::isNormalized(comp))
- {
- vComp = _simd16_cvtepi32_ps(_simd16_castps_si(vComp));
- vComp = _simd16_mul_ps(vComp, _simd16_set1_ps(FormatTraits<SrcFormat>::toFloat(comp)));
- }
-
- dst.v[FormatTraits<SrcFormat>::swizzle(comp)] = vComp;
-
- pSrc += (FormatTraits<SrcFormat>::GetBPC(comp) * KNOB_SIMD16_WIDTH) / 8;
- };
-
- UnrollerL<0, FormatTraits<SrcFormat>::numComps, 1>::step(lambda);
+ return Clamp<SIMD512, Format>(v, Component);
}
//////////////////////////////////////////////////////////////////////////
-/// @brief Clamps the given component based on the requirements on the
+/// @brief Normalize the given component based on the requirements on the
/// Format template arg
/// @param vComp - SIMD vector of floats
/// @param Component - component
-template <SWR_FORMAT Format>
-INLINE simd16scalar SIMDCALL Clamp(simd16scalar const& v, uint32_t Component)
+template <typename SIMD_T, SWR_FORMAT Format>
+INLINE Float<SIMD_T> SIMDCALL Normalize(Float<SIMD_T> const& vComp, uint32_t Component)
{
- simd16scalar vComp = v;
+ Float<SIMD_T> r = vComp;
if (FormatTraits<Format>::isNormalized(Component))
{
- if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UNORM)
- {
- vComp = _simd16_max_ps(vComp, _simd16_setzero_ps());
- }
-
- if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SNORM)
- {
- vComp = _simd16_max_ps(vComp, _simd16_set1_ps(-1.0f));
- }
- vComp = _simd16_min_ps(vComp, _simd16_set1_ps(1.0f));
- }
- else if (FormatTraits<Format>::GetBPC(Component) < 32)
- {
- if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_UINT)
- {
- int iMax = (1 << FormatTraits<Format>::GetBPC(Component)) - 1;
- int iMin = 0;
- simd16scalari vCompi = _simd16_castps_si(vComp);
- vCompi = _simd16_max_epu32(vCompi, _simd16_set1_epi32(iMin));
- vCompi = _simd16_min_epu32(vCompi, _simd16_set1_epi32(iMax));
- vComp = _simd16_castsi_ps(vCompi);
- }
- else if (FormatTraits<Format>::GetType(Component) == SWR_TYPE_SINT)
- {
- int iMax = (1 << (FormatTraits<Format>::GetBPC(Component) - 1)) - 1;
- int iMin = -1 - iMax;
- simd16scalari vCompi = _simd16_castps_si(vComp);
- vCompi = _simd16_max_epi32(vCompi, _simd16_set1_epi32(iMin));
- vCompi = _simd16_min_epi32(vCompi, _simd16_set1_epi32(iMax));
- vComp = _simd16_castsi_ps(vCompi);
- }
+ r = SIMD_T::mul_ps(r, SIMD_T::set1_ps(FormatTraits<Format>::fromFloat(Component)));
+ r = SIMD_T::castsi_ps(SIMD_T::cvtps_epi32(r));
}
+ return r;
+}
- return vComp;
+template <SWR_FORMAT Format>
+INLINE simdscalar SIMDCALL Normalize(simdscalar const& vComp, uint32_t Component)
+{
+ return Normalize<SIMD256, Format>(vComp, Component);
}
-//////////////////////////////////////////////////////////////////////////
-/// @brief Normalize the given component based on the requirements on the
-/// Format template arg
-/// @param vComp - SIMD vector of floats
-/// @param Component - component
template <SWR_FORMAT Format>
INLINE simd16scalar SIMDCALL Normalize(simd16scalar const& vComp, uint32_t Component)
{
- simd16scalar r = vComp;
- if (FormatTraits<Format>::isNormalized(Component))
- {
- r = _simd16_mul_ps(r, _simd16_set1_ps(FormatTraits<Format>::fromFloat(Component)));
- r = _simd16_castsi_ps(_simd16_cvtps_epi32(r));
- }
- return r;
+ return Normalize<SIMD512, Format>(vComp, Component);
}
//////////////////////////////////////////////////////////////////////////
@@ -311,8 +184,8 @@ INLINE simd16scalar SIMDCALL Normalize(simd16scalar const& vComp, uint32_t Compo
/// RGBA32_FLOAT to SOA format
/// @param src - source data in SOA form
/// @param dst - output data in SOA form
-template <SWR_FORMAT DstFormat>
-INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst)
+template <typename SIMD_T, SWR_FORMAT DstFormat>
+INLINE void SIMDCALL StoreSOA(const Vec4<SIMD_T>& src, uint8_t* pDst)
{
// fast path for float32
if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) &&
@@ -320,7 +193,7 @@ INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst)
{
for (uint32_t comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
{
- simd16scalar vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
+ Float<SIMD_T> vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
// Gamma-correct
if (FormatTraits<DstFormat>::isSRGB)
@@ -331,13 +204,13 @@ INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst)
}
}
- _simd16_store_ps(reinterpret_cast<float*>(pDst + comp * sizeof(simd16scalar)), vComp);
+ SIMD_T::store_ps(reinterpret_cast<float*>(pDst + comp * sizeof(simd16scalar)), vComp);
}
return;
}
auto lambda = [&](int comp) {
- simd16scalar vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
+ Float<SIMD_T> vComp = src.v[FormatTraits<DstFormat>::swizzle(comp)];
// Gamma-correct
if (FormatTraits<DstFormat>::isSRGB)
@@ -349,10 +222,10 @@ INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst)
}
// clamp
- vComp = Clamp<DstFormat>(vComp, comp);
+ vComp = Clamp<SIMD_T, DstFormat>(vComp, comp);
// normalize
- vComp = Normalize<DstFormat>(vComp, comp);
+ vComp = Normalize<SIMD_T, DstFormat>(vComp, comp);
// pack
vComp = FormatTraits<DstFormat>::pack(comp, vComp);
@@ -360,10 +233,24 @@ INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst)
// store
FormatTraits<DstFormat>::storeSOA(comp, pDst, vComp);
- pDst += (FormatTraits<DstFormat>::GetBPC(comp) * KNOB_SIMD16_WIDTH) / 8;
+ // is there a better way to get this from the SIMD traits?
+ const uint32_t SIMD_WIDTH = sizeof(typename SIMD_T::Float) / sizeof(float);
+
+ pDst += (FormatTraits<DstFormat>::GetBPC(comp) * SIMD_WIDTH) / 8;
};
UnrollerL<0, FormatTraits<DstFormat>::numComps, 1>::step(lambda);
}
-#endif
+template <SWR_FORMAT DstFormat>
+INLINE void SIMDCALL StoreSOA(const simdvector& src, uint8_t* pDst)
+{
+ StoreSOA<SIMD256, DstFormat>(src, pDst);
+}
+
+template <SWR_FORMAT DstFormat>
+INLINE void SIMDCALL StoreSOA(const simd16vector& src, uint8_t* pDst)
+{
+ StoreSOA<SIMD512, DstFormat>(src, pDst);
+}
+
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h b/src/gallium/drivers/swr/rasterizer/core/format_types.h
index 518da829d58..7d7dd843349 100644
--- a/src/gallium/drivers/swr/rasterizer/core/format_types.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h
@@ -36,17 +36,17 @@
template <uint32_t NumBits, bool Signed = false>
struct PackTraits
{
- static const uint32_t MyNumBits = NumBits;
+ static const uint32_t MyNumBits = NumBits;
+
static simdscalar loadSOA(const uint8_t* pSrc) = delete;
static void storeSOA(uint8_t* pDst, simdscalar const& src) = delete;
static simdscalar unpack(simdscalar& in) = delete;
static simdscalar pack(simdscalar& in) = delete;
-#if ENABLE_AVX512_SIMD16
- static simd16scalar loadSOA_16(const uint8_t* pSrc) = delete;
+
+ static simd16scalar loadSOA_16(const uint8_t* pSrc) = delete;
static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) = delete;
static simd16scalar unpack(simd16scalar& in) = delete;
static simd16scalar pack(simd16scalar& in) = delete;
-#endif
};
//////////////////////////////////////////////////////////////////////////
@@ -61,12 +61,11 @@ struct PackTraits<0, false>
static void storeSOA(uint8_t* pDst, simdscalar const& src) { return; }
static simdscalar unpack(simdscalar& in) { return _simd_setzero_ps(); }
static simdscalar pack(simdscalar& in) { return _simd_setzero_ps(); }
-#if ENABLE_AVX512_SIMD16
- static simd16scalar loadSOA_16(const uint8_t* pSrc) { return _simd16_setzero_ps(); }
+
+ static simd16scalar loadSOA_16(const uint8_t* pSrc) { return _simd16_setzero_ps(); }
static void SIMDCALL storeSOA(uint8_t* pDst, simd16scalar const& src) { return; }
static simd16scalar unpack(simd16scalar& in) { return _simd16_setzero_ps(); }
static simd16scalar pack(simd16scalar& in) { return _simd16_setzero_ps(); }
-#endif
};
//////////////////////////////////////////////////////////////////////////
@@ -131,7 +130,6 @@ struct PackTraits<8, false>
#error Unsupported vector width
#endif
}
-#if ENABLE_AVX512_SIMD16
static simd16scalar loadSOA_16(const uint8_t* pSrc)
{
@@ -163,40 +161,31 @@ struct PackTraits<8, false>
static simd16scalar pack(simd16scalar& in)
{
+ // clang-format off
+
simd16scalari result = _simd16_setzero_si();
- simdscalari inlo =
- _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b)
- simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF
+ simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b)
+ simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF
- simdscalari permlo =
- _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b)
- simdscalari permhi =
- _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b)
+ simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b)
+ simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b)
- simdscalari pack = _simd_packus_epi32(
- permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b)
+ simdscalari pack = _simd_packus_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b)
const simdscalari zero = _simd_setzero_si();
- permlo = _simd_permute2f128_si(
- pack,
- zero,
- 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b)
- permhi = _simd_permute2f128_si(
- pack,
- zero,
- 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b)
+ permlo = _simd_permute2f128_si(pack, zero, 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b)
+ permhi = _simd_permute2f128_si(pack, zero, 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b)
- pack = _simd_packus_epi16(permlo,
- permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00
- // 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b)
+ pack = _simd_packus_epi16(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b)
result = _simd16_insert_si(result, pack, 0);
return _simd16_castsi_ps(result);
+
+ // clang-format on
}
-#endif
};
//////////////////////////////////////////////////////////////////////////
@@ -262,7 +251,6 @@ struct PackTraits<8, true>
#error Unsupported vector width
#endif
}
-#if ENABLE_AVX512_SIMD16
static simd16scalar loadSOA_16(const uint8_t* pSrc)
{
@@ -294,40 +282,31 @@ struct PackTraits<8, true>
static simd16scalar pack(simd16scalar& in)
{
+ // clang-format off
+
simd16scalari result = _simd16_setzero_si();
- simdscalari inlo =
- _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b)
- simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF
+ simdscalari inlo = _simd_castps_si(_simd16_extract_ps(in, 0)); // r0 r1 r2 r3 r4 r5 r6 r7 (32b)
+ simdscalari inhi = _simd_castps_si(_simd16_extract_ps(in, 1)); // r8 r9 rA rB rC rD rE rF
- simdscalari permlo =
- _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b)
- simdscalari permhi =
- _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b)
+ simdscalari permlo = _simd_permute2f128_si(inlo, inhi, 0x20); // r0 r1 r2 r3 r8 r9 rA rB (32b)
+ simdscalari permhi = _simd_permute2f128_si(inlo, inhi, 0x31); // r4 r5 r6 r7 rC rD rE rF (32b)
- simdscalari pack = _simd_packs_epi32(
- permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b)
+ simdscalari pack = _simd_packs_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF (16b)
const simdscalari zero = _simd_setzero_si();
- permlo = _simd_permute2f128_si(
- pack,
- zero,
- 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b)
- permhi = _simd_permute2f128_si(
- pack,
- zero,
- 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b)
+ permlo = _simd_permute2f128_si(pack, zero, 0x20); // (2, 0) // r0 r1 r2 r3 r4 r5 r6 r7 00 00 00 00 00 00 00 00 (16b)
+ permhi = _simd_permute2f128_si(pack, zero, 0x31); // (3, 1) // r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 (16b)
- pack =
- _simd_packs_epi16(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00
- // 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b)
+ pack = _simd_packs_epi16(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (8b)
result = _simd16_insert_si(result, pack, 0);
return _simd16_castsi_ps(result);
+
+ // clang-format on
}
-#endif
};
//////////////////////////////////////////////////////////////////////////
@@ -391,7 +370,6 @@ struct PackTraits<16, false>
#error Unsupported vector width
#endif
}
-#if ENABLE_AVX512_SIMD16
static simd16scalar loadSOA_16(const uint8_t* pSrc)
{
@@ -418,24 +396,19 @@ struct PackTraits<16, false>
static simd16scalar pack(simd16scalar& in)
{
+ // clang-format off
+
const simd16scalari zero = _simd16_setzero_si();
- simd16scalari permlo = _simd16_permute2f128_si(
- _simd16_castps_si(in),
- zero,
- 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b)
- simd16scalari permhi = _simd16_permute2f128_si(
- _simd16_castps_si(in),
- zero,
- 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00
+ simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b)
+ simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00
- simd16scalari result = _simd16_packus_epi32(
- permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00
- // 00 00 00 00 00 00 00 00 00 (16b)
+ simd16scalari result = _simd16_packus_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b)
return _simd16_castsi_ps(result);
+
+ // clang-format on
}
-#endif
};
//////////////////////////////////////////////////////////////////////////
@@ -500,7 +473,6 @@ struct PackTraits<16, true>
#error Unsupported vector width
#endif
}
-#if ENABLE_AVX512_SIMD16
static simd16scalar loadSOA_16(const uint8_t* pSrc)
{
@@ -527,24 +499,19 @@ struct PackTraits<16, true>
static simd16scalar pack(simd16scalar& in)
{
+ // clang-format off
+
const simd16scalari zero = _simd16_setzero_si();
- simd16scalari permlo = _simd16_permute2f128_si(
- _simd16_castps_si(in),
- zero,
- 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b)
- simd16scalari permhi = _simd16_permute2f128_si(
- _simd16_castps_si(in),
- zero,
- 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00
+ simd16scalari permlo = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x08); // (0, 0, 2, 0) // r0 r1 r2 r3 r8 r9 rA rB 00 00 00 00 00 00 00 00 (32b)
+ simd16scalari permhi = _simd16_permute2f128_si(_simd16_castps_si(in), zero, 0x0D); // (0, 0, 3, 1) // r4 r5 r6 r7 rC rD rE rF 00 00 00 00 00 00 00 00
- simd16scalari result = _simd16_packs_epi32(
- permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00
- // 00 00 00 00 00 00 00 00 00 (16b)
+ simd16scalari result = _simd16_packs_epi32(permlo, permhi); // r0 r1 r2 r3 r4 r5 r6 r7 r8 r9 rA rB rC rD rE rF 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 (16b)
return _simd16_castsi_ps(result);
+
+ // clang-format on
}
-#endif
};
//////////////////////////////////////////////////////////////////////////
@@ -562,7 +529,6 @@ struct PackTraits<32, false>
}
static simdscalar unpack(simdscalar& in) { return in; }
static simdscalar pack(simdscalar& in) { return in; }
-#if ENABLE_AVX512_SIMD16
static simd16scalar loadSOA_16(const uint8_t* pSrc)
{
@@ -577,7 +543,6 @@ struct PackTraits<32, false>
static simd16scalar unpack(simd16scalar& in) { return in; }
static simd16scalar pack(simd16scalar& in) { return in; }
-#endif
};
//////////////////////////////////////////////////////////////////////////
@@ -957,7 +922,6 @@ static inline __m128 ConvertFloatToSRGB2(__m128& Src)
return Result;
}
-#if ENABLE_AVX512_SIMD16
template <unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden>
inline static simd16scalar SIMDCALL fastpow(simd16scalar const& value)
{
@@ -1058,7 +1022,7 @@ static inline simd16scalar ConvertFloatToSRGB2(const simd16scalar& value)
// only native AVX512 can directly use the computed mask for the blend operation
result = _mm512_mask_blend_ps(mask, result2, result);
#else
- result = _simd16_blendv_ps(
+ result = _simd16_blendv_ps(
result2, result, _simd16_cmplt_ps(value, _simd16_set1_ps(0.0031308f)));
#endif
}
@@ -1066,7 +1030,6 @@ static inline simd16scalar ConvertFloatToSRGB2(const simd16scalar& value)
return result;
}
-#endif
//////////////////////////////////////////////////////////////////////////
/// TypeTraits - Format type traits specialization for FLOAT16
//////////////////////////////////////////////////////////////////////////
@@ -1202,7 +1165,6 @@ struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16>
SWR_NOT_IMPL; // @todo
return _simd_setzero_ps();
}
-#if ENABLE_AVX512_SIMD16
static simd16scalar pack(const simd16scalar& in)
{
@@ -1235,7 +1197,6 @@ struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16>
SWR_NOT_IMPL; // @todo
return _simd16_setzero_ps();
}
-#endif
};
//////////////////////////////////////////////////////////////////////////
@@ -1263,10 +1224,8 @@ struct TypeTraits<SWR_TYPE_FLOAT, 32> : PackTraits<32>
#endif
return in;
}
-#if ENABLE_AVX512_SIMD16
static inline simd16scalar convertSrgb(simd16scalar& in) { return ConvertFloatToSRGB2(in); }
-#endif
};
//////////////////////////////////////////////////////////////////////////
@@ -1467,21 +1426,25 @@ struct ComponentTraits
return TypeTraits<X, NumBitsX>::fromFloat();
}
- INLINE static simdscalar loadSOA(uint32_t comp, const uint8_t* pSrc)
+ INLINE static void loadSOA(uint32_t comp, const uint8_t* pSrc, simdscalar& dst)
{
switch (comp)
{
case 0:
- return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
+ dst = TypeTraits<X, NumBitsX>::loadSOA(pSrc);
+ return;
case 1:
- return TypeTraits<Y, NumBitsY>::loadSOA(pSrc);
+ dst = TypeTraits<Y, NumBitsY>::loadSOA(pSrc);
+ return;
case 2:
- return TypeTraits<Z, NumBitsZ>::loadSOA(pSrc);
+ dst = TypeTraits<Z, NumBitsZ>::loadSOA(pSrc);
+ return;
case 3:
- return TypeTraits<W, NumBitsW>::loadSOA(pSrc);
+ dst = TypeTraits<W, NumBitsW>::loadSOA(pSrc);
+ return;
}
SWR_INVALID("Invalid component: %d", comp);
- return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
+ dst = TypeTraits<X, NumBitsX>::loadSOA(pSrc);
}
INLINE static void storeSOA(uint32_t comp, uint8_t* pDst, simdscalar const& src)
@@ -1570,23 +1533,26 @@ struct ComponentTraits
SWR_INVALID("Invalid component: %d", comp);
return TypeTraits<X, NumBitsX>::convertSrgb(in);
}
-#if ENABLE_AVX512_SIMD16
- INLINE static simd16scalar loadSOA_16(uint32_t comp, const uint8_t* pSrc)
+ INLINE static void SIMDCALL loadSOA(uint32_t comp, const uint8_t* pSrc, simd16scalar& dst)
{
switch (comp)
{
case 0:
- return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
+ dst = TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
+ return;
case 1:
- return TypeTraits<Y, NumBitsY>::loadSOA_16(pSrc);
+ dst = TypeTraits<Y, NumBitsY>::loadSOA_16(pSrc);
+ return;
case 2:
- return TypeTraits<Z, NumBitsZ>::loadSOA_16(pSrc);
+ dst = TypeTraits<Z, NumBitsZ>::loadSOA_16(pSrc);
+ return;
case 3:
- return TypeTraits<W, NumBitsW>::loadSOA_16(pSrc);
+ dst = TypeTraits<W, NumBitsW>::loadSOA_16(pSrc);
+ return;
}
SWR_INVALID("Invalid component: %d", comp);
- return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
+ dst = TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
}
INLINE static void SIMDCALL storeSOA(uint32_t comp, uint8_t* pDst, simd16scalar const& src)
@@ -1660,5 +1626,4 @@ struct ComponentTraits
SWR_INVALID("Invalid component: %d", comp);
return TypeTraits<X, NumBitsX>::convertSrgb(in);
}
-#endif
};
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_utils.h b/src/gallium/drivers/swr/rasterizer/core/format_utils.h
index b51755dab50..7c0b62f1910 100644
--- a/src/gallium/drivers/swr/rasterizer/core/format_utils.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_utils.h
@@ -136,7 +136,6 @@ void vTranspose4x8(simd4scalar (&vDst)[8],
vDst[7] = _simd_extractf128_ps(r02r1xhihi, 1);
}
-#if ENABLE_AVX512_SIMD16
INLINE
void vTranspose4x16(simd16scalar (&dst)[4],
const simd16scalar& src0,
@@ -145,22 +144,9 @@ void vTranspose4x16(simd16scalar (&dst)[4],
const simd16scalar& src3)
{
const simd16scalari perm =
- _simd16_set_epi32(15,
- 11,
- 7,
- 3,
- 14,
- 10,
- 6,
- 2,
- 13,
- 9,
- 5,
- 1,
- 12,
- 8,
- 4,
- 0); // pre-permute input to setup the right order after all the unpacking
+ _simd16_set_epi32(15, 11, 7, 3, 14, 10, 6, 2, 13, 9, 5, 1, 12, 8, 4, 0);
+
+ // pre-permute input to setup the right order after all the unpacking
simd16scalar pre0 = _simd16_permute_ps(src0, perm); // r
simd16scalar pre1 = _simd16_permute_ps(src1, perm); // g
@@ -178,7 +164,6 @@ void vTranspose4x16(simd16scalar (&dst)[4],
dst[3] = _simd16_unpackhi_ps(rbhi, gahi);
}
-#endif
INLINE
void vTranspose8x8(simdscalar (&vDst)[8],
const simdscalar& vMask0,
@@ -253,13 +238,11 @@ struct TransposeSingleComponent
{
memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
}
-#if ENABLE_AVX512_SIMD16
- INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
+ INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
{
memcpy(pDst, pSrc, (bpp * KNOB_SIMD16_WIDTH) / 8);
}
-#endif
};
//////////////////////////////////////////////////////////////////////////
@@ -315,34 +298,35 @@ struct Transpose8_8_8_8
#error Unsupported vector width
#endif
}
-#if ENABLE_AVX512_SIMD16
- INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
+ INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
{
- simd4scalari src0 =
- SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc)); // rrrrrrrrrrrrrrrr
- simd4scalari src1 =
- SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 1); // gggggggggggggggg
- simd4scalari src2 =
- SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb
- simd4scalari src3 =
- SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 3); // aaaaaaaaaaaaaaaa
+#if KNOB_SIMD16_WIDTH == 16
+ // clang-format off
+
+ simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc)); // rrrrrrrrrrrrrrrr
+ simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 1); // gggggggggggggggg
+ simd4scalari src2 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb
+ simd4scalari src3 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 3); // aaaaaaaaaaaaaaaa
simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0);
simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1);
simd16scalari cvt2 = _simd16_cvtepu8_epi32(src2);
simd16scalari cvt3 = _simd16_cvtepu8_epi32(src3);
- simd16scalari shl1 = _simd16_slli_epi32(cvt1, 8);
+ simd16scalari shl1 = _simd16_slli_epi32(cvt1, 8);
simd16scalari shl2 = _simd16_slli_epi32(cvt2, 16);
simd16scalari shl3 = _simd16_slli_epi32(cvt3, 24);
simd16scalari dst = _simd16_or_si(_simd16_or_si(cvt0, shl1), _simd16_or_si(shl2, shl3));
- _simd16_store_si(reinterpret_cast<simd16scalari*>(pDst),
- dst); // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba
- }
+ _simd16_store_si(reinterpret_cast<simd16scalari*>(pDst), dst); // rgbargbargbargbargbargbargbargbargbargbargbargbargbargbargbargba
+
+ // clang-format on
+#else
+#error Unsupported vector width
#endif
+ }
};
//////////////////////////////////////////////////////////////////////////
@@ -355,10 +339,7 @@ struct Transpose8_8_8
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#if ENABLE_AVX512_SIMD16
-
- INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#endif
+ INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
@@ -383,14 +364,14 @@ struct Transpose8_8
#error Unsupported vector width
#endif
}
-#if ENABLE_AVX512_SIMD16
- INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
+ INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
{
- simd4scalari src0 =
- SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc)); // rrrrrrrrrrrrrrrr
- simd4scalari src1 =
- SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 1); // gggggggggggggggg
+#if KNOB_SIMD16_WIDTH == 16
+ // clang-format off
+
+ simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc)); // rrrrrrrrrrrrrrrr
+ simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari*>(pSrc) + 1); // gggggggggggggggg
simdscalari cvt0 = _simd_cvtepu8_epi16(src0);
simdscalari cvt1 = _simd_cvtepu8_epi16(src1);
@@ -399,10 +380,13 @@ struct Transpose8_8
simdscalari dst = _simd_or_si(cvt0, shl1);
- _simd_store_si(reinterpret_cast<simdscalari*>(pDst),
- dst); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
- }
+ _simd_store_si(reinterpret_cast<simdscalari*>(pDst), dst); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
+
+ // clang-format on
+#else
+#error Unsupported vector width
#endif
+ }
};
//////////////////////////////////////////////////////////////////////////
@@ -436,10 +420,12 @@ struct Transpose32_32_32_32
#error Unsupported vector width
#endif
}
-#if ENABLE_AVX512_SIMD16
- INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
+ INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
{
+#if KNOB_SIMD16_WIDTH == 16
+ // clang-format off
+
simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc));
simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
@@ -449,12 +435,16 @@ struct Transpose32_32_32_32
vTranspose4x16(dst, src0, src1, src2, src3);
- _simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst[0]);
+ _simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst[0]);
_simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst[1]);
_simd16_store_ps(reinterpret_cast<float*>(pDst) + 32, dst[2]);
_simd16_store_ps(reinterpret_cast<float*>(pDst) + 48, dst[3]);
- }
+
+ // clang-format on
+#else
+#error Unsupported vector width
#endif
+ }
};
//////////////////////////////////////////////////////////////////////////
@@ -487,10 +477,12 @@ struct Transpose32_32_32
#error Unsupported vector width
#endif
}
-#if ENABLE_AVX512_SIMD16
- INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
+ INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
{
+#if KNOB_SIMD16_WIDTH == 16
+ // clang-format off
+
simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc));
simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16);
simd16scalar src2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 32);
@@ -500,12 +492,16 @@ struct Transpose32_32_32
vTranspose4x16(dst, src0, src1, src2, src3);
- _simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst[0]);
+ _simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst[0]);
_simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst[1]);
_simd16_store_ps(reinterpret_cast<float*>(pDst) + 32, dst[2]);
_simd16_store_ps(reinterpret_cast<float*>(pDst) + 48, dst[3]);
- }
+
+ // clang-format on
+#else
+#error Unsupported vector width
#endif
+ }
};
//////////////////////////////////////////////////////////////////////////
@@ -540,42 +536,32 @@ struct Transpose32_32
#error Unsupported vector width
#endif
}
-#if ENABLE_AVX512_SIMD16
- INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
+ INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
{
- simd16scalar src0 =
- _simd16_load_ps(reinterpret_cast<const float*>(pSrc)); // rrrrrrrrrrrrrrrr
- simd16scalar src1 =
- _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16); // gggggggggggggggg
-
- simd16scalar tmp0 =
- _simd16_unpacklo_ps(src0, src1); // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD
- simd16scalar tmp1 =
- _simd16_unpackhi_ps(src0, src1); // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF
-
- simd16scalar per0 = _simd16_permute2f128_ps(
- tmp0,
- tmp1,
- 0x44); // (1, 0, 1, 0) // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7
- simd16scalar per1 = _simd16_permute2f128_ps(
- tmp0,
- tmp1,
- 0xEE); // (3, 2, 3, 2) // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF
-
- simd16scalar dst0 = _simd16_permute2f128_ps(
- per0,
- per0,
- 0xD8); // (3, 1, 2, 0) // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7
- simd16scalar dst1 = _simd16_permute2f128_ps(
- per1,
- per1,
- 0xD8); // (3, 1, 2, 0) // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF
-
- _simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg
- _simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst1); // rgrgrgrgrgrgrgrg
- }
+#if KNOB_SIMD16_WIDTH == 16
+ // clang-format off
+
+ simd16scalar src0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc)); // rrrrrrrrrrrrrrrr
+ simd16scalar src1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc) + 16); // gggggggggggggggg
+
+ simd16scalar tmp0 = _simd16_unpacklo_ps(src0, src1); // r0 g0 r1 g1 r4 g4 r5 g5 r8 g8 r9 g9 rC gC rD gD
+ simd16scalar tmp1 = _simd16_unpackhi_ps(src0, src1); // r2 g2 r3 g3 r6 g6 r7 g7 rA gA rB gB rE gE rF gF
+
+ simd16scalar per0 = _simd16_permute2f128_ps(tmp0, tmp1, 0x44); // (1, 0, 1, 0) // r0 g0 r1 g1 r4 g4 r5 g5 r2 g2 r3 g3 r6 g6 r7 g7
+ simd16scalar per1 = _simd16_permute2f128_ps(tmp0, tmp1, 0xEE); // (3, 2, 3, 2) // r8 g8 r9 g9 rC gC rD gD rA gA rB gB rE gE rF gF
+
+ simd16scalar dst0 = _simd16_permute2f128_ps(per0, per0, 0xD8); // (3, 1, 2, 0) // r0 g0 r1 g1 r2 g2 r3 g3 r4 g4 r5 g5 r6 g6 r7 g7
+ simd16scalar dst1 = _simd16_permute2f128_ps(per1, per1, 0xD8); // (3, 1, 2, 0) // r8 g8 r9 g9 rA gA rB gB rC gC rD gD rE gE rF gF
+
+ _simd16_store_ps(reinterpret_cast<float*>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg
+ _simd16_store_ps(reinterpret_cast<float*>(pDst) + 16, dst1); // rgrgrgrgrgrgrgrg
+
+ // clang-format on
+#else
+#error Unsupported vector width
#endif
+ }
};
//////////////////////////////////////////////////////////////////////////
@@ -616,44 +602,42 @@ struct Transpose16_16_16_16
#error Unsupported vector width
#endif
}
-#if ENABLE_AVX512_SIMD16
- INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
+ INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
{
- simdscalari src0 =
- _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr
- simdscalari src1 =
- _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg
- simdscalari src2 =
- _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb
- simdscalari src3 =
- _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 3); // aaaaaaaaaaaaaaaa
-
- simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
- simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
- simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
- simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF
-
- simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9
- simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB
- simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD
- simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF
-
- simdscalari dst0 = _simd_permute2f128_si(
- tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3
- simdscalari dst1 = _simd_permute2f128_si(
- tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7
- simdscalari dst2 = _simd_permute2f128_si(
- tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB
- simdscalari dst3 = _simd_permute2f128_si(
- tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF
-
- _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgbargbargbargba
- _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgbargbargbargba
- _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 2, dst2); // rgbargbargbargba
- _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 3, dst3); // rgbargbargbargba
- }
+#if KNOB_SIMD16_WIDTH == 16
+ // clang-format off
+
+ simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr
+ simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg
+ simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb
+ simdscalari src3 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 3); // aaaaaaaaaaaaaaaa
+
+ simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
+ simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
+ simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
+ simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF
+
+ simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9
+ simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB
+ simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD
+ simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF
+
+ simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3
+ simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7
+ simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB
+ simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF
+
+ _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgbargbargbargba
+ _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgbargbargbargba
+ _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 2, dst2); // rgbargbargbargba
+ _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 3, dst3); // rgbargbargbargba
+
+ // clang-format on
+#else
+#error Unsupported vector width
#endif
+ }
};
//////////////////////////////////////////////////////////////////////////
@@ -693,43 +677,42 @@ struct Transpose16_16_16
#error Unsupported vector width
#endif
}
-#if ENABLE_AVX512_SIMD16
- INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
+ INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
{
- simdscalari src0 =
- _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr
- simdscalari src1 =
- _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg
- simdscalari src2 =
- _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb
- simdscalari src3 = _simd_setzero_si(); // aaaaaaaaaaaaaaaa
-
- simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
- simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
- simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
- simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF
-
- simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9
- simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB
- simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD
- simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF
-
- simdscalari dst0 = _simd_permute2f128_si(
- tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3
- simdscalari dst1 = _simd_permute2f128_si(
- tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7
- simdscalari dst2 = _simd_permute2f128_si(
- tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB
- simdscalari dst3 = _simd_permute2f128_si(
- tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF
-
- _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgbargbargbargba
- _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgbargbargbargba
- _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 2, dst2); // rgbargbargbargba
- _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 3, dst3); // rgbargbargbargba
- }
+#if KNOB_SIMD16_WIDTH == 16
+ // clang-format off
+
+ simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr
+ simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg
+ simdscalari src2 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 2); // bbbbbbbbbbbbbbbb
+ simdscalari src3 = _simd_setzero_si(); // aaaaaaaaaaaaaaaa
+
+ simdscalari pre0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
+ simdscalari pre1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
+ simdscalari pre2 = _simd_unpacklo_epi16(src2, src3); // ba0 ba1 ba3 ba3 ba8 ba9 baA baB
+ simdscalari pre3 = _simd_unpackhi_epi16(src2, src3); // ba4 ba5 ba6 ba7 baC baD baE baF
+
+ simdscalari tmp0 = _simd_unpacklo_epi32(pre0, pre2); // rbga0 rbga1 rbga8 rbga9
+ simdscalari tmp1 = _simd_unpackhi_epi32(pre0, pre2); // rbga2 rbga3 rbgaA rbgaB
+ simdscalari tmp2 = _simd_unpacklo_epi32(pre1, pre3); // rbga4 rbga5 rgbaC rbgaD
+ simdscalari tmp3 = _simd_unpackhi_epi32(pre1, pre3); // rbga6 rbga7 rbgaE rbgaF
+
+ simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rbga0 rbga1 rbga2 rbga3
+ simdscalari dst1 = _simd_permute2f128_si(tmp2, tmp3, 0x20); // (2, 0) // rbga4 rbga5 rbga6 rbga7
+ simdscalari dst2 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rbga8 rbga9 rbgaA rbgaB
+ simdscalari dst3 = _simd_permute2f128_si(tmp2, tmp3, 0x31); // (3, 1) // rbgaC rbgaD rbgaE rbgaF
+
+ _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgbargbargbargba
+ _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgbargbargbargba
+ _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 2, dst2); // rgbargbargbargba
+ _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 3, dst3); // rgbargbargbargba
+
+ // clang-format on
+#else
+#error Unsupported vector width
#endif
+ }
};
//////////////////////////////////////////////////////////////////////////
@@ -761,27 +744,29 @@ struct Transpose16_16
#error Unsupported vector width
#endif
}
-#if ENABLE_AVX512_SIMD16
- INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
+ INLINE static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst)
{
- simdscalari src0 =
- _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr
- simdscalari src1 =
- _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg
+#if KNOB_SIMD16_WIDTH == 16
+ // clang-format off
- simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
- simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
+ simdscalari src0 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc)); // rrrrrrrrrrrrrrrr
+ simdscalari src1 = _simd_load_si(reinterpret_cast<const simdscalari*>(pSrc) + 1); // gggggggggggggggg
- simdscalari dst0 = _simd_permute2f128_si(
- tmp0, tmp1, 0x20); // (2, 0) // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7
- simdscalari dst1 = _simd_permute2f128_si(
- tmp0, tmp1, 0x31); // (3, 1) // rg8 rg9 rgA rgB rgC rgD rgE rgF
+ simdscalari tmp0 = _simd_unpacklo_epi16(src0, src1); // rg0 rg1 rg2 rg3 rg8 rg9 rgA rgB
+ simdscalari tmp1 = _simd_unpackhi_epi16(src0, src1); // rg4 rg5 rg6 rg7 rgC rgD rgE rgF
- _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg
- _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgrgrgrgrgrgrgrg
- }
+ simdscalari dst0 = _simd_permute2f128_si(tmp0, tmp1, 0x20); // (2, 0) // rg0 rg1 rg2 rg3 rg4 rg5 rg6 rg7
+ simdscalari dst1 = _simd_permute2f128_si(tmp0, tmp1, 0x31); // (3, 1) // rg8 rg9 rgA rgB rgC rgD rgE rgF
+
+ _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 0, dst0); // rgrgrgrgrgrgrgrg
+ _simd_store_si(reinterpret_cast<simdscalari*>(pDst) + 1, dst1); // rgrgrgrgrgrgrgrg
+
+ // clang-format on
+#else
+#error Unsupported vector width
#endif
+ }
};
//////////////////////////////////////////////////////////////////////////
@@ -794,10 +779,7 @@ struct Transpose24_8
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#if ENABLE_AVX512_SIMD16
-
- static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#endif
+ static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
@@ -810,10 +792,7 @@ struct Transpose32_8_24
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#if ENABLE_AVX512_SIMD16
-
- static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#endif
+ static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
@@ -826,10 +805,7 @@ struct Transpose4_4_4_4
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#if ENABLE_AVX512_SIMD16
-
- static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#endif
+ static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
@@ -842,10 +818,7 @@ struct Transpose5_6_5
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#if ENABLE_AVX512_SIMD16
-
- static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#endif
+ static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
@@ -858,10 +831,7 @@ struct Transpose9_9_9_5
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#if ENABLE_AVX512_SIMD16
-
- static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#endif
+ static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
@@ -874,10 +844,7 @@ struct Transpose5_5_5_1
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#if ENABLE_AVX512_SIMD16
-
- static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#endif
+ static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
@@ -890,6 +857,7 @@ struct Transpose1_5_5_5
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
+ static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
@@ -902,10 +870,7 @@ struct Transpose10_10_10_2
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#if ENABLE_AVX512_SIMD16
-
- static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#endif
+ static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
@@ -918,10 +883,7 @@ struct Transpose11_11_10
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#if ENABLE_AVX512_SIMD16
-
- static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#endif
+ static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
@@ -934,10 +896,7 @@ struct Transpose64
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#if ENABLE_AVX512_SIMD16
-
- static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#endif
+ static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
@@ -950,10 +909,7 @@ struct Transpose64_64
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#if ENABLE_AVX512_SIMD16
-
- static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#endif
+ static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
@@ -966,10 +922,7 @@ struct Transpose64_64_64
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#if ENABLE_AVX512_SIMD16
-
- static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#endif
+ static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
@@ -982,8 +935,5 @@ struct Transpose64_64_64_64
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#if ENABLE_AVX512_SIMD16
-
- static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) = delete;
-#endif
+ static void Transpose_simd16(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h b/src/gallium/drivers/swr/rasterizer/core/knobs.h
index 8cccbf416af..92fbf8840e1 100644
--- a/src/gallium/drivers/swr/rasterizer/core/knobs.h
+++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h
@@ -53,35 +53,22 @@
#if (KNOB_ARCH == KNOB_ARCH_AVX)
#define KNOB_ARCH_ISA AVX
#define KNOB_ARCH_STR "AVX"
-#define KNOB_SIMD_WIDTH 8
-#define KNOB_SIMD_BYTES 32
#elif (KNOB_ARCH == KNOB_ARCH_AVX2)
#define KNOB_ARCH_ISA AVX2
#define KNOB_ARCH_STR "AVX2"
-#define KNOB_SIMD_WIDTH 8
-#define KNOB_SIMD_BYTES 32
#elif (KNOB_ARCH == KNOB_ARCH_AVX512)
#define KNOB_ARCH_ISA AVX512F
#define KNOB_ARCH_STR "AVX512"
-#define KNOB_SIMD_WIDTH 8
-#define KNOB_SIMD_BYTES 32
#else
#error "Unknown architecture"
#endif
-#if ENABLE_AVX512_SIMD16
+#define KNOB_SIMD_WIDTH 8
+#define KNOB_SIMD_BYTES 32
#define KNOB_SIMD16_WIDTH 16
#define KNOB_SIMD16_BYTES 64
-#if (KNOB_ARCH == KNOB_ARCH_AVX512)
-#define ENABLE_AVX512_EMULATION 0
-#else
-#define ENABLE_AVX512_EMULATION 1
-#endif
-
-#endif
-
#define MAX_KNOB_ARCH_STR_LEN sizeof("AVX512_PLUS_PADDING")
///////////////////////////////////////////////////////////////////////////////
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h
index 04fad69feae..3f8123250c6 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -201,14 +201,11 @@ struct simdvertex
simdvector attrib[SWR_VTX_NUM_SLOTS];
};
-#if ENABLE_AVX512_SIMD16
struct simd16vertex
{
simd16vector attrib[SWR_VTX_NUM_SLOTS];
};
-#endif
-
template <typename SIMD_T>
struct SIMDVERTEX_T
{
@@ -429,11 +426,12 @@ struct SWR_CS_CONTEXT
// enums
enum SWR_TILE_MODE
{
- SWR_TILE_NONE = 0x0, // Linear mode (no tiling)
- SWR_TILE_MODE_WMAJOR, // W major tiling
- SWR_TILE_MODE_XMAJOR, // X major tiling
- SWR_TILE_MODE_YMAJOR, // Y major tiling
- SWR_TILE_SWRZ, // SWR-Z tiling
+ SWR_TILE_NONE = 0x0, // Linear mode (no tiling)
+ SWR_TILE_MODE_WMAJOR, // W major tiling
+ SWR_TILE_MODE_XMAJOR, // X major tiling
+ SWR_TILE_MODE_YMAJOR, // Y major tiling
+ SWR_TILE_SWRZ, // SWR-Z tiling
+
SWR_TILE_MODE_COUNT
};
diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h
index e008cc8d739..9b483776be9 100644
--- a/src/gallium/drivers/swr/rasterizer/core/utils.h
+++ b/src/gallium/drivers/swr/rasterizer/core/utils.h
@@ -44,7 +44,6 @@ struct simdBBox
simdscalari xmax;
};
-#if ENABLE_AVX512_SIMD16
struct simd16BBox
{
simd16scalari ymin;
@@ -52,7 +51,6 @@ struct simd16BBox
simd16scalari xmin;
simd16scalari xmax;
};
-#endif
template <typename SIMD_T>
struct SIMDBBOX_T
diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h
index 407cefae54e..02c6df0e075 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h
+++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h
@@ -355,7 +355,7 @@ struct ConvertPixelsSOAtoAOS
StoreSOA<DstFormat>(src, soaTile);
// Convert from SOA --> AOS
- FormatTraits<DstFormat>::TransposeT::Transpose_16(soaTile, aosTile);
+ FormatTraits<DstFormat>::TransposeT::Transpose_simd16(soaTile, aosTile);
// Store data into destination
StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
@@ -382,7 +382,7 @@ struct ConvertPixelsSOAtoAOS<Format, Format>
OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
// Convert from SOA --> AOS
- FormatTraits<Format>::TransposeT::Transpose_16(pSrc, aosTile);
+ FormatTraits<Format>::TransposeT::Transpose_simd16(pSrc, aosTile);
// Store data into destination
StorePixels<FormatTraits<Format>::bpp, NumDests>::Store(aosTile, ppDsts);
diff --git a/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h b/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h
index abb0c53ec41..cd29550691d 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h
+++ b/src/gallium/drivers/swr/rasterizer/memory/TilingFunctions.h
@@ -153,7 +153,6 @@ struct SimdTile <R8_UINT,R8_UINT>
}
};
-#if ENABLE_AVX512_SIMD16
//////////////////////////////////////////////////////////////////////////
/// SimdTile 8x2 for AVX-512
//////////////////////////////////////////////////////////////////////////
@@ -253,7 +252,6 @@ struct SimdTile_16 <R8_UINT, R8_UINT>
}
};
-#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Computes lod offset for 1D surface at specified lod.
/// @param baseWidth - width of basemip (mip 0).