swr/rast: Switch intrinsic usage to SIMDLib

Switch from a macro-based simd intrinsics layer to a more C++ implementation, which also adds AVX512 optimizations to 128-bit and 256-bit SIMD. Reviewed-by: Bruce Cherniak <bruce.cherniak at intel.com>
author: Tim Rowley <[email protected]> 2017-06-15 15:24:07 -0500
committer: Tim Rowley <[email protected]> 2017-06-30 13:26:19 -0500
commit: fc4f6c44c479a97b9cad5d08f0d9cd71a8e1e5f8 (patch)
tree: a8ea649f549dc856f402b0b5d9323c5cef080e34 /src/gallium/drivers/swr
parent: 8b66d18a3b4f6d6a4f0ea9d71459dac68e5e0295 (diff)
30 files changed, 6222 insertions, 2679 deletions
diff --git a/src/gallium/drivers/swr/Makefile.sources b/src/gallium/drivers/swr/Makefile.sources
index 12a5e7d137f..3c1118b1ad7 100644
--- a/src/gallium/drivers/swr/Makefile.sources
+++ b/src/gallium/drivers/swr/Makefile.sources
@@ -65,6 +65,19 @@ COMMON_CXX_SOURCES := \
 	rasterizer/common/rdtsc_buckets_shared.h \
 	rasterizer/common/simd16intrin.h \
 	rasterizer/common/simdintrin.h \
+	rasterizer/common/simdlib.hpp \
+	rasterizer/common/simdlib_128_avx.inl \
+	rasterizer/common/simdlib_128_avx2.inl \
+	rasterizer/common/simdlib_128_avx512.inl \
+	rasterizer/common/simdlib_256_avx.inl \
+	rasterizer/common/simdlib_256_avx2.inl \
+	rasterizer/common/simdlib_256_avx512.inl \
+	rasterizer/common/simdlib_512_avx512.inl \
+	rasterizer/common/simdlib_512_avx512_masks.inl \
+	rasterizer/common/simdlib_512_emu.inl \
+	rasterizer/common/simdlib_512_emu_masks.inl \
+	rasterizer/common/simdlib_interface.hpp \
+	rasterizer/common/simdlib_types.hpp \
 	rasterizer/common/swr_assert.cpp \
 	rasterizer/common/swr_assert.h
 
diff --git a/src/gallium/drivers/swr/rasterizer/common/intrin.h b/src/gallium/drivers/swr/rasterizer/common/intrin.h
index f45b2e55880..33d37e3cece 100644
--- a/src/gallium/drivers/swr/rasterizer/common/intrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/intrin.h
@@ -26,89 +26,37 @@
 
 #include "os.h"
 
-#include <cassert>
-
-#include <emmintrin.h>
-#include <immintrin.h>
-#include <xmmintrin.h>
+#define SIMD_ARCH KNOB_ARCH
+#include "simdlib_types.hpp"
+
+typedef SIMDImpl::SIMD128Impl::Float                      simd4scalar;
+typedef SIMDImpl::SIMD128Impl::Double                     simd4scalard;
+typedef SIMDImpl::SIMD128Impl::Integer                    simd4scalari;
+typedef SIMDImpl::SIMD128Impl::Vec4                       simd4vector;
+typedef SIMDImpl::SIMD128Impl::Mask                       simd4mask;
+
+typedef SIMDImpl::SIMD256Impl::Float                      simd8scalar;
+typedef SIMDImpl::SIMD256Impl::Double                     simd8scalard;
+typedef SIMDImpl::SIMD256Impl::Integer                    simd8scalari;
+typedef SIMDImpl::SIMD256Impl::Vec4                       simd8vector;
+typedef SIMDImpl::SIMD256Impl::Mask                       simd8mask;
+
+typedef SIMDImpl::SIMD512Impl::Float                      simd16scalar;
+typedef SIMDImpl::SIMD512Impl::Double                     simd16scalard;
+typedef SIMDImpl::SIMD512Impl::Integer                    simd16scalari;
+typedef SIMDImpl::SIMD512Impl::Vec4                       simd16vector;
+typedef SIMDImpl::SIMD512Impl::Mask                       simd16mask;
 
 #if KNOB_SIMD_WIDTH == 8 
-typedef __m256 simdscalar;
-typedef __m256i simdscalari;
-typedef uint8_t simdmask;
-#else
-#error Unsupported vector width
-#endif
-
-// simd vector
-OSALIGNSIMD(union) simdvector
-{
-    simdscalar  v[4];
-    struct
-    {
-        simdscalar x, y, z, w;
-    };
-
-    simdscalar& operator[] (const int i) { return v[i]; }
-    const simdscalar& operator[] (const int i) const { return v[i]; }
-};
-
-#if ENABLE_AVX512_SIMD16
-
-#if KNOB_SIMD16_WIDTH == 16
-
-#if ENABLE_AVX512_EMULATION
-struct simd16scalar
-{
-    __m256  lo;
-    __m256  hi;
-};
-struct simd16scalard
-{
-    __m256d lo;
-    __m256d hi;
-};
-struct simd16scalari
-{
-    __m256i lo;
-    __m256i hi;
-};
-typedef uint16_t simd16mask;
-
-#else
-typedef __m512 simd16scalar;
-typedef __m512d simd16scalard;
-typedef __m512i simd16scalari;
-typedef __mmask16 simd16mask;
-#endif//ENABLE_AVX512_EMULATION
+typedef simd8scalar     simdscalar;
+typedef simd8scalard    simdscalard;
+typedef simd8scalari    simdscalari;
+typedef simd8vector     simdvector;
+typedef simd8mask       simdmask;
 #else
 #error Unsupported vector width
-#endif//KNOB_SIMD16_WIDTH == 16
-
-#define _simd16_masklo(mask) ((mask) & 0xFF)
-#define _simd16_maskhi(mask) (((mask) >> 8) & 0xFF)
-#define _simd16_setmask(hi, lo) (((hi) << 8) | (lo))
-
-#if defined(_WIN32)
-#define SIMDAPI __vectorcall
-#else
-#define SIMDAPI
 #endif
 
-OSALIGN(union, KNOB_SIMD16_BYTES) simd16vector
-{
-    simd16scalar  v[4];
-    struct
-    {
-        simd16scalar x, y, z, w;
-    };
-
-    simd16scalar& operator[] (const int i) { return v[i]; }
-    const simd16scalar& operator[] (const int i) const { return v[i]; }
-};
-
-#endif // ENABLE_AVX512_SIMD16
-
 INLINE
 UINT pdep_u32(UINT a, UINT mask)
 {
diff --git a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
index a822420ae37..29151682e07 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
@@ -26,1096 +26,141 @@
 
 #if ENABLE_AVX512_SIMD16
 
-#if ENABLE_AVX512_EMULATION
-
-#define SIMD16_EMU_AVX512_0(type, func, intrin) \
-INLINE type SIMDAPI func()\
-{\
-    type result;\
-\
-    result.lo = intrin();\
-    result.hi = intrin();\
-\
-    return result;\
-}
-
-#define SIMD16_EMU_AVX512_1(type, func, intrin) \
-INLINE type SIMDAPI func(type a)\
-{\
-    type result;\
-\
-    result.lo = intrin(a.lo);\
-    result.hi = intrin(a.hi);\
-\
-    return result;\
-}
-
-#define SIMD16_EMU_AVX512_2(type, func, intrin) \
-INLINE type SIMDAPI func(type a, type b)\
-{\
-    type result;\
-\
-    result.lo = intrin(a.lo, b.lo);\
-    result.hi = intrin(a.hi, b.hi);\
-\
-    return result;\
-}
-
-#define SIMD16_EMU_AVX512_3(type, func, intrin) \
-INLINE type SIMDAPI func(type a, type b, type c)\
-{\
-    type result;\
-\
-    result.lo = intrin(a.lo, b.lo, c.lo);\
-    result.hi = intrin(a.hi, b.hi, c.hi);\
-\
-    return result;\
-}
-
-SIMD16_EMU_AVX512_0(simd16scalar, _simd16_setzero_ps, _mm256_setzero_ps)
-SIMD16_EMU_AVX512_0(simd16scalari, _simd16_setzero_si, _mm256_setzero_si256)
-
-INLINE simd16scalar SIMDAPI _simd16_set1_ps(float a)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_set1_ps(a);
-    result.hi = _mm256_set1_ps(a);
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_set1_epi8(char a)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_set1_epi8(a);
-    result.hi = _mm256_set1_epi8(a);
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_set1_epi32(int a)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_set1_epi32(a);
-    result.hi = _mm256_set1_epi32(a);
-
-    return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_set_ps(float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
-    result.hi = _mm256_set_ps(e15, e14, e13, e12, e11, e10, e9, e8);
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_set_epi32(int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
-    result.hi = _mm256_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8);
-
-    return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_set_ps(float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
-    result.hi = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_set_epi32(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
-    result.hi = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
-
-    return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_load_ps(float const *m)
-{
-    simd16scalar result;
-
-    float const *n = reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m) + sizeof(result.lo));
-
-    result.lo = _mm256_load_ps(m);
-    result.hi = _mm256_load_ps(n);
-
-    return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_loadu_ps(float const *m)
-{
-    simd16scalar result;
-
-    float const *n = reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m) + sizeof(result.lo));
-
-    result.lo = _mm256_loadu_ps(m);
-    result.hi = _mm256_loadu_ps(n);
-
-    return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_load1_ps(float const *m)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_broadcast_ss(m);
-    result.hi = _mm256_broadcast_ss(m);
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_load_si(simd16scalari const *m)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_load_si256(&m[0].lo);
-    result.hi = _mm256_load_si256(&m[0].hi);
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_loadu_si(simd16scalari const *m)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_loadu_si256(&m[0].lo);
-    result.hi = _mm256_loadu_si256(&m[0].hi);
-
-    return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_broadcast_ss(float const *m)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_broadcast_ss(m);
-    result.hi = _mm256_broadcast_ss(m);
-
-    return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_broadcast_ps(__m128 const *m)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_broadcast_ps(m);
-    result.hi = _mm256_broadcast_ps(m);
-
-    return result;
-}
-
-INLINE void SIMDAPI _simd16_store_ps(float *m, simd16scalar a)
-{
-    float *n = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m) + sizeof(a.lo));
-
-    _mm256_store_ps(m, a.lo);
-    _mm256_store_ps(n, a.hi);
-}
-
-INLINE void SIMDAPI _simd16_maskstore_ps(float *m, simd16scalari mask, simd16scalar a)
-{
-    float *n = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m) + sizeof(a.lo));
-
-    _mm256_maskstore_ps(m, mask.lo, a.lo);
-    _mm256_maskstore_ps(n, mask.hi, a.hi);
-}
-
-INLINE void SIMDAPI _simd16_store_si(simd16scalari *m, simd16scalari a)
-{
-    _mm256_store_si256(&m[0].lo, a.lo);
-    _mm256_store_si256(&m[0].hi, a.hi);
-}
-
-INLINE simdscalar SIMDAPI _simd16_extract_ps(simd16scalar a, int imm8)
-{
-    switch (imm8)
-    {
-    case 0:
-        return a.lo;
-    case 1:
-        return a.hi;
-    }
-    return _simd_set1_ps(0.0f);
-}
-
-INLINE simdscalari SIMDAPI _simd16_extract_si(simd16scalari a, int imm8)
-{
-    switch (imm8)
-    {
-    case 0:
-        return a.lo;
-    case 1:
-        return a.hi;
-    }
-    return _simd_set1_epi32(0);
-}
-
-INLINE simd16scalar SIMDAPI _simd16_insert_ps(simd16scalar a, simdscalar b, int imm8)
-{
-    switch (imm8)
-    {
-    case 0:
-        a.lo = b;
-        break;
-    case 1:
-        a.hi = b;
-        break;
-    }
-    return a;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_insert_si(simd16scalari a, simdscalari b, int imm8)
-{
-    switch (imm8)
-    {
-    case 0:
-        a.lo = b;
-        break;
-    case 1:
-        a.hi = b;
-        break;
-    }
-    return a;
-}
-
-template <simd16mask mask>
-INLINE simd16scalar SIMDAPI _simd16_blend_ps_temp(simd16scalar a, simd16scalar b)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_blend_ps(a.lo, b.lo, _simd16_masklo(mask));
-    result.hi = _mm256_blend_ps(a.hi, b.hi, _simd16_maskhi(mask));
-
-    return result;
-}
-
-#define _simd16_blend_ps(a, b, mask) _simd16_blend_ps_temp<mask>(a, b)
-
-SIMD16_EMU_AVX512_3(simd16scalar, _simd16_blendv_ps, _mm256_blendv_ps)
-
-INLINE simd16scalari SIMDAPI _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalar mask)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.lo), _mm256_castsi256_ps(b.lo), mask.lo));
-    result.hi = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.hi), _mm256_castsi256_ps(b.hi), mask.hi));
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalari mask)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.lo), _mm256_castsi256_ps(b.lo), _mm256_castsi256_ps(mask.lo)));
-    result.hi = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.hi), _mm256_castsi256_ps(b.hi), _mm256_castsi256_ps(mask.hi)));
-
-    return result;
-}
-
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_mul_ps, _mm256_mul_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_add_ps, _mm256_add_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_sub_ps, _mm256_sub_ps)
-SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rsqrt_ps, _mm256_rsqrt_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_min_ps, _mm256_min_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_max_ps, _mm256_max_ps)
-
-INLINE simd16mask SIMDAPI _simd16_movemask_ps(simd16scalar a)
-{
-    simdmask mask_lo = _mm256_movemask_ps(a.lo);
-    simdmask mask_hi = _mm256_movemask_ps(a.hi);
-
-    return static_cast<simd16mask>(mask_lo) | (static_cast<simd16mask>(mask_hi) << 8);
-}
-
-INLINE simd16mask SIMDAPI _simd16_movemask_pd(simd16scalard a)
-{
-    simdmask mask_lo = _mm256_movemask_pd(a.lo);
-    simdmask mask_hi = _mm256_movemask_pd(a.hi);
-
-    return static_cast<simd16mask>(mask_lo) | (static_cast<simd16mask>(mask_hi) << 4);
-}
-
-INLINE uint64_t SIMDAPI _simd16_movemask_epi8(simd16scalari a)
-{
-    uint32_t mask_lo = _mm256_movemask_epi8(a.lo);
-    uint32_t mask_hi = _mm256_movemask_epi8(a.hi);
-
-    return static_cast<uint64_t>(mask_lo) | (static_cast<uint64_t>(mask_hi) << 32);
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cvtps_epi32(simd16scalar a)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_cvtps_epi32(a.lo);
-    result.hi = _mm256_cvtps_epi32(a.hi);
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cvttps_epi32(simd16scalar a)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_cvttps_epi32(a.lo);
-    result.hi = _mm256_cvttps_epi32(a.hi);
-
-    return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_cvtepi32_ps(simd16scalari a)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_cvtepi32_ps(a.lo);
-    result.hi = _mm256_cvtepi32_ps(a.hi);
-
-    return result;
-}
-
-template <int comp>
-INLINE simd16scalar SIMDAPI _simd16_cmp_ps_temp(simd16scalar a, simd16scalar b)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_cmp_ps(a.lo, b.lo, comp);
-    result.hi = _mm256_cmp_ps(a.hi, b.hi, comp);
-
-    return result;
-}
-
-#define _simd16_cmp_ps(a, b, comp)  _simd16_cmp_ps_temp<comp>(a, b)
-
-#define _simd16_cmplt_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_LT_OQ)
-#define _simd16_cmpgt_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_GT_OQ)
-#define _simd16_cmpneq_ps(a, b)     _simd16_cmp_ps(a, b, _CMP_NEQ_OQ)
-#define _simd16_cmpeq_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_EQ_OQ)
-#define _simd16_cmpge_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_GE_OQ)
-#define _simd16_cmple_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_LE_OQ)
-
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_and_ps, _simd_and_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_andnot_ps, _simd_andnot_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_or_ps, _simd_or_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_xor_ps, _simd_xor_ps)
-
-SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rcp_ps, _simd_rcp_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_div_ps, _simd_div_ps)
-
-INLINE simd16scalar SIMDAPI _simd16_castsi_ps(simd16scalari a)
-{
-    return *reinterpret_cast<simd16scalar *>(&a);
-}
-
-INLINE simd16scalari SIMDAPI _simd16_castps_si(simd16scalar a)
-{
-    return *reinterpret_cast<simd16scalari *>(&a);
-}
-
-INLINE simd16scalard SIMDAPI _simd16_castsi_pd(simd16scalari a)
-{
-    return *reinterpret_cast<simd16scalard *>(&a);
-}
-
-INLINE simd16scalari SIMDAPI _simd16_castpd_si(simd16scalard a)
-{
-    return *reinterpret_cast<simd16scalari *>(&a);
-}
-
-INLINE simd16scalar SIMDAPI _simd16_castpd_ps(simd16scalard a)
-{
-    return *reinterpret_cast<simd16scalar *>(&a);
-}
-
-INLINE simd16scalard SIMDAPI _simd16_castps_pd(simd16scalar a)
-{
-    return *reinterpret_cast<simd16scalard *>(&a);
-}
-
-template <int mode>
-INLINE simd16scalar SIMDAPI _simd16_round_ps_temp(simd16scalar a)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_round_ps(a.lo, mode);
-    result.hi = _mm256_round_ps(a.hi, mode);
-
-    return result;
-}
-
-#define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)
-
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mul_epi32, _simd_mul_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mullo_epi32, _simd_mullo_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi32, _simd_sub_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi64, _simd_sub_epi64)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epi32, _simd_min_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epi32, _simd_max_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epu32, _simd_min_epu32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epu32, _simd_max_epu32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi32, _simd_add_epi32)
-
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_and_si, _simd_and_si)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_andnot_si, _simd_andnot_si)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_or_si, _simd_or_si)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_xor_si, _simd_xor_si)
-
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi32, _simd_cmpeq_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi32, _simd_cmpgt_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmplt_epi32, _simd_cmplt_epi32)
-
-INLINE int SIMDAPI _simd16_testz_ps(simd16scalar a, simd16scalar b)
-{
-    int lo = _simd_testz_ps(a.lo, b.lo);
-    int hi = _simd_testz_ps(a.hi, b.hi);
-
-    return lo & hi;
-}
-
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_unpacklo_ps, _simd_unpacklo_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_unpackhi_ps, _simd_unpackhi_ps)
-SIMD16_EMU_AVX512_2(simd16scalard, _simd16_unpacklo_pd, _simd_unpacklo_pd)
-SIMD16_EMU_AVX512_2(simd16scalard, _simd16_unpackhi_pd, _simd_unpackhi_pd)
-
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi8, _simd_unpacklo_epi8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi8, _simd_unpackhi_epi8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi16, _simd_unpacklo_epi16)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi16, _simd_unpackhi_epi16)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi32, _simd_unpacklo_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi32, _simd_unpackhi_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi64, _simd_unpacklo_epi64)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi64, _simd_unpackhi_epi64)
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_slli_epi32_temp(simd16scalari a)
-{
-    simd16scalari result;
-
-    result.lo = _simd_slli_epi32(a.lo, imm8);
-    result.hi = _simd_slli_epi32(a.hi, imm8);
-
-    return result;
-}
-
-#define _simd16_slli_epi32(a, imm8) _simd16_slli_epi32_temp<imm8>(a)
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_srai_epi32_temp(simd16scalari a)
-{
-    simd16scalari result;
-
-    result.lo = _simd_srai_epi32(a.lo, imm8);
-    result.hi = _simd_srai_epi32(a.hi, imm8);
-
-    return result;
-}
-
-#define _simd16_srai_epi32(a, imm8) _simd16_srai_epi32_temp<imm8>(a)
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_srli_epi32_temp(simd16scalari a)
-{
-    simd16scalari result;
-
-    result.lo = _simd_srli_epi32(a.lo, imm8);
-    result.hi = _simd_srli_epi32(a.hi, imm8);
-
-    return result;
-}
-
-#define _simd16_srli_epi32(a, imm8) _simd16_srli_epi32_temp<imm8>(a)
-
-SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmadd_ps, _simd_fmadd_ps)
-SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmsub_ps, _simd_fmsub_ps)
-
-template <int scale>
-INLINE simd16scalar SIMDAPI _simd16_i32gather_ps_temp(const float *m, simd16scalari index)
-{
-    simd16scalar result;
-
-    result.lo = _simd_i32gather_ps(m, index.lo, scale);
-    result.hi = _simd_i32gather_ps(m, index.hi, scale);
-
-    return result;
-}
-
-#define _simd16_i32gather_ps(m, index, scale) _simd16_i32gather_ps_temp<scale>(m, index)
-
-template <int scale>
-INLINE simd16scalar SIMDAPI _simd16_mask_i32gather_ps_temp(simd16scalar a, const float *m, simd16scalari index, simd16scalari mask)
-{
-    simd16scalar result;
-
-    result.lo = _simd_mask_i32gather_ps(a.lo, m, index.lo, _simd_castsi_ps(mask.lo), scale);
-    result.hi = _simd_mask_i32gather_ps(a.hi, m, index.hi, _simd_castsi_ps(mask.hi), scale);
-
-    return result;
-}
-
-#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp<scale>(a, m, index, mask)
-
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_shuffle_epi8, _simd_shuffle_epi8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_adds_epu8, _simd_adds_epu8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_subs_epu8, _simd_subs_epu8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi8, _simd_add_epi8)
-SIMD16_EMU_AVX512_1(simd16scalari, _simd16_abs_epi32, _simd_abs_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi64, _simd_cmpeq_epi64)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi64, _simd_cmpgt_epi64)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi16, _simd_cmpeq_epi16)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi16, _simd_cmpgt_epi16)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi8, _simd_cmpeq_epi8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi8, _simd_cmpgt_epi8)
-
-INLINE simd16scalar SIMDAPI _simd16_permute_ps(simd16scalar a, simd16scalari i)
-{
-    simd16scalar result;
-
-    const simdscalari mask = _simd_set1_epi32(7);
-
-    simdscalar lolo = _simd_permute_ps(a.lo, _simd_and_si(i.lo, mask));
-    simdscalar lohi = _simd_permute_ps(a.hi, _simd_and_si(i.lo, mask));
-
-    simdscalar hilo = _simd_permute_ps(a.lo, _simd_and_si(i.hi, mask));
-    simdscalar hihi = _simd_permute_ps(a.hi, _simd_and_si(i.hi, mask));
-
-    result.lo = _simd_blendv_ps(lolo, lohi, _simd_castsi_ps(_simd_cmpgt_epi32(i.lo, mask)));
-    result.hi = _simd_blendv_ps(hilo, hihi, _simd_castsi_ps(_simd_cmpgt_epi32(i.hi, mask)));
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_permute_epi32(simd16scalari a, simd16scalari i)
-{
-    return _simd16_castps_si(_simd16_permute_ps(_simd16_castsi_ps(a), i));
-}
-
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_srlv_epi32, _simd_srlv_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sllv_epi32, _simd_sllv_epi32)
-
-template <int imm8>
-INLINE simd16scalar SIMDAPI _simd16_permute2f128_ps_temp(simd16scalar a, simd16scalar b)
-{
-    simd16scalar result;
-
-    result.lo = _simd_permute2f128_ps(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
-    result.hi = _simd_permute2f128_ps(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
-
-    return result;
-}
-
-#define _simd16_permute2f128_ps(a, b, imm8) _simd16_permute2f128_ps_temp<imm8>(a, b)
-
-template <int imm8>
-INLINE simd16scalard SIMDAPI _simd16_permute2f128_pd_temp(simd16scalard a, simd16scalard b)
-{
-    simd16scalard result;
-
-    result.lo = _simd_permute2f128_pd(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
-    result.hi = _simd_permute2f128_pd(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
-
-    return result;
-}
-
-#define _simd16_permute2f128_pd(a, b, imm8) _simd16_permute2f128_pd_temp<imm8>(a, b)
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_permute2f128_si_temp(simd16scalari a, simd16scalari b)
-{
-    simd16scalari result;
-
-    result.lo = _simd_permute2f128_si(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
-    result.hi = _simd_permute2f128_si(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
-
-    return result;
-}
-
-#define _simd16_permute2f128_si(a, b, imm8) _simd16_permute2f128_si_temp<imm8>(a, b)
-
-template <int imm8>
-INLINE simd16scalar SIMDAPI _simd16_shuffle_ps_temp(simd16scalar a, simd16scalar b)
-{
-    simd16scalar result;
-
-    result.lo = _simd_shuffle_ps(a.lo, b.lo, imm8);
-    result.hi = _simd_shuffle_ps(a.hi, b.hi, imm8);
-
-    return result;
-}
-
-#define _simd16_shuffle_ps(a, b, imm8) _simd16_shuffle_ps_temp<imm8>(a, b)
-
-template <int imm8>
-INLINE simd16scalard SIMDAPI _simd16_shuffle_pd_temp(simd16scalard a, simd16scalard b)
-{
-    simd16scalard result;
-
-    result.lo = _simd_shuffle_pd(a.lo, b.lo, (imm8 & 15));
-    result.hi = _simd_shuffle_pd(a.hi, b.hi, (imm8 >> 4));
-
-    return result;
-}
-
-#define _simd16_shuffle_pd(a, b, imm8) _simd16_shuffle_pd_temp<imm8>(a, b)
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_shuffle_epi32_temp(simd16scalari a, simd16scalari b)
-{
-    return _simd16_castps_si(_simd16_shuffle_ps(_simd16_castsi_ps(a), _simd16_castsi_ps(b), imm8));
-}
-
-#define _simd16_shuffle_epi32(a, b, imm8) _simd16_shuffle_epi32_temp<imm8>(a, b)
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_shuffle_epi64_temp(simd16scalari a, simd16scalari b)
-{
-    return _simd16_castpd_si(_simd16_shuffle_pd(_simd16_castsi_pd(a), _simd16_castsi_pd(b), imm8));
-}
-
-#define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b)
-
-INLINE simd16scalari SIMDAPI _simd16_cvtepu8_epi16(simdscalari a)
-{
-    simd16scalari result;
-
-    result.lo = _simd_cvtepu8_epi16(_mm256_extractf128_si256(a, 0));
-    result.hi = _simd_cvtepu8_epi16(_mm256_extractf128_si256(a, 1));
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cvtepu8_epi32(__m128i a)
-{
-    simd16scalari result;
-
-    result.lo = _simd_cvtepu8_epi32(a);
-    result.hi = _simd_cvtepu8_epi32(_mm_srli_si128(a, 8));
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cvtepu16_epi32(simdscalari a)
-{
-    simd16scalari result;
-
-    result.lo = _simd_cvtepu16_epi32(_mm256_extractf128_si256(a, 0));
-    result.hi = _simd_cvtepu16_epi32(_mm256_extractf128_si256(a, 1));
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cvtepu16_epi64(simdscalari a)
-{
-    simd16scalari result;
-
-    result.lo = _simd_cvtepu16_epi64(_mm256_extractf128_si256(a, 0));
-    result.hi = _simd_cvtepu16_epi64(_mm256_extractf128_si256(a, 1));
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cvtepu32_epi64(simdscalari a)
-{
-    simd16scalari result;
-
-    result.lo = _simd_cvtepu32_epi64(_mm256_extractf128_si256(a, 0));
-    result.hi = _simd_cvtepu32_epi64(_mm256_extractf128_si256(a, 1));
-
-    return result;
-}
-
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packus_epi16, _simd_packus_epi16)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packs_epi16, _simd_packs_epi16)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packus_epi32, _simd_packus_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packs_epi32, _simd_packs_epi32)
-
-INLINE simd16mask SIMDAPI _simd16_int2mask(int mask)
-{
-    return mask;
-}
-
-INLINE int SIMDAPI SIMDAPI _simd16_mask2int(simd16mask mask)
-{
-    return mask;
-}
-
-INLINE simd16mask SIMDAPI _simd16_cmplt_ps_mask(simd16scalar a, simd16scalar b)
-{
-    return _simd16_movemask_ps(_simd16_cmplt_ps(a, b));
-}
-
-// convert bitmask to vector mask
-INLINE simd16scalar SIMDAPI vMask16(int32_t mask)
-{
-    simd16scalari temp = _simd16_set1_epi32(mask);
-
-    simd16scalari bits = _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
-
-    simd16scalari result = _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp, bits));
-
-    return _simd16_castsi_ps(result);
-}
-
+#if KNOB_SIMD16_WIDTH == 16
+typedef SIMD512                             SIMD16;
 #else
-
-INLINE simd16mask SIMDAPI _simd16_scalari2mask(simd16scalari mask)
-{
-    return _mm512_cmpneq_epu32_mask(mask, _mm512_setzero_epi32());
-}
-
-INLINE simd16mask SIMDAPI _simd16_scalard2mask(simd16scalard mask)
-{
-    return _mm512_cmpneq_epu64_mask(_mm512_castpd_si512(mask), _mm512_setzero_si512());
-}
-
-#define _simd16_setzero_ps      _mm512_setzero_ps
-#define _simd16_setzero_si      _mm512_setzero_si512
-#define _simd16_set1_ps         _mm512_set1_ps
-#define _simd16_set1_epi8       _mm512_set1_epi8
-#define _simd16_set1_epi32      _mm512_set1_epi32
-
-INLINE simd16scalar SIMDAPI _simd16_set_ps(float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
-{
-    return _mm512_set_ps(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
-}
-
-INLINE simd16scalari SIMDAPI _simd16_set_epi32(int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
-{
-    return _mm512_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
-}
-
-INLINE simd16scalar SIMDAPI _simd16_set_ps(float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
-{
-    return _mm512_set_ps(e7, e6, e5, e4, e3, e2, e1, e0, e7, e6, e5, e4, e3, e2, e1, e0);
-}
-
-INLINE simd16scalari SIMDAPI _simd16_set_epi32(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
-{
-    return _mm512_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0, e7, e6, e5, e4, e3, e2, e1, e0);
-}
-
-#define _simd16_load_ps         _mm512_load_ps
-#define _simd16_loadu_ps        _mm512_loadu_ps
-#if 1
-#define _simd16_load1_ps        _simd16_broadcast_ss
-#endif
-#define _simd16_load_si         _mm512_load_si512
-#define _simd16_loadu_si        _mm512_loadu_si512
-#define _simd16_broadcast_ss(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, 0)
-#define _simd16_broadcast_ps(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, 0)
-#define _simd16_store_ps        _mm512_store_ps
-#define _simd16_store_si        _mm512_store_si512
-#define _simd16_extract_ps(a, imm8) _mm256_castsi256_ps(_mm512_extracti64x4_epi64(_mm512_castps_si512(a), imm8))
-#define _simd16_extract_si      _mm512_extracti64x4_epi64
-#define _simd16_insert_ps(a, b, imm8)  _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castps_si512(a), _mm256_castps_si256(b), imm8))
-#define _simd16_insert_si       _mm512_inserti64x4
-
-INLINE void SIMDAPI _simd16_maskstore_ps(float *m, simd16scalari mask, simd16scalar a)
-{
-    simd16mask k = _simd16_scalari2mask(mask);
-
-    _mm512_mask_store_ps(m, k, a);
-}
-
-#define _simd16_blend_ps(a, b, mask)    _mm512_mask_blend_ps(mask, a, b)
-
-INLINE simd16scalar SIMDAPI _simd16_blendv_ps(simd16scalar a, simd16scalar b, const simd16scalar mask)
-{
-    simd16mask k = _simd16_scalari2mask(_mm512_castps_si512(mask));
-
-    return _mm512_mask_blend_ps(k, a, b);
-}
-
-INLINE simd16scalari SIMDAPI _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalar mask)
-{
-    simd16mask k = _simd16_scalari2mask(_mm512_castps_si512(mask));
-
-    return _mm512_mask_blend_epi32(k, a, b);
-}
-
-INLINE simd16scalari SIMDAPI _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalari mask)
-{
-    simd16mask k = _simd16_scalari2mask(mask);
-
-    return _mm512_mask_blend_epi32(k, a, b);
-}
-
-#define _simd16_mul_ps          _mm512_mul_ps
-#define _simd16_div_ps          _mm512_div_ps
-#define _simd16_add_ps          _mm512_add_ps
-#define _simd16_sub_ps          _mm512_sub_ps
-#define _simd16_rsqrt_ps        _mm512_rsqrt14_ps
-#define _simd16_min_ps          _mm512_min_ps
-#define _simd16_max_ps          _mm512_max_ps
-
-INLINE simd16mask SIMDAPI _simd16_movemask_ps(simd16scalar a)
-{
-    // movemask_ps only checks the top bit of the float single elements
-    return  _simd16_scalari2mask(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x80000000)));
-}
-
-INLINE simd16mask SIMDAPI _simd16_movemask_pd(simd16scalard a)
-{
-    // movemask_pd only checks the top bit of the float double elements
-    return  _simd16_scalard2mask(_mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a), _mm512_set1_epi64(0x8000000000000000))));
-}
-
-#if 0
-INLINE int SIMDAPI _simd16_movemask_epi8(simd16scalari a)
-{
-    return  _simd16_scalar2mask(a);
-}
-#endif
-
-#define _simd16_cvtps_epi32     _mm512_cvtps_epi32
-#define _simd16_cvttps_epi32    _mm512_cvttps_epi32
-#define _simd16_cvtepi32_ps     _mm512_cvtepi32_ps
-
-template <int comp>
-INLINE simd16scalar SIMDAPI _simd16_cmp_ps_temp(simd16scalar a, simd16scalar b)
-{
-    simd16mask k = _mm512_cmp_ps_mask(a, b, comp);
-
-    return _mm512_castsi512_ps(_mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF)));
-}
-
-#define _simd16_cmp_ps(a, b, comp)  _simd16_cmp_ps_temp<comp>(a, b)
-
-#define _simd16_cmplt_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_LT_OQ)
-#define _simd16_cmpgt_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_GT_OQ)
-#define _simd16_cmpneq_ps(a, b)     _simd16_cmp_ps(a, b, _CMP_NEQ_OQ)
-#define _simd16_cmpeq_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_EQ_OQ)
-#define _simd16_cmpge_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_GE_OQ)
-#define _simd16_cmple_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_LE_OQ)
-
-#define _simd16_castsi_ps           _mm512_castsi512_ps
-#define _simd16_castps_si           _mm512_castps_si512
-#define _simd16_castsi_pd           _mm512_castsi512_pd
-#define _simd16_castpd_si           _mm512_castpd_si512
-#define _simd16_castpd_ps           _mm512_castpd_ps
-#define _simd16_castps_pd           _mm512_castps_pd
-
-// _mm512_and_ps (and other bitwise operations) exist in AVX512DQ,
-// while the functionally equivalent _mm512_and_epi32 is in AVX512F.
-// Define the _simd16_*_ps versions in terms of AVX512F for broader
-// support.
-#define _simd16_logicop_ps(a, b, op) _simd16_castsi_ps(op##_epi32(_simd16_castps_si(a), _simd16_castps_si(b)))
-
-#define _simd16_and_ps(a, b)        _simd16_logicop_ps(a, b, _mm512_and)
-#define _simd16_andnot_ps(a, b)     _simd16_logicop_ps(a, b, _mm512_andnot)
-#define _simd16_or_ps(a, b)         _simd16_logicop_ps(a, b, _mm512_or)
-#define _simd16_xor_ps(a, b)        _simd16_logicop_ps(a, b, _mm512_xor)
-
-template <int mode>
-INLINE simd16scalar SIMDAPI _simd16_round_ps_temp(simd16scalar a)
-{
-    return _mm512_roundscale_ps(a, mode);
-}
-
-#define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)
-
-#define _simd16_mul_epi32         _mm512_mul_epi32
-#define _simd16_mullo_epi32       _mm512_mullo_epi32
-#define _simd16_sub_epi32         _mm512_sub_epi32
-#define _simd16_sub_epi64         _mm512_sub_epi64
-#define _simd16_min_epi32         _mm512_min_epi32
-#define _simd16_max_epi32         _mm512_max_epi32
-#define _simd16_min_epu32         _mm512_min_epu32
-#define _simd16_max_epu32         _mm512_max_epu32
-#define _simd16_add_epi32         _mm512_add_epi32
-
-#define _simd16_and_si            _mm512_and_si512
-#define _simd16_andnot_si         _mm512_andnot_si512
-#define _simd16_or_si             _mm512_or_si512
-#define _simd16_xor_si            _mm512_xor_si512
-
-INLINE simd16scalari SIMDAPI _simd16_cmpeq_epi32(simd16scalari a, simd16scalari b)
-{
-    simd16mask k = _mm512_cmpeq_epi32_mask(a, b);
-
-    return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cmpgt_epi32(simd16scalari a, simd16scalari b)
-{
-    simd16mask k = _mm512_cmpgt_epi32_mask(a, b);
-
-    return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cmplt_epi32(simd16scalari a, simd16scalari b)
-{
-    simd16mask k = _mm512_cmplt_epi32_mask(a, b);
-
-    return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE int SIMDAPI _simd16_testz_ps(simd16scalar a, simd16scalar b)
-{
-    int lo = _simd_testz_ps(_simd16_extract_ps(a, 0), _simd16_extract_ps(b, 0));
-    int hi = _simd_testz_ps(_simd16_extract_ps(a, 1), _simd16_extract_ps(b, 1));
-
-    return lo & hi;
-}
-
-#define _simd16_unpacklo_ps       _mm512_unpacklo_ps
-#define _simd16_unpackhi_ps       _mm512_unpackhi_ps
-#define _simd16_unpacklo_pd       _mm512_unpacklo_pd
-#define _simd16_unpackhi_pd       _mm512_unpackhi_pd
-#define _simd16_unpacklo_epi8     _mm512_unpacklo_epi8
-#define _simd16_unpackhi_epi8     _mm512_unpackhi_epi8
-#define _simd16_unpacklo_epi16    _mm512_unpacklo_epi16
-#define _simd16_unpackhi_epi16    _mm512_unpackhi_epi16
-#define _simd16_unpacklo_epi32    _mm512_unpacklo_epi32
-#define _simd16_unpackhi_epi32    _mm512_unpackhi_epi32
-#define _simd16_unpacklo_epi64    _mm512_unpacklo_epi64
-#define _simd16_unpackhi_epi64    _mm512_unpackhi_epi64
-#define _simd16_slli_epi32        _mm512_slli_epi32
-#define _simd16_srli_epi32        _mm512_srli_epi32
-#define _simd16_srai_epi32        _mm512_srai_epi32
-#define _simd16_fmadd_ps          _mm512_fmadd_ps
-#define _simd16_fmsub_ps          _mm512_fmsub_ps
-#define _simd16_adds_epu8         _mm512_adds_epu8
-#define _simd16_subs_epu8         _mm512_subs_epu8
-#define _simd16_add_epi8          _mm512_add_epi8
-#define _simd16_shuffle_epi8      _mm512_shuffle_epi8
-
-#define _simd16_fmadd_ps          _mm512_fmadd_ps
-#define _simd16_fmsub_ps          _mm512_fmsub_ps
-
-#define _simd16_i32gather_ps(m, index, scale)               _mm512_i32gather_ps(index, m, scale)
-
-template <int scale>
-INLINE simd16scalar SIMDAPI _simd16_mask_i32gather_ps_temp(simd16scalar a, const float *m, simd16scalari index, simd16scalari mask)
-{
-    __mmask16 k = _mm512_cmpneq_epi32_mask(mask, _mm512_setzero_si512());
-
-    return _mm512_mask_i32gather_ps(a, k, index, m, scale);
-}
-
-#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp<scale>(a, m, index, mask)
-
-#define _simd16_abs_epi32         _mm512_abs_epi32
-
-INLINE simd16scalari SIMDAPI _simd16_cmpeq_epi64(simd16scalari a, simd16scalari b)
-{
-    __mmask8 k = _mm512_cmpeq_epi64_mask(a, b);
-
-    return _mm512_mask_blend_epi64(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cmpgt_epi64(simd16scalari a, simd16scalari b)
-{
-    __mmask8 k = _mm512_cmpgt_epi64_mask(a, b);
-
-    return _mm512_mask_blend_epi64(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cmpeq_epi16(simd16scalari a, simd16scalari b)
-{
-    __mmask32 k = _mm512_cmpeq_epi16_mask(a, b);
-
-    return _mm512_mask_blend_epi16(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cmpgt_epi16(simd16scalari a, simd16scalari b)
-{
-    __mmask32 k = _mm512_cmpgt_epi16_mask(a, b);
-
-    return _mm512_mask_blend_epi16(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cmpeq_epi8(simd16scalari a, simd16scalari b)
-{
-    __mmask64 k = _mm512_cmpeq_epi8_mask(a, b);
-
-    return _mm512_mask_blend_epi8(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cmpgt_epi8(simd16scalari a, simd16scalari b)
-{
-    __mmask64 k = _mm512_cmpgt_epi8_mask(a, b);
-
-    return _mm512_mask_blend_epi8(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-#define _simd16_permute_ps(a, i)        _mm512_permutexvar_ps(i, a)
-#define _simd16_permute_epi32(a, i)     _mm512_permutexvar_epi32(i, a)
-#define _simd16_sllv_epi32              _mm512_srlv_epi32
-#define _simd16_srlv_epi32              _mm512_sllv_epi32
-#define _simd16_permute2f128_ps         _mm512_shuffle_f32x4
-#define _simd16_permute2f128_pd         _mm512_shuffle_f64x2
-#define _simd16_permute2f128_si         _mm512_shuffle_i32x4
-#define _simd16_shuffle_ps              _mm512_shuffle_ps
-#define _simd16_shuffle_pd              _mm512_shuffle_pd
-#define _simd16_cvtepu8_epi16           _mm512_cvtepu8_epi16
-#define _simd16_cvtepu8_epi32           _mm512_cvtepu8_epi32
-#define _simd16_cvtepu16_epi32          _mm512_cvtepu16_epi32
-#define _simd16_cvtepu16_epi64          _mm512_cvtepu16_epi64
-#define _simd16_cvtepu32_epi64          _mm512_cvtepu32_epi64
-#define _simd16_packus_epi16            _mm512_packus_epi16
-#define _simd16_packs_epi16             _mm512_packs_epi16
-#define _simd16_packus_epi32            _mm512_packus_epi32
-#define _simd16_packs_epi32             _mm512_packs_epi32
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_shuffle_epi32_temp(simd16scalari a, simd16scalari b)
-{
-    return _simd16_castps_si(_simd16_shuffle_ps(_simd16_castsi_ps(a), _simd16_castsi_ps(b), imm8));
-}
-
-#define _simd16_shuffle_epi32(a, b, imm8) _simd16_shuffle_epi32_temp<imm8>(a, b)
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_shuffle_epi64_temp(simd16scalari a, simd16scalari b)
-{
-    return _simd16_castpd_si(_simd16_shuffle_pd(_simd16_castsi_pd(a), _simd16_castsi_pd(b), imm8));
-}
-
-#define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b)
-
-INLINE simd16mask SIMDAPI _simd16_int2mask(int mask)
-{
-    return _mm512_int2mask(mask);
-}
-
-INLINE int SIMDAPI _simd16_mask2int(simd16mask mask)
-{
-    return _mm512_mask2int(mask);
-}
-
-INLINE simd16mask SIMDAPI _simd16_cmplt_ps_mask(simd16scalar a, simd16scalar b)
-{
-    return _mm512_cmplt_ps_mask(a, b);
-}
-
-// convert bitmask to vector mask
-INLINE simd16scalar SIMDAPI vMask16(int32_t mask)
-{
-    simd16scalari temp = _simd16_set1_epi32(mask);
-
-    simd16scalari bits = _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
-
-    simd16scalari result = _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp, bits));
-
-    return _simd16_castsi_ps(result);
-}
-
-#endif//ENABLE_AVX512_EMULATION
+#error Unsupported vector width
+#endif//KNOB_SIMD16_WIDTH == 16
+
+#define _simd16_setzero_ps                  SIMD16::setzero_ps
+#define _simd16_setzero_si                  SIMD16::setzero_si
+#define _simd16_set1_ps                     SIMD16::set1_ps
+#define _simd16_set1_epi8                   SIMD16::set1_epi8
+#define _simd16_set1_epi32                  SIMD16::set1_epi32
+#define _simd16_set_ps                      SIMD16::set_ps
+#define _simd16_set_epi32                   SIMD16::set_epi32
+#define _simd16_load_ps                     SIMD16::load_ps
+#define _simd16_loadu_ps                    SIMD16::loadu_ps
+#if 1                                       
+#define _simd16_load1_ps                    SIMD16::broadcast_ss
+#endif                                      
+#define _simd16_load_si                     SIMD16::load_si
+#define _simd16_loadu_si                    SIMD16::loadu_si
+#define _simd16_broadcast_ss(m)             SIMD16::broadcast_ss((float const*)m)
+#define _simd16_store_ps                    SIMD16::store_ps
+#define _simd16_store_si                    SIMD16::store_si
+#define _simd16_extract_ps(a, imm8)         SIMD16::extract_ps<imm8>(a)
+#define _simd16_extract_si(a, imm8)         SIMD16::extract_si<imm8>(a)
+#define _simd16_insert_ps(a, b, imm8)       SIMD16::insert_ps<imm8>(a, b)
+#define _simd16_insert_si(a, b, imm8)       SIMD16::insert_si<imm8>(a, b)
+#define _simd16_maskstore_ps                SIMD16::maskstore_ps
+#define _simd16_blend_ps(a, b, mask)        SIMD16::blend_ps<mask>(a, b)
+#define _simd16_blendv_ps                   SIMD16::blendv_ps
+#define _simd16_blendv_epi32                SIMD16::blendv_epi32
+#define _simd16_mul_ps                      SIMD16::mul_ps
+#define _simd16_div_ps                      SIMD16::div_ps
+#define _simd16_add_ps                      SIMD16::add_ps
+#define _simd16_sub_ps                      SIMD16::sub_ps
+#define _simd16_rsqrt_ps                    SIMD16::rsqrt_ps
+#define _simd16_min_ps                      SIMD16::min_ps
+#define _simd16_max_ps                      SIMD16::max_ps
+#define _simd16_movemask_ps                 SIMD16::movemask_ps
+#define _simd16_movemask_pd                 SIMD16::movemask_pd
+#define _simd16_cvtps_epi32                 SIMD16::cvtps_epi32
+#define _simd16_cvttps_epi32                SIMD16::cvttps_epi32
+#define _simd16_cvtepi32_ps                 SIMD16::cvtepi32_ps
+#define _simd16_cmp_ps(a, b, comp)          SIMD16::cmp_ps<SIMD16::CompareType(comp)>(a, b)
+#define _simd16_cmplt_ps                    SIMD16::cmplt_ps
+#define _simd16_cmpgt_ps                    SIMD16::cmpgt_ps
+#define _simd16_cmpneq_ps                   SIMD16::cmpneq_ps
+#define _simd16_cmpeq_ps                    SIMD16::cmpeq_ps
+#define _simd16_cmpge_ps                    SIMD16::cmpge_ps
+#define _simd16_cmple_ps                    SIMD16::cmple_ps
+#define _simd16_castsi_ps                   SIMD16::castsi_ps
+#define _simd16_castps_si                   SIMD16::castps_si
+#define _simd16_castsi_pd                   SIMD16::castsi_pd
+#define _simd16_castpd_si                   SIMD16::castpd_si
+#define _simd16_castpd_ps                   SIMD16::castpd_ps
+#define _simd16_castps_pd                   SIMD16::castps_pd
+#define _simd16_and_ps                      SIMD16::and_ps
+#define _simd16_andnot_ps                   SIMD16::andnot_ps
+#define _simd16_or_ps                       SIMD16::or_ps
+#define _simd16_xor_ps                      SIMD16::xor_ps
+#define _simd16_round_ps(a, mode)           SIMD16::round_ps<SIMD16::RoundMode(mode)>(a)
+#define _simd16_mul_epi32                   SIMD16::mul_epi32
+#define _simd16_mullo_epi32                 SIMD16::mullo_epi32
+#define _simd16_sub_epi32                   SIMD16::sub_epi32
+#define _simd16_sub_epi64                   SIMD16::sub_epi64
+#define _simd16_min_epi32                   SIMD16::min_epi32
+#define _simd16_max_epi32                   SIMD16::max_epi32
+#define _simd16_min_epu32                   SIMD16::min_epu32
+#define _simd16_max_epu32                   SIMD16::max_epu32
+#define _simd16_add_epi32                   SIMD16::add_epi32
+#define _simd16_and_si                      SIMD16::and_si
+#define _simd16_andnot_si                   SIMD16::andnot_si
+#define _simd16_or_si                       SIMD16::or_si
+#define _simd16_xor_si                      SIMD16::xor_si
+#define _simd16_cmpeq_epi32                 SIMD16::cmpeq_epi32
+#define _simd16_cmpgt_epi32                 SIMD16::cmpgt_epi32
+#define _simd16_cmplt_epi32                 SIMD16::cmplt_epi32
+#define _simd16_testz_ps                    SIMD16::testz_ps
+#define _simd16_unpacklo_ps                 SIMD16::unpacklo_ps
+#define _simd16_unpackhi_ps                 SIMD16::unpackhi_ps
+#define _simd16_unpacklo_pd                 SIMD16::unpacklo_pd
+#define _simd16_unpackhi_pd                 SIMD16::unpackhi_pd
+#define _simd16_unpacklo_epi8               SIMD16::unpacklo_epi8
+#define _simd16_unpackhi_epi8               SIMD16::unpackhi_epi8
+#define _simd16_unpacklo_epi16              SIMD16::unpacklo_epi16
+#define _simd16_unpackhi_epi16              SIMD16::unpackhi_epi16
+#define _simd16_unpacklo_epi32              SIMD16::unpacklo_epi32
+#define _simd16_unpackhi_epi32              SIMD16::unpackhi_epi32
+#define _simd16_unpacklo_epi64              SIMD16::unpacklo_epi64
+#define _simd16_unpackhi_epi64              SIMD16::unpackhi_epi64
+#define _simd16_slli_epi32(a, i)            SIMD16::slli_epi32<i>(a)
+#define _simd16_srli_epi32(a, i)            SIMD16::srli_epi32<i>(a)
+#define _simd16_srai_epi32(a, i)            SIMD16::srai_epi32<i>(a)
+#define _simd16_fmadd_ps                    SIMD16::fmadd_ps
+#define _simd16_fmsub_ps                    SIMD16::fmsub_ps
+#define _simd16_adds_epu8                   SIMD16::adds_epu8
+#define _simd16_subs_epu8                   SIMD16::subs_epu8
+#define _simd16_add_epi8                    SIMD16::add_epi8
+#define _simd16_shuffle_epi8                SIMD16::shuffle_epi8
+
+#define _simd16_i32gather_ps(m, index, scale)               SIMD16::i32gather_ps<SIMD16::ScaleFactor(scale)>(index, m)
+#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) SIMD16::mask_i32gather_ps<SIMD16::ScaleFactor(scale)>(a, m, index, mask)
+
+#define _simd16_abs_epi32                   SIMD16::abs_epi32
+
+#define _simd16_cmpeq_epi64                 SIMD16::cmpeq_epi64
+#define _simd16_cmpgt_epi64                 SIMD16::cmpgt_epi64
+#define _simd16_cmpeq_epi16                 SIMD16::cmpeq_epi16
+#define _simd16_cmpgt_epi16                 SIMD16::cmpgt_epi16
+#define _simd16_cmpeq_epi8                  SIMD16::cmpeq_epi8
+#define _simd16_cmpgt_epi8                  SIMD16::cmpgt_epi8
+
+#define _simd16_permute_ps                  SIMD16::permute_ps
+#define _simd16_permute_epi32               SIMD16::permute_epi32
+#define _simd16_sllv_epi32                  SIMD16::sllv_epi32
+#define _simd16_srlv_epi32                  SIMD16::sllv_epi32
+#define _simd16_permute2f128_ps(a, b, i)    SIMD16::permute2f128_ps<i>(a, b)
+#define _simd16_permute2f128_pd(a, b, i)    SIMD16::permute2f128_pd<i>(a, b)
+#define _simd16_permute2f128_si(a, b, i)    SIMD16::permute2f128_si<i>(a, b)
+#define _simd16_shuffle_ps(a, b, i)         SIMD16::shuffle_ps<i>(a, b)
+#define _simd16_shuffle_pd(a, b, i)         SIMD16::shuffle_pd<i>(a, b)
+#define _simd16_shuffle_epi32(a, b, imm8)   SIMD16::shuffle_epi32<imm8>(a, b)
+#define _simd16_shuffle_epi64(a, b, imm8)   SIMD16::shuffle_epi64<imm8>(a, b)
+#define _simd16_cvtepu8_epi16               SIMD16::cvtepu8_epi16
+#define _simd16_cvtepu8_epi32               SIMD16::cvtepu8_epi32
+#define _simd16_cvtepu16_epi32              SIMD16::cvtepu16_epi32
+#define _simd16_cvtepu16_epi64              SIMD16::cvtepu16_epi64
+#define _simd16_cvtepu32_epi64              SIMD16::cvtepu32_epi64
+#define _simd16_packus_epi16                SIMD16::packus_epi16
+#define _simd16_packs_epi16                 SIMD16::packs_epi16
+#define _simd16_packus_epi32                SIMD16::packus_epi32
+#define _simd16_packs_epi32                 SIMD16::packs_epi32
+#define _simd16_cmplt_ps_mask               SIMD16::cmp_ps_mask<SIMD16::CompareType::LT_OQ>
+#define _simd16_int2mask(mask)              simd16mask(mask)
+#define _simd16_mask2int(mask)              int(mask)
 
 #endif//ENABLE_AVX512_SIMD16
 
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
index 5ccb6c3ea95..f95c109e6fe 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
@@ -24,749 +24,218 @@
 #ifndef __SWR_SIMDINTRIN_H__
 #define __SWR_SIMDINTRIN_H__
 
-#include "common/os.h"
 #include "common/intrin.h"
+#include "common/simdlib.hpp"
 
 #if KNOB_SIMD_WIDTH == 8
-#define _simd128_maskstore_ps _mm_maskstore_ps
-#define _simd_load_ps _mm256_load_ps
-#define _simd_load1_ps _mm256_broadcast_ss
-#define _simd_loadu_ps _mm256_loadu_ps
-#define _simd_setzero_ps _mm256_setzero_ps
-#define _simd_set1_ps   _mm256_set1_ps
-#define _simd_blend_ps  _mm256_blend_ps
-#define _simd_blendv_ps _mm256_blendv_ps
-#define _simd_store_ps _mm256_store_ps
-#define _simd_mul_ps _mm256_mul_ps
-#define _simd_add_ps _mm256_add_ps
-#define _simd_sub_ps _mm256_sub_ps
-#define _simd_rsqrt_ps _mm256_rsqrt_ps
-#define _simd_min_ps _mm256_min_ps
-#define _simd_max_ps _mm256_max_ps
-#define _simd_movemask_ps _mm256_movemask_ps
-#define _simd_cvtps_epi32 _mm256_cvtps_epi32
-#define _simd_cvttps_epi32 _mm256_cvttps_epi32
-#define _simd_cvtepi32_ps _mm256_cvtepi32_ps
-#define _simd_cmplt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ)
-#define _simd_cmpgt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OQ)
-#define _simd_cmpneq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_NEQ_OQ)
-#define _simd_cmpeq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_OQ)
-#define _simd_cmpge_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ)
-#define _simd_cmple_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
-#define _simd_cmp_ps(a, b, imm) _mm256_cmp_ps(a, b, imm)
-#define _simd_and_ps _mm256_and_ps
-#define _simd_or_ps _mm256_or_ps
-
-#define _simd_rcp_ps _mm256_rcp_ps
-#define _simd_div_ps _mm256_div_ps
-#define _simd_castsi_ps _mm256_castsi256_ps
-#define _simd_andnot_ps _mm256_andnot_ps
-#define _simd_round_ps _mm256_round_ps
-#define _simd_castpd_ps _mm256_castpd_ps
-#define _simd_broadcast_ps(a) _mm256_broadcast_ps((const __m128*)(a))
-#define _simd_stream_ps _mm256_stream_ps
-
-#define _simd_load_sd _mm256_load_sd
-#define _simd_movemask_pd _mm256_movemask_pd
-#define _simd_castsi_pd _mm256_castsi256_pd
-
-// emulated integer simd
-#define SIMD_EMU_EPI(func, intrin) \
-INLINE \
-__m256i func(__m256i a, __m256i b)\
-{\
-    __m128i aHi = _mm256_extractf128_si256(a, 1);\
-    __m128i bHi = _mm256_extractf128_si256(b, 1);\
-    __m128i aLo = _mm256_castsi256_si128(a);\
-    __m128i bLo = _mm256_castsi256_si128(b);\
-\
-    __m128i subLo = intrin(aLo, bLo);\
-    __m128i subHi = intrin(aHi, bHi);\
-\
-    __m256i result = _mm256_castsi128_si256(subLo);\
-            result = _mm256_insertf128_si256(result, subHi, 1);\
-\
-    return result;\
-}
-
-#if (KNOB_ARCH == KNOB_ARCH_AVX)
-INLINE
-__m256 _simdemu_permute_ps(__m256 a, __m256i b)
-{
-    __m128 aHi = _mm256_extractf128_ps(a, 1);
-    __m128i bHi = _mm256_extractf128_si256(b, 1);
-    __m128 aLo = _mm256_castps256_ps128(a);
-    __m128i bLo = _mm256_castsi256_si128(b);
-
-    __m128i indexHi = _mm_cmpgt_epi32(bLo, _mm_set1_epi32(3));
-    __m128 resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bLo, _mm_set1_epi32(0x3)));
-    __m128 resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bLo, _mm_set1_epi32(0x3)));
-    __m128 blendLowRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi));
-
-    indexHi = _mm_cmpgt_epi32(bHi, _mm_set1_epi32(3));
-    resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bHi, _mm_set1_epi32(0x3)));
-    resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bHi, _mm_set1_epi32(0x3)));
-    __m128 blendHiRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi));
-
-    __m256 result = _mm256_castps128_ps256(blendLowRes);
-    result = _mm256_insertf128_ps(result, blendHiRes, 1);
-
-    return result;
-}
-
-INLINE
-__m256i _simdemu_permute_epi32(__m256i a, __m256i b)
-{
-    return _mm256_castps_si256(_simdemu_permute_ps(_mm256_castsi256_ps(a), b));
-}
-
-INLINE
-__m256i _simdemu_srlv_epi32(__m256i vA, __m256i vCount)
-{
-    int32_t aHi, aLow, countHi, countLow;
-    __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
-    __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
-    __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
-    __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
-
-    aHi = _mm_extract_epi32(vAHi, 0);
-    countHi = _mm_extract_epi32(vCountHi, 0);
-    aHi >>= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 0);
-
-    aLow = _mm_extract_epi32(vALow, 0);
-    countLow = _mm_extract_epi32(vCountLow, 0);
-    aLow >>= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 0);
-
-    aHi = _mm_extract_epi32(vAHi, 1);
-    countHi = _mm_extract_epi32(vCountHi, 1);
-    aHi >>= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 1);
-
-    aLow = _mm_extract_epi32(vALow, 1);
-    countLow = _mm_extract_epi32(vCountLow, 1);
-    aLow >>= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 1);
-
-    aHi = _mm_extract_epi32(vAHi, 2);
-    countHi = _mm_extract_epi32(vCountHi, 2);
-    aHi >>= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 2);
-
-    aLow = _mm_extract_epi32(vALow, 2);
-    countLow = _mm_extract_epi32(vCountLow, 2);
-    aLow >>= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 2);
-
-    aHi = _mm_extract_epi32(vAHi, 3);
-    countHi = _mm_extract_epi32(vCountHi, 3);
-    aHi >>= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 3);
-
-    aLow = _mm_extract_epi32(vALow, 3);
-    countLow = _mm_extract_epi32(vCountLow, 3);
-    aLow >>= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 3);
-
-    __m256i ret = _mm256_set1_epi32(0);
-    ret = _mm256_insertf128_si256(ret, vAHi, 1);
-    ret = _mm256_insertf128_si256(ret, vALow, 0);
-    return ret;
-}
-
-
-INLINE
-__m256i _simdemu_sllv_epi32(__m256i vA, __m256i vCount)
-{
-    int32_t aHi, aLow, countHi, countLow;
-    __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
-    __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
-    __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
-    __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
-
-    aHi = _mm_extract_epi32(vAHi, 0);
-    countHi = _mm_extract_epi32(vCountHi, 0);
-    aHi <<= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 0);
-
-    aLow = _mm_extract_epi32(vALow, 0);
-    countLow = _mm_extract_epi32(vCountLow, 0);
-    aLow <<= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 0);
-
-    aHi = _mm_extract_epi32(vAHi, 1);
-    countHi = _mm_extract_epi32(vCountHi, 1);
-    aHi <<= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 1);
-
-    aLow = _mm_extract_epi32(vALow, 1);
-    countLow = _mm_extract_epi32(vCountLow, 1);
-    aLow <<= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 1);
-
-    aHi = _mm_extract_epi32(vAHi, 2);
-    countHi = _mm_extract_epi32(vCountHi, 2);
-    aHi <<= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 2);
-
-    aLow = _mm_extract_epi32(vALow, 2);
-    countLow = _mm_extract_epi32(vCountLow, 2);
-    aLow <<= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 2);
-
-    aHi = _mm_extract_epi32(vAHi, 3);
-    countHi = _mm_extract_epi32(vCountHi, 3);
-    aHi <<= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 3);
-
-    aLow = _mm_extract_epi32(vALow, 3);
-    countLow = _mm_extract_epi32(vCountLow, 3);
-    aLow <<= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 3);
-
-    __m256i ret = _mm256_set1_epi32(0);
-    ret = _mm256_insertf128_si256(ret, vAHi, 1);
-    ret = _mm256_insertf128_si256(ret, vALow, 0);
-    return ret;
-}
-
-#define _simd_mul_epi32 _simdemu_mul_epi32
-#define _simd_mullo_epi32 _simdemu_mullo_epi32
-#define _simd_sub_epi32 _simdemu_sub_epi32
-#define _simd_sub_epi64 _simdemu_sub_epi64
-#define _simd_min_epi32 _simdemu_min_epi32
-#define _simd_min_epu32 _simdemu_min_epu32
-#define _simd_max_epi32 _simdemu_max_epi32
-#define _simd_max_epu32 _simdemu_max_epu32
-#define _simd_add_epi32 _simdemu_add_epi32
-#define _simd_and_si _simdemu_and_si
-#define _simd_andnot_si _simdemu_andnot_si
-#define _simd_cmpeq_epi32 _simdemu_cmpeq_epi32
-#define _simd_cmplt_epi32 _simdemu_cmplt_epi32
-#define _simd_cmpgt_epi32 _simdemu_cmpgt_epi32
-#define _simd_or_si _simdemu_or_si
-#define _simd_xor_si _simdemu_xor_si
-#define _simd_castps_si _mm256_castps_si256
-#define _simd_adds_epu8 _simdemu_adds_epu8
-#define _simd_subs_epu8 _simdemu_subs_epu8
-#define _simd_add_epi8 _simdemu_add_epi8
-#define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64
-#define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64
-#define _simd_cmpgt_epi8 _simdemu_cmpgt_epi8
-#define _simd_cmpeq_epi8 _simdemu_cmpeq_epi8
-#define _simd_cmpgt_epi16 _simdemu_cmpgt_epi16
-#define _simd_cmpeq_epi16 _simdemu_cmpeq_epi16
-#define _simd_movemask_epi8 _simdemu_movemask_epi8
-#define _simd_permute_ps _simdemu_permute_ps
-#define _simd_permute_epi32 _simdemu_permute_epi32
-#define _simd_srlv_epi32 _simdemu_srlv_epi32
-#define _simd_sllv_epi32 _simdemu_sllv_epi32
-
-SIMD_EMU_EPI(_simdemu_mul_epi32, _mm_mul_epi32)
-SIMD_EMU_EPI(_simdemu_mullo_epi32, _mm_mullo_epi32)
-SIMD_EMU_EPI(_simdemu_sub_epi32, _mm_sub_epi32)
-SIMD_EMU_EPI(_simdemu_sub_epi64, _mm_sub_epi64)
-SIMD_EMU_EPI(_simdemu_min_epi32, _mm_min_epi32)
-SIMD_EMU_EPI(_simdemu_min_epu32, _mm_min_epu32)
-SIMD_EMU_EPI(_simdemu_max_epi32, _mm_max_epi32)
-SIMD_EMU_EPI(_simdemu_max_epu32, _mm_max_epu32)
-SIMD_EMU_EPI(_simdemu_add_epi32, _mm_add_epi32)
-SIMD_EMU_EPI(_simdemu_and_si, _mm_and_si128)
-SIMD_EMU_EPI(_simdemu_andnot_si, _mm_andnot_si128)
-SIMD_EMU_EPI(_simdemu_cmpeq_epi32, _mm_cmpeq_epi32)
-SIMD_EMU_EPI(_simdemu_cmplt_epi32, _mm_cmplt_epi32)
-SIMD_EMU_EPI(_simdemu_cmpgt_epi32, _mm_cmpgt_epi32)
-SIMD_EMU_EPI(_simdemu_or_si, _mm_or_si128)
-SIMD_EMU_EPI(_simdemu_xor_si, _mm_xor_si128)
-SIMD_EMU_EPI(_simdemu_adds_epu8, _mm_adds_epu8)
-SIMD_EMU_EPI(_simdemu_subs_epu8, _mm_subs_epu8)
-SIMD_EMU_EPI(_simdemu_add_epi8, _mm_add_epi8)
-SIMD_EMU_EPI(_simdemu_cmpeq_epi64, _mm_cmpeq_epi64)
-SIMD_EMU_EPI(_simdemu_cmpgt_epi64, _mm_cmpgt_epi64)
-SIMD_EMU_EPI(_simdemu_cmpgt_epi8, _mm_cmpgt_epi8)
-SIMD_EMU_EPI(_simdemu_cmpeq_epi8, _mm_cmpeq_epi8)
-SIMD_EMU_EPI(_simdemu_cmpgt_epi16, _mm_cmpgt_epi16)
-SIMD_EMU_EPI(_simdemu_cmpeq_epi16, _mm_cmpeq_epi16)
-SIMD_EMU_EPI(_simdemu_unpacklo_epi8, _mm_unpacklo_epi8)
-SIMD_EMU_EPI(_simdemu_unpackhi_epi8, _mm_unpackhi_epi8)
-SIMD_EMU_EPI(_simdemu_unpacklo_epi16, _mm_unpacklo_epi16)
-SIMD_EMU_EPI(_simdemu_unpackhi_epi16, _mm_unpackhi_epi16)
-
-#define _simd_unpacklo_epi8 _simdemu_unpacklo_epi8
-#define _simd_unpackhi_epi8 _simdemu_unpackhi_epi8
-#define _simd_unpacklo_epi16 _simdemu_unpacklo_epi16
-#define _simd_unpackhi_epi16 _simdemu_unpackhi_epi16
-#define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
-#define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
-#define _simd_unpacklo_epi64(a, b) _mm256_castpd_si256(_mm256_unpacklo_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)))
-#define _simd_unpackhi_epi64(a, b) _mm256_castpd_si256(_mm256_unpackhi_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)))
-
-#define _simd_slli_epi32(a,i) _simdemu_slli_epi32(a,i)
-#define _simd_srai_epi32(a,i) _simdemu_srai_epi32(a,i)
-#define _simd_srli_epi32(a,i) _simdemu_srli_epi32(a,i)
-#define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
-
-#define _simd128_fmadd_ps _mm_fmaddemu_ps
-#define _simd_fmadd_ps _mm_fmaddemu256_ps
-#define _simd_fmsub_ps _mm_fmsubemu256_ps
-#define _simd_shuffle_epi8 _simdemu_shuffle_epi8 
-SIMD_EMU_EPI(_simdemu_shuffle_epi8, _mm_shuffle_epi8)
-
-INLINE
-__m128 _mm_fmaddemu_ps(__m128 a, __m128 b, __m128 c)
-{
-    __m128 res = _mm_mul_ps(a, b);
-    res = _mm_add_ps(res, c);
-    return res;
-}
-
-INLINE
-__m256 _mm_fmaddemu256_ps(__m256 a, __m256 b, __m256 c)
-{
-    __m256 res = _mm256_mul_ps(a, b);
-    res = _mm256_add_ps(res, c);
-    return res;
-}
-
-INLINE
-__m256 _mm_fmsubemu256_ps(__m256 a, __m256 b, __m256 c)
-{
-    __m256 res = _mm256_mul_ps(a, b);
-    res = _mm256_sub_ps(res, c);
-    return res;
-}
-
-INLINE
-__m256 _simd_i32gather_ps(const float* pBase, __m256i vOffsets, const int scale)
-{
-    uint32_t *pOffsets = (uint32_t*)&vOffsets;
-    simdscalar vResult;
-    float* pResult = (float*)&vResult;
-    for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
-    {
-        uint32_t offset = pOffsets[i];
-        offset = offset * scale;
-        pResult[i] = *(float*)(((const uint8_t*)pBase + offset));
-    }
-
-    return vResult;
-}
-
-INLINE
-__m256 _simd_mask_i32gather_ps(__m256 vSrc, const float* pBase, __m256i vOffsets, __m256 vMask, const int scale)
-{
-    uint32_t *pOffsets = (uint32_t*)&vOffsets;
-    simdscalar vResult = vSrc;
-    float* pResult = (float*)&vResult;
-    DWORD index;
-    uint32_t mask = _simd_movemask_ps(vMask);
-    while (_BitScanForward(&index, mask))
-    {
-        mask &= ~(1 << index);
-        uint32_t offset = pOffsets[index];
-        offset = offset * scale;
-        pResult[index] = *(float*)(((const uint8_t*)pBase + offset));
-    }
-
-    return vResult;
-}
-
-INLINE
-__m256i _simd_abs_epi32(__m256i a)
-{
-        __m128i aHi = _mm256_extractf128_si256(a, 1);
-        __m128i aLo = _mm256_castsi256_si128(a);
-        __m128i absLo = _mm_abs_epi32(aLo);
-        __m128i absHi = _mm_abs_epi32(aHi);
-        __m256i result = _mm256_castsi128_si256(absLo);
-        result = _mm256_insertf128_si256(result, absHi, 1);
-        return result;
-}
-
-INLINE 
-int _simdemu_movemask_epi8(__m256i a)
-{
-    __m128i aHi = _mm256_extractf128_si256(a, 1);
-    __m128i aLo = _mm256_castsi256_si128(a);
-
-    int resHi = _mm_movemask_epi8(aHi);
-    int resLo = _mm_movemask_epi8(aLo);
-
-    return (resHi << 16) | resLo;
-}
-
-INLINE
-__m256i _simd_cvtepu8_epi16(__m128i a)
-{
-    __m128i resultlo = _mm_cvtepu8_epi16(a);
-    __m128i resulthi = _mm_cvtepu8_epi16(_mm_srli_si128(a, 8));
-
-    __m256i result = _mm256_castsi128_si256(resultlo);
-
-    return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_cvtepu8_epi32(__m128i a)
-{
-    __m128i resultlo = _mm_cvtepu8_epi32(a);
-    __m128i resulthi = _mm_cvtepu8_epi32(_mm_srli_si128(a, 4));
-
-    __m256i result = _mm256_castsi128_si256(resultlo);
-
-    return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_cvtepu16_epi32(__m128i a)
-{
-    __m128i resultlo = _mm_cvtepu16_epi32(a);
-    __m128i resulthi = _mm_cvtepu16_epi32(_mm_srli_si128(a, 8));
-
-    __m256i result = _mm256_castsi128_si256(resultlo);
-
-    return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_cvtepu16_epi64(__m128i a)
-{
-    __m128i resultlo = _mm_cvtepu16_epi64(a);
-    __m128i resulthi = _mm_cvtepu16_epi64(_mm_srli_si128(a, 4));
-
-    __m256i result = _mm256_castsi128_si256(resultlo);
-
-    return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_cvtepu32_epi64(__m128i a)
-{
-    __m128i resultlo = _mm_cvtepu32_epi64(a);
-    __m128i resulthi = _mm_cvtepu32_epi64(_mm_srli_si128(a, 8));
-
-    __m256i result = _mm256_castsi128_si256(resultlo);
-
-    return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_packus_epi16(__m256i a, __m256i b)
-{
-    __m128i alo = _mm256_extractf128_si256(a, 0);
-    __m128i ahi = _mm256_extractf128_si256(a, 1);
-
-    __m128i blo = _mm256_extractf128_si256(b, 0);
-    __m128i bhi = _mm256_extractf128_si256(b, 1);
-
-    __m128i resultlo = _mm_packus_epi16(alo, blo);
-    __m128i resulthi = _mm_packus_epi16(ahi, bhi);
-
-    __m256i result = _mm256_castsi128_si256(resultlo);
-
-    return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_packs_epi16(__m256i a, __m256i b)
-{
-    __m128i alo = _mm256_extractf128_si256(a, 0);
-    __m128i ahi = _mm256_extractf128_si256(a, 1);
-
-    __m128i blo = _mm256_extractf128_si256(b, 0);
-    __m128i bhi = _mm256_extractf128_si256(b, 1);
-
-    __m128i resultlo = _mm_packs_epi16(alo, blo);
-    __m128i resulthi = _mm_packs_epi16(ahi, bhi);
-
-    __m256i result = _mm256_castsi128_si256(resultlo);
-
-    return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_packus_epi32(__m256i a, __m256i b)
-{
-    __m128i alo = _mm256_extractf128_si256(a, 0);
-    __m128i ahi = _mm256_extractf128_si256(a, 1);
-
-    __m128i blo = _mm256_extractf128_si256(b, 0);
-    __m128i bhi = _mm256_extractf128_si256(b, 1);
-
-    __m128i resultlo = _mm_packus_epi32(alo, blo);
-    __m128i resulthi = _mm_packus_epi32(ahi, bhi);
-
-    __m256i result = _mm256_castsi128_si256(resultlo);
-
-    return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_packs_epi32(__m256i a, __m256i b)
-{
-    __m128i alo = _mm256_extractf128_si256(a, 0);
-    __m128i ahi = _mm256_extractf128_si256(a, 1);
-
-    __m128i blo = _mm256_extractf128_si256(b, 0);
-    __m128i bhi = _mm256_extractf128_si256(b, 1);
-
-    __m128i resultlo = _mm_packs_epi32(alo, blo);
-    __m128i resulthi = _mm_packs_epi32(ahi, bhi);
-
-    __m256i result = _mm256_castsi128_si256(resultlo);
-
-    return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
+typedef SIMD256                             SIMD;
 #else
-
-#define _simd_mul_epi32 _mm256_mul_epi32
-#define _simd_mullo_epi32 _mm256_mullo_epi32
-#define _simd_sub_epi32 _mm256_sub_epi32
-#define _simd_sub_epi64 _mm256_sub_epi64
-#define _simd_min_epi32 _mm256_min_epi32
-#define _simd_max_epi32 _mm256_max_epi32
-#define _simd_min_epu32 _mm256_min_epu32
-#define _simd_max_epu32 _mm256_max_epu32
-#define _simd_add_epi32 _mm256_add_epi32
-#define _simd_and_si _mm256_and_si256
-#define _simd_andnot_si _mm256_andnot_si256
-#define _simd_cmpeq_epi32 _mm256_cmpeq_epi32
-#define _simd_cmplt_epi32(a,b) _mm256_cmpgt_epi32(b,a)
-#define _simd_cmpgt_epi32(a,b) _mm256_cmpgt_epi32(a,b)
-#define _simd_or_si _mm256_or_si256
-#define _simd_xor_si _mm256_xor_si256
-#define _simd_castps_si _mm256_castps_si256
-
-#define _simd_unpacklo_epi8 _mm256_unpacklo_epi8
-#define _simd_unpackhi_epi8 _mm256_unpackhi_epi8
-#define _simd_unpacklo_epi16 _mm256_unpacklo_epi16
-#define _simd_unpackhi_epi16 _mm256_unpackhi_epi16
-#define _simd_unpacklo_epi32 _mm256_unpacklo_epi32
-#define _simd_unpackhi_epi32 _mm256_unpackhi_epi32
-#define _simd_unpacklo_epi64 _mm256_unpacklo_epi64
-#define _simd_unpackhi_epi64 _mm256_unpackhi_epi64
-
-#define _simd_srli_si(a,i) _simdemu_srli_si128<i>(a)
-#define _simd_slli_epi32 _mm256_slli_epi32
-#define _simd_srai_epi32 _mm256_srai_epi32
-#define _simd_srli_epi32 _mm256_srli_epi32
-#define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
-#define _simd128_fmadd_ps _mm_fmadd_ps
-#define _simd_fmadd_ps _mm256_fmadd_ps
-#define _simd_fmsub_ps _mm256_fmsub_ps
-#define _simd_shuffle_epi8 _mm256_shuffle_epi8 
-#define _simd_adds_epu8 _mm256_adds_epu8
-#define _simd_subs_epu8 _mm256_subs_epu8
-#define _simd_add_epi8 _mm256_add_epi8
-#define _simd_i32gather_ps _mm256_i32gather_ps
-#define _simd_mask_i32gather_ps _mm256_mask_i32gather_ps
-#define _simd_abs_epi32 _mm256_abs_epi32
-
-#define _simd_cmpeq_epi64 _mm256_cmpeq_epi64
-#define _simd_cmpgt_epi64 _mm256_cmpgt_epi64
-#define _simd_cmpgt_epi8  _mm256_cmpgt_epi8
-#define _simd_cmpeq_epi8  _mm256_cmpeq_epi8
-#define _simd_cmpgt_epi16  _mm256_cmpgt_epi16
-#define _simd_cmpeq_epi16  _mm256_cmpeq_epi16
-#define _simd_movemask_epi8 _mm256_movemask_epi8
-#define _simd_permute_ps _mm256_permutevar8x32_ps
-#define _simd_permute_epi32 _mm256_permutevar8x32_epi32
-#define _simd_srlv_epi32 _mm256_srlv_epi32
-#define _simd_sllv_epi32 _mm256_sllv_epi32
-#define _simd_cvtepu8_epi16 _mm256_cvtepu8_epi16
-#define _simd_cvtepu8_epi32 _mm256_cvtepu8_epi32
-#define _simd_cvtepu16_epi32 _mm256_cvtepu16_epi32
-#define _simd_cvtepu16_epi64 _mm256_cvtepu16_epi64
-#define _simd_cvtepu32_epi64 _mm256_cvtepu32_epi64
-#define _simd_packus_epi16 _mm256_packus_epi16
-#define _simd_packs_epi16 _mm256_packs_epi16
-#define _simd_packus_epi32 _mm256_packus_epi32
-#define _simd_packs_epi32 _mm256_packs_epi32
-
-#endif
-
-#define _simd_unpacklo_ps _mm256_unpacklo_ps
-#define _simd_unpackhi_ps _mm256_unpackhi_ps
-#define _simd_unpacklo_pd _mm256_unpacklo_pd
-#define _simd_unpackhi_pd _mm256_unpackhi_pd
-#define _simd_insertf128_ps _mm256_insertf128_ps
-#define _simd_insertf128_pd _mm256_insertf128_pd
-#define _simd_insertf128_si _mm256_insertf128_si256
-#define _simd_extractf128_ps _mm256_extractf128_ps
-#define _simd_extractf128_pd _mm256_extractf128_pd
-#define _simd_extractf128_si _mm256_extractf128_si256
-#define _simd_permute2f128_ps _mm256_permute2f128_ps
-#define _simd_permute2f128_pd _mm256_permute2f128_pd
-#define _simd_permute2f128_si _mm256_permute2f128_si256
-#define _simd_shuffle_ps _mm256_shuffle_ps
-#define _simd_shuffle_pd _mm256_shuffle_pd
-#define _simd_shuffle_epi32(a, b, imm8) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), imm8))
-#define _simd_shuffle_epi64(a, b, imm8) _mm256_castps_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b), imm8))
-#define _simd_set1_epi32 _mm256_set1_epi32
-#define _simd_set_epi32 _mm256_set_epi32
-#define _simd_set1_epi8 _mm256_set1_epi8
-#define _simd_setzero_si _mm256_setzero_si256
-#define _simd_cvttps_epi32 _mm256_cvttps_epi32
-#define _simd_store_si _mm256_store_si256
-#define _simd_broadcast_ss _mm256_broadcast_ss
-#define _simd_maskstore_ps _mm256_maskstore_ps
-#define _simd_load_si _mm256_load_si256
-#define _simd_loadu_si _mm256_loadu_si256
-#define _simd_sub_ps _mm256_sub_ps
-#define _simd_testz_ps _mm256_testz_ps
-#define _simd_testz_si _mm256_testz_si256
-#define _simd_xor_ps _mm256_xor_ps
-
-INLINE
-simdscalari _simd_loadu2_si(const __m128i *hiaddr, const __m128i *loaddr)
-{
-    __m128i lo = _mm_loadu_si128(loaddr);
-    __m128i hi = _mm_loadu_si128(hiaddr);
-
-    return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
-}
-
-INLINE
-void _simd_storeu2_si(__m128i *hiaddr, __m128i *loaddr, simdscalari a)
-{
-    _mm_storeu_si128(loaddr, _mm256_castsi256_si128(a));
-    _mm_storeu_si128(hiaddr, _mm256_extractf128_si256(a, 1));
-}
-
-INLINE
-simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalar mask)
-{
-    return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), mask));
-}
-
-INLINE
-simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalari mask)
-{
-    return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), _simd_castsi_ps(mask)));
-}
-
-template<int mask>
-INLINE
-__m128i _simd_blend4_epi32(__m128i a, __m128i b)
-{
-    return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), mask));
+#error Unsupported vector width
+#endif//KNOB_SIMD16_WIDTH == 16
+
+
+#define _simd128_maskstore_ps               SIMD128::maskstore_ps
+#define _simd128_fmadd_ps                   SIMD128::fmadd_ps
+
+#define _simd_load_ps                       SIMD::load_ps
+#define _simd_load1_ps                      SIMD::broadcast_ss
+#define _simd_loadu_ps                      SIMD::loadu_ps
+#define _simd_setzero_ps                    SIMD::setzero_ps
+#define _simd_set1_ps                       SIMD::set1_ps
+#define _simd_blend_ps(a, b, i)             SIMD::blend_ps<i>(a, b)
+#define _simd_blend_epi32(a, b, i)          SIMD::blend_epi32<i>(a, b)
+#define _simd_blendv_ps                     SIMD::blendv_ps
+#define _simd_store_ps                      SIMD::store_ps
+#define _simd_mul_ps                        SIMD::mul_ps
+#define _simd_add_ps                        SIMD::add_ps
+#define _simd_sub_ps                        SIMD::sub_ps
+#define _simd_rsqrt_ps                      SIMD::rsqrt_ps
+#define _simd_min_ps                        SIMD::min_ps
+#define _simd_max_ps                        SIMD::max_ps
+#define _simd_movemask_ps                   SIMD::movemask_ps
+#define _simd_cvtps_epi32                   SIMD::cvtps_epi32
+#define _simd_cvttps_epi32                  SIMD::cvttps_epi32
+#define _simd_cvtepi32_ps                   SIMD::cvtepi32_ps
+#define _simd_cmplt_ps                      SIMD::cmplt_ps
+#define _simd_cmpgt_ps                      SIMD::cmpgt_ps
+#define _simd_cmpneq_ps                     SIMD::cmpneq_ps
+#define _simd_cmpeq_ps                      SIMD::cmpeq_ps
+#define _simd_cmpge_ps                      SIMD::cmpge_ps
+#define _simd_cmple_ps                      SIMD::cmple_ps
+#define _simd_cmp_ps(a, b, imm)             SIMD::cmp_ps<SIMD::CompareType(imm)>(a, b)
+#define _simd_and_ps                        SIMD::and_ps
+#define _simd_or_ps                         SIMD::or_ps
+#define _simd_rcp_ps                        SIMD::rcp_ps
+#define _simd_div_ps                        SIMD::div_ps
+#define _simd_castsi_ps                     SIMD::castsi_ps
+#define _simd_castps_pd                     SIMD::castps_pd
+#define _simd_castpd_ps                     SIMD::castpd_ps
+#define _simd_andnot_ps                     SIMD::andnot_ps
+#define _simd_round_ps(a, i)                SIMD::round_ps<SIMD::RoundMode(i)>(a)
+#define _simd_castpd_ps                     SIMD::castpd_ps
+#define _simd_broadcast_ps(a)               SIMD::broadcast_ps((SIMD128::Float const *)(a))
+#define _simd_stream_ps                     SIMD::stream_ps
+
+#define _simd_movemask_pd                   SIMD::movemask_pd
+#define _simd_castsi_pd                     SIMD::castsi_pd
+
+#define _simd_mul_epi32                     SIMD::mul_epi32
+#define _simd_mullo_epi32                   SIMD::mullo_epi32
+#define _simd_sub_epi32                     SIMD::sub_epi32
+#define _simd_sub_epi64                     SIMD::sub_epi64
+#define _simd_min_epi32                     SIMD::min_epi32
+#define _simd_min_epu32                     SIMD::min_epu32
+#define _simd_max_epi32                     SIMD::max_epi32
+#define _simd_max_epu32                     SIMD::max_epu32
+#define _simd_add_epi32                     SIMD::add_epi32
+#define _simd_and_si                        SIMD::and_si
+#define _simd_andnot_si                     SIMD::andnot_si
+#define _simd_cmpeq_epi32                   SIMD::cmpeq_epi32
+#define _simd_cmplt_epi32                   SIMD::cmplt_epi32
+#define _simd_cmpgt_epi32                   SIMD::cmpgt_epi32
+#define _simd_or_si                         SIMD::or_si
+#define _simd_xor_si                        SIMD::xor_si
+#define _simd_castps_si                     SIMD::castps_si
+#define _simd_adds_epu8                     SIMD::adds_epu8
+#define _simd_subs_epu8                     SIMD::subs_epu8
+#define _simd_add_epi8                      SIMD::add_epi8
+#define _simd_cmpeq_epi64                   SIMD::cmpeq_epi64
+#define _simd_cmpgt_epi64                   SIMD::cmpgt_epi64
+#define _simd_cmpgt_epi8                    SIMD::cmpgt_epi8
+#define _simd_cmpeq_epi8                    SIMD::cmpeq_epi8
+#define _simd_cmpgt_epi16                   SIMD::cmpgt_epi16
+#define _simd_cmpeq_epi16                   SIMD::cmpeq_epi16
+#define _simd_movemask_epi8                 SIMD::movemask_epi8
+#define _simd_permute_ps                    SIMD::permute_ps
+#define _simd_permute_epi32                 SIMD::permute_epi32
+#define _simd_srlv_epi32                    SIMD::srlv_epi32
+#define _simd_sllv_epi32                    SIMD::sllv_epi32
+
+#define _simd_unpacklo_epi8                 SIMD::unpacklo_epi8
+#define _simd_unpackhi_epi8                 SIMD::unpackhi_epi8
+#define _simd_unpacklo_epi16                SIMD::unpacklo_epi16
+#define _simd_unpackhi_epi16                SIMD::unpackhi_epi16
+#define _simd_unpacklo_epi32                SIMD::unpacklo_epi32
+#define _simd_unpackhi_epi32                SIMD::unpackhi_epi32
+#define _simd_unpacklo_epi64                SIMD::unpacklo_epi64
+#define _simd_unpackhi_epi64                SIMD::unpackhi_epi64
+
+#define _simd_slli_epi32(a,i)               SIMD::slli_epi32<i>(a)
+#define _simd_srai_epi32(a,i)               SIMD::srai_epi32<i>(a)
+#define _simd_srli_epi32(a,i)               SIMD::srli_epi32<i>(a)
+#define _simd_srlisi_ps(a,i)                SIMD::srlisi_ps<i>(a)
+
+#define _simd_fmadd_ps                      SIMD::fmadd_ps
+#define _simd_fmsub_ps                      SIMD::fmsub_ps
+#define _simd_shuffle_epi8                  SIMD::shuffle_epi8
+
+#define _simd_i32gather_ps(p, o, s)         SIMD::i32gather_ps<SIMD::ScaleFactor(s)>(p, o)
+#define _simd_mask_i32gather_ps(r, p, o, m, s) SIMD::mask_i32gather_ps<SIMD::ScaleFactor(s)>(r, p, o, m)
+#define _simd_abs_epi32                     SIMD::abs_epi32
+
+#define _simd_cvtepu8_epi16                 SIMD::cvtepu8_epi16
+#define _simd_cvtepu8_epi32                 SIMD::cvtepu8_epi32
+#define _simd_cvtepu16_epi32                SIMD::cvtepu16_epi32
+#define _simd_cvtepu16_epi64                SIMD::cvtepu16_epi64
+#define _simd_cvtepu32_epi64                SIMD::cvtepu32_epi64
+
+#define _simd_packus_epi16                  SIMD::packus_epi16
+#define _simd_packs_epi16                   SIMD::packs_epi16
+#define _simd_packus_epi32                  SIMD::packus_epi32
+#define _simd_packs_epi32                   SIMD::packs_epi32
+
+#define _simd_unpacklo_ps                   SIMD::unpacklo_ps
+#define _simd_unpackhi_ps                   SIMD::unpackhi_ps
+#define _simd_unpacklo_pd                   SIMD::unpacklo_pd
+#define _simd_unpackhi_pd                   SIMD::unpackhi_pd
+#define _simd_insertf128_ps                 SIMD::insertf128_ps
+#define _simd_insertf128_pd                 SIMD::insertf128_pd
+#define _simd_insertf128_si(a, b, i)        SIMD::insertf128_si<i>(a, b)
+#define _simd_extractf128_ps(a, i)          SIMD::extractf128_ps<i>(a)
+#define _simd_extractf128_pd(a, i)          SIMD::extractf128_pd<i>(a)
+#define _simd_extractf128_si(a, i)          SIMD::extractf128_si<i>(a)
+#define _simd_permute2f128_ps(a, b, i)      SIMD::permute2f128_ps<i>(a, b)
+#define _simd_permute2f128_pd(a, b, i)      SIMD::permute2f128_pd<i>(a, b)
+#define _simd_permute2f128_si(a, b, i)      SIMD::permute2f128_si<i>(a, b)
+#define _simd_shuffle_ps(a, b, i)           SIMD::shuffle_ps<i>(a, b)
+#define _simd_shuffle_pd(a, b, i)           SIMD::shuffle_pd<i>(a, b)
+#define _simd_shuffle_epi32(a, b, imm8)     SIMD::shuffle_epi32<imm8>(a, b)
+#define _simd_shuffle_epi64(a, b, imm8)     SIMD::shuffle_epi64<imm8>(a, b)
+#define _simd_set1_epi32                    SIMD::set1_epi32
+#define _simd_set_epi32                     SIMD::set_epi32
+#define _simd_set_ps                        SIMD::set_ps
+#define _simd_set1_epi8                     SIMD::set1_epi8
+#define _simd_setzero_si                    SIMD::setzero_si
+#define _simd_cvttps_epi32                  SIMD::cvttps_epi32
+#define _simd_store_si                      SIMD::store_si
+#define _simd_broadcast_ss                  SIMD::broadcast_ss
+#define _simd_maskstore_ps                  SIMD::maskstore_ps
+#define _simd_load_si                       SIMD::load_si
+#define _simd_loadu_si                      SIMD::loadu_si
+#define _simd_sub_ps                        SIMD::sub_ps
+#define _simd_testz_ps                      SIMD::testz_ps
+#define _simd_testz_si                      SIMD::testz_si
+#define _simd_xor_ps                        SIMD::xor_ps
+
+#define _simd_loadu2_si                     SIMD::loadu2_si
+#define _simd_storeu2_si                    SIMD::storeu2_si
+
+#define _simd_blendv_epi32                  SIMD::blendv_epi32
+
+template<int mask> SIMDINLINE
+SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer a, SIMD128::Integer b)
+{
+    return SIMD128::castps_si(SIMD128::blend_ps<mask>(SIMD128::castsi_ps(a), SIMD128::castsi_ps(b)));
 }
 
 // convert bitmask to vector mask
-INLINE
-simdscalar vMask(int32_t mask)
+SIMDINLINE
+SIMD256::Float vMask(int32_t mask)
 {
-    __m256i vec = _mm256_set1_epi32(mask);
-    const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
-    vec = _simd_and_si(vec, bit);
-    vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
-    return _simd_castsi_ps(vec);
+    SIMD256::Integer vec = SIMD256::set1_epi32(mask);
+    const SIMD256::Integer bit = SIMD256::set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+    vec = SIMD256::and_si(vec, bit);
+    vec = SIMD256::cmplt_epi32(SIMD256::setzero_si(), vec);
+    return SIMD256::castsi_ps(vec);
 }
 
-INLINE
-simdscalari vMaski(int32_t mask)
+SIMDINLINE
+SIMD256::Integer vMaski(int32_t mask)
 {
-    __m256i vec = _mm256_set1_epi32(mask);
-    const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
-    vec = _simd_and_si(vec, bit);
-    return _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
+    SIMD256::Integer vec = SIMD256::set1_epi32(mask);
+    const SIMD256::Integer bit = SIMD256::set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+    vec = SIMD256::and_si(vec, bit);
+    return SIMD256::cmplt_epi32(SIMD256::setzero_si(), vec);
 }
 
-INLINE
+SIMDINLINE
 void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int slane)
 {
     OSALIGNSIMD(float) rArray[KNOB_SIMD_WIDTH], sArray[KNOB_SIMD_WIDTH];
-    _mm256_store_ps(rArray, r);
-    _mm256_store_ps(sArray, s);
+    SIMD256::store_ps(rArray, r);
+    SIMD256::store_ps(sArray, s);
     rArray[rlane] = sArray[slane];
-    r = _mm256_load_ps(rArray);
+    r = SIMD256::load_ps(rArray);
 }
 
-INLINE __m256i _simdemu_slli_epi32(__m256i a, uint32_t i)
-{
-    __m128i aHi = _mm256_extractf128_si256(a, 1);
-    __m128i aLo = _mm256_castsi256_si128(a);
-
-    __m128i resHi = _mm_slli_epi32(aHi, i);
-    __m128i resLo = _mm_slli_epi32(aLo, i);
-
-    __m256i result = _mm256_castsi128_si256(resLo);
-            result = _mm256_insertf128_si256(result, resHi, 1);
-
-    return result;
-}
-
-INLINE __m256i _simdemu_srai_epi32(__m256i a, uint32_t i)
-{
-    __m128i aHi = _mm256_extractf128_si256(a, 1);
-    __m128i aLo = _mm256_castsi256_si128(a);
-
-    __m128i resHi = _mm_srai_epi32(aHi, i);
-    __m128i resLo = _mm_srai_epi32(aLo, i);
-
-    __m256i result = _mm256_castsi128_si256(resLo);
-            result = _mm256_insertf128_si256(result, resHi, 1);
-
-    return result;
-}
-
-INLINE __m256i _simdemu_srli_epi32(__m256i a, uint32_t i)
-{
-    __m128i aHi = _mm256_extractf128_si256(a, 1);
-    __m128i aLo = _mm256_castsi256_si128(a);
-
-    __m128i resHi = _mm_srli_epi32(aHi, i);
-    __m128i resLo = _mm_srli_epi32(aLo, i);
-
-    __m256i result = _mm256_castsi128_si256(resLo);
-    result = _mm256_insertf128_si256(result, resHi, 1);
-
-    return result;
-}
-
-INLINE
-void _simdvec_transpose(simdvector &v)
-{
-    SWR_INVALID("Need to implement 8 wide version");
-}
-
-#else
-#error Unsupported vector width
-#endif
-
 // Populates a simdvector from a vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
-INLINE
-void _simdvec_load_ps(simdvector& r, const float *p)
-{
-    r[0] = _simd_set1_ps(p[0]);
-    r[1] = _simd_set1_ps(p[1]);
-    r[2] = _simd_set1_ps(p[2]);
-    r[3] = _simd_set1_ps(p[3]);
-}
+#define _simdvec_load_ps SIMD::vec4_load1_ps
 
-INLINE
+SIMDINLINE
 void _simdvec_mov(simdvector& r, const simdscalar& s)
 {
-    r[0] = s;
-    r[1] = s;
-    r[2] = s;
-    r[3] = s;
+    SIMD::vec4_set1_vps(r, s);
 }
 
-INLINE
+SIMDINLINE
 void _simdvec_mov(simdvector& r, const simdvector& v)
 {
-    r[0] = v[0];
-    r[1] = v[1];
-    r[2] = v[2];
-    r[3] = v[3];
+    r = v;
 }
 
 #if 0
 // just move a lane from the source simdvector to dest simdvector
-INLINE
+SIMDINLINE
 void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane)
 {
     _simd_mov(r[0], rlane, s[0], slane);
@@ -776,330 +245,23 @@ void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int
 }
 
 #endif
-INLINE
-void _simdvec_dp3_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
-{
-    simdscalar tmp;
-    r   = _simd_mul_ps(v0[0], v1[0]);   // (v0.x*v1.x)
-
-    tmp = _simd_mul_ps(v0[1], v1[1]);       // (v0.y*v1.y)
-    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y)
 
-    tmp = _simd_mul_ps(v0[2], v1[2]);   // (v0.z*v1.z)
-    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
-}
-
-INLINE
-void _simdvec_dp4_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
-{
-    simdscalar tmp;
-    r   = _simd_mul_ps(v0[0], v1[0]);   // (v0.x*v1.x)
-
-    tmp = _simd_mul_ps(v0[1], v1[1]);       // (v0.y*v1.y)
-    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y)
-
-    tmp = _simd_mul_ps(v0[2], v1[2]);   // (v0.z*v1.z)
-    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
-
-    tmp = _simd_mul_ps(v0[3], v1[3]);   // (v0.w*v1.w)
-    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
-}
-
-INLINE
-simdscalar _simdvec_rcp_length_ps(const simdvector& v)
-{
-    simdscalar length;
-    _simdvec_dp4_ps(length, v, v);
-    return _simd_rsqrt_ps(length);
-}
-
-INLINE
-void _simdvec_normalize_ps(simdvector& r, const simdvector& v)
-{
-    simdscalar vecLength;
-    vecLength = _simdvec_rcp_length_ps(v);
-
-    r[0] = _simd_mul_ps(v[0], vecLength);
-    r[1] = _simd_mul_ps(v[1], vecLength);
-    r[2] = _simd_mul_ps(v[2], vecLength);
-    r[3] = _simd_mul_ps(v[3], vecLength);
-}
-
-INLINE
-void _simdvec_mul_ps(simdvector& r, const simdvector& v, const simdscalar& s)
-{
-    r[0] = _simd_mul_ps(v[0], s);
-    r[1] = _simd_mul_ps(v[1], s);
-    r[2] = _simd_mul_ps(v[2], s);
-    r[3] = _simd_mul_ps(v[3], s);
-}
-
-INLINE
-void _simdvec_mul_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
-{
-    r[0] = _simd_mul_ps(v0[0], v1[0]);
-    r[1] = _simd_mul_ps(v0[1], v1[1]);
-    r[2] = _simd_mul_ps(v0[2], v1[2]);
-    r[3] = _simd_mul_ps(v0[3], v1[3]);
-}
-
-INLINE
-void _simdvec_add_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
-{
-    r[0] = _simd_add_ps(v0[0], v1[0]);
-    r[1] = _simd_add_ps(v0[1], v1[1]);
-    r[2] = _simd_add_ps(v0[2], v1[2]);
-    r[3] = _simd_add_ps(v0[3], v1[3]);
-}
-
-INLINE
-void _simdvec_min_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
-{
-    r[0] = _simd_min_ps(v0[0], s);
-    r[1] = _simd_min_ps(v0[1], s);
-    r[2] = _simd_min_ps(v0[2], s);
-    r[3] = _simd_min_ps(v0[3], s);
-}
-
-INLINE
-void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
-{
-    r[0] = _simd_max_ps(v0[0], s);
-    r[1] = _simd_max_ps(v0[1], s);
-    r[2] = _simd_max_ps(v0[2], s);
-    r[3] = _simd_max_ps(v0[3], s);
-}
-
-// Matrix4x4 * Vector4
-//   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
-//   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
-//   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
-//   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
-INLINE
-void _simd_mat4x4_vec4_multiply(
-    simdvector& result,
-    const float *pMatrix,
-    const simdvector& v)
-{
-    simdscalar m;
-    simdscalar r0;
-    simdscalar r1;
-
-    m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 3);    // m[row][3]
-    r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
-    result[0] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 3);    // m[row][3]
-    r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
-    result[1] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 3);    // m[row][3]
-    r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
-    result[2] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 3*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 3*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 3*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 3*4 + 3);    // m[row][3]
-    r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
-    result[3] = r0;
-}
-
-// Matrix4x4 * Vector3 - Direction Vector where w = 0.
-//   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
-//   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
-//   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
-//   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
-INLINE
-void _simd_mat3x3_vec3_w0_multiply(
-    simdvector& result,
-    const float *pMatrix,
-    const simdvector& v)
-{
-    simdscalar m;
-    simdscalar r0;
-    simdscalar r1;
-
-    m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    result[0] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    result[1] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    result[2] = r0;
-
-    result[3] = _simd_setzero_ps();
-}
-
-// Matrix4x4 * Vector3 - Position vector where w = 1.
-//   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
-//   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
-//   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
-//   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
-INLINE
-void _simd_mat4x4_vec3_w1_multiply(
-    simdvector& result,
-    const float *pMatrix,
-    const simdvector& v)
-{
-    simdscalar m;
-    simdscalar r0;
-    simdscalar r1;
-
-    m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 3);    // m[row][3]
-    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-    result[0] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 3);    // m[row][3]
-    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-    result[1] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 3);    // m[row][3]
-    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-    result[2] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 3*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 3*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 3*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 3*4 + 3);    // m[row][3]
-    result[3]   = _simd_add_ps(r0, m);          // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-}
-
-INLINE
-void _simd_mat4x3_vec3_w1_multiply(
-    simdvector& result,
-    const float *pMatrix,
-    const simdvector& v)
-{
-    simdscalar m;
-    simdscalar r0;
-    simdscalar r1;
-
-    m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 3);    // m[row][3]
-    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-    result[0] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 3);    // m[row][3]
-    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-    result[1] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 3);    // m[row][3]
-    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-    result[2] = r0;
-    result[3] = _simd_set1_ps(1.0f);
-}
+#define _simdvec_dp3_ps                 SIMD::vec4_dp3_ps
+#define _simdvec_dp4_ps                 SIMD::vec4_dp4_ps
+#define _simdvec_rcp_length_ps          SIMD::vec4_rcp_length_ps
+#define _simdvec_normalize_ps           SIMD::vec4_normalize_ps
+#define _simdvec_mul_ps                 SIMD::vec4_mul_ps
+#define _simdvec_add_ps                 SIMD::vec4_add_ps
+#define _simdvec_min_ps                 SIMD::vec4_min_ps
+#define _simdvec_max_ps                 SIMD::vec4_max_ps
+#define _simd_mat4x4_vec4_multiply      SIMD::mat4x4_vec4_multiply
+#define _simd_mat3x3_vec3_w0_multiply   SIMD::mat3x3_vec3_w0_multiply
+#define _simd_mat4x4_vec3_w1_multiply   SIMD::mat4x4_vec3_w1_multiply
+#define _simd_mat4x3_vec3_w1_multiply   SIMD::mat4x3_vec3_w1_multiply
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Compute plane equation vA * vX + vB * vY + vC
-INLINE simdscalar vplaneps(simdscalar vA, simdscalar vB, simdscalar vC, simdscalar &vX, simdscalar &vY)
+SIMDINLINE simdscalar vplaneps(simdscalar vA, simdscalar vB, simdscalar vC, simdscalar &vX, simdscalar &vY)
 {
     simdscalar vOut = _simd_fmadd_ps(vA, vX, vC);
     vOut = _simd_fmadd_ps(vB, vY, vOut);
@@ -1108,9 +270,9 @@ INLINE simdscalar vplaneps(simdscalar vA, simdscalar vB, simdscalar vC, simdscal
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Compute plane equation vA * vX + vB * vY + vC
-INLINE __m128 vplaneps128(__m128 vA, __m128 vB, __m128 vC, __m128 &vX, __m128 &vY)
+SIMDINLINE simd4scalar vplaneps(simd4scalar vA, simd4scalar vB, simd4scalar vC, simd4scalar &vX, simd4scalar &vY)
 {
-    __m128 vOut = _simd128_fmadd_ps(vA, vX, vC);
+    simd4scalar vOut = _simd128_fmadd_ps(vA, vX, vC);
     vOut = _simd128_fmadd_ps(vB, vY, vOut);
     return vOut;
 }
@@ -1121,7 +283,7 @@ INLINE __m128 vplaneps128(__m128 vA, __m128 vB, __m128 vC, __m128 &vX, __m128 &v
 /// @param vJ - barycentric J
 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
 template<UINT Attrib, UINT Comp, UINT numComponents = 4>
-static INLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, const float *pInterpBuffer)
+static SIMDINLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, const float *pInterpBuffer)
 {
     const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
     const float *pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
@@ -1141,7 +303,7 @@ static INLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, cons
 /// @brief Interpolates a single component (flat shade).
 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
 template<UINT Attrib, UINT Comp, UINT numComponents = 4>
-static INLINE simdscalar InterpolateComponentFlat(const float *pInterpBuffer)
+static SIMDINLINE simdscalar InterpolateComponentFlat(const float *pInterpBuffer)
 {
     const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
 
@@ -1156,34 +318,35 @@ static INLINE simdscalar InterpolateComponentFlat(const float *pInterpBuffer)
 /// @param vJ - barycentric J
 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
 template<UINT Attrib, UINT Comp, UINT numComponents = 4>
-static INLINE __m128 InterpolateComponent(__m128 vI, __m128 vJ, const float *pInterpBuffer)
+static SIMDINLINE simd4scalar InterpolateComponent(simd4scalar vI, simd4scalar vJ, const float *pInterpBuffer)
 {
     const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
     const float *pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
     const float *pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp];
 
-    __m128 vA = _mm_broadcast_ss(pInterpA);
-    __m128 vB = _mm_broadcast_ss(pInterpB);
-    __m128 vC = _mm_broadcast_ss(pInterpC);
+    simd4scalar vA = SIMD128::broadcast_ss(pInterpA);
+    simd4scalar vB = SIMD128::broadcast_ss(pInterpB);
+    simd4scalar vC = SIMD128::broadcast_ss(pInterpC);
 
-    __m128 vk = _mm_sub_ps(_mm_sub_ps(_mm_set1_ps(1.0f), vI), vJ);
-    vC = _mm_mul_ps(vk, vC);
+    simd4scalar vk = SIMD128::sub_ps(SIMD128::sub_ps(SIMD128::set1_ps(1.0f), vI), vJ);
+    vC = SIMD128::mul_ps(vk, vC);
 
-    return vplaneps128(vA, vB, vC, vI, vJ);
+    return vplaneps(vA, vB, vC, vI, vJ);
 }
 
-static INLINE __m128 _simd128_abs_ps(__m128 a)
+static SIMDINLINE simd4scalar _simd128_abs_ps(simd4scalar a)
 {
-    __m128i ai = _mm_castps_si128(a);
-    return _mm_castsi128_ps(_mm_and_si128(ai, _mm_set1_epi32(0x7fffffff)));
+    simd4scalari ai = SIMD128::castps_si(a);
+    return SIMD128::castsi_ps(SIMD128::and_si(ai, SIMD128::set1_epi32(0x7fffffff)));
 }
 
-static INLINE simdscalar _simd_abs_ps(simdscalar a)
+static SIMDINLINE simdscalar _simd_abs_ps(simdscalar a)
 {
     simdscalari ai = _simd_castps_si(a);
     return _simd_castsi_ps(_simd_and_si(ai, _simd_set1_epi32(0x7fffffff)));
 }
 
+
 #if ENABLE_AVX512_SIMD16
 #include "simd16intrin.h"
 #endif//ENABLE_AVX512_SIMD16
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
new file mode 100644
index 00000000000..fb1113204d5
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
@@ -0,0 +1,550 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#pragma once
+
+#include "simdlib_types.hpp"
+
+// For documentation, please see the following include...
+// #include "simdlib_interface.hpp"
+
+namespace SIMDImpl
+{
+    namespace SIMD128Impl
+    {
+#if SIMD_ARCH >= SIMD_ARCH_AVX
+        struct AVXImpl
+        {
+#define __SIMD_LIB_AVX_HPP__
+#include "simdlib_128_avx.inl"
+#undef __SIMD_LIB_AVX_HPP__
+        }; // struct AVXImpl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
+
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX2
+        struct AVX2Impl : AVXImpl
+        {
+#define __SIMD_LIB_AVX2_HPP__
+#include "simdlib_128_avx2.inl"
+#undef __SIMD_LIB_AVX2_HPP__
+        }; // struct AVX2Impl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+        struct AVX512Impl : AVX2Impl
+        {
+#define __SIMD_LIB_AVX512_HPP__
+#include "simdlib_128_avx512.inl"
+#undef __SIMD_LIB_AVX512_HPP__
+        }; // struct AVX2Impl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
+
+        struct Traits : SIMDImpl::Traits
+        {
+#if SIMD_ARCH == SIMD_ARCH_AVX
+            using IsaImpl = AVXImpl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX2
+            using IsaImpl = AVX2Impl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX512
+            using IsaImpl = AVX512Impl;
+#else
+#error Invalid value for SIMD_ARCH
+#endif
+
+            using Float     = SIMD128Impl::Float;
+            using Double    = SIMD128Impl::Double;
+            using Integer   = SIMD128Impl::Integer;
+            using Vec4      = SIMD128Impl::Vec4;
+            using Mask      = SIMD128Impl::Mask;
+        };
+    } // ns SIMD128Impl
+
+    namespace SIMD256Impl
+    {
+#if SIMD_ARCH >= SIMD_ARCH_AVX
+        struct AVXImpl
+        {
+#define __SIMD_LIB_AVX_HPP__
+#include "simdlib_256_avx.inl"
+#undef __SIMD_LIB_AVX_HPP__
+        }; // struct AVXImpl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
+
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX2
+        struct AVX2Impl : AVXImpl
+        {
+#define __SIMD_LIB_AVX2_HPP__
+#include "simdlib_256_avx2.inl"
+#undef __SIMD_LIB_AVX2_HPP__
+        }; // struct AVX2Impl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+        struct AVX512Impl : AVX2Impl
+        {
+#define __SIMD_LIB_AVX512_HPP__
+#include "simdlib_256_avx512.inl"
+#undef __SIMD_LIB_AVX512_HPP__
+        }; // struct AVX2Impl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
+
+        struct Traits : SIMDImpl::Traits
+        {
+#if SIMD_ARCH == SIMD_ARCH_AVX
+            using IsaImpl = AVXImpl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX2
+            using IsaImpl = AVX2Impl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX512
+            using IsaImpl = AVX512Impl;
+#else
+#error Invalid value for SIMD_ARCH
+#endif
+
+            using Float     = SIMD256Impl::Float;
+            using Double    = SIMD256Impl::Double;
+            using Integer   = SIMD256Impl::Integer;
+            using Vec4      = SIMD256Impl::Vec4;
+            using Mask      = SIMD256Impl::Mask;
+        };
+    } // ns SIMD256Impl
+
+    namespace SIMD512Impl
+    {
+#if SIMD_ARCH >= SIMD_ARCH_AVX
+        template<typename SIMD256T>
+        struct AVXImplBase
+        {
+#define __SIMD_LIB_AVX_HPP__
+#include "simdlib_512_emu.inl"
+#include "simdlib_512_emu_masks.inl"
+#undef __SIMD_LIB_AVX_HPP__
+        }; // struct AVXImplBase
+        using AVXImpl = AVXImplBase<SIMD256Impl::AVXImpl>;
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
+
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX2
+        using AVX2Impl = AVXImplBase<SIMD256Impl::AVX2Impl>;
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
+
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+        struct AVX512Impl
+        {
+#define __SIMD_LIB_AVX512_HPP__
+#include "simdlib_512_avx512.inl"
+#include "simdlib_512_avx512_masks.inl"
+#undef __SIMD_LIB_AVX512_HPP__
+        }; // struct AVX512Impl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
+
+        struct Traits : SIMDImpl::Traits
+        {
+#if SIMD_ARCH == SIMD_ARCH_AVX
+            using IsaImpl = AVXImpl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX2
+            using IsaImpl = AVX2Impl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX512
+            using IsaImpl = AVX512Impl;
+#else
+#error Invalid value for SIMD_ARCH
+#endif
+
+            using Float     = SIMD512Impl::Float;
+            using Double    = SIMD512Impl::Double;
+            using Integer   = SIMD512Impl::Integer;
+            using Vec4      = SIMD512Impl::Vec4;
+            using Mask      = SIMD512Impl::Mask;
+        };
+    } // ns SIMD512Impl
+} // ns SIMDImpl
+
+template <typename Traits>
+struct SIMDBase : Traits::IsaImpl
+{
+    using CompareType   = typename Traits::CompareType;
+    using ScaleFactor   = typename Traits::ScaleFactor;
+    using RoundMode     = typename Traits::RoundMode;
+    using SIMD          = typename Traits::IsaImpl;
+    using Float         = typename Traits::Float;
+    using Double        = typename Traits::Double;
+    using Integer       = typename Traits::Integer;
+    using Vec4          = typename Traits::Vec4;
+    using Mask          = typename Traits::Mask;
+
+    // Populates a SIMD Vec4 from a non-simd vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
+    static SIMDINLINE
+    void vec4_load1_ps(Vec4& r, const float *p)
+    {
+        r[0] = SIMD::set1_ps(p[0]);
+        r[1] = SIMD::set1_ps(p[1]);
+        r[2] = SIMD::set1_ps(p[2]);
+        r[3] = SIMD::set1_ps(p[3]);
+    }
+
+    static SIMDINLINE
+    void vec4_set1_vps(Vec4& r, Float s)
+    {
+        r[0] = s;
+        r[1] = s;
+        r[2] = s;
+        r[3] = s;
+    }
+
+    static SIMDINLINE
+    Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1)
+    {
+        Float tmp, r;
+        r   = SIMD::mul_ps(v0[0], v1[0]);     // (v0.x*v1.x)
+
+        tmp = SIMD::mul_ps(v0[1], v1[1]);     // (v0.y*v1.y)
+        r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y)
+
+        tmp = SIMD::mul_ps(v0[2], v1[2]);     // (v0.z*v1.z)
+        r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+
+        return r;
+    }
+
+    static SIMDINLINE
+    Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1)
+    {
+        Float tmp, r;
+        r   = SIMD::mul_ps(v0[0], v1[0]);     // (v0.x*v1.x)
+
+        tmp = SIMD::mul_ps(v0[1], v1[1]);     // (v0.y*v1.y)
+        r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y)
+
+        tmp = SIMD::mul_ps(v0[2], v1[2]);     // (v0.z*v1.z)
+        r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+
+        tmp = SIMD::mul_ps(v0[3], v1[3]);     // (v0.w*v1.w)
+        r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+
+        return r;
+    }
+
+    static SIMDINLINE
+    Float vec4_rcp_length_ps(const Vec4& v)
+    {
+        Float length = vec4_dp4_ps(v, v);
+        return SIMD::rsqrt_ps(length);
+    }
+
+    static SIMDINLINE
+    void vec4_normalize_ps(Vec4& r, const Vec4& v)
+    {
+        Float rcpLength = vec4_rcp_length_ps(v);
+
+        r[0] = SIMD::mul_ps(v[0], rcpLength);
+        r[1] = SIMD::mul_ps(v[1], rcpLength);
+        r[2] = SIMD::mul_ps(v[2], rcpLength);
+        r[3] = SIMD::mul_ps(v[3], rcpLength);
+    }
+
+    static SIMDINLINE
+    void vec4_mul_ps(Vec4& r, const Vec4& v, Float s)
+    {
+        r[0] = SIMD::mul_ps(v[0], s);
+        r[1] = SIMD::mul_ps(v[1], s);
+        r[2] = SIMD::mul_ps(v[2], s);
+        r[3] = SIMD::mul_ps(v[3], s);
+    }
+
+    static SIMDINLINE
+    void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
+    {
+        r[0] = SIMD::mul_ps(v0[0], v1[0]);
+        r[1] = SIMD::mul_ps(v0[1], v1[1]);
+        r[2] = SIMD::mul_ps(v0[2], v1[2]);
+        r[3] = SIMD::mul_ps(v0[3], v1[3]);
+    }
+
+    static SIMDINLINE
+    void vec4_add_ps(Vec4& r, const Vec4& v0, Float s)
+    {
+        r[0] = SIMD::add_ps(v0[0], s);
+        r[1] = SIMD::add_ps(v0[1], s);
+        r[2] = SIMD::add_ps(v0[2], s);
+        r[3] = SIMD::add_ps(v0[3], s);
+    }
+
+    static SIMDINLINE
+    void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
+    {
+        r[0] = SIMD::add_ps(v0[0], v1[0]);
+        r[1] = SIMD::add_ps(v0[1], v1[1]);
+        r[2] = SIMD::add_ps(v0[2], v1[2]);
+        r[3] = SIMD::add_ps(v0[3], v1[3]);
+    }
+
+    static SIMDINLINE
+    void vec4_min_ps(Vec4& r, const Vec4& v0, Float s)
+    {
+        r[0] = SIMD::min_ps(v0[0], s);
+        r[1] = SIMD::min_ps(v0[1], s);
+        r[2] = SIMD::min_ps(v0[2], s);
+        r[3] = SIMD::min_ps(v0[3], s);
+    }
+
+    static SIMDINLINE
+    void vec4_max_ps(Vec4& r, const Vec4& v0, Float s)
+    {
+        r[0] = SIMD::max_ps(v0[0], s);
+        r[1] = SIMD::max_ps(v0[1], s);
+        r[2] = SIMD::max_ps(v0[2], s);
+        r[3] = SIMD::max_ps(v0[3], s);
+    }
+
+    // Matrix4x4 * Vector4
+    //   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
+    //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
+    //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
+    //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
+    static SIMDINLINE
+    void SIMDCALL mat4x4_vec4_multiply(
+        Vec4& result,
+        const float *pMatrix,
+        const Vec4& v)
+    {
+        Float m;
+        Float r0;
+        Float r1;
+
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 3);  // m[row][3]
+        r1  = SIMD::mul_ps(m, v[3]);              // (m3 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+        result[0] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 3);  // m[row][3]
+        r1  = SIMD::mul_ps(m, v[3]);              // (m3 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+        result[1] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 3);  // m[row][3]
+        r1  = SIMD::mul_ps(m, v[3]);              // (m3 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+        result[2] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 3);  // m[row][3]
+        r1  = SIMD::mul_ps(m, v[3]);              // (m3 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+        result[3] = r0;
+    }
+
+    // Matrix4x4 * Vector3 - Direction Vector where w = 0.
+    //   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
+    //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
+    //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
+    //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
+    static SIMDINLINE
+    void SIMDCALL mat3x3_vec3_w0_multiply(
+        Vec4& result,
+        const float *pMatrix,
+        const Vec4& v)
+    {
+        Float m;
+        Float r0;
+        Float r1;
+
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        result[0] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        result[1] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        result[2] = r0;
+
+        result[3] = SIMD::setzero_ps();
+    }
+
+    // Matrix4x4 * Vector3 - Position vector where w = 1.
+    //   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
+    //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
+    //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
+    //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
+    static SIMDINLINE
+    void SIMDCALL mat4x4_vec3_w1_multiply(
+        Vec4& result,
+        const float *pMatrix,
+        const Vec4& v)
+    {
+        Float m;
+        Float r0;
+        Float r1;
+
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 3);  // m[row][3]
+        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        result[0] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 3);  // m[row][3]
+        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        result[1] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 3);  // m[row][3]
+        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        result[2] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 3);  // m[row][3]
+        result[3] = SIMD::add_ps(r0, m);        // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+    }
+
+    static SIMDINLINE
+    void SIMDCALL mat4x3_vec3_w1_multiply(
+        Vec4& result,
+        const float *pMatrix,
+        const Vec4& v)
+    {
+        Float m;
+        Float r0;
+        Float r1;
+
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 3);  // m[row][3]
+        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        result[0] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 3);  // m[row][3]
+        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        result[1] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 3);  // m[row][3]
+        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        result[2] = r0;
+        result[3] = SIMD::set1_ps(1.0f);
+    }
+}; // struct SIMDBase
+
+using SIMD128 = SIMDBase<SIMDImpl::SIMD128Impl::Traits>;
+using SIMD256 = SIMDBase<SIMDImpl::SIMD256Impl::Traits>;
+using SIMD512 = SIMDBase<SIMDImpl::SIMD512Impl::Traits>;
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl
new file mode 100644
index 00000000000..5bcedf39713
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl
@@ -0,0 +1,545 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD128 AVX (1) implementation
+//============================================================================
+
+#define SIMD_WRAPPER_1(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return _mm_##op(a);\
+    }
+
+#define SIMD_WRAPPER_2(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm_##op(a, b);\
+    }
+
+#define SIMD_DWRAPPER_2(op)  \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return _mm_##op(a, b);\
+    }
+
+#define SIMD_WRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm_##op(a, b, ImmT);\
+    }
+
+#define SIMD_DWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return _mm_##op(a, b, ImmT);\
+    }
+
+#define SIMD_WRAPPER_3(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+    {\
+        return _mm_##op(a, b, c);\
+    }
+
+#define SIMD_IWRAPPER_1(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm_##op(a);\
+    }
+
+#define SIMD_IWRAPPER_1I_(op, intrin)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return intrin(a, ImmT);\
+    }
+#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm_##op)
+
+#define SIMD_IWRAPPER_2_(op, intrin)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return intrin(a, b);\
+    }
+
+#define SIMD_IWRAPPER_2(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm_##op(a, b);\
+    }
+
+#define SIMD_IFWRAPPER_2(op, intrin)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return castps_si( intrin(castsi_ps(a), castsi_ps(b)) );\
+    }
+
+#define SIMD_IWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm_##op(a, b, ImmT);\
+    }
+
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps);     // return a + b
+SIMD_WRAPPER_2(div_ps);     // return a / b
+SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);     // return a * b
+SIMD_WRAPPER_1(rcp_ps);     // return 1.0f / a
+SIMD_WRAPPER_1(rsqrt_ps);   // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);     // return a - b
+
+static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c)    // return (a * b) + c
+{
+    return add_ps(mul_ps(a, b), c);
+}
+static SIMDINLINE Float SIMDCALL fmsub_ps(Float a, Float b, Float c)    // return (a * b) - c
+{
+    return sub_ps(mul_ps(a, b), c);
+}
+
+template <RoundMode RMT>
+static SIMDINLINE Float SIMDCALL round_ps(Float a)
+{
+    return _mm_round_ps(a, static_cast<int>(RMT));
+}
+
+static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
+SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
+SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2(mullo_epi32);
+SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
+SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
+SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(and_ps);                             // return a & b       (float treated as int)
+SIMD_IWRAPPER_2_(and_si, _mm_and_si128);        // return a & b       (int)
+SIMD_WRAPPER_2(andnot_ps);                          // return (~a) & b    (float treated as int)
+SIMD_IWRAPPER_2_(andnot_si, _mm_andnot_si128);  // return (~a) & b    (int)
+SIMD_WRAPPER_2(or_ps);                              // return a | b       (float treated as int)
+SIMD_IWRAPPER_2_(or_si, _mm_or_si128);          // return a | b       (int)
+SIMD_WRAPPER_2(xor_ps);                             // return a ^ b       (float treated as int)
+SIMD_IWRAPPER_2_(xor_si, _mm_xor_si128);        // return a ^ b       (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1I(slli_epi32);               // return a << ImmT
+
+static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b      (uint32)
+{
+    int32_t a, count;
+    a = _mm_extract_epi32(vA, 0);
+    count = _mm_extract_epi32(vB, 0);
+    a <<= count;
+    vA = _mm_insert_epi32(vA, a, 0);
+
+    a = _mm_extract_epi32(vA, 1);
+    count = _mm_extract_epi32(vB, 1);
+    a <<= count;
+    vA = _mm_insert_epi32(vA, a, 1);
+
+    a = _mm_extract_epi32(vA, 2);
+    count = _mm_extract_epi32(vB, 2);
+    a <<= count;
+    vA = _mm_insert_epi32(vA, a, 2);
+
+    a = _mm_extract_epi32(vA, 3);
+    count = _mm_extract_epi32(vB, 3);
+    a <<= count;
+    vA = _mm_insert_epi32(vA, a, 3);
+
+    return vA;
+}
+
+SIMD_IWRAPPER_1I(srai_epi32);               // return a >> ImmT   (int32)
+SIMD_IWRAPPER_1I(srli_epi32);               // return a >> ImmT   (uint32)
+SIMD_IWRAPPER_1I_(srli_si, _mm_srli_si128); // return a >> (ImmT*8) (uint)
+
+template<int ImmT>                              // same as srli_si, but with Float cast to int
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
+{
+    return castsi_ps(srli_si<ImmT>(castps_si(a)));
+}
+
+static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b      (uint32)
+{
+    int32_t a, count;
+    a = _mm_extract_epi32(vA, 0);
+    count = _mm_extract_epi32(vB, 0);
+    a >>= count;
+    vA = _mm_insert_epi32(vA, a, 0);
+
+    a = _mm_extract_epi32(vA, 1);
+    count = _mm_extract_epi32(vB, 1);
+    a >>= count;
+    vA = _mm_insert_epi32(vA, a, 1);
+
+    a = _mm_extract_epi32(vA, 2);
+    count = _mm_extract_epi32(vB, 2);
+    a >>= count;
+    vA = _mm_insert_epi32(vA, a, 2);
+
+    a = _mm_extract_epi32(vA, 3);
+    count = _mm_extract_epi32(vB, 3);
+    a >>= count;
+    vA = _mm_insert_epi32(vA, a, 3);
+
+    return vA;
+}
+
+
+
+//-----------------------------------------------------------------------
+// Conversion operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL castpd_ps(Double a)   // return *(Float*)(&a)
+{
+    return _mm_castpd_ps(a);
+}
+
+static SIMDINLINE Integer SIMDCALL castps_si(Float a)   // return *(Integer*)(&a)
+{
+    return _mm_castps_si128(a);
+}
+
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer a)   // return *(Double*)(&a)
+{
+    return _mm_castsi128_pd(a);
+}
+
+static SIMDINLINE Double SIMDCALL castps_pd(Float a)   // return *(Double*)(&a)
+{
+    return _mm_castps_pd(a);
+}
+
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer a)   // return *(Float*)(&a)
+{
+    return _mm_castsi128_ps(a);
+}
+
+static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a    (int32 --> float)
+{
+    return _mm_cvtepi32_ps(a);
+}
+
+SIMD_IWRAPPER_1(cvtepu8_epi16);     // return (int16)a    (uint8 --> int16)
+SIMD_IWRAPPER_1(cvtepu8_epi32);     // return (int32)a    (uint8 --> int32)
+SIMD_IWRAPPER_1(cvtepu16_epi32);    // return (int32)a    (uint16 --> int32)
+SIMD_IWRAPPER_1(cvtepu16_epi64);    // return (int64)a    (uint16 --> int64)
+SIMD_IWRAPPER_1(cvtepu32_epi64);    // return (int64)a    (uint32 --> int64)
+
+static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a)            // return (int32)a    (float --> int32)
+{
+    return _mm_cvtps_epi32(a);
+}
+
+static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a)           // return (int32)a    (rnd_to_zero(float) --> int32)
+{
+    return _mm_cvttps_epi32(a);
+}
+
+//-----------------------------------------------------------------------
+// Comparison operations
+//-----------------------------------------------------------------------
+template<CompareType CmpTypeT>
+static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
+{
+    return _mm_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
+}
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
+
+SIMD_IWRAPPER_2(cmpeq_epi8);    // return a == b (int8)
+SIMD_IWRAPPER_2(cmpeq_epi16);   // return a == b (int16)
+SIMD_IWRAPPER_2(cmpeq_epi32);   // return a == b (int32)
+SIMD_IWRAPPER_2(cmpeq_epi64);   // return a == b (int64)
+SIMD_IWRAPPER_2(cmpgt_epi8);    // return a > b (int8)
+SIMD_IWRAPPER_2(cmpgt_epi16);   // return a > b (int16)
+SIMD_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
+SIMD_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
+SIMD_IWRAPPER_2(cmplt_epi32);   // return a < b (int32)
+
+static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
+{
+    return  0 != _mm_testz_ps(a, b);
+}
+
+static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
+{
+    return  0 != _mm_testz_si128(a, b);
+}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2I(blend_ps);  // return ImmT ? b : a  (float)
+SIMD_WRAPPER_3(blendv_ps);  // return mask ? b : a  (float)
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
+{
+    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
+}
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
+{
+    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
+}
+
+static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)  // return *p (all elements in vector get same value)
+{
+    return _mm_broadcast_ss(p);
+}
+
+SIMD_IWRAPPER_2(packs_epi16);   // See documentation for _mm_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2(packs_epi32);   // See documentation for _mm_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2(packus_epi16);  // See documentation for _mm_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2(packus_epi32);  // See documentation for _mm_packus_epi32 and _mm512_packus_epi32
+
+static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    return castps_si(_mm_permutevar_ps(castsi_ps(a), swiz));
+}
+
+static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    return _mm_permutevar_ps(a, swiz);
+}
+
+SIMD_IWRAPPER_1I(shuffle_epi32);
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) = delete;
+
+SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_DWRAPPER_2I(shuffle_pd);
+SIMD_WRAPPER_2I(shuffle_ps);
+SIMD_IWRAPPER_2(unpackhi_epi16);
+
+//SIMD_IFWRAPPER_2(unpackhi_epi32, _mm_unpackhi_ps);
+static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
+{
+    return castps_si(_mm_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
+}
+
+SIMD_IWRAPPER_2(unpackhi_epi64);
+SIMD_IWRAPPER_2(unpackhi_epi8);
+SIMD_DWRAPPER_2(unpackhi_pd);
+SIMD_WRAPPER_2(unpackhi_ps);
+SIMD_IWRAPPER_2(unpacklo_epi16);
+SIMD_IFWRAPPER_2(unpacklo_epi32, _mm_unpacklo_ps);
+SIMD_IWRAPPER_2(unpacklo_epi64);
+SIMD_IWRAPPER_2(unpacklo_epi8);
+SIMD_DWRAPPER_2(unpacklo_pd);
+SIMD_WRAPPER_2(unpacklo_ps);
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    uint32_t *pOffsets = (uint32_t*)&idx;
+    Float vResult;
+    float* pResult = (float*)&vResult;
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        uint32_t offset = pOffsets[i];
+        offset = offset * static_cast<uint32_t>(ScaleT);
+        pResult[i] = *(float const*)(((uint8_t const*)p + offset));
+    }
+
+    return vResult;
+}
+
+static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
+{
+    return broadcast_ss(p);
+}
+
+static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+{
+    return _mm_load_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+{
+    return _mm_load_si128(&p->v);
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+{
+    return _mm_loadu_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+{
+    return _mm_lddqu_si128(&p->v);
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    uint32_t *pOffsets = (uint32_t*)&idx;
+    Float vResult = old;
+    float* pResult = (float*)&vResult;
+    DWORD index;
+    uint32_t umask = movemask_ps(mask);
+    while (_BitScanForward(&index, umask))
+    {
+        umask &= ~(1 << index);
+        uint32_t offset = pOffsets[index];
+        offset = offset * static_cast<uint32_t>(ScaleT);
+        pResult[index] = *(float const *)(((uint8_t const *)p + offset));
+    }
+
+    return vResult;
+}
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+    _mm_maskstore_ps(p, mask, src);
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+    return static_cast<uint32_t>(_mm_movemask_epi8(a));
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
+{
+    return static_cast<uint32_t>(_mm_movemask_pd(a));
+}
+static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
+{
+    return static_cast<uint32_t>(_mm_movemask_ps(a));
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
+{
+    return _mm_set1_epi32(i);
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
+{
+    return _mm_set1_epi8(i);
+}
+
+static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
+{
+    return _mm_set1_ps(f);
+}
+
+static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
+{
+    return _mm_setzero_ps();
+}
+
+static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
+{
+    return _mm_setzero_si128();
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+{
+    _mm_store_ps(p, a);
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+{
+    _mm_store_si128(&p->v, a);
+}
+
+static SIMDINLINE void SIMDCALL storeu_si(Integer *p, Integer a) // *p = a    (same as store_si but allows for unaligned mem)
+{
+    _mm_storeu_si128(&p->v, a);
+}
+
+static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
+{
+    _mm_stream_ps(p, a);
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(float in3, float in2, float in1, float in0)
+{
+    return _mm_set_ps(in3, in2, in1, in0);
+}
+
+template <int ImmT>
+static SIMDINLINE float SIMDCALL extract_ps(Float a)
+{
+    int tmp = _mm_extract_ps(a, ImmT);
+    return *reinterpret_cast<float*>(&tmp);
+}
+
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_2
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IFWRAPPER_2
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_1I
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2I
+
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl
new file mode 100644
index 00000000000..e8ee0b4d87b
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl
@@ -0,0 +1,68 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX2_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD4 AVX (2) implementation
+//
+// Since this implementation inherits from the AVX (1) implementation,
+// the only operations below ones that replace AVX (1) operations.
+// Only 2 shifts and 2 gathers were introduced with AVX 2
+// Also, add native support for FMA operations
+//============================================================================
+#define SIMD_WRAPPER_3(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+    {\
+        return _mm_##op(a, b, c);\
+    }
+
+SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
+
+static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b      (uint32)
+{
+    return _mm_sllv_epi32(vA, vB);
+}
+
+static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b      (uint32)
+{
+    return _mm_srlv_epi32(vA, vB);
+}
+
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    return _mm_i32gather_ps(p, idx, static_cast<const int>(ScaleT));
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    return _mm_mask_i32gather_ps(old, p, idx, mask, static_cast<const int>(ScaleT));
+}
+
+#undef SIMD_WRAPPER_3
+
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl
new file mode 100644
index 00000000000..3ab41a23651
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl
@@ -0,0 +1,408 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD128 AVX (512) implementation
+//
+// Since this implementation inherits from the AVX (2) implementation,
+// the only operations below ones that replace AVX (2) operations.
+// These use native AVX512 instructions with masking to enable a larger
+// register set.
+//============================================================================
+
+private:
+    static SIMDINLINE __m512  __conv(Float r) { return _mm512_castps128_ps512(r.v); }
+    static SIMDINLINE __m512d __conv(Double r) { return _mm512_castpd128_pd512(r.v); }
+    static SIMDINLINE __m512i __conv(Integer r) { return _mm512_castsi128_si512(r.v); }
+    static SIMDINLINE Float   __conv(__m512 r) { return _mm512_castps512_ps128(r); }
+    static SIMDINLINE Double  __conv(__m512d r) { return _mm512_castpd512_pd128(r); }
+    static SIMDINLINE Integer __conv(__m512i r) { return _mm512_castsi512_si128(r); }
+public:
+
+#define SIMD_WRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#define SIMD_WRAPPER_1(op)  SIMD_WRAPPER_1_(op, op, __mmask16(0xf))
+
+#define SIMD_WRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#define SIMD_WRAPPER_1I(op)  SIMD_WRAPPER_1I_(op, op, __mmask16(0xf))
+
+#define SIMD_WRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#define SIMD_WRAPPER_2(op)  SIMD_WRAPPER_2_(op, op, __mmask16(0xf))
+
+#define SIMD_WRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
+    }
+
+#define SIMD_WRAPPER_3_(op, intrin, mask)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c)));\
+    }
+#define SIMD_WRAPPER_3(op)  SIMD_WRAPPER_3_(op, op, __mmask16(0xf))
+
+#define SIMD_DWRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Double SIMDCALL op(Double a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_1(op)  SIMD_DWRAPPER_1_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_DWRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Double SIMDCALL op(Double a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_1I(op)  SIMD_DWRAPPER_1I_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_DWRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_2(op)  SIMD_DWRAPPER_2_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_DWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT));\
+    }
+
+#define SIMD_IWRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#define SIMD_IWRAPPER_1_32(op)  SIMD_IWRAPPER_1_(op, op, __mmask16(0xf))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_1_8(op)   SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffull))
+#define SIMD_IWRAPPER_1_16(op)  SIMD_IWRAPPER_1_(op, op, __mmask32(0xff))
+#define SIMD_IWRAPPER_1_64(op)  SIMD_IWRAPPER_1_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_IWRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#define SIMD_IWRAPPER_1I_32(op)  SIMD_IWRAPPER_1I_(op, op, __mmask16(0xf))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_1I_8(op)   SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffull))
+#define SIMD_IWRAPPER_1I_16(op)  SIMD_IWRAPPER_1I_(op, op, __mmask32(0xff))
+#define SIMD_IWRAPPER_1I_64(op)  SIMD_IWRAPPER_1I_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_IWRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#define SIMD_IWRAPPER_2_32(op)  SIMD_IWRAPPER_2_(op, op, __mmask16(0xf))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_2_8(op)   SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffull))
+#define SIMD_IWRAPPER_2_16(op)  SIMD_IWRAPPER_2_(op, op, __mmask32(0xff))
+#define SIMD_IWRAPPER_2_64(op)  SIMD_IWRAPPER_2_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_IWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
+    }
+
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps);     // return a + b
+SIMD_WRAPPER_2(div_ps);     // return a / b
+SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
+SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);     // return a * b
+SIMD_WRAPPER_1_(rcp_ps, rcp28_ps, __mmask16(0xf));     // return 1.0f / a
+SIMD_WRAPPER_1_(rsqrt_ps, rsqrt28_ps, __mmask16(0xf));   // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);     // return a - b
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1_32(abs_epi32);  // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2_32(add_epi32);  // return a + b (int32)
+SIMD_IWRAPPER_2_32(max_epi32);  // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2_32(max_epu32);  // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2_32(min_epi32);  // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2_32(min_epu32);  // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2_32(mul_epi32);  // return a * b (int32)
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_8(add_epi8);    // return a + b (int8)
+SIMD_IWRAPPER_2_8(adds_epu8);   // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+
+#endif
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2_32(mullo_epi32);
+SIMD_IWRAPPER_2_32(sub_epi32);  // return a - b (int32)
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_64(sub_epi64);  // return a - b (int64)
+SIMD_IWRAPPER_2_8(subs_epu8);   // return (b > a) ? 0 : (a - b) (uint8)
+
+#endif
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2_(and_si,    and_epi32, __mmask16(0xf));    // return a & b       (int)
+SIMD_IWRAPPER_2_(andnot_si, andnot_epi32, __mmask16(0xf)); // return (~a) & b    (int)
+SIMD_IWRAPPER_2_(or_si,     or_epi32, __mmask16(0xf));     // return a | b       (int)
+SIMD_IWRAPPER_2_(xor_si,    xor_epi32, __mmask16(0xf));    // return a ^ b       (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1I_32(slli_epi32);               // return a << ImmT
+SIMD_IWRAPPER_2_32(sllv_epi32);                // return a << b      (uint32)
+SIMD_IWRAPPER_1I_32(srai_epi32);               // return a >> ImmT   (int32)
+SIMD_IWRAPPER_1I_32(srli_epi32);               // return a >> ImmT   (uint32)
+SIMD_IWRAPPER_2_32(srlv_epi32);                // return a >> b      (uint32)
+
+// use AVX2 version
+//SIMD_IWRAPPER_1I_(srli_si, srli_si256);     // return a >> (ImmT*8) (uint)
+
+//-----------------------------------------------------------------------
+// Conversion operations (Use AVX2 versions)
+//-----------------------------------------------------------------------
+// SIMD_IWRAPPER_1L(cvtepu8_epi16, 0xffff);    // return (int16)a    (uint8 --> int16)
+// SIMD_IWRAPPER_1L(cvtepu8_epi32, 0xff);      // return (int32)a    (uint8 --> int32)
+// SIMD_IWRAPPER_1L(cvtepu16_epi32, 0xff);     // return (int32)a    (uint16 --> int32)
+// SIMD_IWRAPPER_1L(cvtepu16_epi64, 0xf);      // return (int64)a    (uint16 --> int64)
+// SIMD_IWRAPPER_1L(cvtepu32_epi64, 0xf);      // return (int64)a    (uint32 --> int64)
+
+//-----------------------------------------------------------------------
+// Comparison operations (Use AVX2 versions
+//-----------------------------------------------------------------------
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi8);    // return a == b (int8)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi16);   // return a == b (int16)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi32);   // return a == b (int32)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi64);   // return a == b (int64)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi8,);   // return a > b (int8)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi16);   // return a > b (int16)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi32);   // return a > b (int32)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi64);   // return a > b (int64)
+//
+//static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b)   // return a < b (int32)
+//{
+//    return cmpgt_epi32(b, a);
+//}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_8(packs_epi16);     // int16 --> int8    See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2_16(packs_epi32);    // int32 --> int16   See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2_8(packus_epi16);    // uint16 --> uint8  See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2_16(packus_epi32);   // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+
+#endif
+// SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
+
+//static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+//{
+//    return _mm256_permutevar8x32_ps(a, swiz);
+//}
+
+SIMD_IWRAPPER_1I_32(shuffle_epi32);
+//template<int ImmT>
+//static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+//{
+//    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
+//}
+//SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_IWRAPPER_2_32(unpackhi_epi32);
+SIMD_IWRAPPER_2_32(unpacklo_epi32);
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_16(unpackhi_epi16);
+SIMD_IWRAPPER_2_64(unpackhi_epi64);
+SIMD_IWRAPPER_2_8(unpackhi_epi8);
+SIMD_IWRAPPER_2_16(unpacklo_epi16);
+SIMD_IWRAPPER_2_64(unpacklo_epi64);
+SIMD_IWRAPPER_2_8(unpacklo_epi8);
+
+#endif
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+{
+    return __conv(_mm512_maskz_load_ps(__mmask16(0xf), p));
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+{
+    return __conv(_mm512_maskz_load_epi32(__mmask16(0xf), p));
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+{
+    return __conv(_mm512_maskz_loadu_ps(__mmask16(0xf), p));
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+{
+    return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xf), p));
+}
+
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    return __conv(_mm512_mask_i32gather_ps(
+                    _mm512_setzero_ps(),
+                    __mmask16(0xf),
+                    __conv(idx),
+                    p,
+                    static_cast<int>(ScaleT)));
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    __mmask16 m = 0xf;
+    m = _mm512_mask_test_epi32_mask(m, _mm512_castps_si512(__conv(mask)),
+                                _mm512_set1_epi32(0x8000000));
+    return __conv(_mm512_mask_i32gather_ps(
+                    __conv(old),
+                    m,
+                    __conv(idx),
+                    p,
+                    static_cast<int>(ScaleT)));
+}
+
+#if !defined(AVX512F_STRICT)
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+    __mmask64 m = 0xffffull;
+    return static_cast<uint32_t>(
+        _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
+}
+
+#endif
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+    __mmask16 m = 0xf;
+    m = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000));
+    _mm512_mask_store_ps(p, m, __conv(src));
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+{
+    _mm512_mask_store_ps(p, __mmask16(0xf), __conv(a));
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+{
+    _mm512_mask_store_epi32(p, __mmask16(0xf), __conv(a));
+}
+
+//=======================================================================
+// Legacy interface (available only in SIMD256 width)
+//=======================================================================
+
+#undef SIMD_WRAPPER_1_
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_1I_
+#undef SIMD_WRAPPER_1I
+#undef SIMD_WRAPPER_2_
+#undef SIMD_WRAPPER_2
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3_
+#undef SIMD_WRAPPER_3
+#undef SIMD_DWRAPPER_1_
+#undef SIMD_DWRAPPER_1
+#undef SIMD_DWRAPPER_1I_
+#undef SIMD_DWRAPPER_1I
+#undef SIMD_DWRAPPER_2_
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_IWRAPPER_1_
+#undef SIMD_IWRAPPER_1_8
+#undef SIMD_IWRAPPER_1_16
+#undef SIMD_IWRAPPER_1_32
+#undef SIMD_IWRAPPER_1_64
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_1I_8
+#undef SIMD_IWRAPPER_1I_16
+#undef SIMD_IWRAPPER_1I_32
+#undef SIMD_IWRAPPER_1I_64
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2_8
+#undef SIMD_IWRAPPER_2_16
+#undef SIMD_IWRAPPER_2_32
+#undef SIMD_IWRAPPER_2_64
+#undef SIMD_IWRAPPER_2I
+//#undef SIMD_IWRAPPER_2I_8
+//#undef SIMD_IWRAPPER_2I_16
+//#undef SIMD_IWRAPPER_2I_32
+//#undef SIMD_IWRAPPER_2I_64
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
new file mode 100644
index 00000000000..aec79e31590
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
@@ -0,0 +1,757 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+using SIMD128T = SIMD128Impl::AVXImpl;
+
+//============================================================================
+// SIMD256 AVX (1) implementation
+//============================================================================
+
+#define SIMD_WRAPPER_1(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return _mm256_##op(a);\
+    }
+
+#define SIMD_WRAPPER_2(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm256_##op(a, b);\
+    }
+
+#define SIMD_DWRAPPER_2(op)  \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return _mm256_##op(a, b);\
+    }
+
+#define SIMD_WRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return  _mm256_##op(a, b, ImmT);\
+    }
+
+#define SIMD_DWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return _mm256_##op(a, b, ImmT);\
+    }
+
+#define SIMD_WRAPPER_3(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+    {\
+        return _mm256_##op(a, b, c);\
+    }
+
+#define SIMD_IWRAPPER_1(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm256_##op(a);\
+    }
+
+#define SIMD_IWRAPPER_2(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm256_##op(a, b);\
+    }
+
+#define SIMD_IFWRAPPER_2(op, intrin)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return castps_si( intrin(castsi_ps(a), castsi_ps(b)) );\
+    }
+
+#define SIMD_IFWRAPPER_2I(op, intrin)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return castps_si( intrin(castsi_ps(a), castsi_ps(b), ImmT) );\
+    }
+
+#define SIMD_IWRAPPER_2I_(op, intrin)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm256_##intrin(a, b, ImmT);\
+    }
+#define SIMD_IWRAPPER_2I(op)  SIMD_IWRAPPER_2I_(op, op)
+
+#define SIMD_IWRAPPER_3(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b, Integer c)   \
+    {\
+        return _mm256_##op(a, b, c);\
+    }
+
+// emulated integer simd
+#define SIMD_EMU_IWRAPPER_1(op) \
+    static SIMDINLINE \
+    Integer SIMDCALL op(Integer a)\
+    {\
+        return Integer\
+        {\
+            SIMD128T::op(a.v4[0]),\
+            SIMD128T::op(a.v4[1]),\
+        };\
+    }
+#define SIMD_EMU_IWRAPPER_1L(op, shift) \
+    static SIMDINLINE \
+    Integer SIMDCALL op(Integer a)\
+    {\
+        return Integer \
+        {\
+            SIMD128T::op(a.v4[0]), \
+            SIMD128T::op(SIMD128T::template srli_si<shift>(a.v4[0])), \
+        };\
+    }\
+    static SIMDINLINE \
+    Integer SIMDCALL op(SIMD128Impl::Integer a)\
+    {\
+        return Integer \
+        {\
+            SIMD128T::op(a), \
+            SIMD128T::op(SIMD128T::template srli_si<shift>(a)), \
+        };\
+    }
+
+#define SIMD_EMU_IWRAPPER_1I(op) \
+    template <int ImmT> static SIMDINLINE \
+    Integer SIMDCALL op(Integer a)\
+    {\
+        return Integer\
+        {\
+            SIMD128T::template op<ImmT>(a.v4[0]),\
+            SIMD128T::template op<ImmT>(a.v4[1]),\
+        };\
+    }
+
+#define SIMD_EMU_IWRAPPER_2(op) \
+    static SIMDINLINE \
+    Integer SIMDCALL op(Integer a, Integer b)\
+    {\
+        return Integer\
+        {\
+            SIMD128T::op(a.v4[0], b.v4[0]),\
+            SIMD128T::op(a.v4[1], b.v4[1]),\
+        };\
+    }
+
+#define SIMD_EMU_IWRAPPER_2I(op) \
+    template <int ImmT> static SIMDINLINE \
+    Integer SIMDCALL op(Integer a, Integer b)\
+    {\
+        return Integer\
+        {\
+            SIMD128T::template op<ImmT>(a.v4[0], b.v[0]),\
+            SIMD128T::template op<ImmT>(a.v4[1], b.v[1]),\
+        };\
+    }
+
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps);     // return a + b
+SIMD_WRAPPER_2(div_ps);     // return a / b
+
+static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c) // return (a * b) + c
+{
+    return add_ps(mul_ps(a, b), c);
+}
+
+SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
+SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);     // return a * b
+SIMD_WRAPPER_1(rcp_ps);     // return 1.0f / a
+SIMD_WRAPPER_1(rsqrt_ps);   // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);     // return a - b
+
+template <RoundMode RMT>
+static SIMDINLINE Float SIMDCALL round_ps(Float a)
+{
+    return _mm256_round_ps(a, static_cast<int>(RMT));
+}
+
+static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_EMU_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
+SIMD_EMU_IWRAPPER_2(add_epi32); // return a + b (int32)
+SIMD_EMU_IWRAPPER_2(add_epi8);  // return a + b (int8)
+SIMD_EMU_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+SIMD_EMU_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_EMU_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_EMU_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_EMU_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_EMU_IWRAPPER_2(mul_epi32); // return a * b (int32)
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_EMU_IWRAPPER_2(mullo_epi32);
+SIMD_EMU_IWRAPPER_2(sub_epi32); // return a - b (int32)
+SIMD_EMU_IWRAPPER_2(sub_epi64); // return a - b (int64)
+SIMD_EMU_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(and_ps);         // return a & b       (float treated as int)
+SIMD_EMU_IWRAPPER_2(and_si);    // return a & b       (int)
+SIMD_WRAPPER_2(andnot_ps);      // return (~a) & b    (float treated as int)
+SIMD_EMU_IWRAPPER_2(andnot_si); // return (~a) & b    (int)
+SIMD_WRAPPER_2(or_ps);          // return a | b       (float treated as int)
+SIMD_EMU_IWRAPPER_2(or_si);     // return a | b       (int)
+SIMD_WRAPPER_2(xor_ps);         // return a ^ b       (float treated as int)
+SIMD_EMU_IWRAPPER_2(xor_si);    // return a ^ b       (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_EMU_IWRAPPER_1I(slli_epi32);               // return a << ImmT
+
+static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vCount) // return a << b      (uint32)
+{
+    int32_t aHi, aLow, countHi, countLow;
+    __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
+    __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
+    __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
+    __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
+
+    aHi = _mm_extract_epi32(vAHi, 0);
+    countHi = _mm_extract_epi32(vCountHi, 0);
+    aHi <<= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 0);
+
+    aLow = _mm_extract_epi32(vALow, 0);
+    countLow = _mm_extract_epi32(vCountLow, 0);
+    aLow <<= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 0);
+
+    aHi = _mm_extract_epi32(vAHi, 1);
+    countHi = _mm_extract_epi32(vCountHi, 1);
+    aHi <<= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 1);
+
+    aLow = _mm_extract_epi32(vALow, 1);
+    countLow = _mm_extract_epi32(vCountLow, 1);
+    aLow <<= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 1);
+
+    aHi = _mm_extract_epi32(vAHi, 2);
+    countHi = _mm_extract_epi32(vCountHi, 2);
+    aHi <<= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 2);
+
+    aLow = _mm_extract_epi32(vALow, 2);
+    countLow = _mm_extract_epi32(vCountLow, 2);
+    aLow <<= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 2);
+
+    aHi = _mm_extract_epi32(vAHi, 3);
+    countHi = _mm_extract_epi32(vCountHi, 3);
+    aHi <<= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 3);
+
+    aLow = _mm_extract_epi32(vALow, 3);
+    countLow = _mm_extract_epi32(vCountLow, 3);
+    aLow <<= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 3);
+
+    __m256i ret = _mm256_set1_epi32(0);
+    ret = _mm256_insertf128_si256(ret, vAHi, 1);
+    ret = _mm256_insertf128_si256(ret, vALow, 0);
+    return ret;
+}
+
+SIMD_EMU_IWRAPPER_1I(srai_epi32);   // return a >> ImmT   (int32)
+SIMD_EMU_IWRAPPER_1I(srli_epi32);   // return a >> ImmT   (uint32)
+SIMD_EMU_IWRAPPER_1I(srli_si);      // return a >> (ImmT*8) (uint)
+
+template<int ImmT>                              // same as srli_si, but with Float cast to int
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
+{
+    return castsi_ps(srli_si<ImmT>(castps_si(a)));
+}
+
+static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vCount) // return a >> b      (uint32)
+{
+    int32_t aHi, aLow, countHi, countLow;
+    __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
+    __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
+    __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
+    __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
+
+    aHi = _mm_extract_epi32(vAHi, 0);
+    countHi = _mm_extract_epi32(vCountHi, 0);
+    aHi >>= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 0);
+
+    aLow = _mm_extract_epi32(vALow, 0);
+    countLow = _mm_extract_epi32(vCountLow, 0);
+    aLow >>= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 0);
+
+    aHi = _mm_extract_epi32(vAHi, 1);
+    countHi = _mm_extract_epi32(vCountHi, 1);
+    aHi >>= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 1);
+
+    aLow = _mm_extract_epi32(vALow, 1);
+    countLow = _mm_extract_epi32(vCountLow, 1);
+    aLow >>= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 1);
+
+    aHi = _mm_extract_epi32(vAHi, 2);
+    countHi = _mm_extract_epi32(vCountHi, 2);
+    aHi >>= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 2);
+
+    aLow = _mm_extract_epi32(vALow, 2);
+    countLow = _mm_extract_epi32(vCountLow, 2);
+    aLow >>= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 2);
+
+    aHi = _mm_extract_epi32(vAHi, 3);
+    countHi = _mm_extract_epi32(vCountHi, 3);
+    aHi >>= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 3);
+
+    aLow = _mm_extract_epi32(vALow, 3);
+    countLow = _mm_extract_epi32(vCountLow, 3);
+    aLow >>= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 3);
+
+    __m256i ret = _mm256_set1_epi32(0);
+    ret = _mm256_insertf128_si256(ret, vAHi, 1);
+    ret = _mm256_insertf128_si256(ret, vALow, 0);
+    return ret;
+}
+
+
+
+//-----------------------------------------------------------------------
+// Conversion operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL castpd_ps(Double a)   // return *(Float*)(&a)
+{
+    return _mm256_castpd_ps(a);
+}
+
+static SIMDINLINE Integer SIMDCALL castps_si(Float a)   // return *(Integer*)(&a)
+{
+    return _mm256_castps_si256(a);
+}
+
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer a)   // return *(Double*)(&a)
+{
+    return _mm256_castsi256_pd(a);
+}
+
+static SIMDINLINE Double SIMDCALL castps_pd(Float a)   // return *(Double*)(&a)
+{
+    return _mm256_castps_pd(a);
+}
+
+static SIMDINLINE Integer SIMDCALL castpd_si(Double a)   // return *(Integer*)(&a)
+{
+    return _mm256_castpd_si256(a);
+}
+
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer a)   // return *(Float*)(&a)
+{
+    return _mm256_castsi256_ps(a);
+}
+
+static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a    (int32 --> float)
+{
+    return _mm256_cvtepi32_ps(a);
+}
+
+SIMD_EMU_IWRAPPER_1L(cvtepu8_epi16, 8);                  // return (int16)a    (uint8 --> int16)
+SIMD_EMU_IWRAPPER_1L(cvtepu8_epi32, 4);                  // return (int32)a    (uint8 --> int32)
+SIMD_EMU_IWRAPPER_1L(cvtepu16_epi32, 8);                 // return (int32)a    (uint16 --> int32)
+SIMD_EMU_IWRAPPER_1L(cvtepu16_epi64, 4);                 // return (int64)a    (uint16 --> int64)
+SIMD_EMU_IWRAPPER_1L(cvtepu32_epi64, 8);                 // return (int64)a    (uint32 --> int64)
+
+static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a)            // return (int32)a    (float --> int32)
+{
+    return _mm256_cvtps_epi32(a);
+}
+
+static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a)           // return (int32)a    (rnd_to_zero(float) --> int32)
+{
+    return _mm256_cvttps_epi32(a);
+}
+
+//-----------------------------------------------------------------------
+// Comparison operations
+//-----------------------------------------------------------------------
+template<CompareType CmpTypeT>
+static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
+{
+    return _mm256_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
+}
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
+
+SIMD_EMU_IWRAPPER_2(cmpeq_epi8);    // return a == b (int8)
+SIMD_EMU_IWRAPPER_2(cmpeq_epi16);   // return a == b (int16)
+SIMD_EMU_IWRAPPER_2(cmpeq_epi32);   // return a == b (int32)
+SIMD_EMU_IWRAPPER_2(cmpeq_epi64);   // return a == b (int64)
+SIMD_EMU_IWRAPPER_2(cmpgt_epi8);    // return a > b (int8)
+SIMD_EMU_IWRAPPER_2(cmpgt_epi16);   // return a > b (int16)
+SIMD_EMU_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
+SIMD_EMU_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
+SIMD_EMU_IWRAPPER_2(cmplt_epi32);   // return a < b (int32)
+
+static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
+{
+    return  0 != _mm256_testz_ps(a, b);
+}
+
+static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
+{
+    return  0 != _mm256_testz_si256(a, b);
+}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2I(blend_ps);  // return ImmT ? b : a  (float)
+SIMD_IFWRAPPER_2I(blend_epi32, _mm256_blend_ps);  // return ImmT ? b : a  (int32)
+SIMD_WRAPPER_3(blendv_ps);  // return mask ? b : a  (float)
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
+{
+    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
+}
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
+{
+    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
+}
+
+static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)  // return *p (all elements in vector get same value)
+{
+    return _mm256_broadcast_ss(p);
+}
+
+SIMD_EMU_IWRAPPER_2(packs_epi16);   // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_EMU_IWRAPPER_2(packs_epi32);   // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_EMU_IWRAPPER_2(packus_epi16);  // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_EMU_IWRAPPER_2(packus_epi32);  // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+
+static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
+{
+    Integer result;
+
+    // Ugly slow implementation
+    uint32_t const *pA = reinterpret_cast<uint32_t const*>(&a);
+    uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
+    uint32_t *pResult = reinterpret_cast<uint32_t *>(&result);
+
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        pResult[i] = pA[0xF & pSwiz[i]];
+    }
+
+    return result;
+}
+
+static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    Float result;
+
+    // Ugly slow implementation
+    float const *pA = reinterpret_cast<float const*>(&a);
+    uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
+    float *pResult = reinterpret_cast<float *>(&result);
+
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        pResult[i] = pA[0xF & pSwiz[i]];
+    }
+
+    return result;
+}
+
+SIMD_WRAPPER_2I(permute2f128_ps);
+SIMD_DWRAPPER_2I(permute2f128_pd);
+SIMD_IWRAPPER_2I_(permute2f128_si, permute2f128_si256);
+
+
+SIMD_EMU_IWRAPPER_1I(shuffle_epi32);
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+{
+    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
+}
+SIMD_EMU_IWRAPPER_2(shuffle_epi8);
+SIMD_DWRAPPER_2I(shuffle_pd);
+SIMD_WRAPPER_2I(shuffle_ps);
+SIMD_EMU_IWRAPPER_2(unpackhi_epi16);
+SIMD_IFWRAPPER_2(unpackhi_epi32, _mm256_unpackhi_ps);
+SIMD_EMU_IWRAPPER_2(unpackhi_epi64);
+SIMD_EMU_IWRAPPER_2(unpackhi_epi8);
+SIMD_DWRAPPER_2(unpackhi_pd);
+SIMD_WRAPPER_2(unpackhi_ps);
+SIMD_EMU_IWRAPPER_2(unpacklo_epi16);
+SIMD_IFWRAPPER_2(unpacklo_epi32, _mm256_unpacklo_ps);
+SIMD_EMU_IWRAPPER_2(unpacklo_epi64);
+SIMD_EMU_IWRAPPER_2(unpacklo_epi8);
+SIMD_DWRAPPER_2(unpacklo_pd);
+SIMD_WRAPPER_2(unpacklo_ps);
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    uint32_t *pOffsets = (uint32_t*)&idx;
+    Float vResult;
+    float* pResult = (float*)&vResult;
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        uint32_t offset = pOffsets[i];
+        offset = offset * static_cast<uint32_t>(ScaleT);
+        pResult[i] = *(float const*)(((uint8_t const*)p + offset));
+    }
+
+    return vResult;
+}
+
+static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
+{
+    return broadcast_ss(p);
+}
+
+static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+{
+    return _mm256_load_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+{
+    return _mm256_load_si256(&p->v);
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+{
+    return _mm256_loadu_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+{
+    return _mm256_lddqu_si256(&p->v);
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    uint32_t *pOffsets = (uint32_t*)&idx;
+    Float vResult = old;
+    float* pResult = (float*)&vResult;
+    DWORD index;
+    uint32_t umask = movemask_ps(mask);
+    while (_BitScanForward(&index, umask))
+    {
+        umask &= ~(1 << index);
+        uint32_t offset = pOffsets[index];
+        offset = offset * static_cast<uint32_t>(ScaleT);
+        pResult[index] = *(float const *)(((uint8_t const *)p + offset));
+    }
+
+    return vResult;
+}
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+    _mm256_maskstore_ps(p, mask, src);
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+    return SIMD128T::movemask_epi8(a.v4[0]) |
+           (SIMD128T::movemask_epi8(a.v4[1]) << 16);
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
+{
+    return static_cast<uint32_t>(_mm256_movemask_pd(a));
+}
+static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
+{
+    return static_cast<uint32_t>(_mm256_movemask_ps(a));
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
+{
+    return _mm256_set1_epi32(i);
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
+{
+    return _mm256_set1_epi8(i);
+}
+
+static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
+{
+    return _mm256_set1_ps(f);
+}
+
+static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
+{
+    return _mm256_setzero_ps();
+}
+
+static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
+{
+    return _mm256_setzero_si256();
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+{
+    _mm256_store_ps(p, a);
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+{
+    _mm256_store_si256(&p->v, a);
+}
+
+static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
+{
+    _mm256_stream_ps(p, a);
+}
+
+//=======================================================================
+// Legacy interface (available only in SIMD256 width)
+//=======================================================================
+
+static SIMDINLINE Float SIMDCALL broadcast_ps(SIMD128Impl::Float const *p)
+{
+    return _mm256_broadcast_ps(&p->v);
+}
+
+template<int ImmT>
+static SIMDINLINE SIMD128Impl::Double SIMDCALL extractf128_pd(Double a)
+{
+    return _mm256_extractf128_pd(a, ImmT);
+}
+
+template<int ImmT>
+static SIMDINLINE SIMD128Impl::Float  SIMDCALL extractf128_ps(Float a)
+{
+    return _mm256_extractf128_ps(a, ImmT);
+}
+
+template<int ImmT>
+static SIMDINLINE SIMD128Impl::Integer SIMDCALL extractf128_si(Integer a)
+{
+    return _mm256_extractf128_si256(a, ImmT);
+}
+
+template<int ImmT>
+static SIMDINLINE Double SIMDCALL insertf128_pd(Double a, SIMD128Impl::Double b)
+{
+    return _mm256_insertf128_pd(a, b, ImmT);
+}
+
+template<int ImmT>
+static SIMDINLINE Float SIMDCALL insertf128_ps(Float a, SIMD128Impl::Float b)
+{
+    return _mm256_insertf128_ps(a, b, ImmT);
+}
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL insertf128_si(Integer a, SIMD128Impl::Integer b)
+{
+    return _mm256_insertf128_si256(a, b, ImmT);
+}
+
+#ifndef _mm256_set_m128i
+#define _mm256_set_m128i(/* SIMD128Impl::Integer */ hi, /* SIMD128Impl::Integer */ lo) \
+    _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
+#endif
+
+#ifndef _mm256_loadu2_m128i
+#define _mm256_loadu2_m128i(/* SIMD128Impl::Integer const* */ hiaddr, \
+                            /* SIMD128Impl::Integer const* */ loaddr) \
+    _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr))
+#endif
+
+static SIMDINLINE Integer SIMDCALL loadu2_si(SIMD128Impl::Integer const* phi, SIMD128Impl::Integer const* plo)
+{
+    return _mm256_loadu2_m128i(&phi->v, &plo->v);
+}
+
+static SIMDINLINE Integer SIMDCALL set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+{
+    return _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+{
+    return _mm256_set_ps(i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE void SIMDCALL storeu2_si(SIMD128Impl::Integer *phi, SIMD128Impl::Integer *plo, Integer src)
+{
+    _mm256_storeu2_m128i(&phi->v, &plo->v, src);
+}
+
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_2
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IFWRAPPER_2
+#undef SIMD_IFWRAPPER_2I
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_2I_
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_3
+#undef SIMD_EMU_IWRAPPER_1
+#undef SIMD_EMU_IWRAPPER_1I
+#undef SIMD_EMU_IWRAPPER_2
+#undef SIMD_EMU_IWRAPPER_2I
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl
new file mode 100644
index 00000000000..0a812039300
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl
@@ -0,0 +1,234 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX2_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD256 AVX (2) implementation
+//
+// Since this implementation inherits from the AVX (1) implementation,
+// the only operations below ones that replace AVX (1) operations.
+// Mostly these are integer operations that are no longer emulated with SSE
+//============================================================================
+
+#define SIMD_IWRAPPER_1(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm256_##op(a);\
+    }
+
+#define SIMD_IWRAPPER_1L(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm256_##op(_mm256_castsi256_si128(a));\
+    }\
+
+#define SIMD_IWRAPPER_1I(op)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm256_##op(a, ImmT);\
+    }
+
+#define SIMD_IWRAPPER_1I_(op, intrin)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm256_##intrin(a, ImmT);\
+    }
+
+#define SIMD_IWRAPPER_2_(op, intrin)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm256_##intrin(a, b);\
+    }
+
+#define SIMD_IWRAPPER_2(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm256_##op(a, b);\
+    }
+
+#define SIMD_IWRAPPER_2I(op)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm256_##op(a, b, ImmT);\
+    }
+
+#define SIMD_IWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm256_##op(a, b, ImmT);\
+    }
+
+//-----------------------------------------------------------------------
+// Floating point arithmetic operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c)   // return (a * b) + c
+{
+    return _mm256_fmadd_ps(a, b, c);
+}
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
+SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
+SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2(mullo_epi32);
+SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
+SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
+SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2_(and_si,    and_si256);     // return a & b       (int)
+SIMD_IWRAPPER_2_(andnot_si, andnot_si256);  // return (~a) & b    (int)
+SIMD_IWRAPPER_2_(or_si,     or_si256);      // return a | b       (int)
+SIMD_IWRAPPER_2_(xor_si,    xor_si256);     // return a ^ b       (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1I(slli_epi32);               // return a << ImmT
+SIMD_IWRAPPER_2(sllv_epi32);                // return a << b      (uint32)
+SIMD_IWRAPPER_1I(srai_epi32);               // return a >> ImmT   (int32)
+SIMD_IWRAPPER_1I(srli_epi32);               // return a >> ImmT   (uint32)
+SIMD_IWRAPPER_2(srlv_epi32);                // return a >> b      (uint32)
+SIMD_IWRAPPER_1I_(srli_si, srli_si256);     // return a >> (ImmT*8) (uint)
+
+template<int ImmT>                          // same as srli_si, but with Float cast to int
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
+{
+    return castsi_ps(srli_si<ImmT>(castps_si(a)));
+}
+
+
+//-----------------------------------------------------------------------
+// Conversion operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1L(cvtepu8_epi16);    // return (int16)a    (uint8 --> int16)
+SIMD_IWRAPPER_1L(cvtepu8_epi32);    // return (int32)a    (uint8 --> int32)
+SIMD_IWRAPPER_1L(cvtepu16_epi32);   // return (int32)a    (uint16 --> int32)
+SIMD_IWRAPPER_1L(cvtepu16_epi64);   // return (int64)a    (uint16 --> int64)
+SIMD_IWRAPPER_1L(cvtepu32_epi64);   // return (int64)a    (uint32 --> int64)
+
+//-----------------------------------------------------------------------
+// Comparison operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2(cmpeq_epi8);    // return a == b (int8)
+SIMD_IWRAPPER_2(cmpeq_epi16);   // return a == b (int16)
+SIMD_IWRAPPER_2(cmpeq_epi32);   // return a == b (int32)
+SIMD_IWRAPPER_2(cmpeq_epi64);   // return a == b (int64)
+SIMD_IWRAPPER_2(cmpgt_epi8);    // return a > b (int8)
+SIMD_IWRAPPER_2(cmpgt_epi16);   // return a > b (int16)
+SIMD_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
+SIMD_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
+
+static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b)   // return a < b (int32)
+{
+    return cmpgt_epi32(b, a);
+}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2I(blend_epi32);  // return ImmT ? b : a  (int32)
+SIMD_IWRAPPER_2(packs_epi16);   // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2(packs_epi32);   // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2(packus_epi16);  // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2(packus_epi32);  // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
+
+static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    return _mm256_permutevar8x32_ps(a, swiz);
+}
+
+SIMD_IWRAPPER_1I(shuffle_epi32);
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+{
+    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
+}
+SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_IWRAPPER_2(unpackhi_epi16);
+SIMD_IWRAPPER_2(unpackhi_epi32);
+SIMD_IWRAPPER_2(unpackhi_epi64);
+SIMD_IWRAPPER_2(unpackhi_epi8);
+SIMD_IWRAPPER_2(unpacklo_epi16);
+SIMD_IWRAPPER_2(unpacklo_epi32);
+SIMD_IWRAPPER_2(unpacklo_epi64);
+SIMD_IWRAPPER_2(unpacklo_epi8);
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    return _mm256_i32gather_ps(p, idx, static_cast<int>(ScaleT));
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+	// g++ in debug mode needs the explicit .v suffix instead of relying on operator __m256()
+	// Only for this intrinsic - not sure why. :(
+    return _mm256_mask_i32gather_ps(old.v, p, idx.v, mask.v, static_cast<int>(ScaleT));
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+    return static_cast<uint32_t>(_mm256_movemask_epi8(a));
+}
+
+//=======================================================================
+// Legacy interface (available only in SIMD256 width)
+//=======================================================================
+
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_1L
+#undef SIMD_IWRAPPER_1I
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_2I
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl
new file mode 100644
index 00000000000..76afbd01c05
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl
@@ -0,0 +1,409 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD256 AVX (512) implementation
+//
+// Since this implementation inherits from the AVX (2) implementation,
+// the only operations below ones that replace AVX (2) operations.
+// These use native AVX512 instructions with masking to enable a larger
+// register set.
+//============================================================================
+
+private:
+    static SIMDINLINE __m512  __conv(Float r) { return _mm512_castps256_ps512(r.v); }
+    static SIMDINLINE __m512d __conv(Double r) { return _mm512_castpd256_pd512(r.v); }
+    static SIMDINLINE __m512i __conv(Integer r) { return _mm512_castsi256_si512(r.v); }
+    static SIMDINLINE Float   __conv(__m512 r) { return _mm512_castps512_ps256(r); }
+    static SIMDINLINE Double  __conv(__m512d r) { return _mm512_castpd512_pd256(r); }
+    static SIMDINLINE Integer __conv(__m512i r) { return _mm512_castsi512_si256(r); }
+public:
+
+#define SIMD_WRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#define SIMD_WRAPPER_1(op)  SIMD_WRAPPER_1_(op, op, __mmask16(0xff))
+
+#define SIMD_WRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#define SIMD_WRAPPER_1I(op)  SIMD_WRAPPER_1I_(op, op, __mmask16(0xff))
+
+#define SIMD_WRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#define SIMD_WRAPPER_2(op)  SIMD_WRAPPER_2_(op, op, __mmask16(0xff))
+
+#define SIMD_WRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT));\
+    }
+
+#define SIMD_WRAPPER_3_(op, intrin, mask)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c)));\
+    }
+#define SIMD_WRAPPER_3(op)  SIMD_WRAPPER_3_(op, op, __mmask16(0xff))
+
+#define SIMD_DWRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Double SIMDCALL op(Double a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_1(op)  SIMD_DWRAPPER_1_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_DWRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Double SIMDCALL op(Double a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_1I(op)  SIMD_DWRAPPER_1I_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_DWRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_2(op)  SIMD_DWRAPPER_2_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_DWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
+    }
+
+#define SIMD_IWRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#define SIMD_IWRAPPER_1_32(op)  SIMD_IWRAPPER_1_(op, op, __mmask16(0xff))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_1_8(op)   SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffffffull))
+#define SIMD_IWRAPPER_1_16(op)  SIMD_IWRAPPER_1_(op, op, __mmask32(0xffff))
+#define SIMD_IWRAPPER_1_64(op)  SIMD_IWRAPPER_1_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_IWRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#define SIMD_IWRAPPER_1I_32(op)  SIMD_IWRAPPER_1I_(op, op, __mmask16(0xff))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_1I_8(op)   SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffffffull))
+#define SIMD_IWRAPPER_1I_16(op)  SIMD_IWRAPPER_1I_(op, op, __mmask32(0xffff))
+#define SIMD_IWRAPPER_1I_64(op)  SIMD_IWRAPPER_1I_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_IWRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#define SIMD_IWRAPPER_2_32(op)  SIMD_IWRAPPER_2_(op, op, __mmask16(0xff))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_2_8(op)   SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffffffull))
+#define SIMD_IWRAPPER_2_16(op)  SIMD_IWRAPPER_2_(op, op, __mmask32(0xffff))
+#define SIMD_IWRAPPER_2_64(op)  SIMD_IWRAPPER_2_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_IWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT));\
+    }
+
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps);     // return a + b
+SIMD_WRAPPER_2(div_ps);     // return a / b
+SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
+SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);     // return a * b
+//SIMD_WRAPPER_1_(rcp_ps, rcp28_ps, __mmask16(0xff));     // return 1.0f / a
+//SIMD_WRAPPER_1_(rsqrt_ps, rsqrt28_ps, __mmask16(0xff));   // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);     // return a - b
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1_32(abs_epi32);  // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2_32(add_epi32);  // return a + b (int32)
+SIMD_IWRAPPER_2_32(max_epi32);  // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2_32(max_epu32);  // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2_32(min_epi32);  // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2_32(min_epu32);  // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2_32(mul_epi32);  // return a * b (int32)
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_8(add_epi8);    // return a + b (int8)
+SIMD_IWRAPPER_2_8(adds_epu8);   // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+
+#endif
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2_32(mullo_epi32);
+SIMD_IWRAPPER_2_32(sub_epi32);  // return a - b (int32)
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_64(sub_epi64);  // return a - b (int64)
+SIMD_IWRAPPER_2_8(subs_epu8);   // return (b > a) ? 0 : (a - b) (uint8)
+
+#endif
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2_(and_si,    and_epi32, __mmask16(0xff));    // return a & b       (int)
+SIMD_IWRAPPER_2_(andnot_si, andnot_epi32, __mmask16(0xff)); // return (~a) & b    (int)
+SIMD_IWRAPPER_2_(or_si,     or_epi32, __mmask16(0xff));     // return a | b       (int)
+SIMD_IWRAPPER_2_(xor_si,    xor_epi32, __mmask16(0xff));    // return a ^ b       (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1I_32(slli_epi32);               // return a << ImmT
+SIMD_IWRAPPER_2_32(sllv_epi32);                // return a << b      (uint32)
+SIMD_IWRAPPER_1I_32(srai_epi32);               // return a >> ImmT   (int32)
+SIMD_IWRAPPER_1I_32(srli_epi32);               // return a >> ImmT   (uint32)
+SIMD_IWRAPPER_2_32(srlv_epi32);                // return a >> b      (uint32)
+
+// use AVX2 version
+//SIMD_IWRAPPER_1I_(srli_si, srli_si256);     // return a >> (ImmT*8) (uint)
+
+//-----------------------------------------------------------------------
+// Conversion operations (Use AVX2 versions)
+//-----------------------------------------------------------------------
+// SIMD_IWRAPPER_1L(cvtepu8_epi16, 0xffff);    // return (int16)a    (uint8 --> int16)
+// SIMD_IWRAPPER_1L(cvtepu8_epi32, 0xff);      // return (int32)a    (uint8 --> int32)
+// SIMD_IWRAPPER_1L(cvtepu16_epi32, 0xff);     // return (int32)a    (uint16 --> int32)
+// SIMD_IWRAPPER_1L(cvtepu16_epi64, 0xf);      // return (int64)a    (uint16 --> int64)
+// SIMD_IWRAPPER_1L(cvtepu32_epi64, 0xf);      // return (int64)a    (uint32 --> int64)
+
+//-----------------------------------------------------------------------
+// Comparison operations (Use AVX2 versions
+//-----------------------------------------------------------------------
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi8);    // return a == b (int8)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi16);   // return a == b (int16)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi32);   // return a == b (int32)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi64);   // return a == b (int64)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi8,);   // return a > b (int8)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi16);   // return a > b (int16)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi32);   // return a > b (int32)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi64);   // return a > b (int64)
+//
+//static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b)   // return a < b (int32)
+//{
+//    return cmpgt_epi32(b, a);
+//}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_8(packs_epi16);     // int16 --> int8    See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2_16(packs_epi32);    // int32 --> int16   See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2_8(packus_epi16);    // uint16 --> uint8  See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2_16(packus_epi32);   // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+
+#endif
+
+// SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
+
+//static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+//{
+//    return _mm256_permutevar8x32_ps(a, swiz);
+//}
+
+SIMD_IWRAPPER_1I_32(shuffle_epi32);
+//template<int ImmT>
+//static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+//{
+//    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
+//}
+//SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_IWRAPPER_2_32(unpackhi_epi32);
+SIMD_IWRAPPER_2_32(unpacklo_epi32);
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_16(unpackhi_epi16);
+SIMD_IWRAPPER_2_64(unpackhi_epi64);
+SIMD_IWRAPPER_2_8(unpackhi_epi8);
+SIMD_IWRAPPER_2_16(unpacklo_epi16);
+SIMD_IWRAPPER_2_64(unpacklo_epi64);
+SIMD_IWRAPPER_2_8(unpacklo_epi8);
+
+#endif
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+{
+    return __conv(_mm512_maskz_load_ps(__mmask16(0xff), p));
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+{
+    return __conv(_mm512_maskz_load_epi32(__mmask16(0xff), p));
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+{
+    return __conv(_mm512_maskz_loadu_ps(__mmask16(0xff), p));
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+{
+    return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xff), p));
+}
+
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    return __conv(_mm512_mask_i32gather_ps(
+                    _mm512_setzero_ps(),
+                    __mmask16(0xff),
+                    __conv(idx),
+                    p,
+                    static_cast<int>(ScaleT)));
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    __mmask16 m = 0xff;
+    m = _mm512_mask_test_epi32_mask(m, _mm512_castps_si512(__conv(mask)),
+                                _mm512_set1_epi32(0x8000000));
+    return __conv(_mm512_mask_i32gather_ps(
+                    __conv(old),
+                    m,
+                    __conv(idx),
+                    p,
+                    static_cast<int>(ScaleT)));
+}
+
+#if !defined(AVX512F_STRICT)
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+    __mmask64 m = 0xffffffffull;
+    return static_cast<uint32_t>(
+        _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
+}
+
+#endif
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+    __mmask16 m = 0xff;
+    m = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000));
+    _mm512_mask_store_ps(p, m, __conv(src));
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+{
+    _mm512_mask_store_ps(p, __mmask16(0xff), __conv(a));
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+{
+    _mm512_mask_store_epi32(p, __mmask16(0xff), __conv(a));
+}
+
+//=======================================================================
+// Legacy interface (available only in SIMD256 width)
+//=======================================================================
+
+#undef SIMD_WRAPPER_1_
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_1I_
+#undef SIMD_WRAPPER_1I
+#undef SIMD_WRAPPER_2_
+#undef SIMD_WRAPPER_2
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3_
+#undef SIMD_WRAPPER_3
+#undef SIMD_DWRAPPER_1_
+#undef SIMD_DWRAPPER_1
+#undef SIMD_DWRAPPER_1I_
+#undef SIMD_DWRAPPER_1I
+#undef SIMD_DWRAPPER_2_
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_IWRAPPER_1_
+#undef SIMD_IWRAPPER_1_8
+#undef SIMD_IWRAPPER_1_16
+#undef SIMD_IWRAPPER_1_32
+#undef SIMD_IWRAPPER_1_64
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_1I_8
+#undef SIMD_IWRAPPER_1I_16
+#undef SIMD_IWRAPPER_1I_32
+#undef SIMD_IWRAPPER_1I_64
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2_8
+#undef SIMD_IWRAPPER_2_16
+#undef SIMD_IWRAPPER_2_32
+#undef SIMD_IWRAPPER_2_64
+#undef SIMD_IWRAPPER_2I
+//#undef SIMD_IWRAPPER_2I_8
+//#undef SIMD_IWRAPPER_2I_16
+//#undef SIMD_IWRAPPER_2I_32
+//#undef SIMD_IWRAPPER_2I_64
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
new file mode 100644
index 00000000000..226952e282e
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
@@ -0,0 +1,682 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD16 AVX512 (F) implementation
+//
+//  TODO: Optimize for KNL / KNH or for SKX??
+//      For now probably optimizing more for KNL as that's where
+//      immediate customers are.
+//============================================================================
+
+static const int TARGET_SIMD_WIDTH = 16;
+using SIMD256T = SIMD256Impl::AVX2Impl;
+
+#define SIMD_WRAPPER_1_(op, intrin)  \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return intrin(a);\
+    }
+
+#define SIMD_WRAPPER_1(op)  \
+    SIMD_WRAPPER_1_(op, _mm512_##op)
+
+#define SIMD_WRAPPER_2_(op, intrin)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm512_##intrin(a, b);\
+    }
+#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
+
+#define SIMD_WRAPPERI_2_(op, intrin)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm512_castsi512_ps(_mm512_##intrin(\
+            _mm512_castps_si512(a), _mm512_castps_si512(b)));\
+    }
+
+#define SIMD_DWRAPPER_2(op)  \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return _mm512_##op(a, b);\
+    }
+
+#define SIMD_WRAPPER_2I_(op, intrin)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm512_##intrin(a, b, ImmT);\
+    }
+#define SIMD_WRAPPER_2I(op)  SIMD_WRAPPER_2I_(op, op)
+
+#define SIMD_DWRAPPER_2I_(op, intrin)  \
+    template<int ImmT>\
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return _mm512_##intrin(a, b, ImmT);\
+    }
+#define SIMD_DWRAPPER_2I(op)  SIMD_DWRAPPER_2I_(op, op)
+
+#define SIMD_WRAPPER_3(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+    {\
+        return _mm512_##op(a, b, c);\
+    }
+
+#define SIMD_IWRAPPER_1(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm512_##op(a);\
+    }
+#define SIMD_IWRAPPER_1_8(op)  \
+    static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a)   \
+    {\
+        return _mm512_##op(a);\
+    }
+
+#define SIMD_IWRAPPER_1_4(op)  \
+    static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a)   \
+    {\
+        return _mm512_##op(a);\
+    }
+
+#define SIMD_IWRAPPER_1I_(op, intrin)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return intrin(a, ImmT);\
+    }
+#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
+
+#define SIMD_IWRAPPER_2_(op, intrin)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm512_##intrin(a, b);\
+    }
+#define SIMD_IWRAPPER_2(op)  SIMD_IWRAPPER_2_(op, op)
+
+#define SIMD_IWRAPPER_2_CMP(op, cmp)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return cmp(a, b);\
+    }
+
+#define SIMD_IFWRAPPER_2(op, intrin)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b)) );\
+    }
+
+#define SIMD_IWRAPPER_2I_(op, intrin)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm512_##intrin(a, b, ImmT);\
+    }
+#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
+
+private:
+    static SIMDINLINE Integer vmask(__mmask8 m)
+    {
+        return _mm512_maskz_set1_epi64(m, -1LL);
+    }
+    static SIMDINLINE Integer vmask(__mmask16 m)
+    {
+        return _mm512_maskz_set1_epi32(m, -1);
+    }
+    static SIMDINLINE Integer vmask(__mmask32 m)
+    {
+        return _mm512_maskz_set1_epi16(m, -1);
+    }
+    static SIMDINLINE Integer vmask(__mmask64 m)
+    {
+        return _mm512_maskz_set1_epi8(m, -1);
+    }
+
+public:
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps);     // return a + b
+SIMD_WRAPPER_2(div_ps);     // return a / b
+SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
+SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);     // return a * b
+SIMD_WRAPPER_1_(rcp_ps, _mm512_rcp28_ps);       // return 1.0f / a
+SIMD_WRAPPER_1_(rsqrt_ps, _mm512_rsqrt28_ps);   // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);     // return a - b
+
+template <RoundMode RMT>
+static SIMDINLINE Float SIMDCALL round_ps(Float a)
+{
+    return _mm512_roundscale_ps(a, static_cast<int>(RMT));
+}
+
+static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
+SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
+SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
+
+                            // return (a * b) & 0xFFFFFFFF
+                            //
+                            // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+                            // and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2(mullo_epi32);
+SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
+SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
+SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2_(and_si, and_si512);        // return a & b       (int)
+SIMD_IWRAPPER_2_(andnot_si, andnot_si512);  // return (~a) & b    (int)
+SIMD_IWRAPPER_2_(or_si, or_si512);          // return a | b       (int)
+SIMD_IWRAPPER_2_(xor_si, xor_si512);        // return a ^ b       (int)
+
+#if defined(AVX512F_STRICT)
+
+SIMD_WRAPPERI_2_(and_ps, and_epi32);          // return a & b       (float treated as int)
+SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32);    // return (~a) & b    (float treated as int)
+SIMD_WRAPPERI_2_(or_ps, or_epi32);            // return a | b       (float treated as int)
+SIMD_WRAPPERI_2_(xor_ps, xor_epi32);          // return a ^ b       (float treated as int)
+
+#else
+
+SIMD_WRAPPER_2(and_ps);                     // return a & b       (float treated as int)
+SIMD_WRAPPER_2(andnot_ps);                  // return (~a) & b    (float treated as int)
+SIMD_WRAPPER_2(or_ps);                      // return a | b       (float treated as int)
+SIMD_WRAPPER_2(xor_ps);                     // return a ^ b       (float treated as int)
+
+#endif
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1I(slli_epi32);               // return a << ImmT
+SIMD_IWRAPPER_2(sllv_epi32);
+SIMD_IWRAPPER_1I(srai_epi32);               // return a >> ImmT   (int32)
+SIMD_IWRAPPER_1I(srli_epi32);               // return a >> ImmT   (uint32)
+SIMD_IWRAPPER_1I_(srli_si, srli_si512);     // return a >> (ImmT*8) (uint)
+
+template<int ImmT>                              // same as srli_si, but with Float cast to int
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
+{
+    return castsi_ps(srli_si<ImmT>(castps_si(a)));
+}
+
+SIMD_IWRAPPER_2(srlv_epi32);
+
+//-----------------------------------------------------------------------
+// Conversion operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL castpd_ps(Double a)   // return *(Float*)(&a)
+{
+    return _mm512_castpd_ps(a);
+}
+
+static SIMDINLINE Integer SIMDCALL castps_si(Float a)   // return *(Integer*)(&a)
+{
+    return _mm512_castps_si512(a);
+}
+
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer a)   // return *(Double*)(&a)
+{
+    return _mm512_castsi512_pd(a);
+}
+
+static SIMDINLINE Double SIMDCALL castps_pd(Float a)   // return *(Double*)(&a)
+{
+    return _mm512_castps_pd(a);
+}
+
+static SIMDINLINE Integer SIMDCALL castpd_si(Double a)   // return *(Integer*)(&a)
+{
+    return _mm512_castpd_si512(a);
+}
+
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer a)   // return *(Float*)(&a)
+{
+    return _mm512_castsi512_ps(a);
+}
+
+static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a    (int32 --> float)
+{
+    return _mm512_cvtepi32_ps(a);
+}
+
+SIMD_IWRAPPER_1_8(cvtepu8_epi16);     // return (int16)a    (uint8 --> int16)
+SIMD_IWRAPPER_1_4(cvtepu8_epi32);     // return (int32)a    (uint8 --> int32)
+SIMD_IWRAPPER_1_8(cvtepu16_epi32);    // return (int32)a    (uint16 --> int32)
+SIMD_IWRAPPER_1_4(cvtepu16_epi64);    // return (int64)a    (uint16 --> int64)
+SIMD_IWRAPPER_1_8(cvtepu32_epi64);    // return (int64)a    (uint32 --> int64)
+
+static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a)            // return (int32)a    (float --> int32)
+{
+    return _mm512_cvtps_epi32(a);
+}
+
+static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a)           // return (int32)a    (rnd_to_zero(float) --> int32)
+{
+    return _mm512_cvttps_epi32(a);
+}
+
+//-----------------------------------------------------------------------
+// Comparison operations
+//-----------------------------------------------------------------------
+template<CompareType CmpTypeT>
+static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b)
+{
+    return _mm512_cmp_ps_mask(a, b, static_cast<const int>(CmpTypeT));
+}
+
+template<CompareType CmpTypeT>
+static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
+{
+    // Legacy vector mask generator
+    __mmask16 result = cmp_ps_mask<CmpTypeT>(a, b);
+    return castsi_ps(vmask(result));
+}
+
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
+
+template<CompareTypeInt CmpTypeT>
+static SIMDINLINE Integer SIMDCALL cmp_epi8(Integer a, Integer b)
+{
+    // Legacy vector mask generator
+    __mmask64 result = _mm512_cmp_epi8_mask(a, b, static_cast<const int>(CmpTypeT));
+    return vmask(result);
+}
+template<CompareTypeInt CmpTypeT>
+static SIMDINLINE Integer SIMDCALL cmp_epi16(Integer a, Integer b)
+{
+    // Legacy vector mask generator
+    __mmask32 result = _mm512_cmp_epi16_mask(a, b, static_cast<const int>(CmpTypeT));
+    return vmask(result);
+}
+template<CompareTypeInt CmpTypeT>
+static SIMDINLINE Integer SIMDCALL cmp_epi32(Integer a, Integer b)
+{
+    // Legacy vector mask generator
+    __mmask16 result = _mm512_cmp_epi32_mask(a, b, static_cast<const int>(CmpTypeT));
+    return vmask(result);
+}
+template<CompareTypeInt CmpTypeT>
+static SIMDINLINE Integer SIMDCALL cmp_epi64(Integer a, Integer b)
+{
+    // Legacy vector mask generator
+    __mmask8 result = _mm512_cmp_epi64_mask(a, b, static_cast<const int>(CmpTypeT));
+    return vmask(result);
+}
+
+SIMD_IWRAPPER_2_CMP(cmpeq_epi8,  cmp_epi8<CompareTypeInt::EQ>);    // return a == b (int8)
+SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>);   // return a == b (int16)
+SIMD_IWRAPPER_2_CMP(cmpeq_epi32, cmp_epi32<CompareTypeInt::EQ>);   // return a == b (int32)
+SIMD_IWRAPPER_2_CMP(cmpeq_epi64, cmp_epi64<CompareTypeInt::EQ>);   // return a == b (int64)
+SIMD_IWRAPPER_2_CMP(cmpgt_epi8,  cmp_epi8<CompareTypeInt::GT>);    // return a > b (int8)
+SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>);   // return a > b (int16)
+SIMD_IWRAPPER_2_CMP(cmpgt_epi32, cmp_epi32<CompareTypeInt::GT>);   // return a > b (int32)
+SIMD_IWRAPPER_2_CMP(cmpgt_epi64, cmp_epi64<CompareTypeInt::GT>);   // return a > b (int64)
+SIMD_IWRAPPER_2_CMP(cmplt_epi32, cmp_epi32<CompareTypeInt::LT>);   // return a < b (int32)
+
+static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
+{
+    return (0 == static_cast<int>(_mm512_test_epi32_mask(castps_si(a), castps_si(b))));
+}
+
+static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
+{
+    return (0 == static_cast<int>(_mm512_test_epi32_mask(a, b)));
+}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+template <int ImmT>
+static SIMDINLINE Float blend_ps(Float a, Float b) // return ImmT ? b : a  (float)
+{
+    return _mm512_mask_blend_ps(__mmask16(ImmT), a, b);
+}
+
+template <int ImmT>
+static SIMDINLINE Float blend_epi32(Integer a, Integer b) // return ImmT ? b : a  (int32)
+{
+    return _mm512_mask_blend_epi32(__mmask16(ImmT), a, b);
+}
+
+static SIMDINLINE Float blendv_ps(Float a, Float b, Float mask) // return mask ? b : a  (float)
+{
+    return _mm512_mask_blend_ps(__mmask16(movemask_ps(mask)), a, b);
+}
+
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
+{
+    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
+}
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
+{
+    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
+}
+
+static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)  // return *p (all elements in vector get same value)
+{
+    return _mm512_set1_ps(*p);
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a)
+{
+    return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(a), imm));
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a)
+{
+    return _mm512_extractf64x4_pd(a, imm);
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a)
+{
+    return _mm512_extracti64x4_epi64(a, imm);
+}
+
+template<int imm>
+static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b)
+{
+    return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(a), _mm256_castps_pd(b), imm));
+}
+
+template<int imm>
+static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b)
+{
+    return _mm512_insertf64x4(a, b, imm);
+}
+
+template<int imm>
+static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
+{
+    return _mm512_inserti64x4(a, b, imm);
+}
+
+SIMD_IWRAPPER_2(packs_epi16);   // See documentation for _mm512_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2(packs_epi32);   // See documentation for _mm512_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2(packus_epi16);  // See documentation for _mm512_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2(packus_epi32);  // See documentation for _mm512_packus_epi32 and _mm512_packus_epi32
+
+static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    return _mm512_permutexvar_epi32(swiz, a);
+}
+
+static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    return _mm512_permutexvar_ps(swiz, a);
+}
+
+SIMD_WRAPPER_2I_(permute2f128_ps, shuffle_f32x4);
+SIMD_DWRAPPER_2I_(permute2f128_pd, shuffle_f64x2);
+SIMD_IWRAPPER_2I_(permute2f128_si, shuffle_i32x4);
+
+SIMD_IWRAPPER_1I(shuffle_epi32);
+
+SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_DWRAPPER_2I(shuffle_pd);
+SIMD_WRAPPER_2I(shuffle_ps);
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+{
+    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
+}
+
+SIMD_IWRAPPER_2(unpackhi_epi16);
+
+//SIMD_IFWRAPPER_2(unpackhi_epi32, _mm512_unpackhi_ps);
+static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
+{
+    return castps_si(_mm512_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
+}
+
+SIMD_IWRAPPER_2(unpackhi_epi64);
+SIMD_IWRAPPER_2(unpackhi_epi8);
+SIMD_DWRAPPER_2(unpackhi_pd);
+SIMD_WRAPPER_2(unpackhi_ps);
+SIMD_IWRAPPER_2(unpacklo_epi16);
+SIMD_IFWRAPPER_2(unpacklo_epi32, unpacklo_ps);
+SIMD_IWRAPPER_2(unpacklo_epi64);
+SIMD_IWRAPPER_2(unpacklo_epi8);
+SIMD_DWRAPPER_2(unpacklo_pd);
+SIMD_WRAPPER_2(unpacklo_ps);
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    uint32_t *pOffsets = (uint32_t*)&idx;
+    Float vResult;
+    float* pResult = (float*)&vResult;
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        uint32_t offset = pOffsets[i];
+        offset = offset * static_cast<uint32_t>(ScaleT);
+        pResult[i] = *(float const*)(((uint8_t const*)p + offset));
+    }
+
+    return vResult;
+}
+
+static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
+{
+    return broadcast_ss(p);
+}
+
+static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+{
+    return _mm512_load_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+{
+    return _mm512_load_si512(&p->v);
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+{
+    return _mm512_loadu_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+{
+    return _mm512_loadu_si512(p);
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    __mmask16 k = _mm512_cmpneq_ps_mask(mask, setzero_ps());
+
+    return _mm512_mask_i32gather_ps(old, k, idx, p, ScaleT);
+}
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+    Mask m = _mm512_cmplt_epi32_mask(mask, setzero_si());
+    _mm512_mask_store_ps(p, m, src);
+}
+
+static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
+{
+    __mmask64 m = _mm512_cmplt_epi8_mask(a, setzero_si());
+    return static_cast<uint64_t>(m);
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
+{
+    __mmask8 m = _mm512_cmplt_pd_mask(a, setzero_pd());
+    return static_cast<uint32_t>(m);
+}
+static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
+{
+    __mmask16 m = _mm512_cmplt_ps_mask(a, setzero_ps());
+    return static_cast<uint32_t>(m);
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
+{
+    return _mm512_set1_epi32(i);
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
+{
+    return _mm512_set1_epi8(i);
+}
+
+static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
+{
+    return _mm512_set1_ps(f);
+}
+
+static SIMDINLINE Double SIMDCALL setzero_pd()      // return 0 (double)
+{
+    return _mm512_setzero_pd();
+}
+
+static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
+{
+    return _mm512_setzero_ps();
+}
+
+static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
+{
+    return _mm512_setzero_si512();
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+{
+    _mm512_store_ps(p, a);
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+{
+    _mm512_store_si512(&p->v, a);
+}
+
+static SIMDINLINE void SIMDCALL storeu_si(Integer *p, Integer a) // *p = a    (same as store_si but allows for unaligned mem)
+{
+    _mm512_storeu_si512(&p->v, a);
+}
+
+static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
+{
+    _mm512_stream_ps(p, a);
+}
+
+static SIMDINLINE Integer SIMDCALL set_epi32(
+    int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
+    int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+{
+    return _mm512_set_epi32(
+        i15, i14, i13, i12, i11, i10, i9, i8,
+        i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Integer SIMDCALL set_epi32(
+    int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+{
+    return set_epi32(
+        0, 0, 0, 0, 0, 0, 0, 0,
+        i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(
+    float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8,
+    float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+{
+    return _mm512_set_ps(
+        i15, i14, i13, i12, i11, i10, i9, i8,
+        i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(
+    float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+{
+    return set_ps(
+        0, 0, 0, 0, 0, 0, 0, 0,
+        i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
+{
+    return castsi_ps(_mm512_maskz_mov_epi32(__mmask16(mask), set1_epi32(-1)));
+}
+
+#undef SIMD_WRAPPER_1_
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_2
+#undef SIMD_WRAPPER_2_
+#undef SIMD_WRAPPERI_2_
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_WRAPPER_2I_
+#undef SIMD_WRAPPER_3_
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IFWRAPPER_2
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_1I
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2I
+
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks.inl
new file mode 100644
index 00000000000..3e36ce5bd36
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks.inl
@@ -0,0 +1,27 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+// Implement mask-enabled SIMD functions
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
new file mode 100644
index 00000000000..a45429f4b6b
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
@@ -0,0 +1,842 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD16 AVX (1) implementation
+//============================================================================
+
+static const int TARGET_SIMD_WIDTH = 8;
+using SIMD128T = SIMD128Impl::AVXImpl;
+
+#define SIMD_WRAPPER_1(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return Float\
+        {\
+            SIMD256T::op(a.v8[0]),\
+            SIMD256T::op(a.v8[1]),\
+        };\
+    }
+
+#define SIMD_WRAPPER_2(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return Float\
+        {\
+            SIMD256T::op(a.v8[0], b.v8[0]),\
+            SIMD256T::op(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_WRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return Float\
+        {\
+            SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\
+            SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_WRAPPER_2I_1(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return Float\
+        {\
+            SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\
+            SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_WRAPPER_3(op)  \
+        static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+        {\
+            return Float\
+            {\
+                SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\
+                SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\
+            };\
+        }
+
+#define SIMD_IWRAPPER_1(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::op(a.v8[0]),\
+            SIMD256T::op(a.v8[1]),\
+        };\
+    }
+
+#define SIMD_IWRAPPER_2(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::op(a.v8[0], b.v8[0]),\
+            SIMD256T::op(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_IWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\
+            SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_IWRAPPER_2I_1(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\
+            SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_IWRAPPER_2I_2(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]),\
+            SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_IWRAPPER_3(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b, Integer c)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\
+            SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\
+        };\
+    }
+
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps);     // return a + b
+SIMD_WRAPPER_2(div_ps);     // return a / b
+SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
+SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);     // return a * b
+SIMD_WRAPPER_1(rcp_ps);     // return 1.0f / a
+SIMD_WRAPPER_1(rsqrt_ps);   // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);     // return a - b
+
+template <RoundMode RMT>
+static SIMDINLINE Float SIMDCALL round_ps(Float a)
+{
+    return Float
+    {
+        SIMD256T::template round_ps<RMT>(a.v8[0]),
+        SIMD256T::template round_ps<RMT>(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
+SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
+SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2(mullo_epi32);
+SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
+SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
+SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(and_ps);     // return a & b       (float treated as int)
+SIMD_IWRAPPER_2(and_si);    // return a & b       (int)
+SIMD_WRAPPER_2(andnot_ps);  // return (~a) & b    (float treated as int)
+SIMD_IWRAPPER_2(andnot_si); // return (~a) & b    (int)
+SIMD_WRAPPER_2(or_ps);      // return a | b       (float treated as int)
+SIMD_IWRAPPER_2(or_si);     // return a | b       (int)
+SIMD_WRAPPER_2(xor_ps);     // return a ^ b       (float treated as int)
+SIMD_IWRAPPER_2(xor_si);    // return a ^ b       (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL slli_epi32(Integer a)      // return a << ImmT
+{
+    return Integer
+    {
+        SIMD256T::template slli_epi32<ImmT>(a.v8[0]),
+        SIMD256T::template slli_epi32<ImmT>(a.v8[1]),
+    };
+}
+
+SIMD_IWRAPPER_2(sllv_epi32);                                // return a << b      (uint32)
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL srai_epi32(Integer a)      // return a >> ImmT   (int32)
+{
+    return Integer
+    {
+        SIMD256T::template srai_epi32<ImmT>(a.v8[0]),
+        SIMD256T::template srai_epi32<ImmT>(a.v8[1]),
+    };
+}
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL srli_epi32(Integer a)      // return a >> ImmT   (uint32)
+{
+    return Integer
+    {
+        SIMD256T::template srli_epi32<ImmT>(a.v8[0]),
+        SIMD256T::template srli_epi32<ImmT>(a.v8[1]),
+    };
+}
+
+template<int ImmT>                                          // for each 128-bit lane:
+static SIMDINLINE Integer SIMDCALL srli_si(Integer a)         //  return a >> (ImmT*8) (uint)
+{
+    return Integer
+    {
+        SIMD256T::template srli_si<ImmT>(a.v8[0]),
+        SIMD256T::template srli_si<ImmT>(a.v8[1]),
+    };
+}
+template<int ImmT>
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)       // same as srli_si, but with Float cast to int
+{
+    return Float
+    {
+        SIMD256T::template srlisi_ps<ImmT>(a.v8[0]),
+        SIMD256T::template srlisi_ps<ImmT>(a.v8[1]),
+    };
+}
+
+SIMD_IWRAPPER_2(srlv_epi32);                                // return a >> b      (uint32)
+
+//-----------------------------------------------------------------------
+// Conversion operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL castpd_ps(Double a)              // return *(Float*)(&a)
+{
+    return Float
+    {
+        SIMD256T::castpd_ps(a.v8[0]),
+        SIMD256T::castpd_ps(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL castps_si(Float a)              // return *(Integer*)(&a)
+{
+    return Integer
+    {
+        SIMD256T::castps_si(a.v8[0]),
+        SIMD256T::castps_si(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer a)              // return *(Double*)(&a)
+{
+    return Double
+    {
+        SIMD256T::castsi_pd(a.v8[0]),
+        SIMD256T::castsi_pd(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Double SIMDCALL castps_pd(Float a)   // return *(Double*)(&a)
+{
+    return Double
+    {
+        SIMD256T::castps_pd(a.v8[0]),
+        SIMD256T::castps_pd(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer a)              // return *(Float*)(&a)
+{
+    return Float
+    {
+        SIMD256T::castsi_ps(a.v8[0]),
+        SIMD256T::castsi_ps(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a)            // return (float)a    (int32 --> float)
+{
+    return Float
+    {
+        SIMD256T::cvtepi32_ps(a.v8[0]),
+        SIMD256T::cvtepi32_ps(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu8_epi16(SIMD256Impl::Integer a)          // return (int16)a    (uint8 --> int16)
+{
+    return Integer
+    {
+        SIMD256T::cvtepu8_epi16(a.v4[0]),
+        SIMD256T::cvtepu8_epi16(a.v4[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu8_epi32(SIMD256Impl::Integer a)          // return (int32)a    (uint8 --> int32)
+{
+    return Integer
+	{
+        SIMD256T::cvtepu8_epi32(a.v4[0]),
+        SIMD256T::cvtepu8_epi32(SIMD128T::template srli_si<8>(a.v4[0])),
+	};
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu16_epi32(SIMD256Impl::Integer a)         // return (int32)a    (uint16 --> int32)
+{
+    return Integer
+    {
+        SIMD256T::cvtepu16_epi32(a.v4[0]),
+        SIMD256T::cvtepu16_epi32(a.v4[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu16_epi64(SIMD256Impl::Integer a)         // return (int64)a    (uint16 --> int64)
+{
+    return Integer
+    {
+        SIMD256T::cvtepu16_epi64(a.v4[0]),
+        SIMD256T::cvtepu16_epi64(SIMD128T::template srli_si<8>(a.v4[0])),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu32_epi64(SIMD256Impl::Integer a)         // return (int64)a    (uint32 --> int64)
+{
+    return Integer
+    {
+        SIMD256T::cvtepu32_epi64(a.v4[0]),
+        SIMD256T::cvtepu32_epi64(a.v4[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a)            // return (int32)a    (float --> int32)
+{
+    return Integer
+    {
+        SIMD256T::cvtps_epi32(a.v8[0]),
+        SIMD256T::cvtps_epi32(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a)           // return (int32)a    (rnd_to_zero(float) --> int32)
+{
+    return Integer
+    {
+        SIMD256T::cvtps_epi32(a.v8[0]),
+        SIMD256T::cvtps_epi32(a.v8[1]),
+    };
+}
+
+//-----------------------------------------------------------------------
+// Comparison operations
+//-----------------------------------------------------------------------
+template<CompareType CmpTypeT>
+static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
+{
+    return Float
+    {
+        SIMD256T::template cmp_ps<CmpTypeT>(a.v8[0], b.v8[0]),
+        SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]),
+    };
+}
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
+
+template<CompareType CmpTypeT>
+static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b)
+{
+    return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b)));
+}
+
+
+SIMD_IWRAPPER_2(cmpeq_epi8);    // return a == b (int8)
+SIMD_IWRAPPER_2(cmpeq_epi16);   // return a == b (int16)
+SIMD_IWRAPPER_2(cmpeq_epi32);   // return a == b (int32)
+SIMD_IWRAPPER_2(cmpeq_epi64);   // return a == b (int64)
+SIMD_IWRAPPER_2(cmpgt_epi8);    // return a > b (int8)
+SIMD_IWRAPPER_2(cmpgt_epi16);   // return a > b (int16)
+SIMD_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
+SIMD_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
+SIMD_IWRAPPER_2(cmplt_epi32);   // return a < b (int32)
+
+static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
+{
+    return  0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) &
+                  SIMD256T::testz_ps(a.v8[1], b.v8[1]));
+}
+
+static SIMDINLINE int SIMDCALL testz_si(Integer a, Integer b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
+{
+    return  0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) &
+                  SIMD256T::testz_si(a.v8[1], b.v8[1]));
+}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2I(blend_ps);  // return ImmT ? b : a  (float)
+SIMD_IWRAPPER_2I(blend_epi32);  // return ImmT ? b : a  (int32)
+SIMD_WRAPPER_3(blendv_ps);  // return mask ? b : a  (float)
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
+{
+    return Integer
+    {
+        SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
+        SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
+{
+    return Integer
+    {
+        SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
+        SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)         // return *p (all elements in vector get same value)
+{
+    float f = *p;
+    return Float
+    {
+        SIMD256T::set1_ps(f),
+        SIMD256T::set1_ps(f),
+    };
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    return a.v8[imm];
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    return a.v8[imm];
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    return a.v8[imm];
+}
+
+template<int imm>
+static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    a.v8[imm] = b;
+    return a;
+}
+
+template<int imm>
+static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    a.v8[imm] = b;
+    return a;
+}
+
+template<int imm>
+static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    a.v8[imm] = b;
+    return a;
+}
+
+SIMD_IWRAPPER_2(packs_epi16);      // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2(packs_epi32);      // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2(packus_epi16);     // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2(packus_epi32);     // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+
+static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
+{
+    Integer result;
+
+    // Ugly slow implementation
+    uint32_t const *pA = reinterpret_cast<uint32_t const*>(&a);
+    uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
+    uint32_t *pResult = reinterpret_cast<uint32_t *>(&result);
+
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        pResult[i] = pA[0xF & pSwiz[i]];
+    }
+
+    return result;
+}
+
+static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    Float result;
+
+    // Ugly slow implementation
+    float const *pA = reinterpret_cast<float const*>(&a);
+    uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
+    float *pResult = reinterpret_cast<float *>(&result);
+
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        pResult[i] = pA[0xF & pSwiz[i]];
+    }
+
+    return result;
+}
+
+// All of the 512-bit permute2f128_XX intrinsics do the following:
+//
+//      SELECT4(src, control) {
+//          CASE(control[1:0])
+//              0:	tmp[127:0] : = src[127:0]
+//              1 : tmp[127:0] : = src[255:128]
+//              2 : tmp[127:0] : = src[383:256]
+//              3 : tmp[127:0] : = src[511:384]
+//              ESAC
+//              RETURN tmp[127:0]
+//      }
+//      
+//      dst[127:0]   : = SELECT4(a[511:0], imm8[1:0])
+//      dst[255:128] : = SELECT4(a[511:0], imm8[3:2])
+//      dst[383:256] : = SELECT4(b[511:0], imm8[5:4])
+//      dst[511:384] : = SELECT4(b[511:0], imm8[7:6])
+//      dst[MAX:512] : = 0
+//
+// Since the 256-bit AVX instructions use a 4-bit control field (instead
+// of 2-bit for AVX512), we need to expand the control bits sent to the
+// AVX instructions for emulation.
+//
+template <int shuf>
+static SIMDINLINE Float SIMDCALL permute2f128_ps(Float a, Float b)
+{
+    return Float
+    {
+        SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
+        SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
+    };
+}
+
+template <int shuf>
+static SIMDINLINE Double SIMDCALL permute2f128_pd(Double a, Double b)
+{
+    return Double
+    {
+        SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
+        SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
+    };
+}
+
+template <int shuf>
+static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer a, Integer b)
+{
+    return Integer
+	{
+        SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
+        SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
+	};
+}
+
+SIMD_IWRAPPER_2I_1(shuffle_epi32);
+SIMD_IWRAPPER_2I_2(shuffle_epi64);
+SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_WRAPPER_2I_1(shuffle_pd);
+SIMD_WRAPPER_2I_1(shuffle_ps);
+SIMD_IWRAPPER_2(unpackhi_epi16);
+SIMD_IWRAPPER_2(unpackhi_epi32);
+SIMD_IWRAPPER_2(unpackhi_epi64);
+SIMD_IWRAPPER_2(unpackhi_epi8);
+SIMD_WRAPPER_2(unpackhi_pd);
+SIMD_WRAPPER_2(unpackhi_ps);
+SIMD_IWRAPPER_2(unpacklo_epi16);
+SIMD_IWRAPPER_2(unpacklo_epi32);
+SIMD_IWRAPPER_2(unpacklo_epi64);
+SIMD_IWRAPPER_2(unpacklo_epi8);
+SIMD_WRAPPER_2(unpacklo_pd);
+SIMD_WRAPPER_2(unpacklo_ps);
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    return Float
+    {
+        SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[0]),
+        SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
+{
+    return broadcast_ss(p);
+}
+
+static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+{
+    return Float
+    {
+        SIMD256T::load_ps(p),
+        SIMD256T::load_ps(p + TARGET_SIMD_WIDTH)
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+{
+    return Integer
+    {
+        SIMD256T::load_si(&p->v8[0]),
+        SIMD256T::load_si(&p->v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+{
+    return Float
+    {
+        SIMD256T::loadu_ps(p),
+        SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH)
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+{
+    return Integer
+    {
+        SIMD256T::loadu_si(&p->v8[0]),
+        SIMD256T::loadu_si(&p->v8[1]),
+    };
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    return Float
+    {
+        SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
+        SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
+    };
+}
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+    SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]);
+    SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]);
+}
+
+static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
+{
+    uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0]));
+             mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4);
+
+    return mask;
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
+{
+    uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0]));
+             mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2);
+
+    return mask;
+}
+static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
+{
+    uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0]));
+             mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH;
+
+    return mask;
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
+{
+    return Integer
+    {
+        SIMD256T::set1_epi32(i),
+        SIMD256T::set1_epi32(i)
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
+{
+    return Integer
+    {
+        SIMD256T::set1_epi8(i),
+        SIMD256T::set1_epi8(i)
+    };
+}
+
+static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
+{
+    return Float
+    {
+        SIMD256T::set1_ps(f),
+        SIMD256T::set1_ps(f)
+    };
+}
+
+static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
+{
+    return Float
+    {
+        SIMD256T::setzero_ps(),
+        SIMD256T::setzero_ps()
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
+{
+    return Integer
+    {
+        SIMD256T::setzero_si(),
+        SIMD256T::setzero_si()
+    };
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+{
+    SIMD256T::store_ps(p, a.v8[0]);
+    SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+{
+    SIMD256T::store_si(&p->v8[0], a.v8[0]);
+    SIMD256T::store_si(&p->v8[1], a.v8[1]);
+}
+
+static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
+{
+    SIMD256T::stream_ps(p, a.v8[0]);
+    SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
+}
+
+static SIMDINLINE Integer SIMDCALL set_epi32(
+    int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
+    int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+{
+    return Integer
+    {
+        SIMD256T::set_epi32(
+            i7, i6, i5, i4, i3, i2, i1, i0),
+        SIMD256T::set_epi32(
+            i15, i14, i13, i12, i11, i10, i9, i8)
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL set_epi32(
+    int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+{
+    return set_epi32(
+        0, 0, 0, 0, 0, 0, 0, 0,
+        i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(
+    float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8,
+    float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+{
+    return Float
+    {
+        SIMD256T::set_ps(
+            i7, i6, i5, i4, i3, i2, i1, i0),
+        SIMD256T::set_ps(
+            i15, i14, i13, i12, i11, i10, i9, i8)
+    };
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(
+    float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+{
+    return set_ps(
+        0, 0, 0, 0, 0, 0, 0, 0,
+        i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
+{
+    Integer vec = set1_epi32(mask);
+    const Integer bit = set_epi32(
+        0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100,
+        0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+    vec = and_si(vec, bit);
+    vec = cmplt_epi32(setzero_si(), vec);
+    return castsi_ps(vec);
+}
+
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_2
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_2I_1
+#undef SIMD_WRAPPER_3
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_2I_1
+#undef SIMD_IWRAPPER_3
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu_masks.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu_masks.inl
new file mode 100644
index 00000000000..bc5bff477a4
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu_masks.inl
@@ -0,0 +1,28 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+// no backwards compatibility for simd mask-enabled functions
+
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp
new file mode 100644
index 00000000000..df2df1b09cd
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp
@@ -0,0 +1,428 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#pragma once
+#if 0
+//===========================================================================
+// Placeholder name representing either SIMD4, SIMD256, or SIMD16 structures.
+//===========================================================================
+struct SIMD256 // or SIMD4 or SIMD16
+{
+    //=======================================================================
+    // SIMD Types
+    //
+    // These typedefs are examples. The SIMD256 and SIMD16 implementations will
+    // use different base types with this same naming.
+    using Float     = __m256;  // Packed single-precision float vector
+    using Double    = __m256d; // Packed double-precision float vector
+    using Integer   = __m256i; // Packed integer vector (mutable element widths)
+    using Mask      = uint8_t; // Integer representing mask bits
+
+    //=======================================================================
+    // Standard interface
+    // (available in both SIMD256 and SIMD16 widths)
+    //=======================================================================
+
+    //-----------------------------------------------------------------------
+    // Single precision floating point arithmetic operations
+    //-----------------------------------------------------------------------
+    static Float    add_ps(Float a, Float b);               // return a + b
+    static Float    div_ps(Float a, Float b);               // return a / b
+    static Float    fmadd_ps(Float a, Float b, Float c);    // return (a * b) + c
+    static Float    fmsub_ps(Float a, Float b, Float c);    // return (a * b) - c
+    static Float    max_ps(Float a, Float b);               // return (a > b) ? a : b
+    static Float    min_ps(Float a, Float b);               // return (a < b) ? a : b
+    static Float    mul_ps(Float a, Float b);               // return a * b
+    static Float    rcp_ps(Float a);                        // return 1.0f / a
+    static Float    rsqrt_ps(Float a);                      // return 1.0f / sqrt(a)
+    static Float    sub_ps(Float a, Float b);               // return a - b
+
+    enum class RoundMode
+    {
+        TO_NEAREST_INT  = 0x00, // Round to nearest integer == TRUNCATE(value + (signof(value))0.5)
+        TO_NEG_INF      = 0x01, // Round to negative infinity
+        TO_POS_INF      = 0x02, // Round to positive infinity
+        TO_ZERO         = 0x03, // Round to 0 a.k.a. truncate
+        CUR_DIRECTION   = 0x04, // Round in direction set in MXCSR register
+
+        RAISE_EXC       = 0x00, // Raise exception on overflow
+        NO_EXC          = 0x08, // Suppress exceptions
+
+        NINT            = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(RAISE_EXC),
+        NINT_NOEXC      = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(NO_EXC),
+        FLOOR           = static_cast<int>(TO_NEG_INF)      | static_cast<int>(RAISE_EXC),
+        FLOOR_NOEXC     = static_cast<int>(TO_NEG_INF)      | static_cast<int>(NO_EXC),
+        CEIL            = static_cast<int>(TO_POS_INF)      | static_cast<int>(RAISE_EXC),
+        CEIL_NOEXC      = static_cast<int>(TO_POS_INF)      | static_cast<int>(NO_EXC),
+        TRUNC           = static_cast<int>(TO_ZERO)         | static_cast<int>(RAISE_EXC),
+        TRUNC_NOEXC     = static_cast<int>(TO_ZERO)         | static_cast<int>(NO_EXC),
+        RINT            = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(RAISE_EXC),
+        NEARBYINT       = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(NO_EXC),
+    };
+
+    // return round_func(a)
+    //
+    // round_func is chosen on the RMT template parameter.  See the documentation
+    // for the RoundMode enumeration above.
+    template <RoundMode RMT>
+    static Float    round_ps(Float a);                  // return round(a) 
+
+
+    //-----------------------------------------------------------------------
+    // Integer (various width) arithmetic operations
+    //-----------------------------------------------------------------------
+    static Integer  abs_epi32(Integer a);               // return absolute_value(a) (int32)
+    static Integer  add_epi32(Integer a, Integer b);    // return a + b (int32)
+    static Integer  add_epi8(Integer a, Integer b);     // return a + b (int8)
+    static Integer  adds_epu8(Integer a, Integer b);    // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+    static Integer  max_epi32(Integer a, Integer b);    // return (a > b) ? a : b (int32)
+    static Integer  max_epu32(Integer a, Integer b);    // return (a > b) ? a : b (uint32)
+    static Integer  min_epi32(Integer a, Integer b);    // return (a < b) ? a : b (int32)
+    static Integer  min_epu32(Integer a, Integer b);    // return (a < b) ? a : b (uint32)
+    static Integer  mul_epi32(Integer a, Integer b);    // return a * b (int32)
+
+    // return (a * b) & 0xFFFFFFFF
+    //
+    // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+    // and store the low 32 bits of the intermediate integers in dst.
+    static Float    mullo_epi32(Integer a, Integer b);
+
+    static Integer  sub_epi32(Integer a, Integer b);    // return a - b (int32)
+    static Integer  sub_epi64(Integer a, Integer b);    // return a - b (int64)
+    static Integer  subs_epu8(Integer a, Integer b);    // return (b > a) ? 0 : (a - b) (uint8)
+
+    //-----------------------------------------------------------------------
+    // Logical operations
+    //-----------------------------------------------------------------------
+    static Float    and_ps(Float a, Float b);           // return a & b       (float treated as int)
+    static Integer  and_si(Integer a, Integer b);       // return a & b       (int)
+    static Float    andnot_ps(Float a, Float b);        // return (~a) & b    (float treated as int)
+    static Integer  andnot_si(Integer a, Integer b);    // return (~a) & b    (int)
+    static Float    or_ps(Float a, Float b);            // return a | b       (float treated as int)
+    static Float    or_si(Integer a, Integer b);        // return a | b       (int)
+    static Float    xor_ps(Float a, Float b);           // return a ^ b       (float treated as int)
+    static Integer  xor_si(Integer a, Integer b);       // return a ^ b       (int)
+
+    //-----------------------------------------------------------------------
+    // Shift operations
+    //-----------------------------------------------------------------------
+    template<int ImmT>
+    static Integer  slli_epi32(Integer a);              // return a << ImmT
+    static Integer  sllv_epi32(Integer a, Integer b);   // return a << b
+    template<int ImmT>
+    static Integer  srai_epi32(Integer a);              // return a >> ImmT   (int32)
+    template<int ImmT>
+    static Integer  srli_epi32(Integer a);              // return a >> ImmT   (uint32)
+    template<int ImmT>                                  // for each 128-bit lane:
+    static Integer  srli_si(Integer a);                 //  return a >> (ImmT*8) (uint)
+    template<int ImmT>
+    static Float    srlisi_ps(Float a);                 // same as srli_si, but with Float cast to int
+    static Integer  srlv_epi32(Integer a, Integer b);   // return a >> b      (uint32)
+
+    //-----------------------------------------------------------------------
+    // Conversion operations
+    //-----------------------------------------------------------------------
+    static Float    castpd_ps(Double a);                // return *(Float*)(&a)
+    static Integer  castps_si(Float a);                 // return *(Integer*)(&a)
+    static Double   castsi_pd(Integer a);               // return *(Double*)(&a)
+    static Double   castps_pd(Float a);                 // return *(Double*)(&a)
+    static Float    castsi_ps(Integer a);               // return *(Float*)(&a)
+    static Float    cvtepi32_ps(Integer a);             // return (float)a    (int32 --> float)
+    static Integer  cvtepu8_epi16(Integer a);           // return (int16)a    (uint8 --> int16)
+    static Integer  cvtepu8_epi32(Integer a);           // return (int32)a    (uint8 --> int32)
+    static Integer  cvtepu16_epi32(Integer a);          // return (int32)a    (uint16 --> int32)
+    static Integer  cvtepu16_epi64(Integer a);          // return (int64)a    (uint16 --> int64)
+    static Integer  cvtepu32_epi64(Integer a);          // return (int64)a    (uint32 --> int64)
+    static Integer  cvtps_epi32(Float a);               // return (int32)a    (float --> int32)
+    static Integer  cvttps_epi32(Float a);              // return (int32)a    (rnd_to_zero(float) --> int32)
+
+    //-----------------------------------------------------------------------
+    // Comparison operations
+    //-----------------------------------------------------------------------
+
+    // Comparison types used with cmp_ps:
+    //   - ordered comparisons are always false if either operand is NaN
+    //   - unordered comparisons are always true if either operand is NaN
+    //   - signaling comparisons raise an exception if either operand is NaN
+    //   - non-signaling comparisons will never raise an exception
+    // 
+    // Ordered:     return (a != NaN) && (b != NaN) && (a cmp b)
+    // Unordered:   return (a == NaN) || (b == NaN) || (a cmp b)
+    enum class CompareType
+    {
+        EQ_OQ      = 0x00, // Equal (ordered, nonsignaling)
+        LT_OS      = 0x01, // Less-than (ordered, signaling)
+        LE_OS      = 0x02, // Less-than-or-equal (ordered, signaling)
+        UNORD_Q    = 0x03, // Unordered (nonsignaling)
+        NEQ_UQ     = 0x04, // Not-equal (unordered, nonsignaling)
+        NLT_US     = 0x05, // Not-less-than (unordered, signaling)
+        NLE_US     = 0x06, // Not-less-than-or-equal (unordered, signaling)
+        ORD_Q      = 0x07, // Ordered (nonsignaling)
+        EQ_UQ      = 0x08, // Equal (unordered, non-signaling)
+        NGE_US     = 0x09, // Not-greater-than-or-equal (unordered, signaling)
+        NGT_US     = 0x0A, // Not-greater-than (unordered, signaling)
+        FALSE_OQ   = 0x0B, // False (ordered, nonsignaling)
+        NEQ_OQ     = 0x0C, // Not-equal (ordered, non-signaling)
+        GE_OS      = 0x0D, // Greater-than-or-equal (ordered, signaling)
+        GT_OS      = 0x0E, // Greater-than (ordered, signaling)
+        TRUE_UQ    = 0x0F, // True (unordered, non-signaling)
+        EQ_OS      = 0x10, // Equal (ordered, signaling)
+        LT_OQ      = 0x11, // Less-than (ordered, nonsignaling)
+        LE_OQ      = 0x12, // Less-than-or-equal (ordered, nonsignaling)
+        UNORD_S    = 0x13, // Unordered (signaling)
+        NEQ_US     = 0x14, // Not-equal (unordered, signaling)
+        NLT_UQ     = 0x15, // Not-less-than (unordered, nonsignaling)
+        NLE_UQ     = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
+        ORD_S      = 0x17, // Ordered (signaling)
+        EQ_US      = 0x18, // Equal (unordered, signaling)
+        NGE_UQ     = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
+        NGT_UQ     = 0x1A, // Not-greater-than (unordered, nonsignaling)
+        FALSE_OS   = 0x1B, // False (ordered, signaling)
+        NEQ_OS     = 0x1C, // Not-equal (ordered, signaling)
+        GE_OQ      = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
+        GT_OQ      = 0x1E, // Greater-than (ordered, nonsignaling)
+        TRUE_US    = 0x1F, // True (unordered, signaling)
+    };
+
+    // return a (CmpTypeT) b (float)
+    //
+    // See documentation for CompareType above for valid values for CmpTypeT.
+    template<CompareType CmpTypeT>
+    static Float    cmp_ps(Float a, Float b);           // return a (CmtTypeT) b (see above)
+    static Float    cmpgt_ps(Float a, Float b);         // return cmp_ps<CompareType::GT_OQ>(a, b)
+    static Float    cmple_ps(Float a, Float b);         // return cmp_ps<CompareType::LE_OQ>(a, b)
+    static Float    cmplt_ps(Float a, Float b);         // return cmp_ps<CompareType::LT_OQ>(a, b)
+    static Float    cmpneq_ps(Float a, Float b);        // return cmp_ps<CompareType::NEQ_OQ>(a, b)
+    static Float    cmpeq_ps(Float a, Float b);         // return cmp_ps<CompareType::EQ_OQ>(a, b)
+    static Float    cmpge_ps(Float a, Float b);         // return cmp_ps<CompareType::GE_OQ>(a, b)
+    static Integer  cmpeq_epi8(Integer a, Integer b);   // return a == b (int8)
+    static Integer  cmpeq_epi16(Integer a, Integer b);  // return a == b (int16)
+    static Integer  cmpeq_epi32(Integer a, Integer b);  // return a == b (int32)
+    static Integer  cmpeq_epi64(Integer a, Integer b);  // return a == b (int64)
+    static Integer  cmpgt_epi8(Integer a, Integer b);   // return a > b (int8)
+    static Integer  cmpgt_epi16(Integer a, Integer b);  // return a > b (int16)
+    static Integer  cmpgt_epi32(Integer a, Integer b);  // return a > b (int32)
+    static Integer  cmpgt_epi64(Integer a, Integer b);  // return a > b (int64)
+    static Integer  cmplt_epi32(Integer a, Integer b);  // return a < b (int32)
+    static bool     testz_ps(Float a, Float b);         // return all_lanes_zero(a & b) ? 1 : 0 (float)
+    static bool     testz_si(Integer a, Integer b);     // return all_lanes_zero(a & b) ? 1 : 0 (int)
+
+    //-----------------------------------------------------------------------
+    // Blend / shuffle / permute operations
+    //-----------------------------------------------------------------------
+    template<int ImmT>
+    static Float    blend_ps(Float a, Float b);                     // return ImmT ? b : a  (float)
+    static Integer  blendv_epi32(Integer a, Integer b, Float mask); // return mask ? b : a (int)
+    static Float    blendv_ps(Float a, Float b, Float mask);        // return mask ? b : a (float)
+    static Float    broadcast_ss(float const *p);                   // return *p (all elements in vector get same value)
+    static Integer  packs_epi16(Integer a, Integer b);              // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+    static Integer  packs_epi32(Integer a, Integer b);              // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+    static Integer  packus_epi16(Integer a, Integer b);             // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+    static Integer  packus_epi32(Integer a, Integer b);             // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+    static Float    permute_epi32(Integer a, Integer swiz);         // return a[swiz[i]] for each 32-bit lane i (int32)
+    static Float    permute_ps(Float a, Integer swiz);              // return a[swiz[i]] for each 32-bit lane i (float)
+    template<int SwizT>
+    static Integer  shuffle_epi32(Integer a, Integer b);    
+    template<int SwizT>
+    static Integer  shuffle_epi64(Integer a, Integer b);
+    static Integer  shuffle_epi8(Integer a, Integer b);
+    template<int SwizT>
+    static Float    shuffle_pd(Double a, Double b);
+    template<int SwizT>
+    static Float    shuffle_ps(Float a, Float b);
+    static Integer  unpackhi_epi16(Integer a, Integer b);
+    static Integer  unpackhi_epi32(Integer a, Integer b);
+    static Integer  unpackhi_epi64(Integer a, Integer b);
+    static Integer  unpackhi_epi8(Integer a, Integer b);
+    static Float    unpackhi_pd(Double a, Double b);
+    static Float    unpackhi_ps(Float a, Float b);
+    static Integer  unpacklo_epi16(Integer a, Integer b);
+    static Integer  unpacklo_epi32(Integer a, Integer b);
+    static Integer  unpacklo_epi64(Integer a, Integer b);
+    static Integer  unpacklo_epi8(Integer a, Integer b);
+    static Float    unpacklo_pd(Double a, Double b);
+    static Float    unpacklo_ps(Float a, Float b);
+
+    //-----------------------------------------------------------------------
+    // Load / store operations
+    //-----------------------------------------------------------------------
+    enum class ScaleFactor
+    {
+        SF_1,   // No scaling
+        SF_2,   // Scale offset by 2
+        SF_4,   // Scale offset by 4
+        SF_8,   // Scale offset by 8
+    };
+
+    template<ScaleFactor ScaleT>
+    static Float    i32gather_ps(float const* p, Integer idx);  // return *(float*)(((int8*)p) + (idx * ScaleT))
+    static Float    load1_ps(float const *p);                   // return *p    (broadcast 1 value to all elements)
+    static Float    load_ps(float const *p);                    // return *p    (loads SIMD width elements from memory)
+    static Integer  load_si(Integer const *p);                  // return *p
+    static Float    loadu_ps(float const *p);                   // return *p    (same as load_ps but allows for unaligned mem)
+    static Integer  loadu_si(Integer const *p);                 // return *p    (same as load_si but allows for unaligned mem)
+
+    // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+    template<int ScaleT>
+    static Float    mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask);
+
+    static void     maskstore_ps(float *p, Integer mask, Float src);
+    static int      movemask_epi8(Integer a);
+    static int      movemask_pd(Double a);
+    static int      movemask_ps(Float a);
+    static Integer  set1_epi32(int i);                          // return i (all elements are same value)
+    static Integer  set1_epi8(char i);                          // return i (all elements are same value)
+    static Float    set1_ps(float f);                           // return f (all elements are same value)
+    static Float    setzero_ps();                               // return 0 (float)
+    static Integer  setzero_si();                               // return 0 (integer)
+    static void     store_ps(float *p, Float a);                // *p = a   (stores all elements contiguously in memory)
+    static void     store_si(Integer *p, Integer a);            // *p = a
+    static void     stream_ps(float *p, Float a);               // *p = a   (same as store_ps, but doesn't keep memory in cache)
+
+    //=======================================================================
+    // Legacy interface (available only in SIMD256 width)
+    //=======================================================================
+
+    static Float    broadcast_ps(__m128 const *p);
+    template<int ImmT>
+    static __m128d  extractf128_pd(Double a);
+    template<int ImmT>
+    static __m128   extractf128_ps(Float a);
+    template<int ImmT>
+    static __m128i  extractf128_si(Integer a);
+    template<int ImmT>
+    static Double   insertf128_pd(Double a, __m128d b);
+    template<int ImmT>
+    static Float    insertf128_ps(Float a, __m128 b);
+    template<int ImmT>
+    static Integer  insertf128_si(Integer a, __m128i b);
+    static Integer  loadu2_si(__m128 const* phi, __m128 const* plo);
+    template<int ImmT>
+    static Double   permute2f128_pd(Double a, Double b);
+    template<int ImmT>
+    static Float    permute2f128_ps(Float a, Float b);
+    template<int ImmT>
+    static Integer  permute2f128_si(Integer a, Integer b);
+    static Integer  set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0);
+    static void     storeu2_si(__m128i *phi, __m128i *plo, Integer src);
+
+    //=======================================================================
+    // Advanced masking interface (currently available only in SIMD16 width)
+    //=======================================================================
+
+
+    //=======================================================================
+    // Extended Utility Functions (common to SIMD256 and SIMD16)
+    //=======================================================================
+
+    //-----------------------------------------------------------------------
+    // Extended Types
+    //-----------------------------------------------------------------------
+
+    // Vec4, an SOA SIMD set of 4-dimensional vectors
+    union Vec4
+    {
+        Vec4() = default;
+        Vec4(Float in)
+        {
+            s.x = in;
+            s.y = in;
+            s.z = in;
+            s.w = in;
+        }
+        Vec4(Float x, Float y, Float z, Float w)
+        {
+            s.x = x;
+            s.y = y;
+            s.z = z;
+            s.w = w;
+        }
+
+        Float      v[4];
+        Integer      vi[4];
+        struct
+        {
+            Float  x;
+            Float  y;
+            Float  z;
+            Float  w;
+        } s;
+        Float& operator[] (const int i) { return v[i]; }
+        Float const & operator[] (const int i) const { return v[i]; }
+    };
+
+    //-----------------------------------------------------------------------
+    // Extended Functions
+    //-----------------------------------------------------------------------
+    static void     vec4_set1_ps(Vec4& r, const float *p);                  // r[0] = set1(p[0]), r[1] = set1(p[1]), ...
+    static void     vec4_set1_vps(Vec4& r, Float s);                        // r[0] = s, r[1] = s, ...
+    static Float    vec4_dp3_ps(const Vec4& v0, const Vec4& v1);            // return dp3(v0, v1)
+    static Float    vec4_dp4_ps(const Vec4& v0, const Vec4& v1);            // return dp4(v0, v1)
+    static Float    vec4_rcp_length_ps(const Vec4& v);                      // return 1.0f / sqrt(dp4(v, v))
+    static void     vec4_normalize_ps(Vec4& r, const Vec4& v);              // r = v * rcp_length(v)
+    static void     vec4_mul_ps(Vec4& r, const Vec4& v, Float s);           // r = v * set1_vps(s)
+    static void     vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1);   // r = v0 * v1
+    static void     vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1);   // r = v0 + v1
+    static void     vec4_min_ps(Vec4& r, const Vec4& v0, Float s);          // r = (v0 < s) ? v0 : s
+    static void     vec4_max_ps(Vec4& r, const Vec4& v0, Float s);          // r = (v0 > s) ? v0 : s
+
+    // Matrix4x4 * Vector4
+    //   result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * v.s.w)
+    //   result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * v.s.w)
+    //   result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * v.s.w)
+    //   result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * v.s.w)
+    static void mat4x4_vec4_multiply(
+            Vec4& result,
+            const float *pMatrix,
+            const Vec4& v);
+
+    // Matrix4x4 * Vector3 - Direction Vector where w = 0.
+    //   result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 0)
+    //   result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 0)
+    //   result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 0)
+    //   result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * 0)
+    static void mat3x3_vec3_w0_multiply(
+            Vec4& result,
+            const float *pMatrix,
+            const Vec4& v);
+
+    // Matrix4x4 * Vector3 - Position vector where w = 1.
+    //   result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 1)
+    //   result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 1)
+    //   result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 1)
+    //   result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * 1)
+    static void mat4x4_vec3_w1_multiply(
+            Vec4& result,
+            const float *pMatrix,
+            const Vec4& v);
+
+    // Matrix4x3 * Vector3 - Position vector where w = 1.
+    //   result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 1)
+    //   result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 1)
+    //   result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 1)
+    //   result.s.w = 1
+    static void mat4x3_vec3_w1_multiply(
+            Vec4& result,
+            const float *pMatrix,
+            const Vec4& v);
+};
+#endif // #if 0
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp
new file mode 100644
index 00000000000..1964ef47027
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp
@@ -0,0 +1,377 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#pragma once
+
+#if !defined(__cplusplus)
+#error C++ compilation required
+#endif
+
+#include <immintrin.h>
+#include <inttypes.h>
+#include <stdint.h>
+
+#define SIMD_ARCH_AVX       0
+#define SIMD_ARCH_AVX2      1
+#define SIMD_ARCH_AVX512    2
+
+#if !defined(SIMD_ARCH)
+#define SIMD_ARCH SIMD_ARCH_AVX
+#endif
+
+#if defined(_MSC_VER)
+#define SIMDCALL __vectorcall
+#define SIMDINLINE __forceinline
+#define SIMDALIGN(type_, align_) __declspec(align(align_)) type_
+#else
+#define SIMDCALL
+#define SIMDINLINE inline
+#define SIMDALIGN(type_, align_) type_ __attribute__((aligned(align_)))
+#endif
+
+// For documentation, please see the following include...
+// #include "simdlib_interface.hpp"
+
+namespace SIMDImpl
+{
+    enum class CompareType
+    {
+        EQ_OQ      = 0x00, // Equal (ordered, nonsignaling)
+        LT_OS      = 0x01, // Less-than (ordered, signaling)
+        LE_OS      = 0x02, // Less-than-or-equal (ordered, signaling)
+        UNORD_Q    = 0x03, // Unordered (nonsignaling)
+        NEQ_UQ     = 0x04, // Not-equal (unordered, nonsignaling)
+        NLT_US     = 0x05, // Not-less-than (unordered, signaling)
+        NLE_US     = 0x06, // Not-less-than-or-equal (unordered, signaling)
+        ORD_Q      = 0x07, // Ordered (nonsignaling)
+        EQ_UQ      = 0x08, // Equal (unordered, non-signaling)
+        NGE_US     = 0x09, // Not-greater-than-or-equal (unordered, signaling)
+        NGT_US     = 0x0A, // Not-greater-than (unordered, signaling)
+        FALSE_OQ   = 0x0B, // False (ordered, nonsignaling)
+        NEQ_OQ     = 0x0C, // Not-equal (ordered, non-signaling)
+        GE_OS      = 0x0D, // Greater-than-or-equal (ordered, signaling)
+        GT_OS      = 0x0E, // Greater-than (ordered, signaling)
+        TRUE_UQ    = 0x0F, // True (unordered, non-signaling)
+        EQ_OS      = 0x10, // Equal (ordered, signaling)
+        LT_OQ      = 0x11, // Less-than (ordered, nonsignaling)
+        LE_OQ      = 0x12, // Less-than-or-equal (ordered, nonsignaling)
+        UNORD_S    = 0x13, // Unordered (signaling)
+        NEQ_US     = 0x14, // Not-equal (unordered, signaling)
+        NLT_UQ     = 0x15, // Not-less-than (unordered, nonsignaling)
+        NLE_UQ     = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
+        ORD_S      = 0x17, // Ordered (signaling)
+        EQ_US      = 0x18, // Equal (unordered, signaling)
+        NGE_UQ     = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
+        NGT_UQ     = 0x1A, // Not-greater-than (unordered, nonsignaling)
+        FALSE_OS   = 0x1B, // False (ordered, signaling)
+        NEQ_OS     = 0x1C, // Not-equal (ordered, signaling)
+        GE_OQ      = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
+        GT_OQ      = 0x1E, // Greater-than (ordered, nonsignaling)
+        TRUE_US    = 0x1F, // True (unordered, signaling)
+    };
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+    enum class CompareTypeInt
+    {
+        EQ  = _MM_CMPINT_EQ,    // Equal
+        LT  = _MM_CMPINT_LT,    // Less than
+        LE  = _MM_CMPINT_LE,    // Less than or Equal
+        NE  = _MM_CMPINT_NE,    // Not Equal
+        GE  = _MM_CMPINT_GE,    // Greater than or Equal
+        GT  = _MM_CMPINT_GT,    // Greater than
+    };
+#endif // SIMD_ARCH >= SIMD_ARCH_AVX512
+
+    enum class ScaleFactor
+    {
+        SF_1 = 1,   // No scaling
+        SF_2 = 2,   // Scale offset by 2
+        SF_4 = 4,   // Scale offset by 4
+        SF_8 = 8,   // Scale offset by 8
+    };
+
+    enum class RoundMode
+    {
+        TO_NEAREST_INT  = 0x00, // Round to nearest integer == TRUNCATE(value + 0.5)
+        TO_NEG_INF      = 0x01, // Round to negative infinity
+        TO_POS_INF      = 0x02, // Round to positive infinity
+        TO_ZERO         = 0x03, // Round to 0 a.k.a. truncate
+        CUR_DIRECTION   = 0x04, // Round in direction set in MXCSR register
+        
+        RAISE_EXC       = 0x00, // Raise exception on overflow
+        NO_EXC          = 0x08, // Suppress exceptions
+        
+        NINT            = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(RAISE_EXC),
+        NINT_NOEXC      = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(NO_EXC),
+        FLOOR           = static_cast<int>(TO_NEG_INF)      | static_cast<int>(RAISE_EXC),
+        FLOOR_NOEXC     = static_cast<int>(TO_NEG_INF)      | static_cast<int>(NO_EXC),
+        CEIL            = static_cast<int>(TO_POS_INF)      | static_cast<int>(RAISE_EXC),
+        CEIL_NOEXC      = static_cast<int>(TO_POS_INF)      | static_cast<int>(NO_EXC),
+        TRUNC           = static_cast<int>(TO_ZERO)         | static_cast<int>(RAISE_EXC),
+        TRUNC_NOEXC     = static_cast<int>(TO_ZERO)         | static_cast<int>(NO_EXC),
+        RINT            = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(RAISE_EXC),
+        NEARBYINT       = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(NO_EXC),
+    };
+
+    struct Traits
+    {
+        using CompareType = SIMDImpl::CompareType;
+        using ScaleFactor = SIMDImpl::ScaleFactor;
+        using RoundMode   = SIMDImpl::RoundMode;
+    };
+
+    // Attribute, 4-dimensional attribute in SIMD SOA layout
+    template<typename Float, typename Integer, typename Double>
+    union Vec4
+    {
+        Float   v[4];
+        Integer vi[4];
+        Double  vd[4];
+        struct
+        {
+            Float  x;
+            Float  y;
+            Float  z;
+            Float  w;
+        };
+        SIMDINLINE Float& operator[] (const int i) { return v[i]; }
+        SIMDINLINE Float const & operator[] (const int i) const { return v[i]; }
+        SIMDINLINE Vec4& operator=(Vec4 const & in)
+        {
+            v[0] = in.v[0];
+            v[1] = in.v[1];
+            v[2] = in.v[2];
+            v[3] = in.v[3];
+            return *this;
+        }
+    };
+
+    namespace SIMD128Impl
+    {
+        union Float
+        {
+            SIMDINLINE Float() = default;
+            SIMDINLINE Float(__m128 in) : v(in) {}
+            SIMDINLINE Float& operator=(__m128 in) { v = in; return *this; }
+            SIMDINLINE Float& operator=(Float const & in) { v = in.v; return *this; }
+            SIMDINLINE operator __m128() const { return v; }
+
+            SIMDALIGN(__m128, 16) v;
+        };
+
+        union Integer
+        {
+            SIMDINLINE Integer() = default;
+            SIMDINLINE Integer(__m128i in) : v(in) {}
+            SIMDINLINE Integer& operator=(__m128i in) { v = in; return *this; }
+            SIMDINLINE Integer& operator=(Integer const & in) { v = in.v; return *this; }
+            SIMDINLINE operator __m128i() const { return v; }
+            SIMDALIGN(__m128i, 16) v;
+        };
+
+        union Double
+        {
+            SIMDINLINE Double() = default;
+            SIMDINLINE Double(__m128d in) : v(in) {}
+            SIMDINLINE Double& operator=(__m128d in) { v = in; return *this; }
+            SIMDINLINE Double& operator=(Double const & in) { v = in.v; return *this; }
+            SIMDINLINE operator __m128d() const { return v; }
+            SIMDALIGN(__m128d, 16) v;
+        };
+
+        using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
+        using Mask = uint8_t;
+
+        static const uint32_t SIMD_WIDTH = 4;
+    } // ns SIMD128Impl
+
+    namespace SIMD256Impl
+    {
+        union Float
+        {
+            SIMDINLINE Float() = default;
+            SIMDINLINE Float(__m256 in) : v(in) {}
+            SIMDINLINE Float(SIMD128Impl::Float in_lo, SIMD128Impl::Float in_hi = _mm_setzero_ps())
+            {
+                v = _mm256_insertf128_ps(_mm256_castps128_ps256(in_lo), in_hi, 0x1);
+            }
+            SIMDINLINE Float& operator=(__m256 in) { v = in; return *this; }
+            SIMDINLINE Float& operator=(Float const & in) { v = in.v; return *this; }
+            SIMDINLINE operator __m256() const { return v; }
+
+            SIMDALIGN(__m256, 32) v;
+            SIMD128Impl::Float v4[2];
+        };
+
+        union Integer
+        {
+            SIMDINLINE Integer() = default;
+            SIMDINLINE Integer(__m256i in) : v(in) {}
+            SIMDINLINE Integer(SIMD128Impl::Integer in_lo, SIMD128Impl::Integer in_hi = _mm_setzero_si128())
+            {
+                v = _mm256_insertf128_si256(_mm256_castsi128_si256(in_lo), in_hi, 0x1);
+            }
+            SIMDINLINE Integer& operator=(__m256i in) { v = in; return *this; }
+            SIMDINLINE Integer& operator=(Integer const & in) { v = in.v; return *this; }
+            SIMDINLINE operator __m256i() const { return v; }
+
+            SIMDALIGN(__m256i, 32) v;
+            SIMD128Impl::Integer v4[2];
+        };
+
+        union Double
+        {
+            SIMDINLINE Double() = default;
+            SIMDINLINE Double(__m256d in) : v(in) {}
+            SIMDINLINE Double(SIMD128Impl::Double in_lo, SIMD128Impl::Double in_hi = _mm_setzero_pd())
+            {
+                v = _mm256_insertf128_pd(_mm256_castpd128_pd256(in_lo), in_hi, 0x1);
+            }
+            SIMDINLINE Double& operator=(__m256d in) { v = in; return *this; }
+            SIMDINLINE Double& operator=(Double const & in) { v = in.v; return *this; }
+            SIMDINLINE operator __m256d() const { return v; }
+
+            SIMDALIGN(__m256d, 32) v;
+            SIMD128Impl::Double v4[2];
+        };
+
+        using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
+        using Mask = uint8_t;
+
+        static const uint32_t SIMD_WIDTH = 8;
+    } // ns SIMD256Impl
+
+    namespace SIMD512Impl
+    {
+#if !defined(_MM_K0_REG)
+        // Define AVX512 types if not included via immintrin.h.
+        // All data members of these types are ONLY to viewed
+        // in a debugger.  Do NOT access them via code!
+        union __m512
+        {
+        private:
+            float m512_f32[16];
+        };
+        struct __m512d
+        {
+        private:
+            double m512d_f64[8];
+        };
+
+        union __m512i
+        {
+        private:
+            int8_t              m512i_i8[64];
+            int16_t             m512i_i16[32];
+            int32_t             m512i_i32[16];
+            int64_t             m512i_i64[8];
+            uint8_t             m512i_u8[64];
+            uint16_t            m512i_u16[32];
+            uint32_t            m512i_u32[16];
+            uint64_t            m512i_u64[8];
+        };
+
+        using __mmask16 = uint16_t;
+#endif
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+#define SIMD_ALIGNMENT_BYTES 64
+#else
+#define SIMD_ALIGNMENT_BYTES 32
+#endif
+
+        union Float
+        {
+            SIMDINLINE Float() = default;
+            SIMDINLINE Float(__m512 in) : v(in) {}
+            SIMDINLINE Float(SIMD256Impl::Float in_lo, SIMD256Impl::Float in_hi = _mm256_setzero_ps()) { v8[0] = in_lo; v8[1] = in_hi; }
+            SIMDINLINE Float& operator=(__m512 in) { v = in; return *this; }
+            SIMDINLINE Float& operator=(Float const & in)
+            {
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+                v = in.v;
+#else
+                v8[0] = in.v8[0];
+                v8[1] = in.v8[1];
+#endif
+                return *this;
+            }
+            SIMDINLINE operator __m512() const { return v; }
+
+            SIMDALIGN(__m512, SIMD_ALIGNMENT_BYTES) v;
+            SIMD256Impl::Float v8[2];
+        };
+
+        union Integer
+        {
+            SIMDINLINE Integer() = default;
+            SIMDINLINE Integer(__m512i in) : v(in) {}
+            SIMDINLINE Integer(SIMD256Impl::Integer in_lo, SIMD256Impl::Integer in_hi = _mm256_setzero_si256()) { v8[0] = in_lo; v8[1] = in_hi; }
+            SIMDINLINE Integer& operator=(__m512i in) { v = in; return *this; }
+            SIMDINLINE Integer& operator=(Integer const & in)
+            {
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+                v = in.v;
+#else
+                v8[0] = in.v8[0];
+                v8[1] = in.v8[1];
+#endif
+                return *this;
+            }
+
+            SIMDINLINE operator __m512i() const { return v; }
+
+            SIMDALIGN(__m512i, SIMD_ALIGNMENT_BYTES) v;
+            SIMD256Impl::Integer v8[2];
+        };
+
+        union Double
+        {
+            SIMDINLINE Double() = default;
+            SIMDINLINE Double(__m512d in) : v(in) {}
+            SIMDINLINE Double(SIMD256Impl::Double in_lo, SIMD256Impl::Double in_hi = _mm256_setzero_pd()) { v8[0] = in_lo; v8[1] = in_hi; }
+            SIMDINLINE Double& operator=(__m512d in) { v = in; return *this; }
+            SIMDINLINE Double& operator=(Double const & in)
+            {
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+                v = in.v;
+#else
+                v8[0] = in.v8[0];
+                v8[1] = in.v8[1];
+#endif
+                return *this;
+            }
+
+            SIMDINLINE operator __m512d() const { return v; }
+
+            SIMDALIGN(__m512d, SIMD_ALIGNMENT_BYTES) v;
+            SIMD256Impl::Double v8[2];
+        };
+
+        typedef SIMDImpl::Vec4<Float, Integer, Double> SIMDALIGN(Vec4, 64);
+        using Mask = __mmask16;
+
+        static const uint32_t SIMD_WIDTH = 16;
+
+#undef SIMD_ALIGNMENT_BYTES
+    } // ns SIMD512Impl
+} // ns SIMDImpl
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
index e1518719840..2e32e4d32cb 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
+++ b/src/gallium/drivers/swr/rasterizer/core/backend_impl.h
@@ -43,10 +43,10 @@ enum SWR_BACKEND_FUNCS
 };
 
 #if KNOB_SIMD_WIDTH == 8
-static const simdscalar vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
-static const simdscalar vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
-static const simdscalar vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
-static const simdscalar vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
+static const __m256 vCenterOffsetsX = __m256{0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
+static const __m256 vCenterOffsetsY = __m256{0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
+static const __m256 vULOffsetsX = __m256{0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
+static const __m256 vULOffsetsY = __m256{0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
 #define MASK 0xff
 #endif
 
@@ -163,52 +163,52 @@ struct generateInputCoverage
             uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
             if(T::MultisampleT::numSamples == 1)
             {
-                sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
+                sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
             }
             else if(T::MultisampleT::numSamples == 2)
             {
-                sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
+                sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
             }
             else if(T::MultisampleT::numSamples == 4)
             {
-                sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
+                sampleCoverage[0] = _simd_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
             }
             else if(T::MultisampleT::numSamples == 8)
             {
-                sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
+                sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
             }
             else if(T::MultisampleT::numSamples == 16)
             {
-                sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
-                sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
+                sampleCoverage[0] = _simd_set1_epi32(centerCoverage);
+                sampleCoverage[1] = _simd_set1_epi32(centerCoverage);
             }
         }
         else
         {
-            __m256i src = _mm256_set1_epi32(0);
-            __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
+            simdscalari src = _simd_set1_epi32(0);
+            simdscalari index0 = _simd_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
 
             if(T::MultisampleT::numSamples == 1)
             {
-                mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
+                mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
             }
             else if(T::MultisampleT::numSamples == 2)
             {
-                mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+                mask[0] = _simd_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
             }
             else if(T::MultisampleT::numSamples == 4)
             {
-                mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
+                mask[0] = _simd_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
             }
             else if(T::MultisampleT::numSamples == 8)
             {
-                mask[0] = _mm256_set1_epi32(-1);
+                mask[0] = _simd_set1_epi32(-1);
             }
             else if(T::MultisampleT::numSamples == 16)
             {
-                mask[0] = _mm256_set1_epi32(-1);
-                mask[1] = _mm256_set1_epi32(-1);
-                index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
+                mask[0] = _simd_set1_epi32(-1);
+                mask[1] = _simd_set1_epi32(-1);
+                index1 = _simd_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
             }
 
             // gather coverage for samples 0-7
@@ -253,14 +253,14 @@ struct generateInputCoverage
             packedSampleCoverage = packedCoverage0;
         }
     #else
-        simdscalari permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
+        simdscalari permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
         // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane 
         packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
 
         simdscalari packedSampleCoverage;
         if(T::MultisampleT::numSamples > 8)
         {
-            permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
+            permMask = _simd_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
             // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
             packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
 
@@ -293,7 +293,7 @@ struct generateInputCoverage
     {
         uint32_t inputMask[KNOB_SIMD_WIDTH];
         generateInputCoverage<T, T::InputCoverage>(coverageMask, inputMask, sampleMask);
-        inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
+        inputCoverage = _simd_castsi_ps(_simd_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
     }
 
 };
@@ -305,10 +305,10 @@ struct generateInputCoverage<T, SWR_INPUT_COVERAGE_INNER_CONSERVATIVE>
     {
         // will need to update for avx512
         assert(KNOB_SIMD_WIDTH == 8);
-        simdscalari vec = _mm256_set1_epi32(coverageMask[0]);
-        const simdscalari bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+        simdscalari vec = _simd_set1_epi32(coverageMask[0]);
+        const simdscalari bit = _simd_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
         vec = _simd_and_si(vec, bit);
-        vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
+        vec = _simd_cmplt_epi32(_simd_setzero_si(), vec);
         vec = _simd_blendv_epi32(_simd_setzero_si(), _simd_set1_epi32(1), vec);
         inputCoverage = _simd_castsi_ps(vec);
     }
@@ -357,7 +357,7 @@ INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS
     (inputMask[7] > 0) ? (_BitScanForward(&sampleNum[7], inputMask[7])) : (sampleNum[7] = 0);
 
     // look up and set the sample offsets from UL pixel corner for first covered sample 
-    __m256 vXSample = _mm256_set_ps(samplePos.X(sampleNum[7]),
+    simdscalar vXSample = _simd_set_ps(samplePos.X(sampleNum[7]),
                                     samplePos.X(sampleNum[6]),
                                     samplePos.X(sampleNum[5]),
                                     samplePos.X(sampleNum[4]),
@@ -366,7 +366,7 @@ INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS
                                     samplePos.X(sampleNum[1]),
                                     samplePos.X(sampleNum[0]));
 
-    __m256 vYSample = _mm256_set_ps(samplePos.Y(sampleNum[7]),
+    simdscalar vYSample = _simd_set_ps(samplePos.Y(sampleNum[7]),
                                     samplePos.Y(sampleNum[6]),
                                     samplePos.Y(sampleNum[5]),
                                     samplePos.Y(sampleNum[4]),
@@ -380,7 +380,7 @@ INLINE void CalcCentroidPos(SWR_PS_CONTEXT &psContext, const SWR_MULTISAMPLE_POS
 
     // Case (1) and case (3b) - All samples covered or not covered with full SampleMask
     static const simdscalari vFullyCoveredMask = T::MultisampleT::FullSampleMask();
-    simdscalari vInputCoveragei =  _mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
+    simdscalari vInputCoveragei =  _simd_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]);
     simdscalari vAllSamplesCovered = _simd_cmpeq_epi32(vInputCoveragei, vFullyCoveredMask);
 
     static const simdscalari vZero = _simd_setzero_si();
diff --git a/src/gallium/drivers/swr/rasterizer/core/binner.cpp b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
index 29d2f1ce42c..34789cf0356 100644
--- a/src/gallium/drivers/swr/rasterizer/core/binner.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/binner.cpp
@@ -88,7 +88,7 @@ INLINE void ProcessAttributes(
             inputSlot = backendState.vertexAttribOffset + i;
         }
 
-        __m128 attrib[3];    // triangle attribs (always 4 wide)
+        simd4scalar attrib[3];    // triangle attribs (always 4 wide)
         float* pAttribStart = pBuffer;
 
         if (HasConstantInterpT::value || IsDegenerate::value)
@@ -128,7 +128,7 @@ INLINE void ProcessAttributes(
 
                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
                 {
-                    _mm_store_ps(pBuffer, attrib[vid]);
+                    SIMD128::store_ps(pBuffer, attrib[vid]);
                     pBuffer += 4;
                 }
             }
@@ -138,7 +138,7 @@ INLINE void ProcessAttributes(
 
                 for (uint32_t i = 0; i < NumVertsT::value; ++i)
                 {
-                    _mm_store_ps(pBuffer, attrib[i]);
+                    SIMD128::store_ps(pBuffer, attrib[i]);
                     pBuffer += 4;
                 }
             }
@@ -149,7 +149,7 @@ INLINE void ProcessAttributes(
 
             for (uint32_t i = 0; i < NumVertsT::value; ++i)
             {
-                _mm_store_ps(pBuffer, attrib[i]);
+                SIMD128::store_ps(pBuffer, attrib[i]);
                 pBuffer += 4;
             }
         }
@@ -160,7 +160,7 @@ INLINE void ProcessAttributes(
         // effect of the missing vertices in the triangle interpolation.
         for (uint32_t v = NumVertsT::value; v < 3; ++v)
         {
-            _mm_store_ps(pBuffer, attrib[NumVertsT::value - 1]);
+            SIMD128::store_ps(pBuffer, attrib[NumVertsT::value - 1]);
             pBuffer += 4;
         }
 
@@ -279,8 +279,7 @@ struct GatherScissors_simd16<16>
 {
     static void Gather(const SWR_RECT* pScissorsInFixedPoint, const uint32_t* pViewportIndex,
         simd16scalari &scisXmin, simd16scalari &scisYmin,
-        simd16scalari &scisXmax, simd16scalari &scisYmax)
-    {
+        simd16scalari &scisXmax, simd16scalari &scisYmax) {
         scisXmin = _simd16_set_epi32(pScissorsInFixedPoint[pViewportIndex[0]].xmin,
             pScissorsInFixedPoint[pViewportIndex[1]].xmin,
             pScissorsInFixedPoint[pViewportIndex[2]].xmin,
@@ -390,14 +389,14 @@ void ProcessUserClipDist(PA_STATE& pa, uint32_t primIndex, uint8_t clipDistMask,
         uint32_t clipAttribSlot = clipSlot == 0 ?
             VERTEX_CLIPCULL_DIST_LO_SLOT : VERTEX_CLIPCULL_DIST_HI_SLOT;
 
-        __m128 primClipDist[3];
+        simd4scalar primClipDist[3];
         pa.AssembleSingle(clipAttribSlot, primIndex, primClipDist);
 
         float vertClipDist[NumVerts];
         for (uint32_t e = 0; e < NumVerts; ++e)
         {
             OSALIGNSIMD(float) aVertClipDist[4];
-            _mm_store_ps(aVertClipDist, primClipDist[e]);
+            SIMD128::store_ps(aVertClipDist, primClipDist[e]);
             vertClipDist[e] = aVertClipDist[clipComp];
         };
 
@@ -625,13 +624,14 @@ void BinTriangles(
             (SWR_INPUT_COVERAGE)pDC->pState->state.psState.inputCoverage, EdgeValToEdgeState(ALL_EDGES_VALID), (state.scissorsTileAligned == false));
     }
 
+    simdBBox bbox;
+
     if (!triMask)
     {
         goto endBinTriangles;
     }
 
     // Calc bounding box of triangles
-    simdBBox bbox;
     calcBoundingBoxIntVertical<CT>(tri, vXi, vYi, bbox);
 
     // determine if triangle falls between pixel centers and discard
@@ -673,28 +673,30 @@ void BinTriangles(
     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
     // Gather the AOS effective scissor rects based on the per-prim VP index.
     /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
-    simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
-    if (state.backendState.readViewportArrayIndex)
-    {
-        GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
-            scisXmin, scisYmin, scisXmax, scisYmax);
-    }
-    else // broadcast fast path for non-VPAI case.
     {
-        scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
-        scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
-        scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
-        scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
-    }
+        simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+        if (state.backendState.readViewportArrayIndex)
+        {
+            GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+                scisXmin, scisYmin, scisXmax, scisYmax);
+        }
+        else // broadcast fast path for non-VPAI case.
+        {
+            scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+            scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+            scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+            scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+        }
 
-    // Make triangle bbox inclusive
-    bbox.xmax = _simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1));
-    bbox.ymax = _simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1));
+        // Make triangle bbox inclusive
+        bbox.xmax = _simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1));
+        bbox.ymax = _simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1));
 
-    bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
-    bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
-    bbox.xmax = _simd_min_epi32(bbox.xmax, scisXmax);
-    bbox.ymax = _simd_min_epi32(bbox.ymax, scisYmax);
+        bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+        bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+        bbox.xmax = _simd_min_epi32(bbox.xmax, scisXmax);
+        bbox.ymax = _simd_min_epi32(bbox.ymax, scisYmax);
+    }
 
     if (CT::IsConservativeT::value)
     {
@@ -768,7 +770,7 @@ endBinTriangles:
 
     // transpose verts needed for backend
     /// @todo modify BE to take non-transformed verts
-    __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
+    simd4scalar vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
     vTranspose3x8(vHorizX, tri[0].x, tri[1].x, tri[2].x);
     vTranspose3x8(vHorizY, tri[0].y, tri[1].y, tri[2].y);
     vTranspose3x8(vHorizZ, tri[0].z, tri[1].z, tri[2].z);
@@ -837,10 +839,10 @@ endBinTriangles:
         // store triangle vertex data
         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
 
-        _mm_store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
-        _mm_store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
-        _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
-        _mm_store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[triIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[triIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[triIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[triIndex]);
 
         // store user clip distances
         if (rastState.clipDistanceMask)
@@ -870,7 +872,7 @@ endBinTriangles:
 
 #if USE_SIMD16_FRONTEND
 template <typename CT>
-void SIMDAPI BinTriangles_simd16(
+void SIMDCALL BinTriangles_simd16(
     DRAW_CONTEXT *pDC,
     PA_STATE& pa,
     uint32_t workerId,
@@ -1124,29 +1126,31 @@ void SIMDAPI BinTriangles_simd16(
     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
     // Gather the AOS effective scissor rects based on the per-prim VP index.
     /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
-    simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
-
-    if (state.backendState.readViewportArrayIndex)
-    {
-        GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
-            scisXmin, scisYmin, scisXmax, scisYmax);
-    }
-    else // broadcast fast path for non-VPAI case.
     {
-        scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
-        scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
-        scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
-        scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
-    }
+        simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
+
+        if (state.backendState.readViewportArrayIndex)
+        {
+            GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+                scisXmin, scisYmin, scisXmax, scisYmax);
+        }
+        else // broadcast fast path for non-VPAI case.
+        {
+            scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+            scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+            scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+            scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+        }
 
-    // Make triangle bbox inclusive
-    bbox.xmax = _simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1));
-    bbox.ymax = _simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1));
+        // Make triangle bbox inclusive
+        bbox.xmax = _simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1));
+        bbox.ymax = _simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1));
 
-    bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
-    bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
-    bbox.xmax = _simd16_min_epi32(bbox.xmax, scisXmax);
-    bbox.ymax = _simd16_min_epi32(bbox.ymax, scisYmax);
+        bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
+        bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
+        bbox.xmax = _simd16_min_epi32(bbox.xmax, scisXmax);
+        bbox.ymax = _simd16_min_epi32(bbox.ymax, scisYmax);
+    }
 
     if (CT::IsConservativeT::value)
     {
@@ -1221,10 +1225,10 @@ endBinTriangles:
 
     // transpose verts needed for backend
     /// @todo modify BE to take non-transformed verts
-    __m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
-    __m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
-    __m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
-    __m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
 
     vTranspose3x8(vHorizX[0], _simd16_extract_ps(tri[0].x, 0), _simd16_extract_ps(tri[1].x, 0), _simd16_extract_ps(tri[2].x, 0));
     vTranspose3x8(vHorizY[0], _simd16_extract_ps(tri[0].y, 0), _simd16_extract_ps(tri[1].y, 0), _simd16_extract_ps(tri[2].y, 0));
@@ -1547,24 +1551,26 @@ void BinPostSetupPoints(
         // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
         // Gather the AOS effective scissor rects based on the per-prim VP index.
         /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
-        simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
-        if (state.backendState.readViewportArrayIndex)
-        {
-            GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
-                scisXmin, scisYmin, scisXmax, scisYmax);
-        }
-        else // broadcast fast path for non-VPAI case.
         {
-            scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
-            scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
-            scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
-            scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
-        }
+            simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+            if (state.backendState.readViewportArrayIndex)
+            {
+                GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+                    scisXmin, scisYmin, scisXmax, scisYmax);
+            }
+            else // broadcast fast path for non-VPAI case.
+            {
+                scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+                scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+                scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+                scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+            }
 
-        bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
-        bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
-        bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
-        bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+            bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+            bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+            bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
+            bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+        }
 
         // Cull bloated points completely outside scissor
         simdscalari maskOutsideScissorX = _simd_cmpgt_epi32(bbox.xmin, bbox.xmax);
@@ -1934,24 +1940,26 @@ void BinPostSetupPoints_simd16(
         // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
         // Gather the AOS effective scissor rects based on the per-prim VP index.
         /// @todo:  Look at speeding this up -- weigh against corresponding costs in rasterizer.
-        simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
-        if (state.backendState.readViewportArrayIndex)
         {
-            GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
-                scisXmin, scisYmin, scisXmax, scisYmax);
-        }
-        else // broadcast fast path for non-VPAI case.
-        {
-            scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
-            scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
-            scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
-            scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
-        }
+            simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
+            if (state.backendState.readViewportArrayIndex)
+            {
+                GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+                    scisXmin, scisYmin, scisXmax, scisYmax);
+            }
+            else // broadcast fast path for non-VPAI case.
+            {
+                scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+                scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+                scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+                scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+            }
 
-        bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
-        bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
-        bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
-        bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+            bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
+            bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
+            bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
+            bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+        }
 
         // Cull bloated points completely outside scissor
         simd16scalari maskOutsideScissorX = _simd16_cmpgt_epi32(bbox.xmin, bbox.xmax);
@@ -2071,7 +2079,7 @@ void BinPostSetupPoints_simd16(
     AR_END(FEBinPoints, 1);
 }
 
-void SIMDAPI BinPoints_simd16(
+void SIMDCALL BinPoints_simd16(
     DRAW_CONTEXT *pDC,
     PA_STATE& pa,
     uint32_t workerId,
@@ -2168,6 +2176,8 @@ void BinPostSetupLines(
     simdscalar& vRecipW0 = recipW[0];
     simdscalar& vRecipW1 = recipW[1];
 
+    simd4scalar vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
+
     // convert to fixed point
     simdscalari vXi[2], vYi[2];
     vXi[0] = fpToFixedPointVertical(prim[0].x);
@@ -2214,24 +2224,26 @@ void BinPostSetupLines(
     bbox.ymax = _simd_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
 
     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
-    simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
-    if (state.backendState.readViewportArrayIndex)
-    {
-        GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
-            scisXmin, scisYmin, scisXmax, scisYmax);
-    }
-    else // broadcast fast path for non-VPAI case.
     {
-        scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
-        scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
-        scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
-        scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
-    }
+        simdscalari scisXmin, scisYmin, scisXmax, scisYmax;
+        if (state.backendState.readViewportArrayIndex)
+        {
+            GatherScissors<KNOB_SIMD_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+                scisXmin, scisYmin, scisXmax, scisYmax);
+        }
+        else // broadcast fast path for non-VPAI case.
+        {
+            scisXmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+            scisYmin = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+            scisXmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+            scisYmax = _simd_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+        }
 
-    bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
-    bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
-    bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
-    bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+        bbox.xmin = _simd_max_epi32(bbox.xmin, scisXmin);
+        bbox.ymin = _simd_max_epi32(bbox.ymin, scisYmin);
+        bbox.xmax = _simd_min_epi32(_simd_sub_epi32(bbox.xmax, _simd_set1_epi32(1)), scisXmax);
+        bbox.ymax = _simd_min_epi32(_simd_sub_epi32(bbox.ymax, _simd_set1_epi32(1)), scisYmax);
+    }
 
     // Cull prims completely outside scissor
     {
@@ -2261,7 +2273,6 @@ void BinPostSetupLines(
 
     // transpose verts needed for backend
     /// @todo modify BE to take non-transformed verts
-    __m128 vHorizX[8], vHorizY[8], vHorizZ[8], vHorizW[8];
     vTranspose3x8(vHorizX, prim[0].x, prim[1].x, vUnused);
     vTranspose3x8(vHorizY, prim[0].y, prim[1].y, vUnused);
     vTranspose3x8(vHorizZ, prim[0].z, prim[1].z, vUnused);
@@ -2310,10 +2321,10 @@ void BinPostSetupLines(
 
         // store line vertex data
         desc.pTriBuffer = (float*)pArena->AllocAligned(4 * 4 * sizeof(float), 16);
-        _mm_store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
-        _mm_store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
-        _mm_store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
-        _mm_store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[0], vHorizX[primIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[4], vHorizY[primIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[8], vHorizZ[primIndex]);
+        SIMD128::store_ps(&desc.pTriBuffer[12], vHorizW[primIndex]);
 
         // store user clip distances
         if (rastState.clipDistanceMask)
@@ -2417,25 +2428,27 @@ void BinPostSetupLines_simd16(
     bbox.ymax = _simd16_blendv_epi32(bloatBox.ymax, bbox.ymax, vYmajorMask);
 
     // Intersect with scissor/viewport. Subtract 1 ULP in x.8 fixed point since xmax/ymax edge is exclusive.
-    simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
-
-    if (state.backendState.readViewportArrayIndex)
-    {
-        GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
-            scisXmin, scisYmin, scisXmax, scisYmax);
-    }
-    else // broadcast fast path for non-VPAI case.
     {
-        scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
-        scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
-        scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
-        scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
-    }
+        simd16scalari scisXmin, scisYmin, scisXmax, scisYmax;
+
+        if (state.backendState.readViewportArrayIndex)
+        {
+            GatherScissors_simd16<KNOB_SIMD16_WIDTH>::Gather(&state.scissorsInFixedPoint[0], pViewportIndex,
+                scisXmin, scisYmin, scisXmax, scisYmax);
+        }
+        else // broadcast fast path for non-VPAI case.
+        {
+            scisXmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmin);
+            scisYmin = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymin);
+            scisXmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].xmax);
+            scisYmax = _simd16_set1_epi32(state.scissorsInFixedPoint[0].ymax);
+        }
 
-    bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
-    bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
-    bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
-    bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+        bbox.xmin = _simd16_max_epi32(bbox.xmin, scisXmin);
+        bbox.ymin = _simd16_max_epi32(bbox.ymin, scisYmin);
+        bbox.xmax = _simd16_min_epi32(_simd16_sub_epi32(bbox.xmax, _simd16_set1_epi32(1)), scisXmax);
+        bbox.ymax = _simd16_min_epi32(_simd16_sub_epi32(bbox.ymax, _simd16_set1_epi32(1)), scisYmax);
+    }
 
     // Cull prims completely outside scissor
     {
@@ -2468,10 +2481,10 @@ void BinPostSetupLines_simd16(
 
     // transpose verts needed for backend
     /// @todo modify BE to take non-transformed verts
-    __m128 vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
-    __m128 vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
-    __m128 vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
-    __m128 vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizX[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizY[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizZ[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
+    simd4scalar vHorizW[2][KNOB_SIMD_WIDTH]; // KNOB_SIMD16_WIDTH
 
     vTranspose3x8(vHorizX[0], _simd16_extract_ps(prim[0].x, 0), _simd16_extract_ps(prim[1].x, 0), unused);
     vTranspose3x8(vHorizY[0], _simd16_extract_ps(prim[0].y, 0), _simd16_extract_ps(prim[1].y, 0), unused);
@@ -2650,7 +2663,7 @@ void BinLines(
 }
 
 #if USE_SIMD16_FRONTEND
-void SIMDAPI BinLines_simd16(
+void SIMDCALL BinLines_simd16(
     DRAW_CONTEXT *pDC,
     PA_STATE& pa,
     uint32_t workerId,
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
index bd62b58f32a..bf542f18c18 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
@@ -188,7 +188,7 @@ void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector p
 }
 
 #if USE_SIMD16_FRONTEND
-void SIMDAPI ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
+void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
     AR_BEGIN(FEClipTriangles, pDC->drawId);
@@ -203,7 +203,7 @@ void SIMDAPI ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t work
     AR_END(FEClipTriangles, 1);
 }
 
-void SIMDAPI ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
+void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
     AR_BEGIN(FEClipLines, pDC->drawId);
@@ -218,7 +218,7 @@ void SIMDAPI ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId
     AR_END(FEClipLines, 1);
 }
 
-void SIMDAPI ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
+void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId)
 {
     SWR_CONTEXT *pContext = pDC->pContext;
     AR_BEGIN(FEClipPoints, pDC->drawId);
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h
index 4f940d931c4..8a4fe6dcfa0 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -1095,7 +1095,7 @@ public:
             AR_BEGIN(FEGuardbandClip, pa.pDC->drawId);
             // we have to clip tris, execute the clipper, which will also
             // call the binner
-            ClipSimd(vMask16(primMask), vMask16(clipMask), pa, primId);
+            ClipSimd(_simd16_vmask_ps(primMask), _simd16_vmask_ps(clipMask), pa, primId);
             AR_END(FEGuardbandClip, 1);
         }
         else if (validMask)
@@ -1180,7 +1180,7 @@ private:
     {
         simd16scalari vOffsets = ComputeOffsets(attrib, vIndices, component);
         simd16scalar vSrc = _simd16_setzero_ps();
-        return _simd16_mask_i32gather_ps(vSrc, pBuffer, vOffsets, _simd16_castps_si(vMask), 1);
+        return _simd16_mask_i32gather_ps(vSrc, pBuffer, vOffsets, vMask, 1);
     }
 
 #endif
@@ -1895,8 +1895,8 @@ void ClipTriangles(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvecto
 void ClipLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
 void ClipPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[], uint32_t primMask, simdscalari primId);
 #if USE_SIMD16_FRONTEND
-void SIMDAPI ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
-void SIMDAPI ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
-void SIMDAPI ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
+void SIMDCALL ClipTriangles_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
+void SIMDCALL ClipLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
+void SIMDCALL ClipPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[], uint32_t primMask, simd16scalari primId);
 #endif
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index 81bf9ff7114..a694f2d4112 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -218,7 +218,7 @@ typedef void(*PFN_PROCESS_PRIMS)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t worke
 
 #if ENABLE_AVX512_SIMD16
 // function signature for pipeline stages that execute after primitive assembly
-typedef void(SIMDAPI *PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[],
+typedef void(SIMDCALL *PFN_PROCESS_PRIMS_SIMD16)(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[],
     uint32_t primMask, simd16scalari primID);
 
 #endif
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
index 087657bce4a..4e642f8c2e4 100644
--- a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
@@ -202,7 +202,7 @@ INLINE void StoreSOA(const simdvector &src, uint8_t *pDst)
 /// @param pSrc - source data in SOA form
 /// @param dst - output data in SOA form
 template<SWR_FORMAT SrcFormat>
-INLINE void SIMDAPI LoadSOA(const uint8_t *pSrc, simd16vector &dst)
+INLINE void SIMDCALL LoadSOA(const uint8_t *pSrc, simd16vector &dst)
 {
     // fast path for float32
     if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<SrcFormat>::GetBPC(0) == 32))
@@ -247,7 +247,7 @@ INLINE void SIMDAPI LoadSOA(const uint8_t *pSrc, simd16vector &dst)
 /// @param vComp - SIMD vector of floats
 /// @param Component - component
 template<SWR_FORMAT Format>
-INLINE simd16scalar SIMDAPI Clamp(simd16scalar vComp, uint32_t Component)
+INLINE simd16scalar SIMDCALL Clamp(simd16scalar vComp, uint32_t Component)
 {
     if (FormatTraits<Format>::isNormalized(Component))
     {
@@ -293,7 +293,7 @@ INLINE simd16scalar SIMDAPI Clamp(simd16scalar vComp, uint32_t Component)
 /// @param vComp - SIMD vector of floats
 /// @param Component - component
 template<SWR_FORMAT Format>
-INLINE simd16scalar SIMDAPI Normalize(simd16scalar vComp, uint32_t Component)
+INLINE simd16scalar SIMDCALL Normalize(simd16scalar vComp, uint32_t Component)
 {
     if (FormatTraits<Format>::isNormalized(Component))
     {
@@ -309,7 +309,7 @@ INLINE simd16scalar SIMDAPI Normalize(simd16scalar vComp, uint32_t Component)
 /// @param src - source data in SOA form
 /// @param dst - output data in SOA form
 template<SWR_FORMAT DstFormat>
-INLINE void SIMDAPI StoreSOA(const simd16vector &src, uint8_t *pDst)
+INLINE void SIMDCALL StoreSOA(const simd16vector &src, uint8_t *pDst)
 {
     // fast path for float32
     if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<DstFormat>::GetBPC(0) == 32))
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h b/src/gallium/drivers/swr/rasterizer/core/format_types.h
index 1ad3d61927a..43053b646f6 100644
--- a/src/gallium/drivers/swr/rasterizer/core/format_types.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h
@@ -43,7 +43,7 @@ struct PackTraits
     static simdscalar pack(simdscalar &in) = delete;
 #if ENABLE_AVX512_SIMD16
     static simd16scalar loadSOA_16(const uint8_t *pSrc) = delete;
-    static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src) = delete;
+    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src) = delete;
     static simd16scalar unpack(simd16scalar &in) = delete;
     static simd16scalar pack(simd16scalar &in) = delete;
 #endif
@@ -63,7 +63,7 @@ struct PackTraits<0, false>
     static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); }
 #if ENABLE_AVX512_SIMD16
     static simd16scalar loadSOA_16(const uint8_t *pSrc) { return _simd16_setzero_ps(); }
-    static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src) { return; }
+    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src) { return; }
     static simd16scalar unpack(simd16scalar &in) { return _simd16_setzero_ps(); }
     static simd16scalar pack(simd16scalar &in) { return _simd16_setzero_ps(); }
 #endif
@@ -109,7 +109,7 @@ struct PackTraits<8, false>
 
         __m256i result = _mm256_castsi128_si256(resLo);
         result = _mm256_insertf128_si256(result, resHi, 1);
-        return _mm256_castsi256_ps(result);
+        return simdscalar{ _mm256_castsi256_ps(result) };
 #else
         return _mm256_castsi256_ps(_mm256_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(in))));
 #endif
@@ -144,7 +144,7 @@ struct PackTraits<8, false>
         return result;
     }
 
-    static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
+    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
     {
         // store simd16 bytes
         _mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
@@ -152,7 +152,8 @@ struct PackTraits<8, false>
 
     static simd16scalar unpack(simd16scalar &in)
     {
-        simd16scalari result = _simd16_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0))));
+        simd4scalari tmp = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
+        simd16scalari result = _simd16_cvtepu8_epi32(tmp);
 
         return _simd16_castsi_ps(result);
     }
@@ -259,7 +260,7 @@ struct PackTraits<8, true>
         return result;
     }
 
-    static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
+    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
     {
         // store simd16 bytes
         _mm_store_ps(reinterpret_cast<float *>(pDst), _mm256_castps256_ps128(_simd16_extract_ps(src, 0)));
@@ -267,7 +268,8 @@ struct PackTraits<8, true>
 
     static simd16scalar unpack(simd16scalar &in)
     {
-        simd16scalari result = _simd16_cvtepu8_epi32(_mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0))));
+        simd4scalari tmp = _mm_castps_si128(_mm256_castps256_ps128(_simd16_extract_ps(in, 0)));
+        simd16scalari result = _simd16_cvtepu8_epi32(tmp);
 
         return _simd16_castsi_ps(result);
     }
@@ -370,7 +372,7 @@ struct PackTraits<16, false>
         return result;
     }
 
-    static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
+    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
     {
         _simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
     }
@@ -469,7 +471,7 @@ struct PackTraits<16, true>
         return result;
     }
 
-    static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
+    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
     {
         _simd_store_ps(reinterpret_cast<float *>(pDst), _simd16_extract_ps(src, 0));
     }
@@ -514,7 +516,7 @@ struct PackTraits<32, false>
         return _simd16_load_ps(reinterpret_cast<const float *>(pSrc));
     }
 
-    static void SIMDAPI storeSOA(uint8_t *pDst, simd16scalar src)
+    static void SIMDCALL storeSOA(uint8_t *pDst, simd16scalar src)
     {
         _simd16_store_ps(reinterpret_cast<float *>(pDst), src);
     }
@@ -812,7 +814,7 @@ static inline __m128 ConvertFloatToSRGB2(__m128& Src)
 
 #if ENABLE_AVX512_SIMD16
 template< unsigned expnum, unsigned expden, unsigned coeffnum, unsigned coeffden >
-inline static simd16scalar SIMDAPI fastpow(simd16scalar value)
+inline static simd16scalar SIMDCALL fastpow(simd16scalar value)
 {
     static const float factor1 = exp2(127.0f * expden / expnum - 127.0f)
         * powf(1.0f * coeffnum / coeffden, 1.0f * expden / expnum);
@@ -834,7 +836,7 @@ inline static simd16scalar SIMDAPI fastpow(simd16scalar value)
     return result;
 }
 
-inline static simd16scalar SIMDAPI pow512_4(simd16scalar arg)
+inline static simd16scalar SIMDCALL pow512_4(simd16scalar arg)
 {
     // 5/12 is too small, so compute the 4th root of 20/12 instead.
     // 20/12 = 5/3 = 1 + 2/3 = 2 - 1/3. 2/3 is a suitable argument for fastpow.
@@ -855,7 +857,7 @@ inline static simd16scalar SIMDAPI pow512_4(simd16scalar arg)
     return xavg;
 }
 
-inline static simd16scalar SIMDAPI powf_wrapper(const simd16scalar base, float exp)
+inline static simd16scalar SIMDCALL powf_wrapper(const simd16scalar base, float exp)
 {
     const float *f = reinterpret_cast<const float *>(&base);
 
@@ -1410,7 +1412,7 @@ struct ComponentTraits
         return TypeTraits<X, NumBitsX>::loadSOA_16(pSrc);
     }
 
-    INLINE static void SIMDAPI storeSOA(uint32_t comp, uint8_t *pDst, simd16scalar src)
+    INLINE static void SIMDCALL storeSOA(uint32_t comp, uint8_t *pDst, simd16scalar src)
     {
         switch (comp)
         {
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_utils.h b/src/gallium/drivers/swr/rasterizer/core/format_utils.h
index 94b6c1b09e3..576f14bcafd 100644
--- a/src/gallium/drivers/swr/rasterizer/core/format_utils.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_utils.h
@@ -31,58 +31,58 @@
 #include "common/simdintrin.h"
 
 INLINE
-void vTranspose(__m128 &row0, __m128 &row1, __m128 &row2, __m128 &row3)
+void vTranspose(simd4scalar &row0, simd4scalar &row1, simd4scalar &row2, simd4scalar &row3)
 {
-    __m128i row0i = _mm_castps_si128(row0);
-    __m128i row1i = _mm_castps_si128(row1);
-    __m128i row2i = _mm_castps_si128(row2);
-    __m128i row3i = _mm_castps_si128(row3);
+    simd4scalari row0i = SIMD128::castps_si(row0);
+    simd4scalari row1i = SIMD128::castps_si(row1);
+    simd4scalari row2i = SIMD128::castps_si(row2);
+    simd4scalari row3i = SIMD128::castps_si(row3);
 
-    __m128i vTemp = row2i;
-    row2i = _mm_unpacklo_epi32(row2i, row3i);
-    vTemp = _mm_unpackhi_epi32(vTemp, row3i);
+    simd4scalari vTemp = row2i;
+    row2i = SIMD128::unpacklo_epi32(row2i, row3i);
+    vTemp = SIMD128::unpackhi_epi32(vTemp, row3i);
 
     row3i = row0i;
-    row0i = _mm_unpacklo_epi32(row0i, row1i);
-    row3i = _mm_unpackhi_epi32(row3i, row1i);
+    row0i = SIMD128::unpacklo_epi32(row0i, row1i);
+    row3i = SIMD128::unpackhi_epi32(row3i, row1i);
 
     row1i = row0i;
-    row0i = _mm_unpacklo_epi64(row0i, row2i);
-    row1i = _mm_unpackhi_epi64(row1i, row2i);
+    row0i = SIMD128::unpacklo_epi64(row0i, row2i);
+    row1i = SIMD128::unpackhi_epi64(row1i, row2i);
 
     row2i = row3i;
-    row2i = _mm_unpacklo_epi64(row2i, vTemp);
-    row3i = _mm_unpackhi_epi64(row3i, vTemp);
+    row2i = SIMD128::unpacklo_epi64(row2i, vTemp);
+    row3i = SIMD128::unpackhi_epi64(row3i, vTemp);
 
-    row0 = _mm_castsi128_ps(row0i);
-    row1 = _mm_castsi128_ps(row1i);
-    row2 = _mm_castsi128_ps(row2i);
-    row3 = _mm_castsi128_ps(row3i);
+    row0 = SIMD128::castsi_ps(row0i);
+    row1 = SIMD128::castsi_ps(row1i);
+    row2 = SIMD128::castsi_ps(row2i);
+    row3 = SIMD128::castsi_ps(row3i);
 }
 
 INLINE
-void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
+void vTranspose(simd4scalari &row0, simd4scalari &row1, simd4scalari &row2, simd4scalari &row3)
 {
-    __m128i vTemp = row2;
-    row2 = _mm_unpacklo_epi32(row2, row3);
-    vTemp = _mm_unpackhi_epi32(vTemp, row3);
+    simd4scalari vTemp = row2;
+    row2 = SIMD128::unpacklo_epi32(row2, row3);
+    vTemp = SIMD128::unpackhi_epi32(vTemp, row3);
 
     row3 = row0;
-    row0 = _mm_unpacklo_epi32(row0, row1);
-    row3 = _mm_unpackhi_epi32(row3, row1);
+    row0 = SIMD128::unpacklo_epi32(row0, row1);
+    row3 = SIMD128::unpackhi_epi32(row3, row1);
 
     row1 = row0;
-    row0 = _mm_unpacklo_epi64(row0, row2);
-    row1 = _mm_unpackhi_epi64(row1, row2);
+    row0 = SIMD128::unpacklo_epi64(row0, row2);
+    row1 = SIMD128::unpackhi_epi64(row1, row2);
 
     row2 = row3;
-    row2 = _mm_unpacklo_epi64(row2, vTemp);
-    row3 = _mm_unpackhi_epi64(row3, vTemp);
+    row2 = SIMD128::unpacklo_epi64(row2, vTemp);
+    row3 = SIMD128::unpackhi_epi64(row3, vTemp);
 }
 
 #if KNOB_SIMD_WIDTH == 8
 INLINE
-void vTranspose3x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2)
+void vTranspose3x8(simd4scalar (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2)
 {
     simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2);                  //x0z0x1z1 x4z4x5z5
     simdscalar r1rx = _simd_unpacklo_ps(vSrc1, _simd_setzero_ps());     //y0w0y1w1 y4w4y5w5
@@ -94,10 +94,10 @@ void vTranspose3x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar
     simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx);              //x2y2z2w2 x6y6z6w6
     simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx);              //x3y3z3w3 x7y7z7w7
 
-    vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
-    vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
-    vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
-    vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
+    vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
+    vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
+    vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
+    vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);
 
     vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
     vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
@@ -106,7 +106,7 @@ void vTranspose3x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar
 }
 
 INLINE
-void vTranspose4x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2, const simdscalar &vSrc3)
+void vTranspose4x8(simd4scalar (&vDst)[8], const simdscalar &vSrc0, const simdscalar &vSrc1, const simdscalar &vSrc2, const simdscalar &vSrc3)
 {
     simdscalar r0r2 = _simd_unpacklo_ps(vSrc0, vSrc2);      //x0z0x1z1 x4z4x5z5
     simdscalar r1rx = _simd_unpacklo_ps(vSrc1, vSrc3);      //y0w0y1w1 y4w4y5w5
@@ -118,10 +118,10 @@ void vTranspose4x8(__m128 (&vDst)[8], const simdscalar &vSrc0, const simdscalar
     simdscalar r02r1xhilo = _simd_unpacklo_ps(r0r2, r1rx);  //x2y2z2w2 x6y6z6w6
     simdscalar r02r1xhihi = _simd_unpackhi_ps(r0r2, r1rx);  //x3y3z3w3 x7y7z7w7
 
-    vDst[0] = _mm256_castps256_ps128(r02r1xlolo);
-    vDst[1] = _mm256_castps256_ps128(r02r1xlohi);
-    vDst[2] = _mm256_castps256_ps128(r02r1xhilo);
-    vDst[3] = _mm256_castps256_ps128(r02r1xhihi);
+    vDst[0] = _simd_extractf128_ps(r02r1xlolo, 0);
+    vDst[1] = _simd_extractf128_ps(r02r1xlohi, 0);
+    vDst[2] = _simd_extractf128_ps(r02r1xhilo, 0);
+    vDst[3] = _simd_extractf128_ps(r02r1xhihi, 0);
 
     vDst[4] = _simd_extractf128_ps(r02r1xlolo, 1);
     vDst[5] = _simd_extractf128_ps(r02r1xlohi, 1);
@@ -227,16 +227,16 @@ struct Transpose8_8_8_8
 
 #if KNOB_SIMD_WIDTH == 8
 #if KNOB_ARCH <= KNOB_ARCH_AVX
-        __m128i c0c1 = _mm256_castsi256_si128(src);                                           // rrrrrrrrgggggggg
-        __m128i c2c3 = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(src), 1));  // bbbbbbbbaaaaaaaa
-        __m128i c0c2 = _mm_unpacklo_epi64(c0c1, c2c3);                                        // rrrrrrrrbbbbbbbb
-        __m128i c1c3 = _mm_unpackhi_epi64(c0c1, c2c3);                                        // ggggggggaaaaaaaa
-        __m128i c01 = _mm_unpacklo_epi8(c0c2, c1c3);                                          // rgrgrgrgrgrgrgrg
-        __m128i c23 = _mm_unpackhi_epi8(c0c2, c1c3);                                          // babababababababa
-        __m128i c0123lo = _mm_unpacklo_epi16(c01, c23);                                       // rgbargbargbargba
-        __m128i c0123hi = _mm_unpackhi_epi16(c01, c23);                                       // rgbargbargbargba
-        _mm_store_si128((__m128i*)pDst, c0123lo);
-        _mm_store_si128((__m128i*)(pDst + 16), c0123hi);
+        simd4scalari c0c1 = src.v4[0];                                                          // rrrrrrrrgggggggg
+        simd4scalari c2c3 = SIMD128::castps_si(_simd_extractf128_ps(_simd_castsi_ps(src), 1));  // bbbbbbbbaaaaaaaa
+        simd4scalari c0c2 = SIMD128::unpacklo_epi64(c0c1, c2c3);                                        // rrrrrrrrbbbbbbbb
+        simd4scalari c1c3 = SIMD128::unpackhi_epi64(c0c1, c2c3);                                        // ggggggggaaaaaaaa
+        simd4scalari c01 = SIMD128::unpacklo_epi8(c0c2, c1c3);                                          // rgrgrgrgrgrgrgrg
+        simd4scalari c23 = SIMD128::unpackhi_epi8(c0c2, c1c3);                                          // babababababababa
+        simd4scalari c0123lo = SIMD128::unpacklo_epi16(c01, c23);                                       // rgbargbargbargba
+        simd4scalari c0123hi = SIMD128::unpackhi_epi16(c01, c23);                                       // rgbargbargbargba
+        SIMD128::store_si((simd4scalari*)pDst, c0123lo);
+        SIMD128::store_si((simd4scalari*)(pDst + 16), c0123hi);
 #else
         simdscalari dst01 = _simd_shuffle_epi8(src,
             _simd_set_epi32(0x0f078080, 0x0e068080, 0x0d058080, 0x0c048080, 0x80800b03, 0x80800a02, 0x80800901, 0x80800800));
@@ -254,10 +254,10 @@ struct Transpose8_8_8_8
 
     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
     {
-        __m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc));     // rrrrrrrrrrrrrrrr
-        __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg
-        __m128i src2 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
-        __m128i src3 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 3); // aaaaaaaaaaaaaaaa
+        simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc));     // rrrrrrrrrrrrrrrr
+        simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 1); // gggggggggggggggg
+        simd4scalari src2 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 2); // bbbbbbbbbbbbbbbb
+        simd4scalari src3 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 3); // aaaaaaaaaaaaaaaa
 
         simd16scalari cvt0 = _simd16_cvtepu8_epi32(src0);
         simd16scalari cvt1 = _simd16_cvtepu8_epi32(src1);
@@ -305,10 +305,10 @@ struct Transpose8_8
 #if KNOB_SIMD_WIDTH == 8
         simdscalari src = _simd_load_si((const simdscalari*)pSrc);
 
-        __m128i rg = _mm256_castsi256_si128(src);           // rrrrrrrr gggggggg
-        __m128i g = _mm_unpackhi_epi64(rg, rg);             // gggggggg gggggggg
-        rg = _mm_unpacklo_epi8(rg, g);
-        _mm_store_si128((__m128i*)pDst, rg);
+        simd4scalari rg = src.v4[0];           // rrrrrrrr gggggggg
+        simd4scalari g = SIMD128::unpackhi_epi64(rg, rg);             // gggggggg gggggggg
+        rg = SIMD128::unpacklo_epi8(rg, g);
+        SIMD128::store_si((simd4scalari*)pDst, rg);
 #else
 #error Unsupported vector width
 #endif
@@ -317,8 +317,8 @@ struct Transpose8_8
 
     INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
     {
-        __m128i src0 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc));     // rrrrrrrrrrrrrrrr
-        __m128i src1 = _mm_load_si128(reinterpret_cast<const __m128i *>(pSrc) + 1); // gggggggggggggggg
+        simd4scalari src0 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc));     // rrrrrrrrrrrrrrrr
+        simd4scalari src1 = SIMD128::load_si(reinterpret_cast<const simd4scalari *>(pSrc) + 1); // gggggggggggggggg
 
         simdscalari cvt0 = _simd_cvtepu8_epi16(src0);
         simdscalari cvt1 = _simd_cvtepu8_epi16(src1);
@@ -349,16 +349,16 @@ struct Transpose32_32_32_32
         simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
         simdscalar src3 = _simd_load_ps((const float*)pSrc + 24);
 
-        __m128 vDst[8];
+        simd4scalar vDst[8];
         vTranspose4x8(vDst, src0, src1, src2, src3);
-        _mm_store_ps((float*)pDst, vDst[0]);
-        _mm_store_ps((float*)pDst+4, vDst[1]);
-        _mm_store_ps((float*)pDst+8, vDst[2]);
-        _mm_store_ps((float*)pDst+12, vDst[3]);
-        _mm_store_ps((float*)pDst+16, vDst[4]);
-        _mm_store_ps((float*)pDst+20, vDst[5]);
-        _mm_store_ps((float*)pDst+24, vDst[6]);
-        _mm_store_ps((float*)pDst+28, vDst[7]);
+        SIMD128::store_ps((float*)pDst, vDst[0]);
+        SIMD128::store_ps((float*)pDst+4, vDst[1]);
+        SIMD128::store_ps((float*)pDst+8, vDst[2]);
+        SIMD128::store_ps((float*)pDst+12, vDst[3]);
+        SIMD128::store_ps((float*)pDst+16, vDst[4]);
+        SIMD128::store_ps((float*)pDst+20, vDst[5]);
+        SIMD128::store_ps((float*)pDst+24, vDst[6]);
+        SIMD128::store_ps((float*)pDst+28, vDst[7]);
 #else
 #error Unsupported vector width
 #endif
@@ -400,16 +400,16 @@ struct Transpose32_32_32
         simdscalar src1 = _simd_load_ps((const float*)pSrc + 8);
         simdscalar src2 = _simd_load_ps((const float*)pSrc + 16);
 
-        __m128 vDst[8];
+        simd4scalar vDst[8];
         vTranspose3x8(vDst, src0, src1, src2);
-        _mm_store_ps((float*)pDst, vDst[0]);
-        _mm_store_ps((float*)pDst + 4, vDst[1]);
-        _mm_store_ps((float*)pDst + 8, vDst[2]);
-        _mm_store_ps((float*)pDst + 12, vDst[3]);
-        _mm_store_ps((float*)pDst + 16, vDst[4]);
-        _mm_store_ps((float*)pDst + 20, vDst[5]);
-        _mm_store_ps((float*)pDst + 24, vDst[6]);
-        _mm_store_ps((float*)pDst + 28, vDst[7]);
+        SIMD128::store_ps((float*)pDst, vDst[0]);
+        SIMD128::store_ps((float*)pDst + 4, vDst[1]);
+        SIMD128::store_ps((float*)pDst + 8, vDst[2]);
+        SIMD128::store_ps((float*)pDst + 12, vDst[3]);
+        SIMD128::store_ps((float*)pDst + 16, vDst[4]);
+        SIMD128::store_ps((float*)pDst + 20, vDst[5]);
+        SIMD128::store_ps((float*)pDst + 24, vDst[6]);
+        SIMD128::store_ps((float*)pDst + 28, vDst[7]);
 #else
 #error Unsupported vector width
 #endif
@@ -448,21 +448,21 @@ struct Transpose32_32
     {
 #if KNOB_SIMD_WIDTH == 8
         const float* pfSrc = (const float*)pSrc;
-        __m128 src_r0 = _mm_load_ps(pfSrc + 0);
-        __m128 src_r1 = _mm_load_ps(pfSrc + 4);
-        __m128 src_g0 = _mm_load_ps(pfSrc + 8);
-        __m128 src_g1 = _mm_load_ps(pfSrc + 12);
+        simd4scalar src_r0 = SIMD128::load_ps(pfSrc + 0);
+        simd4scalar src_r1 = SIMD128::load_ps(pfSrc + 4);
+        simd4scalar src_g0 = SIMD128::load_ps(pfSrc + 8);
+        simd4scalar src_g1 = SIMD128::load_ps(pfSrc + 12);
 
-        __m128 dst0 = _mm_unpacklo_ps(src_r0, src_g0);
-        __m128 dst1 = _mm_unpackhi_ps(src_r0, src_g0);
-        __m128 dst2 = _mm_unpacklo_ps(src_r1, src_g1);
-        __m128 dst3 = _mm_unpackhi_ps(src_r1, src_g1);
+        simd4scalar dst0 = SIMD128::unpacklo_ps(src_r0, src_g0);
+        simd4scalar dst1 = SIMD128::unpackhi_ps(src_r0, src_g0);
+        simd4scalar dst2 = SIMD128::unpacklo_ps(src_r1, src_g1);
+        simd4scalar dst3 = SIMD128::unpackhi_ps(src_r1, src_g1);
 
         float* pfDst = (float*)pDst;
-        _mm_store_ps(pfDst + 0, dst0);
-        _mm_store_ps(pfDst + 4, dst1);
-        _mm_store_ps(pfDst + 8, dst2);
-        _mm_store_ps(pfDst + 12, dst3);
+        SIMD128::store_ps(pfDst + 0, dst0);
+        SIMD128::store_ps(pfDst + 4, dst1);
+        SIMD128::store_ps(pfDst + 8, dst2);
+        SIMD128::store_ps(pfDst + 12, dst3);
 #else
 #error Unsupported vector width
 #endif
@@ -504,25 +504,25 @@ struct Transpose16_16_16_16
         simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
         simdscalari src_ba = _simd_load_si((const simdscalari*)(pSrc + sizeof(simdscalari)));
 
-        __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
-        __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
-        __m128i src_b = _mm256_extractf128_si256(src_ba, 0);
-        __m128i src_a = _mm256_extractf128_si256(src_ba, 1);
-
-        __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
-        __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
-        __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
-        __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
-
-        __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
-        __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
-        __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
-        __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
-
-        _mm_store_si128(((__m128i*)pDst) + 0, dst0);
-        _mm_store_si128(((__m128i*)pDst) + 1, dst1);
-        _mm_store_si128(((__m128i*)pDst) + 2, dst2);
-        _mm_store_si128(((__m128i*)pDst) + 3, dst3);
+        simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
+        simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
+        simd4scalari src_b = _simd_extractf128_si(src_ba, 0);
+        simd4scalari src_a = _simd_extractf128_si(src_ba, 1);
+
+        simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
+        simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
+        simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
+        simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);
+
+        simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
+        simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
+        simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
+        simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);
+
+        SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
+        SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
+        SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
+        SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
 #else
 #error Unsupported vector width
 #endif
@@ -573,25 +573,25 @@ struct Transpose16_16_16
 #if KNOB_SIMD_WIDTH == 8
         simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
 
-        __m128i src_r = _mm256_extractf128_si256(src_rg, 0);
-        __m128i src_g = _mm256_extractf128_si256(src_rg, 1);
-        __m128i src_b = _mm_load_si128((const __m128i*)(pSrc + sizeof(simdscalari)));
-        __m128i src_a = _mm_undefined_si128();
-
-        __m128i rg0 = _mm_unpacklo_epi16(src_r, src_g);
-        __m128i rg1 = _mm_unpackhi_epi16(src_r, src_g);
-        __m128i ba0 = _mm_unpacklo_epi16(src_b, src_a);
-        __m128i ba1 = _mm_unpackhi_epi16(src_b, src_a);
-
-        __m128i dst0 = _mm_unpacklo_epi32(rg0, ba0);
-        __m128i dst1 = _mm_unpackhi_epi32(rg0, ba0);
-        __m128i dst2 = _mm_unpacklo_epi32(rg1, ba1);
-        __m128i dst3 = _mm_unpackhi_epi32(rg1, ba1);
-
-        _mm_store_si128(((__m128i*)pDst) + 0, dst0);
-        _mm_store_si128(((__m128i*)pDst) + 1, dst1);
-        _mm_store_si128(((__m128i*)pDst) + 2, dst2);
-        _mm_store_si128(((__m128i*)pDst) + 3, dst3);
+        simd4scalari src_r = _simd_extractf128_si(src_rg, 0);
+        simd4scalari src_g = _simd_extractf128_si(src_rg, 1);
+        simd4scalari src_b = SIMD128::load_si((const simd4scalari*)(pSrc + sizeof(simdscalari)));
+        simd4scalari src_a = SIMD128::setzero_si();
+
+        simd4scalari rg0 = SIMD128::unpacklo_epi16(src_r, src_g);
+        simd4scalari rg1 = SIMD128::unpackhi_epi16(src_r, src_g);
+        simd4scalari ba0 = SIMD128::unpacklo_epi16(src_b, src_a);
+        simd4scalari ba1 = SIMD128::unpackhi_epi16(src_b, src_a);
+
+        simd4scalari dst0 = SIMD128::unpacklo_epi32(rg0, ba0);
+        simd4scalari dst1 = SIMD128::unpackhi_epi32(rg0, ba0);
+        simd4scalari dst2 = SIMD128::unpacklo_epi32(rg1, ba1);
+        simd4scalari dst3 = SIMD128::unpackhi_epi32(rg1, ba1);
+
+        SIMD128::store_si(((simd4scalari*)pDst) + 0, dst0);
+        SIMD128::store_si(((simd4scalari*)pDst) + 1, dst1);
+        SIMD128::store_si(((simd4scalari*)pDst) + 2, dst2);
+        SIMD128::store_si(((simd4scalari*)pDst) + 3, dst3);
 #else
 #error Unsupported vector width
 #endif
@@ -642,17 +642,17 @@ struct Transpose16_16
 #if KNOB_SIMD_WIDTH == 8
         simdscalar src = _simd_load_ps((const float*)pSrc);
 
-        __m128 comp0 = _mm256_castps256_ps128(src);
-        __m128 comp1 = _mm256_extractf128_ps(src, 1);
+        simd4scalar comp0 = _simd_extractf128_ps(src, 0);
+        simd4scalar comp1 = _simd_extractf128_ps(src, 1);
 
-        __m128i comp0i = _mm_castps_si128(comp0);
-        __m128i comp1i = _mm_castps_si128(comp1);
+        simd4scalari comp0i = SIMD128::castps_si(comp0);
+        simd4scalari comp1i = SIMD128::castps_si(comp1);
 
-        __m128i resLo = _mm_unpacklo_epi16(comp0i, comp1i);
-        __m128i resHi = _mm_unpackhi_epi16(comp0i, comp1i);
+        simd4scalari resLo = SIMD128::unpacklo_epi16(comp0i, comp1i);
+        simd4scalari resHi = SIMD128::unpackhi_epi16(comp0i, comp1i);
 
-        _mm_store_si128((__m128i*)pDst, resLo);
-        _mm_store_si128((__m128i*)pDst + 1, resHi);
+        SIMD128::store_si((simd4scalari*)pDst, resLo);
+        SIMD128::store_si((simd4scalari*)pDst + 1, resHi);
 #else
 #error Unsupported vector width
 #endif
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index 9e2f35725c5..8796878c586 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -527,7 +527,7 @@ static void StreamOut(
         // Write all entries into primitive data buffer for SOS.
         while (_BitScanForward(&slot, soMask))
         {
-            __m128 attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
+            simd4scalar attrib[MAX_NUM_VERTS_PER_PRIM];    // prim attribs (always 4 wide)
             uint32_t paSlot = slot + soState.vertexAttribOffset[streamIndex];
             pa.AssembleSingle(paSlot, primIndex, attrib);
 
@@ -941,7 +941,9 @@ static void GeometryShaderStage(
 
                             if (HasStreamOutT::value)
                             {
+#if ENABLE_AVX512_SIMD16
                                 gsPa.useAlternateOffset = false;
+#endif
                                 StreamOut(pDC, gsPa, workerId, pSoPrimData, stream);
                             }
 
@@ -1279,7 +1281,9 @@ static void TessellationStages(
             {
                 if (HasStreamOutT::value)
                 {
+#if ENABLE_AVX512_SIMD16
                     tessPa.useAlternateOffset = false;
+#endif
                     StreamOut(pDC, tessPa, workerId, pSoPrimData, 0);
                 }
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h
index 3c2361e85dd..3d7b26dd0f1 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h
@@ -391,7 +391,7 @@ struct PA_STATE_BASE;  // forward decl
 void BinPoints(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID);
 void BinLines(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simdvector prims[3], uint32_t primMask, simdscalari primID);
 #if USE_SIMD16_FRONTEND
-void SIMDAPI BinPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID);
-void SIMDAPI BinLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID);
+void SIMDCALL BinPoints_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID);
+void SIMDCALL BinLines_simd16(DRAW_CONTEXT *pDC, PA_STATE& pa, uint32_t workerId, simd16vector prims[3], uint32_t primMask, simd16scalari primID);
 #endif
 
diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h
index bdd01beedac..77b4f7f59a9 100644
--- a/src/gallium/drivers/swr/rasterizer/core/pa.h
+++ b/src/gallium/drivers/swr/rasterizer/core/pa.h
@@ -92,7 +92,7 @@ struct PA_STATE
 #if ENABLE_AVX512_SIMD16
     virtual bool Assemble_simd16(uint32_t slot, simd16vector verts[]) = 0;
 #endif
-    virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[]) = 0;
+    virtual void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[]) = 0;
     virtual bool NextPrim() = 0;
     virtual SIMDVERTEX& GetNextVsOutput() = 0;
     virtual bool GetNextStreamOutput() = 0;
@@ -139,7 +139,7 @@ struct PA_STATE_OPT : public PA_STATE
 #if ENABLE_AVX512_SIMD16
     typedef bool(*PFN_PA_FUNC_SIMD16)(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-    typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+    typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
 
     PFN_PA_FUNC        pfnPaFunc{ nullptr };        // PA state machine function for assembling 4 triangles.
 #if ENABLE_AVX512_SIMD16
@@ -205,7 +205,7 @@ struct PA_STATE_OPT : public PA_STATE
 
 #endif
     // Assembles 1 primitive. Each simdscalar is a vertex (xyzw).
-    void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
+    void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
     {
         return this->pfnPaSingleFunc(*this, slot, primIndex, verts);
     }
@@ -767,7 +767,7 @@ PRAGMA_WARNING_POP()
     }
 
 #endif
-    void AssembleSingle(uint32_t slot, uint32_t triIndex, __m128 tri[3])
+    void AssembleSingle(uint32_t slot, uint32_t triIndex, simd4scalar tri[3])
     {
         // move to slot
         for (uint32_t v = 0; v < this->vertsPerPrim; ++v)
@@ -1253,7 +1253,7 @@ struct PA_TESS : PA_STATE
                     _simd16_setzero_ps(),
                     pBase,
                     indices,
-                    mask,
+                    _simd16_castsi_ps(mask),
                     4 /* gcc doesn't like sizeof(float) */);
 
                 verts[i].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0);
@@ -1263,7 +1263,7 @@ struct PA_TESS : PA_STATE
                     pBase,
                     indices,
                     _simd_castsi_ps(mask),
-                    4 /* gcc doesn't like sizeof(float) */);
+                    4); // gcc doesn't like sizeof(float)
 #endif
                 pBase += m_attributeStrideInVectors * SIMD_WIDTH;
             }
@@ -1302,7 +1302,7 @@ struct PA_TESS : PA_STATE
                     _simd16_setzero_ps(),
                     pBase,
                     indices,
-                    mask,
+                    _simd16_castsi_ps(mask),
                     4 /* gcc doesn't like sizeof(float) */);
 #else
                 simdscalar temp = _simd_mask_i32gather_ps(
@@ -1321,7 +1321,7 @@ struct PA_TESS : PA_STATE
     }
 
 #endif
-    void AssembleSingle(uint32_t slot, uint32_t primIndex, __m128 verts[])
+    void AssembleSingle(uint32_t slot, uint32_t primIndex, simd4scalar verts[])
     {
         SWR_ASSERT(slot < m_numAttributes);
         SWR_ASSERT(primIndex < PA_TESS::NumPrims());
diff --git a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp
index e710746296c..e53389b63fc 100644
--- a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp
@@ -34,103 +34,103 @@
 
 #if (KNOB_SIMD_WIDTH == 8)
 
-INLINE __m128 swizzleLane0(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane0(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
 {
     simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
     simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
 }
 
-INLINE __m128 swizzleLane1(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane1(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
 {
     simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
     simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
 }
 
-INLINE __m128 swizzleLane2(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane2(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
 {
     simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
     simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 0);
 }
 
-INLINE __m128 swizzleLane3(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane3(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
 {
     simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
     simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 0);
 }
 
-INLINE __m128 swizzleLane4(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane4(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
 {
     simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
     simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
 }
 
-INLINE __m128 swizzleLane5(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane5(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
 {
     simdscalar tmp0 = _mm256_unpacklo_ps(x, z);
     simdscalar tmp1 = _mm256_unpacklo_ps(y, w);
     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
 }
 
-INLINE __m128 swizzleLane6(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane6(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
 {
     simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
     simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
     return _mm256_extractf128_ps(_mm256_unpacklo_ps(tmp0, tmp1), 1);
 }
 
-INLINE __m128 swizzleLane7(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
+INLINE simd4scalar swizzleLane7(const simdscalar &x, const simdscalar &y, const simdscalar &z, const simdscalar &w)
 {
     simdscalar tmp0 = _mm256_unpackhi_ps(x, z);
     simdscalar tmp1 = _mm256_unpackhi_ps(y, w);
     return _mm256_extractf128_ps(_mm256_unpackhi_ps(tmp0, tmp1), 1);
 }
 
-INLINE __m128 swizzleLane0(const simdvector &v)
+INLINE simd4scalar swizzleLane0(const simdvector &v)
 {
     return swizzleLane0(v.x, v.y, v.z, v.w);
 }
 
-INLINE __m128 swizzleLane1(const simdvector &v)
+INLINE simd4scalar swizzleLane1(const simdvector &v)
 {
     return swizzleLane1(v.x, v.y, v.z, v.w);
 }
 
-INLINE __m128 swizzleLane2(const simdvector &v)
+INLINE simd4scalar swizzleLane2(const simdvector &v)
 {
     return swizzleLane2(v.x, v.y, v.z, v.w);
 }
 
-INLINE __m128 swizzleLane3(const simdvector &v)
+INLINE simd4scalar swizzleLane3(const simdvector &v)
 {
     return swizzleLane3(v.x, v.y, v.z, v.w);
 }
 
-INLINE __m128 swizzleLane4(const simdvector &v)
+INLINE simd4scalar swizzleLane4(const simdvector &v)
 {
     return swizzleLane4(v.x, v.y, v.z, v.w);
 }
 
-INLINE __m128 swizzleLane5(const simdvector &v)
+INLINE simd4scalar swizzleLane5(const simdvector &v)
 {
     return swizzleLane5(v.x, v.y, v.z, v.w);
 }
 
-INLINE __m128 swizzleLane6(const simdvector &v)
+INLINE simd4scalar swizzleLane6(const simdvector &v)
 {
     return swizzleLane6(v.x, v.y, v.z, v.w);
 }
 
-INLINE __m128 swizzleLane7(const simdvector &v)
+INLINE simd4scalar swizzleLane7(const simdvector &v)
 {
     return swizzleLane7(v.x, v.y, v.z, v.w);
 }
 
-INLINE __m128 swizzleLaneN(const simdvector &v, int lane)
+INLINE simd4scalar swizzleLaneN(const simdvector &v, int lane)
 {
     switch (lane)
     {
@@ -156,87 +156,87 @@ INLINE __m128 swizzleLaneN(const simdvector &v, int lane)
 }
 
 #if ENABLE_AVX512_SIMD16
-INLINE __m128 swizzleLane0(const simd16vector &v)
+INLINE simd4scalar swizzleLane0(const simd16vector &v)
 {
     return swizzleLane0(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
 }
 
-INLINE __m128 swizzleLane1(const simd16vector &v)
+INLINE simd4scalar swizzleLane1(const simd16vector &v)
 {
     return swizzleLane1(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
 }
 
-INLINE __m128 swizzleLane2(const simd16vector &v)
+INLINE simd4scalar swizzleLane2(const simd16vector &v)
 {
     return swizzleLane2(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
 }
 
-INLINE __m128 swizzleLane3(const simd16vector &v)
+INLINE simd4scalar swizzleLane3(const simd16vector &v)
 {
     return swizzleLane3(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
 }
 
-INLINE __m128 swizzleLane4(const simd16vector &v)
+INLINE simd4scalar swizzleLane4(const simd16vector &v)
 {
     return swizzleLane4(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
 }
 
-INLINE __m128 swizzleLane5(const simd16vector &v)
+INLINE simd4scalar swizzleLane5(const simd16vector &v)
 {
     return swizzleLane5(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
 }
 
-INLINE __m128 swizzleLane6(const simd16vector &v)
+INLINE simd4scalar swizzleLane6(const simd16vector &v)
 {
     return swizzleLane6(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
 }
 
-INLINE __m128 swizzleLane7(const simd16vector &v)
+INLINE simd4scalar swizzleLane7(const simd16vector &v)
 {
     return swizzleLane7(_simd16_extract_ps(v.x, 0), _simd16_extract_ps(v.y, 0), _simd16_extract_ps(v.z, 0), _simd16_extract_ps(v.w, 0));
 }
 
-INLINE __m128 swizzleLane8(const simd16vector &v)
+INLINE simd4scalar swizzleLane8(const simd16vector &v)
 {
     return swizzleLane0(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
 }
 
-INLINE __m128 swizzleLane9(const simd16vector &v)
+INLINE simd4scalar swizzleLane9(const simd16vector &v)
 {
     return swizzleLane1(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
 }
 
-INLINE __m128 swizzleLaneA(const simd16vector &v)
+INLINE simd4scalar swizzleLaneA(const simd16vector &v)
 {
     return swizzleLane2(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
 }
 
-INLINE __m128 swizzleLaneB(const simd16vector &v)
+INLINE simd4scalar swizzleLaneB(const simd16vector &v)
 {
     return swizzleLane3(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
 }
 
-INLINE __m128 swizzleLaneC(const simd16vector &v)
+INLINE simd4scalar swizzleLaneC(const simd16vector &v)
 {
     return swizzleLane4(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
 }
 
-INLINE __m128 swizzleLaneD(const simd16vector &v)
+INLINE simd4scalar swizzleLaneD(const simd16vector &v)
 {
     return swizzleLane5(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
 }
 
-INLINE __m128 swizzleLaneE(const simd16vector &v)
+INLINE simd4scalar swizzleLaneE(const simd16vector &v)
 {
     return swizzleLane6(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
 }
 
-INLINE __m128 swizzleLaneF(const simd16vector &v)
+INLINE simd4scalar swizzleLaneF(const simd16vector &v)
 {
     return swizzleLane7(_simd16_extract_ps(v.x, 1), _simd16_extract_ps(v.y, 1), _simd16_extract_ps(v.z, 1), _simd16_extract_ps(v.w, 1));
 }
 
-INLINE __m128 swizzleLaneN(const simd16vector &v, int lane)
+INLINE simd4scalar swizzleLaneN(const simd16vector &v, int lane)
 {
     switch (lane)
     {
@@ -286,7 +286,7 @@ bool PaTriList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 bool PaTriList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
 
 bool PaTriStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
@@ -294,7 +294,7 @@ bool PaTriStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaTriStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
 
 bool PaTriFan0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
@@ -302,7 +302,7 @@ bool PaTriFan1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaTriFan0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
 
 bool PaQuadList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
@@ -310,7 +310,7 @@ bool PaQuadList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaQuadList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
 
 bool PaLineLoop0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
@@ -318,7 +318,7 @@ bool PaLineLoop1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaLineLoop0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 bool PaLineLoop1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
 
 bool PaLineList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
@@ -326,7 +326,7 @@ bool PaLineList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaLineList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 bool PaLineList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
 
 bool PaLineStrip0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
@@ -334,13 +334,13 @@ bool PaLineStrip1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaLineStrip0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 bool PaLineStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
 
 bool PaPoints0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 #if ENABLE_AVX512_SIMD16
 bool PaPoints0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
 
 bool PaRectList0(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
 bool PaRectList1(PA_STATE_OPT& pa, uint32_t slot, simdvector verts[]);
@@ -350,10 +350,10 @@ bool PaRectList0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 bool PaRectList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 bool PaRectList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]);
 #endif
-void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
+void PaRectListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[]);
 
 template <uint32_t TotalControlPoints>
-void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaPatchListSingle(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
     // We have an input of KNOB_SIMD_WIDTH * TotalControlPoints and we output
     // KNOB_SIMD_WIDTH * 1 patch.  This function is called once per attribute.
@@ -788,7 +788,7 @@ bool PaTriList2_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 }
 
 #endif
-void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaTriListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
 #if USE_SIMD16_FRONTEND
     const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
@@ -1057,7 +1057,7 @@ bool PaTriStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 }
 
 #endif
-void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaTriStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
 #if USE_SIMD16_FRONTEND
     const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
@@ -1325,7 +1325,7 @@ bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 }
 
 #endif
-void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaTriFanSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
 #if USE_SIMD16_FRONTEND
     const simd16vector &a = PaGetSimdVector_simd16(pa, pa.first, slot);
@@ -1491,7 +1491,7 @@ bool PaQuadList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 }
 
 #endif
-void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaQuadListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
 #if USE_SIMD16_FRONTEND
     const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
@@ -1741,7 +1741,7 @@ bool PaLineLoop1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 }
 
 #endif
-void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaLineLoopSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
     PaLineStripSingle0(pa, slot, primIndex, verts);
 
@@ -1855,7 +1855,7 @@ bool PaLineList1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 }
 
 #endif
-void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaLineListSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
 #if USE_SIMD16_FRONTEND
     const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
@@ -2075,7 +2075,7 @@ bool PaLineStrip1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 }
 
 #endif
-void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaLineStripSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
 #if USE_SIMD16_FRONTEND
     const simd16vector &a = PaGetSimdVector_simd16(pa, pa.prev, slot);
@@ -2239,7 +2239,7 @@ bool PaPoints0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[])
 }
 
 #endif
-void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[])
+void PaPointsSingle0(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, simd4scalar verts[])
 {
 #if USE_SIMD16_FRONTEND
     const simd16vector &a = PaGetSimdVector_simd16(pa, 0, slot);
@@ -2529,7 +2529,7 @@ void PaRectListSingle0(
     PA_STATE_OPT& pa,
     uint32_t slot,
     uint32_t primIndex,
-    __m128 verts[])
+    simd4scalar verts[])
 {
     // We have 12 simdscalars contained within 3 simdvectors which
     // hold at least 8 triangles worth of data. We want to assemble a single
diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h
index 12a5f3d8ce1..c3d14e95092 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h
+++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h
@@ -199,15 +199,15 @@ struct StorePixels<32, 2>
     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[2])
     {
         // Each 4-pixel row is 16-bytes
-        __m128i *pZRow01 = (__m128i*)pSrc;
-        __m128i vQuad00 = _mm_load_si128(pZRow01);
-        __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
+        simd4scalari *pZRow01 = (simd4scalari*)pSrc;
+        simd4scalari vQuad00 = SIMD128::load_si(pZRow01);
+        simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1);
 
-        __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
-        __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
+        simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01);
+        simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01);
 
-        _mm_storeu_si128((__m128i*)ppDsts[0], vRow00);
-        _mm_storeu_si128((__m128i*)ppDsts[1], vRow10);
+        SIMD128::storeu_si((simd4scalari*)ppDsts[0], vRow00);
+        SIMD128::storeu_si((simd4scalari*)ppDsts[1], vRow10);
     }
 };
 
@@ -218,20 +218,20 @@ struct StorePixels<32, 4>
     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
     {
         // 4 x 16 bytes = 64 bytes, 16 pixels
-        const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
+        const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
 
-        __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
+        simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
 
         // Unswizzle from SWR-Z order
-        __m128i quad0 = _mm_load_si128(&pSrc128[0]);                        // 0 1 2 3
-        __m128i quad1 = _mm_load_si128(&pSrc128[1]);                        // 4 5 6 7
-        __m128i quad2 = _mm_load_si128(&pSrc128[2]);                        // 8 9 A B
-        __m128i quad3 = _mm_load_si128(&pSrc128[3]);                        // C D E F
-
-        _mm_storeu_si128(ppDsts128[0], _mm_unpacklo_epi64(quad0, quad1));   // 0 1 4 5
-        _mm_storeu_si128(ppDsts128[1], _mm_unpackhi_epi64(quad0, quad1));   // 2 3 6 7
-        _mm_storeu_si128(ppDsts128[2], _mm_unpacklo_epi64(quad2, quad3));   // 8 9 C D
-        _mm_storeu_si128(ppDsts128[3], _mm_unpackhi_epi64(quad2, quad3));   // A B E F
+        simd4scalari quad0 = SIMD128::load_si(&pSrc128[0]);                        // 0 1 2 3
+        simd4scalari quad1 = SIMD128::load_si(&pSrc128[1]);                        // 4 5 6 7
+        simd4scalari quad2 = SIMD128::load_si(&pSrc128[2]);                        // 8 9 A B
+        simd4scalari quad3 = SIMD128::load_si(&pSrc128[3]);                        // C D E F
+
+        SIMD128::storeu_si(ppDsts128[0], SIMD128::unpacklo_epi64(quad0, quad1));   // 0 1 4 5
+        SIMD128::storeu_si(ppDsts128[1], SIMD128::unpackhi_epi64(quad0, quad1));   // 2 3 6 7
+        SIMD128::storeu_si(ppDsts128[2], SIMD128::unpacklo_epi64(quad2, quad3));   // 8 9 C D
+        SIMD128::storeu_si(ppDsts128[3], SIMD128::unpackhi_epi64(quad2, quad3));   // A B E F
     }
 };
 
@@ -251,10 +251,10 @@ struct StorePixels<64, 4>
     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
     {
         // Each 4-pixel row is 32 bytes.
-        const __m128i* pPixSrc = (const __m128i*)pSrc;
+        const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;
 
         // order of pointers match SWR-Z layout
-        __m128i** pvDsts = (__m128i**)&ppDsts[0];
+        simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
         *pvDsts[0] = pPixSrc[0];
         *pvDsts[1] = pPixSrc[1];
         *pvDsts[2] = pPixSrc[2];
@@ -269,9 +269,9 @@ struct StorePixels<64, 8>
     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
     {
         // 8 x 16 bytes = 128 bytes, 16 pixels
-        const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
+        const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
 
-        __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
+        simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
 
         // order of pointers match SWR-Z layout
         *ppDsts128[0] = pSrc128[0];     // 0 1
@@ -301,10 +301,10 @@ struct StorePixels<128, 8>
     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
     {
         // Each 4-pixel row is 64 bytes.
-        const __m128i* pPixSrc = (const __m128i*)pSrc;
+        const simd4scalari* pPixSrc = (const simd4scalari*)pSrc;
 
         // Unswizzle from SWR-Z order
-        __m128i** pvDsts = (__m128i**)&ppDsts[0];
+        simd4scalari** pvDsts = (simd4scalari**)&ppDsts[0];
         *pvDsts[0] = pPixSrc[0];
         *pvDsts[1] = pPixSrc[2];
         *pvDsts[2] = pPixSrc[1];
@@ -323,9 +323,9 @@ struct StorePixels<128, 16>
     static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16])
     {
         // 16 x 16 bytes = 256 bytes, 16 pixels
-        const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
+        const simd4scalari *pSrc128 = reinterpret_cast<const simd4scalari *>(pSrc);
 
-        __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
+        simd4scalari **ppDsts128 = reinterpret_cast<simd4scalari **>(ppDsts);
 
         for (uint32_t i = 0; i < 16; i += 4)
         {
@@ -563,8 +563,8 @@ struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
         temp = _simd16_permute_epi32(temp, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
 
         // merge/store data into destination but don't overwrite the X8 bits
-        simdscalari destlo = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]));
-        simdscalari desthi = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]));
+        simdscalari destlo = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]));
+        simdscalari desthi = _simd_loadu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]));
 
         simd16scalari dest = _simd16_setzero_si();
 
@@ -575,8 +575,8 @@ struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
 
         dest = _simd16_or_si(_simd16_andnot_si(mask, dest), _simd16_and_si(mask, temp));
 
-        _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]), _simd16_extract_si(dest, 0));
-        _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]), _simd16_extract_si(dest, 1));
+        _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[1]), reinterpret_cast<simd4scalari *>(ppDsts[0]), _simd16_extract_si(dest, 0));
+        _simd_storeu2_si(reinterpret_cast<simd4scalari *>(ppDsts[3]), reinterpret_cast<simd4scalari *>(ppDsts[2]), _simd16_extract_si(dest, 1));
 #else
         static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
 
@@ -593,25 +593,25 @@ struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
 
         // Store data into destination but don't overwrite the X8 bits
         // Each 4-pixel row is 16-bytes
-        __m128i *pZRow01 = (__m128i*)aosTile;
-        __m128i vQuad00 = _mm_load_si128(pZRow01);
-        __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
+        simd4scalari *pZRow01 = (simd4scalari*)aosTile;
+        simd4scalari vQuad00 = SIMD128::load_si(pZRow01);
+        simd4scalari vQuad01 = SIMD128::load_si(pZRow01 + 1);
 
-        __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
-        __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
+        simd4scalari vRow00 = SIMD128::unpacklo_epi64(vQuad00, vQuad01);
+        simd4scalari vRow10 = SIMD128::unpackhi_epi64(vQuad00, vQuad01);
 
-        __m128i vDst0 = _mm_loadu_si128((const __m128i*)ppDsts[0]);
-        __m128i vDst1 = _mm_loadu_si128((const __m128i*)ppDsts[1]);
+        simd4scalari vDst0 = SIMD128::loadu_si((const simd4scalari*)ppDsts[0]);
+        simd4scalari vDst1 = SIMD128::loadu_si((const simd4scalari*)ppDsts[1]);
 
-        __m128i vMask = _mm_set1_epi32(0xFFFFFF);
+        simd4scalari vMask = _mm_set1_epi32(0xFFFFFF);
 
-        vDst0 = _mm_andnot_si128(vMask, vDst0);
-        vDst0 = _mm_or_si128(vDst0, _mm_and_si128(vRow00, vMask));
-        vDst1 = _mm_andnot_si128(vMask, vDst1);
-        vDst1 = _mm_or_si128(vDst1, _mm_and_si128(vRow10, vMask));
+        vDst0 = SIMD128::andnot_si(vMask, vDst0);
+        vDst0 = SIMD128::or_si(vDst0, SIMD128::and_si(vRow00, vMask));
+        vDst1 = SIMD128::andnot_si(vMask, vDst1);
+        vDst1 = SIMD128::or_si(vDst1, SIMD128::and_si(vRow10, vMask));
 
-        _mm_storeu_si128((__m128i*)ppDsts[0], vDst0);
-        _mm_storeu_si128((__m128i*)ppDsts[1], vDst1);
+        SIMD128::storeu_si((simd4scalari*)ppDsts[0], vDst0);
+        SIMD128::storeu_si((simd4scalari*)ppDsts[1], vDst1);
 #endif
     }
 };
@@ -683,8 +683,8 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDs
     // store 8x2 memory order:
     //  row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
     //  row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
-    _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
-    _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
+    _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
+    _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
 }
 
 #endif
@@ -736,15 +736,15 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst
 
     // splitting into two sets of 4 wide integer vector types
     // because AVX doesn't have instructions to support this operation at 8 wide
-    __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
-    __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
-    __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
-    __m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
+    simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
+    simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
+    simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
+    simd4scalari srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
 
-    __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
-    __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
-    __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
-    __m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
+    simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
+    simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
+    simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
+    simd4scalari srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
 
     srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
     srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
@@ -753,18 +753,18 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst
     srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
     srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
 
-    srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
-    srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00
+    srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
+    srcLo2 = SIMD128::or_si(srcLo2, srcLo3); // ab00ab00ab00ab00
 
-    srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
-    srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00
+    srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
+    srcHi2 = SIMD128::or_si(srcHi2, srcHi3); // ab00ab00ab00ab00
 
-    srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr
-    srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr
+    srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // abgrabgrabgrabgr
+    srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // abgrabgrabgrabgr
 
     // unpack into rows that get the tiling order correct
-    __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0);  // abgrabgrabgrabgrabgrabgrabgrabgr
-    __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
+    simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0);  // abgrabgrabgrabgrabgrabgrabgrabgr
+    simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);
 
     simdscalari final = _mm256_castsi128_si256(vRow00);
     final = _mm256_insertf128_si256(final, vRow10, 1);
@@ -785,7 +785,7 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst
     final = _mm256_permute4x64_epi64(final, 0xD8);
 #endif
 
-    _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
+    _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
 }
 
 #if USE_8x2_TILE_BACKEND
@@ -848,8 +848,8 @@ INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8
     // store 8x2 memory order:
     //  row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
     //  row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
-    _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
-    _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
+    _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst1), reinterpret_cast<simd4scalari *>(pDst0), _simd16_extract_si(final, 0));
+    _simd_storeu2_si(reinterpret_cast<simd4scalari *>(pDst3), reinterpret_cast<simd4scalari *>(pDst2), _simd16_extract_si(final, 1));
 }
 
 #endif
@@ -894,29 +894,29 @@ INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_
 
     // splitting into two sets of 4 wide integer vector types
     // because AVX doesn't have instructions to support this operation at 8 wide
-    __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
-    __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
-    __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
+    simd4scalari srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
+    simd4scalari srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
+    simd4scalari srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
 
-    __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
-    __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
-    __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
+    simd4scalari srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
+    simd4scalari srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
+    simd4scalari srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
 
     srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
     srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
     srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
     srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
 
-    srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
+    srcLo0 = SIMD128::or_si(srcLo0, srcLo1); // 00gr00gr00gr00gr
 
-    srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
+    srcHi0 = SIMD128::or_si(srcHi0, srcHi1); // 00gr00gr00gr00gr
 
-    srcLo0 = _mm_or_si128(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
-    srcHi0 = _mm_or_si128(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
+    srcLo0 = SIMD128::or_si(srcLo0, srcLo2); // 0bgr0bgr0bgr0bgr
+    srcHi0 = SIMD128::or_si(srcHi0, srcHi2); // 0bgr0bgr0bgr0bgr
 
     // unpack into rows that get the tiling order correct
-    __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0);  // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
-    __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
+    simd4scalari vRow00 = SIMD128::unpacklo_epi64(srcLo0, srcHi0);  // 0bgr0bgr0bgr0bgr0bgr0bgr0bgr0bgr
+    simd4scalari vRow10 = SIMD128::unpackhi_epi64(srcLo0, srcHi0);
 
     simdscalari final = _mm256_castsi128_si256(vRow00);
     final = _mm256_insertf128_si256(final, vRow10, 1);
@@ -936,7 +936,7 @@ INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_
 
 #endif
 
-    _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
+    _simd_storeu2_si((simd4scalari*)pDst1, (simd4scalari*)pDst, final);
 }
 
 template<>
author	Tim Rowley <[email protected]>	2017-06-15 15:24:07 -0500
committer	Tim Rowley <[email protected]>	2017-06-30 13:26:19 -0500
commit	fc4f6c44c479a97b9cad5d08f0d9cd71a8e1e5f8 (patch)
tree	a8ea649f549dc856f402b0b5d9323c5cef080e34 /src/gallium/drivers/swr
parent	8b66d18a3b4f6d6a4f0ea9d71459dac68e5e0295 (diff)