swr/rast: Switch intrinsic usage to SIMDLib

Switch from a macro-based simd intrinsics layer to a more C++ implementation, which also adds AVX512 optimizations to 128-bit and 256-bit SIMD. Reviewed-by: Bruce Cherniak <bruce.cherniak at intel.com>
author: Tim Rowley <[email protected]> 2017-06-15 15:24:07 -0500
committer: Tim Rowley <[email protected]> 2017-06-30 13:26:19 -0500
commit: fc4f6c44c479a97b9cad5d08f0d9cd71a8e1e5f8 (patch)
tree: a8ea649f549dc856f402b0b5d9323c5cef080e34 /src/gallium/drivers/swr/rasterizer/common
parent: 8b66d18a3b4f6d6a4f0ea9d71459dac68e5e0295 (diff)
16 files changed, 5724 insertions, 2213 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/common/intrin.h b/src/gallium/drivers/swr/rasterizer/common/intrin.h
index f45b2e55880..33d37e3cece 100644
--- a/src/gallium/drivers/swr/rasterizer/common/intrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/intrin.h
@@ -26,89 +26,37 @@
 
 #include "os.h"
 
-#include <cassert>
-
-#include <emmintrin.h>
-#include <immintrin.h>
-#include <xmmintrin.h>
+#define SIMD_ARCH KNOB_ARCH
+#include "simdlib_types.hpp"
+
+typedef SIMDImpl::SIMD128Impl::Float                      simd4scalar;
+typedef SIMDImpl::SIMD128Impl::Double                     simd4scalard;
+typedef SIMDImpl::SIMD128Impl::Integer                    simd4scalari;
+typedef SIMDImpl::SIMD128Impl::Vec4                       simd4vector;
+typedef SIMDImpl::SIMD128Impl::Mask                       simd4mask;
+
+typedef SIMDImpl::SIMD256Impl::Float                      simd8scalar;
+typedef SIMDImpl::SIMD256Impl::Double                     simd8scalard;
+typedef SIMDImpl::SIMD256Impl::Integer                    simd8scalari;
+typedef SIMDImpl::SIMD256Impl::Vec4                       simd8vector;
+typedef SIMDImpl::SIMD256Impl::Mask                       simd8mask;
+
+typedef SIMDImpl::SIMD512Impl::Float                      simd16scalar;
+typedef SIMDImpl::SIMD512Impl::Double                     simd16scalard;
+typedef SIMDImpl::SIMD512Impl::Integer                    simd16scalari;
+typedef SIMDImpl::SIMD512Impl::Vec4                       simd16vector;
+typedef SIMDImpl::SIMD512Impl::Mask                       simd16mask;
 
 #if KNOB_SIMD_WIDTH == 8 
-typedef __m256 simdscalar;
-typedef __m256i simdscalari;
-typedef uint8_t simdmask;
-#else
-#error Unsupported vector width
-#endif
-
-// simd vector
-OSALIGNSIMD(union) simdvector
-{
-    simdscalar  v[4];
-    struct
-    {
-        simdscalar x, y, z, w;
-    };
-
-    simdscalar& operator[] (const int i) { return v[i]; }
-    const simdscalar& operator[] (const int i) const { return v[i]; }
-};
-
-#if ENABLE_AVX512_SIMD16
-
-#if KNOB_SIMD16_WIDTH == 16
-
-#if ENABLE_AVX512_EMULATION
-struct simd16scalar
-{
-    __m256  lo;
-    __m256  hi;
-};
-struct simd16scalard
-{
-    __m256d lo;
-    __m256d hi;
-};
-struct simd16scalari
-{
-    __m256i lo;
-    __m256i hi;
-};
-typedef uint16_t simd16mask;
-
-#else
-typedef __m512 simd16scalar;
-typedef __m512d simd16scalard;
-typedef __m512i simd16scalari;
-typedef __mmask16 simd16mask;
-#endif//ENABLE_AVX512_EMULATION
+typedef simd8scalar     simdscalar;
+typedef simd8scalard    simdscalard;
+typedef simd8scalari    simdscalari;
+typedef simd8vector     simdvector;
+typedef simd8mask       simdmask;
 #else
 #error Unsupported vector width
-#endif//KNOB_SIMD16_WIDTH == 16
-
-#define _simd16_masklo(mask) ((mask) & 0xFF)
-#define _simd16_maskhi(mask) (((mask) >> 8) & 0xFF)
-#define _simd16_setmask(hi, lo) (((hi) << 8) | (lo))
-
-#if defined(_WIN32)
-#define SIMDAPI __vectorcall
-#else
-#define SIMDAPI
 #endif
 
-OSALIGN(union, KNOB_SIMD16_BYTES) simd16vector
-{
-    simd16scalar  v[4];
-    struct
-    {
-        simd16scalar x, y, z, w;
-    };
-
-    simd16scalar& operator[] (const int i) { return v[i]; }
-    const simd16scalar& operator[] (const int i) const { return v[i]; }
-};
-
-#endif // ENABLE_AVX512_SIMD16
-
 INLINE
 UINT pdep_u32(UINT a, UINT mask)
 {
diff --git a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
index a822420ae37..29151682e07 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
@@ -26,1096 +26,141 @@
 
 #if ENABLE_AVX512_SIMD16
 
-#if ENABLE_AVX512_EMULATION
-
-#define SIMD16_EMU_AVX512_0(type, func, intrin) \
-INLINE type SIMDAPI func()\
-{\
-    type result;\
-\
-    result.lo = intrin();\
-    result.hi = intrin();\
-\
-    return result;\
-}
-
-#define SIMD16_EMU_AVX512_1(type, func, intrin) \
-INLINE type SIMDAPI func(type a)\
-{\
-    type result;\
-\
-    result.lo = intrin(a.lo);\
-    result.hi = intrin(a.hi);\
-\
-    return result;\
-}
-
-#define SIMD16_EMU_AVX512_2(type, func, intrin) \
-INLINE type SIMDAPI func(type a, type b)\
-{\
-    type result;\
-\
-    result.lo = intrin(a.lo, b.lo);\
-    result.hi = intrin(a.hi, b.hi);\
-\
-    return result;\
-}
-
-#define SIMD16_EMU_AVX512_3(type, func, intrin) \
-INLINE type SIMDAPI func(type a, type b, type c)\
-{\
-    type result;\
-\
-    result.lo = intrin(a.lo, b.lo, c.lo);\
-    result.hi = intrin(a.hi, b.hi, c.hi);\
-\
-    return result;\
-}
-
-SIMD16_EMU_AVX512_0(simd16scalar, _simd16_setzero_ps, _mm256_setzero_ps)
-SIMD16_EMU_AVX512_0(simd16scalari, _simd16_setzero_si, _mm256_setzero_si256)
-
-INLINE simd16scalar SIMDAPI _simd16_set1_ps(float a)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_set1_ps(a);
-    result.hi = _mm256_set1_ps(a);
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_set1_epi8(char a)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_set1_epi8(a);
-    result.hi = _mm256_set1_epi8(a);
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_set1_epi32(int a)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_set1_epi32(a);
-    result.hi = _mm256_set1_epi32(a);
-
-    return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_set_ps(float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
-    result.hi = _mm256_set_ps(e15, e14, e13, e12, e11, e10, e9, e8);
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_set_epi32(int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
-    result.hi = _mm256_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8);
-
-    return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_set_ps(float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
-    result.hi = _mm256_set_ps(e7, e6, e5, e4, e3, e2, e1, e0);
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_set_epi32(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
-    result.hi = _mm256_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0);
-
-    return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_load_ps(float const *m)
-{
-    simd16scalar result;
-
-    float const *n = reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m) + sizeof(result.lo));
-
-    result.lo = _mm256_load_ps(m);
-    result.hi = _mm256_load_ps(n);
-
-    return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_loadu_ps(float const *m)
-{
-    simd16scalar result;
-
-    float const *n = reinterpret_cast<float const *>(reinterpret_cast<uint8_t const *>(m) + sizeof(result.lo));
-
-    result.lo = _mm256_loadu_ps(m);
-    result.hi = _mm256_loadu_ps(n);
-
-    return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_load1_ps(float const *m)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_broadcast_ss(m);
-    result.hi = _mm256_broadcast_ss(m);
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_load_si(simd16scalari const *m)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_load_si256(&m[0].lo);
-    result.hi = _mm256_load_si256(&m[0].hi);
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_loadu_si(simd16scalari const *m)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_loadu_si256(&m[0].lo);
-    result.hi = _mm256_loadu_si256(&m[0].hi);
-
-    return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_broadcast_ss(float const *m)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_broadcast_ss(m);
-    result.hi = _mm256_broadcast_ss(m);
-
-    return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_broadcast_ps(__m128 const *m)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_broadcast_ps(m);
-    result.hi = _mm256_broadcast_ps(m);
-
-    return result;
-}
-
-INLINE void SIMDAPI _simd16_store_ps(float *m, simd16scalar a)
-{
-    float *n = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m) + sizeof(a.lo));
-
-    _mm256_store_ps(m, a.lo);
-    _mm256_store_ps(n, a.hi);
-}
-
-INLINE void SIMDAPI _simd16_maskstore_ps(float *m, simd16scalari mask, simd16scalar a)
-{
-    float *n = reinterpret_cast<float *>(reinterpret_cast<uint8_t *>(m) + sizeof(a.lo));
-
-    _mm256_maskstore_ps(m, mask.lo, a.lo);
-    _mm256_maskstore_ps(n, mask.hi, a.hi);
-}
-
-INLINE void SIMDAPI _simd16_store_si(simd16scalari *m, simd16scalari a)
-{
-    _mm256_store_si256(&m[0].lo, a.lo);
-    _mm256_store_si256(&m[0].hi, a.hi);
-}
-
-INLINE simdscalar SIMDAPI _simd16_extract_ps(simd16scalar a, int imm8)
-{
-    switch (imm8)
-    {
-    case 0:
-        return a.lo;
-    case 1:
-        return a.hi;
-    }
-    return _simd_set1_ps(0.0f);
-}
-
-INLINE simdscalari SIMDAPI _simd16_extract_si(simd16scalari a, int imm8)
-{
-    switch (imm8)
-    {
-    case 0:
-        return a.lo;
-    case 1:
-        return a.hi;
-    }
-    return _simd_set1_epi32(0);
-}
-
-INLINE simd16scalar SIMDAPI _simd16_insert_ps(simd16scalar a, simdscalar b, int imm8)
-{
-    switch (imm8)
-    {
-    case 0:
-        a.lo = b;
-        break;
-    case 1:
-        a.hi = b;
-        break;
-    }
-    return a;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_insert_si(simd16scalari a, simdscalari b, int imm8)
-{
-    switch (imm8)
-    {
-    case 0:
-        a.lo = b;
-        break;
-    case 1:
-        a.hi = b;
-        break;
-    }
-    return a;
-}
-
-template <simd16mask mask>
-INLINE simd16scalar SIMDAPI _simd16_blend_ps_temp(simd16scalar a, simd16scalar b)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_blend_ps(a.lo, b.lo, _simd16_masklo(mask));
-    result.hi = _mm256_blend_ps(a.hi, b.hi, _simd16_maskhi(mask));
-
-    return result;
-}
-
-#define _simd16_blend_ps(a, b, mask) _simd16_blend_ps_temp<mask>(a, b)
-
-SIMD16_EMU_AVX512_3(simd16scalar, _simd16_blendv_ps, _mm256_blendv_ps)
-
-INLINE simd16scalari SIMDAPI _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalar mask)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.lo), _mm256_castsi256_ps(b.lo), mask.lo));
-    result.hi = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.hi), _mm256_castsi256_ps(b.hi), mask.hi));
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalari mask)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.lo), _mm256_castsi256_ps(b.lo), _mm256_castsi256_ps(mask.lo)));
-    result.hi = _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(a.hi), _mm256_castsi256_ps(b.hi), _mm256_castsi256_ps(mask.hi)));
-
-    return result;
-}
-
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_mul_ps, _mm256_mul_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_add_ps, _mm256_add_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_sub_ps, _mm256_sub_ps)
-SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rsqrt_ps, _mm256_rsqrt_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_min_ps, _mm256_min_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_max_ps, _mm256_max_ps)
-
-INLINE simd16mask SIMDAPI _simd16_movemask_ps(simd16scalar a)
-{
-    simdmask mask_lo = _mm256_movemask_ps(a.lo);
-    simdmask mask_hi = _mm256_movemask_ps(a.hi);
-
-    return static_cast<simd16mask>(mask_lo) | (static_cast<simd16mask>(mask_hi) << 8);
-}
-
-INLINE simd16mask SIMDAPI _simd16_movemask_pd(simd16scalard a)
-{
-    simdmask mask_lo = _mm256_movemask_pd(a.lo);
-    simdmask mask_hi = _mm256_movemask_pd(a.hi);
-
-    return static_cast<simd16mask>(mask_lo) | (static_cast<simd16mask>(mask_hi) << 4);
-}
-
-INLINE uint64_t SIMDAPI _simd16_movemask_epi8(simd16scalari a)
-{
-    uint32_t mask_lo = _mm256_movemask_epi8(a.lo);
-    uint32_t mask_hi = _mm256_movemask_epi8(a.hi);
-
-    return static_cast<uint64_t>(mask_lo) | (static_cast<uint64_t>(mask_hi) << 32);
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cvtps_epi32(simd16scalar a)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_cvtps_epi32(a.lo);
-    result.hi = _mm256_cvtps_epi32(a.hi);
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cvttps_epi32(simd16scalar a)
-{
-    simd16scalari result;
-
-    result.lo = _mm256_cvttps_epi32(a.lo);
-    result.hi = _mm256_cvttps_epi32(a.hi);
-
-    return result;
-}
-
-INLINE simd16scalar SIMDAPI _simd16_cvtepi32_ps(simd16scalari a)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_cvtepi32_ps(a.lo);
-    result.hi = _mm256_cvtepi32_ps(a.hi);
-
-    return result;
-}
-
-template <int comp>
-INLINE simd16scalar SIMDAPI _simd16_cmp_ps_temp(simd16scalar a, simd16scalar b)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_cmp_ps(a.lo, b.lo, comp);
-    result.hi = _mm256_cmp_ps(a.hi, b.hi, comp);
-
-    return result;
-}
-
-#define _simd16_cmp_ps(a, b, comp)  _simd16_cmp_ps_temp<comp>(a, b)
-
-#define _simd16_cmplt_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_LT_OQ)
-#define _simd16_cmpgt_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_GT_OQ)
-#define _simd16_cmpneq_ps(a, b)     _simd16_cmp_ps(a, b, _CMP_NEQ_OQ)
-#define _simd16_cmpeq_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_EQ_OQ)
-#define _simd16_cmpge_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_GE_OQ)
-#define _simd16_cmple_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_LE_OQ)
-
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_and_ps, _simd_and_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_andnot_ps, _simd_andnot_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_or_ps, _simd_or_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_xor_ps, _simd_xor_ps)
-
-SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rcp_ps, _simd_rcp_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_div_ps, _simd_div_ps)
-
-INLINE simd16scalar SIMDAPI _simd16_castsi_ps(simd16scalari a)
-{
-    return *reinterpret_cast<simd16scalar *>(&a);
-}
-
-INLINE simd16scalari SIMDAPI _simd16_castps_si(simd16scalar a)
-{
-    return *reinterpret_cast<simd16scalari *>(&a);
-}
-
-INLINE simd16scalard SIMDAPI _simd16_castsi_pd(simd16scalari a)
-{
-    return *reinterpret_cast<simd16scalard *>(&a);
-}
-
-INLINE simd16scalari SIMDAPI _simd16_castpd_si(simd16scalard a)
-{
-    return *reinterpret_cast<simd16scalari *>(&a);
-}
-
-INLINE simd16scalar SIMDAPI _simd16_castpd_ps(simd16scalard a)
-{
-    return *reinterpret_cast<simd16scalar *>(&a);
-}
-
-INLINE simd16scalard SIMDAPI _simd16_castps_pd(simd16scalar a)
-{
-    return *reinterpret_cast<simd16scalard *>(&a);
-}
-
-template <int mode>
-INLINE simd16scalar SIMDAPI _simd16_round_ps_temp(simd16scalar a)
-{
-    simd16scalar result;
-
-    result.lo = _mm256_round_ps(a.lo, mode);
-    result.hi = _mm256_round_ps(a.hi, mode);
-
-    return result;
-}
-
-#define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)
-
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mul_epi32, _simd_mul_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mullo_epi32, _simd_mullo_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi32, _simd_sub_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi64, _simd_sub_epi64)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epi32, _simd_min_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epi32, _simd_max_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epu32, _simd_min_epu32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epu32, _simd_max_epu32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi32, _simd_add_epi32)
-
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_and_si, _simd_and_si)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_andnot_si, _simd_andnot_si)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_or_si, _simd_or_si)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_xor_si, _simd_xor_si)
-
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi32, _simd_cmpeq_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi32, _simd_cmpgt_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmplt_epi32, _simd_cmplt_epi32)
-
-INLINE int SIMDAPI _simd16_testz_ps(simd16scalar a, simd16scalar b)
-{
-    int lo = _simd_testz_ps(a.lo, b.lo);
-    int hi = _simd_testz_ps(a.hi, b.hi);
-
-    return lo & hi;
-}
-
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_unpacklo_ps, _simd_unpacklo_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_unpackhi_ps, _simd_unpackhi_ps)
-SIMD16_EMU_AVX512_2(simd16scalard, _simd16_unpacklo_pd, _simd_unpacklo_pd)
-SIMD16_EMU_AVX512_2(simd16scalard, _simd16_unpackhi_pd, _simd_unpackhi_pd)
-
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi8, _simd_unpacklo_epi8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi8, _simd_unpackhi_epi8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi16, _simd_unpacklo_epi16)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi16, _simd_unpackhi_epi16)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi32, _simd_unpacklo_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi32, _simd_unpackhi_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi64, _simd_unpacklo_epi64)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi64, _simd_unpackhi_epi64)
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_slli_epi32_temp(simd16scalari a)
-{
-    simd16scalari result;
-
-    result.lo = _simd_slli_epi32(a.lo, imm8);
-    result.hi = _simd_slli_epi32(a.hi, imm8);
-
-    return result;
-}
-
-#define _simd16_slli_epi32(a, imm8) _simd16_slli_epi32_temp<imm8>(a)
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_srai_epi32_temp(simd16scalari a)
-{
-    simd16scalari result;
-
-    result.lo = _simd_srai_epi32(a.lo, imm8);
-    result.hi = _simd_srai_epi32(a.hi, imm8);
-
-    return result;
-}
-
-#define _simd16_srai_epi32(a, imm8) _simd16_srai_epi32_temp<imm8>(a)
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_srli_epi32_temp(simd16scalari a)
-{
-    simd16scalari result;
-
-    result.lo = _simd_srli_epi32(a.lo, imm8);
-    result.hi = _simd_srli_epi32(a.hi, imm8);
-
-    return result;
-}
-
-#define _simd16_srli_epi32(a, imm8) _simd16_srli_epi32_temp<imm8>(a)
-
-SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmadd_ps, _simd_fmadd_ps)
-SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmsub_ps, _simd_fmsub_ps)
-
-template <int scale>
-INLINE simd16scalar SIMDAPI _simd16_i32gather_ps_temp(const float *m, simd16scalari index)
-{
-    simd16scalar result;
-
-    result.lo = _simd_i32gather_ps(m, index.lo, scale);
-    result.hi = _simd_i32gather_ps(m, index.hi, scale);
-
-    return result;
-}
-
-#define _simd16_i32gather_ps(m, index, scale) _simd16_i32gather_ps_temp<scale>(m, index)
-
-template <int scale>
-INLINE simd16scalar SIMDAPI _simd16_mask_i32gather_ps_temp(simd16scalar a, const float *m, simd16scalari index, simd16scalari mask)
-{
-    simd16scalar result;
-
-    result.lo = _simd_mask_i32gather_ps(a.lo, m, index.lo, _simd_castsi_ps(mask.lo), scale);
-    result.hi = _simd_mask_i32gather_ps(a.hi, m, index.hi, _simd_castsi_ps(mask.hi), scale);
-
-    return result;
-}
-
-#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp<scale>(a, m, index, mask)
-
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_shuffle_epi8, _simd_shuffle_epi8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_adds_epu8, _simd_adds_epu8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_subs_epu8, _simd_subs_epu8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi8, _simd_add_epi8)
-SIMD16_EMU_AVX512_1(simd16scalari, _simd16_abs_epi32, _simd_abs_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi64, _simd_cmpeq_epi64)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi64, _simd_cmpgt_epi64)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi16, _simd_cmpeq_epi16)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi16, _simd_cmpgt_epi16)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi8, _simd_cmpeq_epi8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi8, _simd_cmpgt_epi8)
-
-INLINE simd16scalar SIMDAPI _simd16_permute_ps(simd16scalar a, simd16scalari i)
-{
-    simd16scalar result;
-
-    const simdscalari mask = _simd_set1_epi32(7);
-
-    simdscalar lolo = _simd_permute_ps(a.lo, _simd_and_si(i.lo, mask));
-    simdscalar lohi = _simd_permute_ps(a.hi, _simd_and_si(i.lo, mask));
-
-    simdscalar hilo = _simd_permute_ps(a.lo, _simd_and_si(i.hi, mask));
-    simdscalar hihi = _simd_permute_ps(a.hi, _simd_and_si(i.hi, mask));
-
-    result.lo = _simd_blendv_ps(lolo, lohi, _simd_castsi_ps(_simd_cmpgt_epi32(i.lo, mask)));
-    result.hi = _simd_blendv_ps(hilo, hihi, _simd_castsi_ps(_simd_cmpgt_epi32(i.hi, mask)));
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_permute_epi32(simd16scalari a, simd16scalari i)
-{
-    return _simd16_castps_si(_simd16_permute_ps(_simd16_castsi_ps(a), i));
-}
-
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_srlv_epi32, _simd_srlv_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sllv_epi32, _simd_sllv_epi32)
-
-template <int imm8>
-INLINE simd16scalar SIMDAPI _simd16_permute2f128_ps_temp(simd16scalar a, simd16scalar b)
-{
-    simd16scalar result;
-
-    result.lo = _simd_permute2f128_ps(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
-    result.hi = _simd_permute2f128_ps(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
-
-    return result;
-}
-
-#define _simd16_permute2f128_ps(a, b, imm8) _simd16_permute2f128_ps_temp<imm8>(a, b)
-
-template <int imm8>
-INLINE simd16scalard SIMDAPI _simd16_permute2f128_pd_temp(simd16scalard a, simd16scalard b)
-{
-    simd16scalard result;
-
-    result.lo = _simd_permute2f128_pd(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
-    result.hi = _simd_permute2f128_pd(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
-
-    return result;
-}
-
-#define _simd16_permute2f128_pd(a, b, imm8) _simd16_permute2f128_pd_temp<imm8>(a, b)
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_permute2f128_si_temp(simd16scalari a, simd16scalari b)
-{
-    simd16scalari result;
-
-    result.lo = _simd_permute2f128_si(a.lo, a.hi, ((imm8 & 0x03) << 0) | ((imm8 & 0x0C) << 2));
-    result.hi = _simd_permute2f128_si(b.lo, b.hi, ((imm8 & 0x30) >> 4) | ((imm8 & 0xC0) >> 2));
-
-    return result;
-}
-
-#define _simd16_permute2f128_si(a, b, imm8) _simd16_permute2f128_si_temp<imm8>(a, b)
-
-template <int imm8>
-INLINE simd16scalar SIMDAPI _simd16_shuffle_ps_temp(simd16scalar a, simd16scalar b)
-{
-    simd16scalar result;
-
-    result.lo = _simd_shuffle_ps(a.lo, b.lo, imm8);
-    result.hi = _simd_shuffle_ps(a.hi, b.hi, imm8);
-
-    return result;
-}
-
-#define _simd16_shuffle_ps(a, b, imm8) _simd16_shuffle_ps_temp<imm8>(a, b)
-
-template <int imm8>
-INLINE simd16scalard SIMDAPI _simd16_shuffle_pd_temp(simd16scalard a, simd16scalard b)
-{
-    simd16scalard result;
-
-    result.lo = _simd_shuffle_pd(a.lo, b.lo, (imm8 & 15));
-    result.hi = _simd_shuffle_pd(a.hi, b.hi, (imm8 >> 4));
-
-    return result;
-}
-
-#define _simd16_shuffle_pd(a, b, imm8) _simd16_shuffle_pd_temp<imm8>(a, b)
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_shuffle_epi32_temp(simd16scalari a, simd16scalari b)
-{
-    return _simd16_castps_si(_simd16_shuffle_ps(_simd16_castsi_ps(a), _simd16_castsi_ps(b), imm8));
-}
-
-#define _simd16_shuffle_epi32(a, b, imm8) _simd16_shuffle_epi32_temp<imm8>(a, b)
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_shuffle_epi64_temp(simd16scalari a, simd16scalari b)
-{
-    return _simd16_castpd_si(_simd16_shuffle_pd(_simd16_castsi_pd(a), _simd16_castsi_pd(b), imm8));
-}
-
-#define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b)
-
-INLINE simd16scalari SIMDAPI _simd16_cvtepu8_epi16(simdscalari a)
-{
-    simd16scalari result;
-
-    result.lo = _simd_cvtepu8_epi16(_mm256_extractf128_si256(a, 0));
-    result.hi = _simd_cvtepu8_epi16(_mm256_extractf128_si256(a, 1));
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cvtepu8_epi32(__m128i a)
-{
-    simd16scalari result;
-
-    result.lo = _simd_cvtepu8_epi32(a);
-    result.hi = _simd_cvtepu8_epi32(_mm_srli_si128(a, 8));
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cvtepu16_epi32(simdscalari a)
-{
-    simd16scalari result;
-
-    result.lo = _simd_cvtepu16_epi32(_mm256_extractf128_si256(a, 0));
-    result.hi = _simd_cvtepu16_epi32(_mm256_extractf128_si256(a, 1));
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cvtepu16_epi64(simdscalari a)
-{
-    simd16scalari result;
-
-    result.lo = _simd_cvtepu16_epi64(_mm256_extractf128_si256(a, 0));
-    result.hi = _simd_cvtepu16_epi64(_mm256_extractf128_si256(a, 1));
-
-    return result;
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cvtepu32_epi64(simdscalari a)
-{
-    simd16scalari result;
-
-    result.lo = _simd_cvtepu32_epi64(_mm256_extractf128_si256(a, 0));
-    result.hi = _simd_cvtepu32_epi64(_mm256_extractf128_si256(a, 1));
-
-    return result;
-}
-
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packus_epi16, _simd_packus_epi16)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packs_epi16, _simd_packs_epi16)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packus_epi32, _simd_packus_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_packs_epi32, _simd_packs_epi32)
-
-INLINE simd16mask SIMDAPI _simd16_int2mask(int mask)
-{
-    return mask;
-}
-
-INLINE int SIMDAPI SIMDAPI _simd16_mask2int(simd16mask mask)
-{
-    return mask;
-}
-
-INLINE simd16mask SIMDAPI _simd16_cmplt_ps_mask(simd16scalar a, simd16scalar b)
-{
-    return _simd16_movemask_ps(_simd16_cmplt_ps(a, b));
-}
-
-// convert bitmask to vector mask
-INLINE simd16scalar SIMDAPI vMask16(int32_t mask)
-{
-    simd16scalari temp = _simd16_set1_epi32(mask);
-
-    simd16scalari bits = _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
-
-    simd16scalari result = _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp, bits));
-
-    return _simd16_castsi_ps(result);
-}
-
+#if KNOB_SIMD16_WIDTH == 16
+typedef SIMD512                             SIMD16;
 #else
-
-INLINE simd16mask SIMDAPI _simd16_scalari2mask(simd16scalari mask)
-{
-    return _mm512_cmpneq_epu32_mask(mask, _mm512_setzero_epi32());
-}
-
-INLINE simd16mask SIMDAPI _simd16_scalard2mask(simd16scalard mask)
-{
-    return _mm512_cmpneq_epu64_mask(_mm512_castpd_si512(mask), _mm512_setzero_si512());
-}
-
-#define _simd16_setzero_ps      _mm512_setzero_ps
-#define _simd16_setzero_si      _mm512_setzero_si512
-#define _simd16_set1_ps         _mm512_set1_ps
-#define _simd16_set1_epi8       _mm512_set1_epi8
-#define _simd16_set1_epi32      _mm512_set1_epi32
-
-INLINE simd16scalar SIMDAPI _simd16_set_ps(float e15, float e14, float e13, float e12, float e11, float e10, float e9, float e8, float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
-{
-    return _mm512_set_ps(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
-}
-
-INLINE simd16scalari SIMDAPI _simd16_set_epi32(int e15, int e14, int e13, int e12, int e11, int e10, int e9, int e8, int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
-{
-    return _mm512_set_epi32(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4, e3, e2, e1, e0);
-}
-
-INLINE simd16scalar SIMDAPI _simd16_set_ps(float e7, float e6, float e5, float e4, float e3, float e2, float e1, float e0)
-{
-    return _mm512_set_ps(e7, e6, e5, e4, e3, e2, e1, e0, e7, e6, e5, e4, e3, e2, e1, e0);
-}
-
-INLINE simd16scalari SIMDAPI _simd16_set_epi32(int e7, int e6, int e5, int e4, int e3, int e2, int e1, int e0)
-{
-    return _mm512_set_epi32(e7, e6, e5, e4, e3, e2, e1, e0, e7, e6, e5, e4, e3, e2, e1, e0);
-}
-
-#define _simd16_load_ps         _mm512_load_ps
-#define _simd16_loadu_ps        _mm512_loadu_ps
-#if 1
-#define _simd16_load1_ps        _simd16_broadcast_ss
-#endif
-#define _simd16_load_si         _mm512_load_si512
-#define _simd16_loadu_si        _mm512_loadu_si512
-#define _simd16_broadcast_ss(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_1X16, 0)
-#define _simd16_broadcast_ps(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, 0)
-#define _simd16_store_ps        _mm512_store_ps
-#define _simd16_store_si        _mm512_store_si512
-#define _simd16_extract_ps(a, imm8) _mm256_castsi256_ps(_mm512_extracti64x4_epi64(_mm512_castps_si512(a), imm8))
-#define _simd16_extract_si      _mm512_extracti64x4_epi64
-#define _simd16_insert_ps(a, b, imm8)  _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castps_si512(a), _mm256_castps_si256(b), imm8))
-#define _simd16_insert_si       _mm512_inserti64x4
-
-INLINE void SIMDAPI _simd16_maskstore_ps(float *m, simd16scalari mask, simd16scalar a)
-{
-    simd16mask k = _simd16_scalari2mask(mask);
-
-    _mm512_mask_store_ps(m, k, a);
-}
-
-#define _simd16_blend_ps(a, b, mask)    _mm512_mask_blend_ps(mask, a, b)
-
-INLINE simd16scalar SIMDAPI _simd16_blendv_ps(simd16scalar a, simd16scalar b, const simd16scalar mask)
-{
-    simd16mask k = _simd16_scalari2mask(_mm512_castps_si512(mask));
-
-    return _mm512_mask_blend_ps(k, a, b);
-}
-
-INLINE simd16scalari SIMDAPI _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalar mask)
-{
-    simd16mask k = _simd16_scalari2mask(_mm512_castps_si512(mask));
-
-    return _mm512_mask_blend_epi32(k, a, b);
-}
-
-INLINE simd16scalari SIMDAPI _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalari mask)
-{
-    simd16mask k = _simd16_scalari2mask(mask);
-
-    return _mm512_mask_blend_epi32(k, a, b);
-}
-
-#define _simd16_mul_ps          _mm512_mul_ps
-#define _simd16_div_ps          _mm512_div_ps
-#define _simd16_add_ps          _mm512_add_ps
-#define _simd16_sub_ps          _mm512_sub_ps
-#define _simd16_rsqrt_ps        _mm512_rsqrt14_ps
-#define _simd16_min_ps          _mm512_min_ps
-#define _simd16_max_ps          _mm512_max_ps
-
-INLINE simd16mask SIMDAPI _simd16_movemask_ps(simd16scalar a)
-{
-    // movemask_ps only checks the top bit of the float single elements
-    return  _simd16_scalari2mask(_mm512_and_si512(_mm512_castps_si512(a), _mm512_set1_epi32(0x80000000)));
-}
-
-INLINE simd16mask SIMDAPI _simd16_movemask_pd(simd16scalard a)
-{
-    // movemask_pd only checks the top bit of the float double elements
-    return  _simd16_scalard2mask(_mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(a), _mm512_set1_epi64(0x8000000000000000))));
-}
-
-#if 0
-INLINE int SIMDAPI _simd16_movemask_epi8(simd16scalari a)
-{
-    return  _simd16_scalar2mask(a);
-}
-#endif
-
-#define _simd16_cvtps_epi32     _mm512_cvtps_epi32
-#define _simd16_cvttps_epi32    _mm512_cvttps_epi32
-#define _simd16_cvtepi32_ps     _mm512_cvtepi32_ps
-
-template <int comp>
-INLINE simd16scalar SIMDAPI _simd16_cmp_ps_temp(simd16scalar a, simd16scalar b)
-{
-    simd16mask k = _mm512_cmp_ps_mask(a, b, comp);
-
-    return _mm512_castsi512_ps(_mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF)));
-}
-
-#define _simd16_cmp_ps(a, b, comp)  _simd16_cmp_ps_temp<comp>(a, b)
-
-#define _simd16_cmplt_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_LT_OQ)
-#define _simd16_cmpgt_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_GT_OQ)
-#define _simd16_cmpneq_ps(a, b)     _simd16_cmp_ps(a, b, _CMP_NEQ_OQ)
-#define _simd16_cmpeq_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_EQ_OQ)
-#define _simd16_cmpge_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_GE_OQ)
-#define _simd16_cmple_ps(a, b)      _simd16_cmp_ps(a, b, _CMP_LE_OQ)
-
-#define _simd16_castsi_ps           _mm512_castsi512_ps
-#define _simd16_castps_si           _mm512_castps_si512
-#define _simd16_castsi_pd           _mm512_castsi512_pd
-#define _simd16_castpd_si           _mm512_castpd_si512
-#define _simd16_castpd_ps           _mm512_castpd_ps
-#define _simd16_castps_pd           _mm512_castps_pd
-
-// _mm512_and_ps (and other bitwise operations) exist in AVX512DQ,
-// while the functionally equivalent _mm512_and_epi32 is in AVX512F.
-// Define the _simd16_*_ps versions in terms of AVX512F for broader
-// support.
-#define _simd16_logicop_ps(a, b, op) _simd16_castsi_ps(op##_epi32(_simd16_castps_si(a), _simd16_castps_si(b)))
-
-#define _simd16_and_ps(a, b)        _simd16_logicop_ps(a, b, _mm512_and)
-#define _simd16_andnot_ps(a, b)     _simd16_logicop_ps(a, b, _mm512_andnot)
-#define _simd16_or_ps(a, b)         _simd16_logicop_ps(a, b, _mm512_or)
-#define _simd16_xor_ps(a, b)        _simd16_logicop_ps(a, b, _mm512_xor)
-
-template <int mode>
-INLINE simd16scalar SIMDAPI _simd16_round_ps_temp(simd16scalar a)
-{
-    return _mm512_roundscale_ps(a, mode);
-}
-
-#define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)
-
-#define _simd16_mul_epi32         _mm512_mul_epi32
-#define _simd16_mullo_epi32       _mm512_mullo_epi32
-#define _simd16_sub_epi32         _mm512_sub_epi32
-#define _simd16_sub_epi64         _mm512_sub_epi64
-#define _simd16_min_epi32         _mm512_min_epi32
-#define _simd16_max_epi32         _mm512_max_epi32
-#define _simd16_min_epu32         _mm512_min_epu32
-#define _simd16_max_epu32         _mm512_max_epu32
-#define _simd16_add_epi32         _mm512_add_epi32
-
-#define _simd16_and_si            _mm512_and_si512
-#define _simd16_andnot_si         _mm512_andnot_si512
-#define _simd16_or_si             _mm512_or_si512
-#define _simd16_xor_si            _mm512_xor_si512
-
-INLINE simd16scalari SIMDAPI _simd16_cmpeq_epi32(simd16scalari a, simd16scalari b)
-{
-    simd16mask k = _mm512_cmpeq_epi32_mask(a, b);
-
-    return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cmpgt_epi32(simd16scalari a, simd16scalari b)
-{
-    simd16mask k = _mm512_cmpgt_epi32_mask(a, b);
-
-    return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cmplt_epi32(simd16scalari a, simd16scalari b)
-{
-    simd16mask k = _mm512_cmplt_epi32_mask(a, b);
-
-    return _mm512_mask_blend_epi32(k, _mm512_setzero_epi32(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE int SIMDAPI _simd16_testz_ps(simd16scalar a, simd16scalar b)
-{
-    int lo = _simd_testz_ps(_simd16_extract_ps(a, 0), _simd16_extract_ps(b, 0));
-    int hi = _simd_testz_ps(_simd16_extract_ps(a, 1), _simd16_extract_ps(b, 1));
-
-    return lo & hi;
-}
-
-#define _simd16_unpacklo_ps       _mm512_unpacklo_ps
-#define _simd16_unpackhi_ps       _mm512_unpackhi_ps
-#define _simd16_unpacklo_pd       _mm512_unpacklo_pd
-#define _simd16_unpackhi_pd       _mm512_unpackhi_pd
-#define _simd16_unpacklo_epi8     _mm512_unpacklo_epi8
-#define _simd16_unpackhi_epi8     _mm512_unpackhi_epi8
-#define _simd16_unpacklo_epi16    _mm512_unpacklo_epi16
-#define _simd16_unpackhi_epi16    _mm512_unpackhi_epi16
-#define _simd16_unpacklo_epi32    _mm512_unpacklo_epi32
-#define _simd16_unpackhi_epi32    _mm512_unpackhi_epi32
-#define _simd16_unpacklo_epi64    _mm512_unpacklo_epi64
-#define _simd16_unpackhi_epi64    _mm512_unpackhi_epi64
-#define _simd16_slli_epi32        _mm512_slli_epi32
-#define _simd16_srli_epi32        _mm512_srli_epi32
-#define _simd16_srai_epi32        _mm512_srai_epi32
-#define _simd16_fmadd_ps          _mm512_fmadd_ps
-#define _simd16_fmsub_ps          _mm512_fmsub_ps
-#define _simd16_adds_epu8         _mm512_adds_epu8
-#define _simd16_subs_epu8         _mm512_subs_epu8
-#define _simd16_add_epi8          _mm512_add_epi8
-#define _simd16_shuffle_epi8      _mm512_shuffle_epi8
-
-#define _simd16_fmadd_ps          _mm512_fmadd_ps
-#define _simd16_fmsub_ps          _mm512_fmsub_ps
-
-#define _simd16_i32gather_ps(m, index, scale)               _mm512_i32gather_ps(index, m, scale)
-
-template <int scale>
-INLINE simd16scalar SIMDAPI _simd16_mask_i32gather_ps_temp(simd16scalar a, const float *m, simd16scalari index, simd16scalari mask)
-{
-    __mmask16 k = _mm512_cmpneq_epi32_mask(mask, _mm512_setzero_si512());
-
-    return _mm512_mask_i32gather_ps(a, k, index, m, scale);
-}
-
-#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp<scale>(a, m, index, mask)
-
-#define _simd16_abs_epi32         _mm512_abs_epi32
-
-INLINE simd16scalari SIMDAPI _simd16_cmpeq_epi64(simd16scalari a, simd16scalari b)
-{
-    __mmask8 k = _mm512_cmpeq_epi64_mask(a, b);
-
-    return _mm512_mask_blend_epi64(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cmpgt_epi64(simd16scalari a, simd16scalari b)
-{
-    __mmask8 k = _mm512_cmpgt_epi64_mask(a, b);
-
-    return _mm512_mask_blend_epi64(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cmpeq_epi16(simd16scalari a, simd16scalari b)
-{
-    __mmask32 k = _mm512_cmpeq_epi16_mask(a, b);
-
-    return _mm512_mask_blend_epi16(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cmpgt_epi16(simd16scalari a, simd16scalari b)
-{
-    __mmask32 k = _mm512_cmpgt_epi16_mask(a, b);
-
-    return _mm512_mask_blend_epi16(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cmpeq_epi8(simd16scalari a, simd16scalari b)
-{
-    __mmask64 k = _mm512_cmpeq_epi8_mask(a, b);
-
-    return _mm512_mask_blend_epi8(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-INLINE simd16scalari SIMDAPI _simd16_cmpgt_epi8(simd16scalari a, simd16scalari b)
-{
-    __mmask64 k = _mm512_cmpgt_epi8_mask(a, b);
-
-    return _mm512_mask_blend_epi8(k, _mm512_setzero_si512(), _mm512_set1_epi32(0xFFFFFFFF));
-}
-
-#define _simd16_permute_ps(a, i)        _mm512_permutexvar_ps(i, a)
-#define _simd16_permute_epi32(a, i)     _mm512_permutexvar_epi32(i, a)
-#define _simd16_sllv_epi32              _mm512_srlv_epi32
-#define _simd16_srlv_epi32              _mm512_sllv_epi32
-#define _simd16_permute2f128_ps         _mm512_shuffle_f32x4
-#define _simd16_permute2f128_pd         _mm512_shuffle_f64x2
-#define _simd16_permute2f128_si         _mm512_shuffle_i32x4
-#define _simd16_shuffle_ps              _mm512_shuffle_ps
-#define _simd16_shuffle_pd              _mm512_shuffle_pd
-#define _simd16_cvtepu8_epi16           _mm512_cvtepu8_epi16
-#define _simd16_cvtepu8_epi32           _mm512_cvtepu8_epi32
-#define _simd16_cvtepu16_epi32          _mm512_cvtepu16_epi32
-#define _simd16_cvtepu16_epi64          _mm512_cvtepu16_epi64
-#define _simd16_cvtepu32_epi64          _mm512_cvtepu32_epi64
-#define _simd16_packus_epi16            _mm512_packus_epi16
-#define _simd16_packs_epi16             _mm512_packs_epi16
-#define _simd16_packus_epi32            _mm512_packus_epi32
-#define _simd16_packs_epi32             _mm512_packs_epi32
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_shuffle_epi32_temp(simd16scalari a, simd16scalari b)
-{
-    return _simd16_castps_si(_simd16_shuffle_ps(_simd16_castsi_ps(a), _simd16_castsi_ps(b), imm8));
-}
-
-#define _simd16_shuffle_epi32(a, b, imm8) _simd16_shuffle_epi32_temp<imm8>(a, b)
-
-template <int imm8>
-INLINE simd16scalari SIMDAPI _simd16_shuffle_epi64_temp(simd16scalari a, simd16scalari b)
-{
-    return _simd16_castpd_si(_simd16_shuffle_pd(_simd16_castsi_pd(a), _simd16_castsi_pd(b), imm8));
-}
-
-#define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b)
-
-INLINE simd16mask SIMDAPI _simd16_int2mask(int mask)
-{
-    return _mm512_int2mask(mask);
-}
-
-INLINE int SIMDAPI _simd16_mask2int(simd16mask mask)
-{
-    return _mm512_mask2int(mask);
-}
-
-INLINE simd16mask SIMDAPI _simd16_cmplt_ps_mask(simd16scalar a, simd16scalar b)
-{
-    return _mm512_cmplt_ps_mask(a, b);
-}
-
-// convert bitmask to vector mask
-INLINE simd16scalar SIMDAPI vMask16(int32_t mask)
-{
-    simd16scalari temp = _simd16_set1_epi32(mask);
-
-    simd16scalari bits = _simd16_set_epi32(0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100, 0x0080, 0x0040, 0x0020, 0x0010, 0x0008, 0x0004, 0x0002, 0x0001);
-
-    simd16scalari result = _simd16_cmplt_epi32(_simd16_setzero_si(), _simd16_and_si(temp, bits));
-
-    return _simd16_castsi_ps(result);
-}
-
-#endif//ENABLE_AVX512_EMULATION
+#error Unsupported vector width
+#endif//KNOB_SIMD16_WIDTH == 16
+
+#define _simd16_setzero_ps                  SIMD16::setzero_ps
+#define _simd16_setzero_si                  SIMD16::setzero_si
+#define _simd16_set1_ps                     SIMD16::set1_ps
+#define _simd16_set1_epi8                   SIMD16::set1_epi8
+#define _simd16_set1_epi32                  SIMD16::set1_epi32
+#define _simd16_set_ps                      SIMD16::set_ps
+#define _simd16_set_epi32                   SIMD16::set_epi32
+#define _simd16_load_ps                     SIMD16::load_ps
+#define _simd16_loadu_ps                    SIMD16::loadu_ps
+#if 1                                       
+#define _simd16_load1_ps                    SIMD16::broadcast_ss
+#endif                                      
+#define _simd16_load_si                     SIMD16::load_si
+#define _simd16_loadu_si                    SIMD16::loadu_si
+#define _simd16_broadcast_ss(m)             SIMD16::broadcast_ss((float const*)m)
+#define _simd16_store_ps                    SIMD16::store_ps
+#define _simd16_store_si                    SIMD16::store_si
+#define _simd16_extract_ps(a, imm8)         SIMD16::extract_ps<imm8>(a)
+#define _simd16_extract_si(a, imm8)         SIMD16::extract_si<imm8>(a)
+#define _simd16_insert_ps(a, b, imm8)       SIMD16::insert_ps<imm8>(a, b)
+#define _simd16_insert_si(a, b, imm8)       SIMD16::insert_si<imm8>(a, b)
+#define _simd16_maskstore_ps                SIMD16::maskstore_ps
+#define _simd16_blend_ps(a, b, mask)        SIMD16::blend_ps<mask>(a, b)
+#define _simd16_blendv_ps                   SIMD16::blendv_ps
+#define _simd16_blendv_epi32                SIMD16::blendv_epi32
+#define _simd16_mul_ps                      SIMD16::mul_ps
+#define _simd16_div_ps                      SIMD16::div_ps
+#define _simd16_add_ps                      SIMD16::add_ps
+#define _simd16_sub_ps                      SIMD16::sub_ps
+#define _simd16_rsqrt_ps                    SIMD16::rsqrt_ps
+#define _simd16_min_ps                      SIMD16::min_ps
+#define _simd16_max_ps                      SIMD16::max_ps
+#define _simd16_movemask_ps                 SIMD16::movemask_ps
+#define _simd16_movemask_pd                 SIMD16::movemask_pd
+#define _simd16_cvtps_epi32                 SIMD16::cvtps_epi32
+#define _simd16_cvttps_epi32                SIMD16::cvttps_epi32
+#define _simd16_cvtepi32_ps                 SIMD16::cvtepi32_ps
+#define _simd16_cmp_ps(a, b, comp)          SIMD16::cmp_ps<SIMD16::CompareType(comp)>(a, b)
+#define _simd16_cmplt_ps                    SIMD16::cmplt_ps
+#define _simd16_cmpgt_ps                    SIMD16::cmpgt_ps
+#define _simd16_cmpneq_ps                   SIMD16::cmpneq_ps
+#define _simd16_cmpeq_ps                    SIMD16::cmpeq_ps
+#define _simd16_cmpge_ps                    SIMD16::cmpge_ps
+#define _simd16_cmple_ps                    SIMD16::cmple_ps
+#define _simd16_castsi_ps                   SIMD16::castsi_ps
+#define _simd16_castps_si                   SIMD16::castps_si
+#define _simd16_castsi_pd                   SIMD16::castsi_pd
+#define _simd16_castpd_si                   SIMD16::castpd_si
+#define _simd16_castpd_ps                   SIMD16::castpd_ps
+#define _simd16_castps_pd                   SIMD16::castps_pd
+#define _simd16_and_ps                      SIMD16::and_ps
+#define _simd16_andnot_ps                   SIMD16::andnot_ps
+#define _simd16_or_ps                       SIMD16::or_ps
+#define _simd16_xor_ps                      SIMD16::xor_ps
+#define _simd16_round_ps(a, mode)           SIMD16::round_ps<SIMD16::RoundMode(mode)>(a)
+#define _simd16_mul_epi32                   SIMD16::mul_epi32
+#define _simd16_mullo_epi32                 SIMD16::mullo_epi32
+#define _simd16_sub_epi32                   SIMD16::sub_epi32
+#define _simd16_sub_epi64                   SIMD16::sub_epi64
+#define _simd16_min_epi32                   SIMD16::min_epi32
+#define _simd16_max_epi32                   SIMD16::max_epi32
+#define _simd16_min_epu32                   SIMD16::min_epu32
+#define _simd16_max_epu32                   SIMD16::max_epu32
+#define _simd16_add_epi32                   SIMD16::add_epi32
+#define _simd16_and_si                      SIMD16::and_si
+#define _simd16_andnot_si                   SIMD16::andnot_si
+#define _simd16_or_si                       SIMD16::or_si
+#define _simd16_xor_si                      SIMD16::xor_si
+#define _simd16_cmpeq_epi32                 SIMD16::cmpeq_epi32
+#define _simd16_cmpgt_epi32                 SIMD16::cmpgt_epi32
+#define _simd16_cmplt_epi32                 SIMD16::cmplt_epi32
+#define _simd16_testz_ps                    SIMD16::testz_ps
+#define _simd16_unpacklo_ps                 SIMD16::unpacklo_ps
+#define _simd16_unpackhi_ps                 SIMD16::unpackhi_ps
+#define _simd16_unpacklo_pd                 SIMD16::unpacklo_pd
+#define _simd16_unpackhi_pd                 SIMD16::unpackhi_pd
+#define _simd16_unpacklo_epi8               SIMD16::unpacklo_epi8
+#define _simd16_unpackhi_epi8               SIMD16::unpackhi_epi8
+#define _simd16_unpacklo_epi16              SIMD16::unpacklo_epi16
+#define _simd16_unpackhi_epi16              SIMD16::unpackhi_epi16
+#define _simd16_unpacklo_epi32              SIMD16::unpacklo_epi32
+#define _simd16_unpackhi_epi32              SIMD16::unpackhi_epi32
+#define _simd16_unpacklo_epi64              SIMD16::unpacklo_epi64
+#define _simd16_unpackhi_epi64              SIMD16::unpackhi_epi64
+#define _simd16_slli_epi32(a, i)            SIMD16::slli_epi32<i>(a)
+#define _simd16_srli_epi32(a, i)            SIMD16::srli_epi32<i>(a)
+#define _simd16_srai_epi32(a, i)            SIMD16::srai_epi32<i>(a)
+#define _simd16_fmadd_ps                    SIMD16::fmadd_ps
+#define _simd16_fmsub_ps                    SIMD16::fmsub_ps
+#define _simd16_adds_epu8                   SIMD16::adds_epu8
+#define _simd16_subs_epu8                   SIMD16::subs_epu8
+#define _simd16_add_epi8                    SIMD16::add_epi8
+#define _simd16_shuffle_epi8                SIMD16::shuffle_epi8
+
+#define _simd16_i32gather_ps(m, index, scale)               SIMD16::i32gather_ps<SIMD16::ScaleFactor(scale)>(index, m)
+#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) SIMD16::mask_i32gather_ps<SIMD16::ScaleFactor(scale)>(a, m, index, mask)
+
+#define _simd16_abs_epi32                   SIMD16::abs_epi32
+
+#define _simd16_cmpeq_epi64                 SIMD16::cmpeq_epi64
+#define _simd16_cmpgt_epi64                 SIMD16::cmpgt_epi64
+#define _simd16_cmpeq_epi16                 SIMD16::cmpeq_epi16
+#define _simd16_cmpgt_epi16                 SIMD16::cmpgt_epi16
+#define _simd16_cmpeq_epi8                  SIMD16::cmpeq_epi8
+#define _simd16_cmpgt_epi8                  SIMD16::cmpgt_epi8
+
+#define _simd16_permute_ps                  SIMD16::permute_ps
+#define _simd16_permute_epi32               SIMD16::permute_epi32
+#define _simd16_sllv_epi32                  SIMD16::sllv_epi32
+#define _simd16_srlv_epi32                  SIMD16::sllv_epi32
+#define _simd16_permute2f128_ps(a, b, i)    SIMD16::permute2f128_ps<i>(a, b)
+#define _simd16_permute2f128_pd(a, b, i)    SIMD16::permute2f128_pd<i>(a, b)
+#define _simd16_permute2f128_si(a, b, i)    SIMD16::permute2f128_si<i>(a, b)
+#define _simd16_shuffle_ps(a, b, i)         SIMD16::shuffle_ps<i>(a, b)
+#define _simd16_shuffle_pd(a, b, i)         SIMD16::shuffle_pd<i>(a, b)
+#define _simd16_shuffle_epi32(a, b, imm8)   SIMD16::shuffle_epi32<imm8>(a, b)
+#define _simd16_shuffle_epi64(a, b, imm8)   SIMD16::shuffle_epi64<imm8>(a, b)
+#define _simd16_cvtepu8_epi16               SIMD16::cvtepu8_epi16
+#define _simd16_cvtepu8_epi32               SIMD16::cvtepu8_epi32
+#define _simd16_cvtepu16_epi32              SIMD16::cvtepu16_epi32
+#define _simd16_cvtepu16_epi64              SIMD16::cvtepu16_epi64
+#define _simd16_cvtepu32_epi64              SIMD16::cvtepu32_epi64
+#define _simd16_packus_epi16                SIMD16::packus_epi16
+#define _simd16_packs_epi16                 SIMD16::packs_epi16
+#define _simd16_packus_epi32                SIMD16::packus_epi32
+#define _simd16_packs_epi32                 SIMD16::packs_epi32
+#define _simd16_cmplt_ps_mask               SIMD16::cmp_ps_mask<SIMD16::CompareType::LT_OQ>
+#define _simd16_int2mask(mask)              simd16mask(mask)
+#define _simd16_mask2int(mask)              int(mask)
 
 #endif//ENABLE_AVX512_SIMD16
 
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
index 5ccb6c3ea95..f95c109e6fe 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
@@ -24,749 +24,218 @@
 #ifndef __SWR_SIMDINTRIN_H__
 #define __SWR_SIMDINTRIN_H__
 
-#include "common/os.h"
 #include "common/intrin.h"
+#include "common/simdlib.hpp"
 
 #if KNOB_SIMD_WIDTH == 8
-#define _simd128_maskstore_ps _mm_maskstore_ps
-#define _simd_load_ps _mm256_load_ps
-#define _simd_load1_ps _mm256_broadcast_ss
-#define _simd_loadu_ps _mm256_loadu_ps
-#define _simd_setzero_ps _mm256_setzero_ps
-#define _simd_set1_ps   _mm256_set1_ps
-#define _simd_blend_ps  _mm256_blend_ps
-#define _simd_blendv_ps _mm256_blendv_ps
-#define _simd_store_ps _mm256_store_ps
-#define _simd_mul_ps _mm256_mul_ps
-#define _simd_add_ps _mm256_add_ps
-#define _simd_sub_ps _mm256_sub_ps
-#define _simd_rsqrt_ps _mm256_rsqrt_ps
-#define _simd_min_ps _mm256_min_ps
-#define _simd_max_ps _mm256_max_ps
-#define _simd_movemask_ps _mm256_movemask_ps
-#define _simd_cvtps_epi32 _mm256_cvtps_epi32
-#define _simd_cvttps_epi32 _mm256_cvttps_epi32
-#define _simd_cvtepi32_ps _mm256_cvtepi32_ps
-#define _simd_cmplt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LT_OQ)
-#define _simd_cmpgt_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GT_OQ)
-#define _simd_cmpneq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_NEQ_OQ)
-#define _simd_cmpeq_ps(a, b) _mm256_cmp_ps(a, b, _CMP_EQ_OQ)
-#define _simd_cmpge_ps(a, b) _mm256_cmp_ps(a, b, _CMP_GE_OQ)
-#define _simd_cmple_ps(a, b) _mm256_cmp_ps(a, b, _CMP_LE_OQ)
-#define _simd_cmp_ps(a, b, imm) _mm256_cmp_ps(a, b, imm)
-#define _simd_and_ps _mm256_and_ps
-#define _simd_or_ps _mm256_or_ps
-
-#define _simd_rcp_ps _mm256_rcp_ps
-#define _simd_div_ps _mm256_div_ps
-#define _simd_castsi_ps _mm256_castsi256_ps
-#define _simd_andnot_ps _mm256_andnot_ps
-#define _simd_round_ps _mm256_round_ps
-#define _simd_castpd_ps _mm256_castpd_ps
-#define _simd_broadcast_ps(a) _mm256_broadcast_ps((const __m128*)(a))
-#define _simd_stream_ps _mm256_stream_ps
-
-#define _simd_load_sd _mm256_load_sd
-#define _simd_movemask_pd _mm256_movemask_pd
-#define _simd_castsi_pd _mm256_castsi256_pd
-
-// emulated integer simd
-#define SIMD_EMU_EPI(func, intrin) \
-INLINE \
-__m256i func(__m256i a, __m256i b)\
-{\
-    __m128i aHi = _mm256_extractf128_si256(a, 1);\
-    __m128i bHi = _mm256_extractf128_si256(b, 1);\
-    __m128i aLo = _mm256_castsi256_si128(a);\
-    __m128i bLo = _mm256_castsi256_si128(b);\
-\
-    __m128i subLo = intrin(aLo, bLo);\
-    __m128i subHi = intrin(aHi, bHi);\
-\
-    __m256i result = _mm256_castsi128_si256(subLo);\
-            result = _mm256_insertf128_si256(result, subHi, 1);\
-\
-    return result;\
-}
-
-#if (KNOB_ARCH == KNOB_ARCH_AVX)
-INLINE
-__m256 _simdemu_permute_ps(__m256 a, __m256i b)
-{
-    __m128 aHi = _mm256_extractf128_ps(a, 1);
-    __m128i bHi = _mm256_extractf128_si256(b, 1);
-    __m128 aLo = _mm256_castps256_ps128(a);
-    __m128i bLo = _mm256_castsi256_si128(b);
-
-    __m128i indexHi = _mm_cmpgt_epi32(bLo, _mm_set1_epi32(3));
-    __m128 resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bLo, _mm_set1_epi32(0x3)));
-    __m128 resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bLo, _mm_set1_epi32(0x3)));
-    __m128 blendLowRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi));
-
-    indexHi = _mm_cmpgt_epi32(bHi, _mm_set1_epi32(3));
-    resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bHi, _mm_set1_epi32(0x3)));
-    resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bHi, _mm_set1_epi32(0x3)));
-    __m128 blendHiRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi));
-
-    __m256 result = _mm256_castps128_ps256(blendLowRes);
-    result = _mm256_insertf128_ps(result, blendHiRes, 1);
-
-    return result;
-}
-
-INLINE
-__m256i _simdemu_permute_epi32(__m256i a, __m256i b)
-{
-    return _mm256_castps_si256(_simdemu_permute_ps(_mm256_castsi256_ps(a), b));
-}
-
-INLINE
-__m256i _simdemu_srlv_epi32(__m256i vA, __m256i vCount)
-{
-    int32_t aHi, aLow, countHi, countLow;
-    __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
-    __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
-    __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
-    __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
-
-    aHi = _mm_extract_epi32(vAHi, 0);
-    countHi = _mm_extract_epi32(vCountHi, 0);
-    aHi >>= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 0);
-
-    aLow = _mm_extract_epi32(vALow, 0);
-    countLow = _mm_extract_epi32(vCountLow, 0);
-    aLow >>= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 0);
-
-    aHi = _mm_extract_epi32(vAHi, 1);
-    countHi = _mm_extract_epi32(vCountHi, 1);
-    aHi >>= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 1);
-
-    aLow = _mm_extract_epi32(vALow, 1);
-    countLow = _mm_extract_epi32(vCountLow, 1);
-    aLow >>= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 1);
-
-    aHi = _mm_extract_epi32(vAHi, 2);
-    countHi = _mm_extract_epi32(vCountHi, 2);
-    aHi >>= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 2);
-
-    aLow = _mm_extract_epi32(vALow, 2);
-    countLow = _mm_extract_epi32(vCountLow, 2);
-    aLow >>= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 2);
-
-    aHi = _mm_extract_epi32(vAHi, 3);
-    countHi = _mm_extract_epi32(vCountHi, 3);
-    aHi >>= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 3);
-
-    aLow = _mm_extract_epi32(vALow, 3);
-    countLow = _mm_extract_epi32(vCountLow, 3);
-    aLow >>= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 3);
-
-    __m256i ret = _mm256_set1_epi32(0);
-    ret = _mm256_insertf128_si256(ret, vAHi, 1);
-    ret = _mm256_insertf128_si256(ret, vALow, 0);
-    return ret;
-}
-
-
-INLINE
-__m256i _simdemu_sllv_epi32(__m256i vA, __m256i vCount)
-{
-    int32_t aHi, aLow, countHi, countLow;
-    __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
-    __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
-    __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
-    __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
-
-    aHi = _mm_extract_epi32(vAHi, 0);
-    countHi = _mm_extract_epi32(vCountHi, 0);
-    aHi <<= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 0);
-
-    aLow = _mm_extract_epi32(vALow, 0);
-    countLow = _mm_extract_epi32(vCountLow, 0);
-    aLow <<= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 0);
-
-    aHi = _mm_extract_epi32(vAHi, 1);
-    countHi = _mm_extract_epi32(vCountHi, 1);
-    aHi <<= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 1);
-
-    aLow = _mm_extract_epi32(vALow, 1);
-    countLow = _mm_extract_epi32(vCountLow, 1);
-    aLow <<= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 1);
-
-    aHi = _mm_extract_epi32(vAHi, 2);
-    countHi = _mm_extract_epi32(vCountHi, 2);
-    aHi <<= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 2);
-
-    aLow = _mm_extract_epi32(vALow, 2);
-    countLow = _mm_extract_epi32(vCountLow, 2);
-    aLow <<= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 2);
-
-    aHi = _mm_extract_epi32(vAHi, 3);
-    countHi = _mm_extract_epi32(vCountHi, 3);
-    aHi <<= countHi;
-    vAHi = _mm_insert_epi32(vAHi, aHi, 3);
-
-    aLow = _mm_extract_epi32(vALow, 3);
-    countLow = _mm_extract_epi32(vCountLow, 3);
-    aLow <<= countLow;
-    vALow = _mm_insert_epi32(vALow, aLow, 3);
-
-    __m256i ret = _mm256_set1_epi32(0);
-    ret = _mm256_insertf128_si256(ret, vAHi, 1);
-    ret = _mm256_insertf128_si256(ret, vALow, 0);
-    return ret;
-}
-
-#define _simd_mul_epi32 _simdemu_mul_epi32
-#define _simd_mullo_epi32 _simdemu_mullo_epi32
-#define _simd_sub_epi32 _simdemu_sub_epi32
-#define _simd_sub_epi64 _simdemu_sub_epi64
-#define _simd_min_epi32 _simdemu_min_epi32
-#define _simd_min_epu32 _simdemu_min_epu32
-#define _simd_max_epi32 _simdemu_max_epi32
-#define _simd_max_epu32 _simdemu_max_epu32
-#define _simd_add_epi32 _simdemu_add_epi32
-#define _simd_and_si _simdemu_and_si
-#define _simd_andnot_si _simdemu_andnot_si
-#define _simd_cmpeq_epi32 _simdemu_cmpeq_epi32
-#define _simd_cmplt_epi32 _simdemu_cmplt_epi32
-#define _simd_cmpgt_epi32 _simdemu_cmpgt_epi32
-#define _simd_or_si _simdemu_or_si
-#define _simd_xor_si _simdemu_xor_si
-#define _simd_castps_si _mm256_castps_si256
-#define _simd_adds_epu8 _simdemu_adds_epu8
-#define _simd_subs_epu8 _simdemu_subs_epu8
-#define _simd_add_epi8 _simdemu_add_epi8
-#define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64
-#define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64
-#define _simd_cmpgt_epi8 _simdemu_cmpgt_epi8
-#define _simd_cmpeq_epi8 _simdemu_cmpeq_epi8
-#define _simd_cmpgt_epi16 _simdemu_cmpgt_epi16
-#define _simd_cmpeq_epi16 _simdemu_cmpeq_epi16
-#define _simd_movemask_epi8 _simdemu_movemask_epi8
-#define _simd_permute_ps _simdemu_permute_ps
-#define _simd_permute_epi32 _simdemu_permute_epi32
-#define _simd_srlv_epi32 _simdemu_srlv_epi32
-#define _simd_sllv_epi32 _simdemu_sllv_epi32
-
-SIMD_EMU_EPI(_simdemu_mul_epi32, _mm_mul_epi32)
-SIMD_EMU_EPI(_simdemu_mullo_epi32, _mm_mullo_epi32)
-SIMD_EMU_EPI(_simdemu_sub_epi32, _mm_sub_epi32)
-SIMD_EMU_EPI(_simdemu_sub_epi64, _mm_sub_epi64)
-SIMD_EMU_EPI(_simdemu_min_epi32, _mm_min_epi32)
-SIMD_EMU_EPI(_simdemu_min_epu32, _mm_min_epu32)
-SIMD_EMU_EPI(_simdemu_max_epi32, _mm_max_epi32)
-SIMD_EMU_EPI(_simdemu_max_epu32, _mm_max_epu32)
-SIMD_EMU_EPI(_simdemu_add_epi32, _mm_add_epi32)
-SIMD_EMU_EPI(_simdemu_and_si, _mm_and_si128)
-SIMD_EMU_EPI(_simdemu_andnot_si, _mm_andnot_si128)
-SIMD_EMU_EPI(_simdemu_cmpeq_epi32, _mm_cmpeq_epi32)
-SIMD_EMU_EPI(_simdemu_cmplt_epi32, _mm_cmplt_epi32)
-SIMD_EMU_EPI(_simdemu_cmpgt_epi32, _mm_cmpgt_epi32)
-SIMD_EMU_EPI(_simdemu_or_si, _mm_or_si128)
-SIMD_EMU_EPI(_simdemu_xor_si, _mm_xor_si128)
-SIMD_EMU_EPI(_simdemu_adds_epu8, _mm_adds_epu8)
-SIMD_EMU_EPI(_simdemu_subs_epu8, _mm_subs_epu8)
-SIMD_EMU_EPI(_simdemu_add_epi8, _mm_add_epi8)
-SIMD_EMU_EPI(_simdemu_cmpeq_epi64, _mm_cmpeq_epi64)
-SIMD_EMU_EPI(_simdemu_cmpgt_epi64, _mm_cmpgt_epi64)
-SIMD_EMU_EPI(_simdemu_cmpgt_epi8, _mm_cmpgt_epi8)
-SIMD_EMU_EPI(_simdemu_cmpeq_epi8, _mm_cmpeq_epi8)
-SIMD_EMU_EPI(_simdemu_cmpgt_epi16, _mm_cmpgt_epi16)
-SIMD_EMU_EPI(_simdemu_cmpeq_epi16, _mm_cmpeq_epi16)
-SIMD_EMU_EPI(_simdemu_unpacklo_epi8, _mm_unpacklo_epi8)
-SIMD_EMU_EPI(_simdemu_unpackhi_epi8, _mm_unpackhi_epi8)
-SIMD_EMU_EPI(_simdemu_unpacklo_epi16, _mm_unpacklo_epi16)
-SIMD_EMU_EPI(_simdemu_unpackhi_epi16, _mm_unpackhi_epi16)
-
-#define _simd_unpacklo_epi8 _simdemu_unpacklo_epi8
-#define _simd_unpackhi_epi8 _simdemu_unpackhi_epi8
-#define _simd_unpacklo_epi16 _simdemu_unpacklo_epi16
-#define _simd_unpackhi_epi16 _simdemu_unpackhi_epi16
-#define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
-#define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
-#define _simd_unpacklo_epi64(a, b) _mm256_castpd_si256(_mm256_unpacklo_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)))
-#define _simd_unpackhi_epi64(a, b) _mm256_castpd_si256(_mm256_unpackhi_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)))
-
-#define _simd_slli_epi32(a,i) _simdemu_slli_epi32(a,i)
-#define _simd_srai_epi32(a,i) _simdemu_srai_epi32(a,i)
-#define _simd_srli_epi32(a,i) _simdemu_srli_epi32(a,i)
-#define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
-
-#define _simd128_fmadd_ps _mm_fmaddemu_ps
-#define _simd_fmadd_ps _mm_fmaddemu256_ps
-#define _simd_fmsub_ps _mm_fmsubemu256_ps
-#define _simd_shuffle_epi8 _simdemu_shuffle_epi8 
-SIMD_EMU_EPI(_simdemu_shuffle_epi8, _mm_shuffle_epi8)
-
-INLINE
-__m128 _mm_fmaddemu_ps(__m128 a, __m128 b, __m128 c)
-{
-    __m128 res = _mm_mul_ps(a, b);
-    res = _mm_add_ps(res, c);
-    return res;
-}
-
-INLINE
-__m256 _mm_fmaddemu256_ps(__m256 a, __m256 b, __m256 c)
-{
-    __m256 res = _mm256_mul_ps(a, b);
-    res = _mm256_add_ps(res, c);
-    return res;
-}
-
-INLINE
-__m256 _mm_fmsubemu256_ps(__m256 a, __m256 b, __m256 c)
-{
-    __m256 res = _mm256_mul_ps(a, b);
-    res = _mm256_sub_ps(res, c);
-    return res;
-}
-
-INLINE
-__m256 _simd_i32gather_ps(const float* pBase, __m256i vOffsets, const int scale)
-{
-    uint32_t *pOffsets = (uint32_t*)&vOffsets;
-    simdscalar vResult;
-    float* pResult = (float*)&vResult;
-    for (uint32_t i = 0; i < KNOB_SIMD_WIDTH; ++i)
-    {
-        uint32_t offset = pOffsets[i];
-        offset = offset * scale;
-        pResult[i] = *(float*)(((const uint8_t*)pBase + offset));
-    }
-
-    return vResult;
-}
-
-INLINE
-__m256 _simd_mask_i32gather_ps(__m256 vSrc, const float* pBase, __m256i vOffsets, __m256 vMask, const int scale)
-{
-    uint32_t *pOffsets = (uint32_t*)&vOffsets;
-    simdscalar vResult = vSrc;
-    float* pResult = (float*)&vResult;
-    DWORD index;
-    uint32_t mask = _simd_movemask_ps(vMask);
-    while (_BitScanForward(&index, mask))
-    {
-        mask &= ~(1 << index);
-        uint32_t offset = pOffsets[index];
-        offset = offset * scale;
-        pResult[index] = *(float*)(((const uint8_t*)pBase + offset));
-    }
-
-    return vResult;
-}
-
-INLINE
-__m256i _simd_abs_epi32(__m256i a)
-{
-        __m128i aHi = _mm256_extractf128_si256(a, 1);
-        __m128i aLo = _mm256_castsi256_si128(a);
-        __m128i absLo = _mm_abs_epi32(aLo);
-        __m128i absHi = _mm_abs_epi32(aHi);
-        __m256i result = _mm256_castsi128_si256(absLo);
-        result = _mm256_insertf128_si256(result, absHi, 1);
-        return result;
-}
-
-INLINE 
-int _simdemu_movemask_epi8(__m256i a)
-{
-    __m128i aHi = _mm256_extractf128_si256(a, 1);
-    __m128i aLo = _mm256_castsi256_si128(a);
-
-    int resHi = _mm_movemask_epi8(aHi);
-    int resLo = _mm_movemask_epi8(aLo);
-
-    return (resHi << 16) | resLo;
-}
-
-INLINE
-__m256i _simd_cvtepu8_epi16(__m128i a)
-{
-    __m128i resultlo = _mm_cvtepu8_epi16(a);
-    __m128i resulthi = _mm_cvtepu8_epi16(_mm_srli_si128(a, 8));
-
-    __m256i result = _mm256_castsi128_si256(resultlo);
-
-    return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_cvtepu8_epi32(__m128i a)
-{
-    __m128i resultlo = _mm_cvtepu8_epi32(a);
-    __m128i resulthi = _mm_cvtepu8_epi32(_mm_srli_si128(a, 4));
-
-    __m256i result = _mm256_castsi128_si256(resultlo);
-
-    return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_cvtepu16_epi32(__m128i a)
-{
-    __m128i resultlo = _mm_cvtepu16_epi32(a);
-    __m128i resulthi = _mm_cvtepu16_epi32(_mm_srli_si128(a, 8));
-
-    __m256i result = _mm256_castsi128_si256(resultlo);
-
-    return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_cvtepu16_epi64(__m128i a)
-{
-    __m128i resultlo = _mm_cvtepu16_epi64(a);
-    __m128i resulthi = _mm_cvtepu16_epi64(_mm_srli_si128(a, 4));
-
-    __m256i result = _mm256_castsi128_si256(resultlo);
-
-    return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_cvtepu32_epi64(__m128i a)
-{
-    __m128i resultlo = _mm_cvtepu32_epi64(a);
-    __m128i resulthi = _mm_cvtepu32_epi64(_mm_srli_si128(a, 8));
-
-    __m256i result = _mm256_castsi128_si256(resultlo);
-
-    return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_packus_epi16(__m256i a, __m256i b)
-{
-    __m128i alo = _mm256_extractf128_si256(a, 0);
-    __m128i ahi = _mm256_extractf128_si256(a, 1);
-
-    __m128i blo = _mm256_extractf128_si256(b, 0);
-    __m128i bhi = _mm256_extractf128_si256(b, 1);
-
-    __m128i resultlo = _mm_packus_epi16(alo, blo);
-    __m128i resulthi = _mm_packus_epi16(ahi, bhi);
-
-    __m256i result = _mm256_castsi128_si256(resultlo);
-
-    return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_packs_epi16(__m256i a, __m256i b)
-{
-    __m128i alo = _mm256_extractf128_si256(a, 0);
-    __m128i ahi = _mm256_extractf128_si256(a, 1);
-
-    __m128i blo = _mm256_extractf128_si256(b, 0);
-    __m128i bhi = _mm256_extractf128_si256(b, 1);
-
-    __m128i resultlo = _mm_packs_epi16(alo, blo);
-    __m128i resulthi = _mm_packs_epi16(ahi, bhi);
-
-    __m256i result = _mm256_castsi128_si256(resultlo);
-
-    return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_packus_epi32(__m256i a, __m256i b)
-{
-    __m128i alo = _mm256_extractf128_si256(a, 0);
-    __m128i ahi = _mm256_extractf128_si256(a, 1);
-
-    __m128i blo = _mm256_extractf128_si256(b, 0);
-    __m128i bhi = _mm256_extractf128_si256(b, 1);
-
-    __m128i resultlo = _mm_packus_epi32(alo, blo);
-    __m128i resulthi = _mm_packus_epi32(ahi, bhi);
-
-    __m256i result = _mm256_castsi128_si256(resultlo);
-
-    return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
-INLINE
-__m256i _simd_packs_epi32(__m256i a, __m256i b)
-{
-    __m128i alo = _mm256_extractf128_si256(a, 0);
-    __m128i ahi = _mm256_extractf128_si256(a, 1);
-
-    __m128i blo = _mm256_extractf128_si256(b, 0);
-    __m128i bhi = _mm256_extractf128_si256(b, 1);
-
-    __m128i resultlo = _mm_packs_epi32(alo, blo);
-    __m128i resulthi = _mm_packs_epi32(ahi, bhi);
-
-    __m256i result = _mm256_castsi128_si256(resultlo);
-
-    return _mm256_insertf128_si256(result, resulthi, 1);
-}
-
+typedef SIMD256                             SIMD;
 #else
-
-#define _simd_mul_epi32 _mm256_mul_epi32
-#define _simd_mullo_epi32 _mm256_mullo_epi32
-#define _simd_sub_epi32 _mm256_sub_epi32
-#define _simd_sub_epi64 _mm256_sub_epi64
-#define _simd_min_epi32 _mm256_min_epi32
-#define _simd_max_epi32 _mm256_max_epi32
-#define _simd_min_epu32 _mm256_min_epu32
-#define _simd_max_epu32 _mm256_max_epu32
-#define _simd_add_epi32 _mm256_add_epi32
-#define _simd_and_si _mm256_and_si256
-#define _simd_andnot_si _mm256_andnot_si256
-#define _simd_cmpeq_epi32 _mm256_cmpeq_epi32
-#define _simd_cmplt_epi32(a,b) _mm256_cmpgt_epi32(b,a)
-#define _simd_cmpgt_epi32(a,b) _mm256_cmpgt_epi32(a,b)
-#define _simd_or_si _mm256_or_si256
-#define _simd_xor_si _mm256_xor_si256
-#define _simd_castps_si _mm256_castps_si256
-
-#define _simd_unpacklo_epi8 _mm256_unpacklo_epi8
-#define _simd_unpackhi_epi8 _mm256_unpackhi_epi8
-#define _simd_unpacklo_epi16 _mm256_unpacklo_epi16
-#define _simd_unpackhi_epi16 _mm256_unpackhi_epi16
-#define _simd_unpacklo_epi32 _mm256_unpacklo_epi32
-#define _simd_unpackhi_epi32 _mm256_unpackhi_epi32
-#define _simd_unpacklo_epi64 _mm256_unpacklo_epi64
-#define _simd_unpackhi_epi64 _mm256_unpackhi_epi64
-
-#define _simd_srli_si(a,i) _simdemu_srli_si128<i>(a)
-#define _simd_slli_epi32 _mm256_slli_epi32
-#define _simd_srai_epi32 _mm256_srai_epi32
-#define _simd_srli_epi32 _mm256_srli_epi32
-#define _simd_srlisi_ps(a,i) _mm256_castsi256_ps(_simdemu_srli_si128<i>(_mm256_castps_si256(a)))
-#define _simd128_fmadd_ps _mm_fmadd_ps
-#define _simd_fmadd_ps _mm256_fmadd_ps
-#define _simd_fmsub_ps _mm256_fmsub_ps
-#define _simd_shuffle_epi8 _mm256_shuffle_epi8 
-#define _simd_adds_epu8 _mm256_adds_epu8
-#define _simd_subs_epu8 _mm256_subs_epu8
-#define _simd_add_epi8 _mm256_add_epi8
-#define _simd_i32gather_ps _mm256_i32gather_ps
-#define _simd_mask_i32gather_ps _mm256_mask_i32gather_ps
-#define _simd_abs_epi32 _mm256_abs_epi32
-
-#define _simd_cmpeq_epi64 _mm256_cmpeq_epi64
-#define _simd_cmpgt_epi64 _mm256_cmpgt_epi64
-#define _simd_cmpgt_epi8  _mm256_cmpgt_epi8
-#define _simd_cmpeq_epi8  _mm256_cmpeq_epi8
-#define _simd_cmpgt_epi16  _mm256_cmpgt_epi16
-#define _simd_cmpeq_epi16  _mm256_cmpeq_epi16
-#define _simd_movemask_epi8 _mm256_movemask_epi8
-#define _simd_permute_ps _mm256_permutevar8x32_ps
-#define _simd_permute_epi32 _mm256_permutevar8x32_epi32
-#define _simd_srlv_epi32 _mm256_srlv_epi32
-#define _simd_sllv_epi32 _mm256_sllv_epi32
-#define _simd_cvtepu8_epi16 _mm256_cvtepu8_epi16
-#define _simd_cvtepu8_epi32 _mm256_cvtepu8_epi32
-#define _simd_cvtepu16_epi32 _mm256_cvtepu16_epi32
-#define _simd_cvtepu16_epi64 _mm256_cvtepu16_epi64
-#define _simd_cvtepu32_epi64 _mm256_cvtepu32_epi64
-#define _simd_packus_epi16 _mm256_packus_epi16
-#define _simd_packs_epi16 _mm256_packs_epi16
-#define _simd_packus_epi32 _mm256_packus_epi32
-#define _simd_packs_epi32 _mm256_packs_epi32
-
-#endif
-
-#define _simd_unpacklo_ps _mm256_unpacklo_ps
-#define _simd_unpackhi_ps _mm256_unpackhi_ps
-#define _simd_unpacklo_pd _mm256_unpacklo_pd
-#define _simd_unpackhi_pd _mm256_unpackhi_pd
-#define _simd_insertf128_ps _mm256_insertf128_ps
-#define _simd_insertf128_pd _mm256_insertf128_pd
-#define _simd_insertf128_si _mm256_insertf128_si256
-#define _simd_extractf128_ps _mm256_extractf128_ps
-#define _simd_extractf128_pd _mm256_extractf128_pd
-#define _simd_extractf128_si _mm256_extractf128_si256
-#define _simd_permute2f128_ps _mm256_permute2f128_ps
-#define _simd_permute2f128_pd _mm256_permute2f128_pd
-#define _simd_permute2f128_si _mm256_permute2f128_si256
-#define _simd_shuffle_ps _mm256_shuffle_ps
-#define _simd_shuffle_pd _mm256_shuffle_pd
-#define _simd_shuffle_epi32(a, b, imm8) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), imm8))
-#define _simd_shuffle_epi64(a, b, imm8) _mm256_castps_si256(_mm256_shuffle_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b), imm8))
-#define _simd_set1_epi32 _mm256_set1_epi32
-#define _simd_set_epi32 _mm256_set_epi32
-#define _simd_set1_epi8 _mm256_set1_epi8
-#define _simd_setzero_si _mm256_setzero_si256
-#define _simd_cvttps_epi32 _mm256_cvttps_epi32
-#define _simd_store_si _mm256_store_si256
-#define _simd_broadcast_ss _mm256_broadcast_ss
-#define _simd_maskstore_ps _mm256_maskstore_ps
-#define _simd_load_si _mm256_load_si256
-#define _simd_loadu_si _mm256_loadu_si256
-#define _simd_sub_ps _mm256_sub_ps
-#define _simd_testz_ps _mm256_testz_ps
-#define _simd_testz_si _mm256_testz_si256
-#define _simd_xor_ps _mm256_xor_ps
-
-INLINE
-simdscalari _simd_loadu2_si(const __m128i *hiaddr, const __m128i *loaddr)
-{
-    __m128i lo = _mm_loadu_si128(loaddr);
-    __m128i hi = _mm_loadu_si128(hiaddr);
-
-    return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
-}
-
-INLINE
-void _simd_storeu2_si(__m128i *hiaddr, __m128i *loaddr, simdscalari a)
-{
-    _mm_storeu_si128(loaddr, _mm256_castsi256_si128(a));
-    _mm_storeu_si128(hiaddr, _mm256_extractf128_si256(a, 1));
-}
-
-INLINE
-simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalar mask)
-{
-    return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), mask));
-}
-
-INLINE
-simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalari mask)
-{
-    return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), _simd_castsi_ps(mask)));
-}
-
-template<int mask>
-INLINE
-__m128i _simd_blend4_epi32(__m128i a, __m128i b)
-{
-    return _mm_castps_si128(_mm_blend_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), mask));
+#error Unsupported vector width
+#endif//KNOB_SIMD16_WIDTH == 16
+
+
+#define _simd128_maskstore_ps               SIMD128::maskstore_ps
+#define _simd128_fmadd_ps                   SIMD128::fmadd_ps
+
+#define _simd_load_ps                       SIMD::load_ps
+#define _simd_load1_ps                      SIMD::broadcast_ss
+#define _simd_loadu_ps                      SIMD::loadu_ps
+#define _simd_setzero_ps                    SIMD::setzero_ps
+#define _simd_set1_ps                       SIMD::set1_ps
+#define _simd_blend_ps(a, b, i)             SIMD::blend_ps<i>(a, b)
+#define _simd_blend_epi32(a, b, i)          SIMD::blend_epi32<i>(a, b)
+#define _simd_blendv_ps                     SIMD::blendv_ps
+#define _simd_store_ps                      SIMD::store_ps
+#define _simd_mul_ps                        SIMD::mul_ps
+#define _simd_add_ps                        SIMD::add_ps
+#define _simd_sub_ps                        SIMD::sub_ps
+#define _simd_rsqrt_ps                      SIMD::rsqrt_ps
+#define _simd_min_ps                        SIMD::min_ps
+#define _simd_max_ps                        SIMD::max_ps
+#define _simd_movemask_ps                   SIMD::movemask_ps
+#define _simd_cvtps_epi32                   SIMD::cvtps_epi32
+#define _simd_cvttps_epi32                  SIMD::cvttps_epi32
+#define _simd_cvtepi32_ps                   SIMD::cvtepi32_ps
+#define _simd_cmplt_ps                      SIMD::cmplt_ps
+#define _simd_cmpgt_ps                      SIMD::cmpgt_ps
+#define _simd_cmpneq_ps                     SIMD::cmpneq_ps
+#define _simd_cmpeq_ps                      SIMD::cmpeq_ps
+#define _simd_cmpge_ps                      SIMD::cmpge_ps
+#define _simd_cmple_ps                      SIMD::cmple_ps
+#define _simd_cmp_ps(a, b, imm)             SIMD::cmp_ps<SIMD::CompareType(imm)>(a, b)
+#define _simd_and_ps                        SIMD::and_ps
+#define _simd_or_ps                         SIMD::or_ps
+#define _simd_rcp_ps                        SIMD::rcp_ps
+#define _simd_div_ps                        SIMD::div_ps
+#define _simd_castsi_ps                     SIMD::castsi_ps
+#define _simd_castps_pd                     SIMD::castps_pd
+#define _simd_castpd_ps                     SIMD::castpd_ps
+#define _simd_andnot_ps                     SIMD::andnot_ps
+#define _simd_round_ps(a, i)                SIMD::round_ps<SIMD::RoundMode(i)>(a)
+#define _simd_castpd_ps                     SIMD::castpd_ps
+#define _simd_broadcast_ps(a)               SIMD::broadcast_ps((SIMD128::Float const *)(a))
+#define _simd_stream_ps                     SIMD::stream_ps
+
+#define _simd_movemask_pd                   SIMD::movemask_pd
+#define _simd_castsi_pd                     SIMD::castsi_pd
+
+#define _simd_mul_epi32                     SIMD::mul_epi32
+#define _simd_mullo_epi32                   SIMD::mullo_epi32
+#define _simd_sub_epi32                     SIMD::sub_epi32
+#define _simd_sub_epi64                     SIMD::sub_epi64
+#define _simd_min_epi32                     SIMD::min_epi32
+#define _simd_min_epu32                     SIMD::min_epu32
+#define _simd_max_epi32                     SIMD::max_epi32
+#define _simd_max_epu32                     SIMD::max_epu32
+#define _simd_add_epi32                     SIMD::add_epi32
+#define _simd_and_si                        SIMD::and_si
+#define _simd_andnot_si                     SIMD::andnot_si
+#define _simd_cmpeq_epi32                   SIMD::cmpeq_epi32
+#define _simd_cmplt_epi32                   SIMD::cmplt_epi32
+#define _simd_cmpgt_epi32                   SIMD::cmpgt_epi32
+#define _simd_or_si                         SIMD::or_si
+#define _simd_xor_si                        SIMD::xor_si
+#define _simd_castps_si                     SIMD::castps_si
+#define _simd_adds_epu8                     SIMD::adds_epu8
+#define _simd_subs_epu8                     SIMD::subs_epu8
+#define _simd_add_epi8                      SIMD::add_epi8
+#define _simd_cmpeq_epi64                   SIMD::cmpeq_epi64
+#define _simd_cmpgt_epi64                   SIMD::cmpgt_epi64
+#define _simd_cmpgt_epi8                    SIMD::cmpgt_epi8
+#define _simd_cmpeq_epi8                    SIMD::cmpeq_epi8
+#define _simd_cmpgt_epi16                   SIMD::cmpgt_epi16
+#define _simd_cmpeq_epi16                   SIMD::cmpeq_epi16
+#define _simd_movemask_epi8                 SIMD::movemask_epi8
+#define _simd_permute_ps                    SIMD::permute_ps
+#define _simd_permute_epi32                 SIMD::permute_epi32
+#define _simd_srlv_epi32                    SIMD::srlv_epi32
+#define _simd_sllv_epi32                    SIMD::sllv_epi32
+
+#define _simd_unpacklo_epi8                 SIMD::unpacklo_epi8
+#define _simd_unpackhi_epi8                 SIMD::unpackhi_epi8
+#define _simd_unpacklo_epi16                SIMD::unpacklo_epi16
+#define _simd_unpackhi_epi16                SIMD::unpackhi_epi16
+#define _simd_unpacklo_epi32                SIMD::unpacklo_epi32
+#define _simd_unpackhi_epi32                SIMD::unpackhi_epi32
+#define _simd_unpacklo_epi64                SIMD::unpacklo_epi64
+#define _simd_unpackhi_epi64                SIMD::unpackhi_epi64
+
+#define _simd_slli_epi32(a,i)               SIMD::slli_epi32<i>(a)
+#define _simd_srai_epi32(a,i)               SIMD::srai_epi32<i>(a)
+#define _simd_srli_epi32(a,i)               SIMD::srli_epi32<i>(a)
+#define _simd_srlisi_ps(a,i)                SIMD::srlisi_ps<i>(a)
+
+#define _simd_fmadd_ps                      SIMD::fmadd_ps
+#define _simd_fmsub_ps                      SIMD::fmsub_ps
+#define _simd_shuffle_epi8                  SIMD::shuffle_epi8
+
+#define _simd_i32gather_ps(p, o, s)         SIMD::i32gather_ps<SIMD::ScaleFactor(s)>(p, o)
+#define _simd_mask_i32gather_ps(r, p, o, m, s) SIMD::mask_i32gather_ps<SIMD::ScaleFactor(s)>(r, p, o, m)
+#define _simd_abs_epi32                     SIMD::abs_epi32
+
+#define _simd_cvtepu8_epi16                 SIMD::cvtepu8_epi16
+#define _simd_cvtepu8_epi32                 SIMD::cvtepu8_epi32
+#define _simd_cvtepu16_epi32                SIMD::cvtepu16_epi32
+#define _simd_cvtepu16_epi64                SIMD::cvtepu16_epi64
+#define _simd_cvtepu32_epi64                SIMD::cvtepu32_epi64
+
+#define _simd_packus_epi16                  SIMD::packus_epi16
+#define _simd_packs_epi16                   SIMD::packs_epi16
+#define _simd_packus_epi32                  SIMD::packus_epi32
+#define _simd_packs_epi32                   SIMD::packs_epi32
+
+#define _simd_unpacklo_ps                   SIMD::unpacklo_ps
+#define _simd_unpackhi_ps                   SIMD::unpackhi_ps
+#define _simd_unpacklo_pd                   SIMD::unpacklo_pd
+#define _simd_unpackhi_pd                   SIMD::unpackhi_pd
+#define _simd_insertf128_ps                 SIMD::insertf128_ps
+#define _simd_insertf128_pd                 SIMD::insertf128_pd
+#define _simd_insertf128_si(a, b, i)        SIMD::insertf128_si<i>(a, b)
+#define _simd_extractf128_ps(a, i)          SIMD::extractf128_ps<i>(a)
+#define _simd_extractf128_pd(a, i)          SIMD::extractf128_pd<i>(a)
+#define _simd_extractf128_si(a, i)          SIMD::extractf128_si<i>(a)
+#define _simd_permute2f128_ps(a, b, i)      SIMD::permute2f128_ps<i>(a, b)
+#define _simd_permute2f128_pd(a, b, i)      SIMD::permute2f128_pd<i>(a, b)
+#define _simd_permute2f128_si(a, b, i)      SIMD::permute2f128_si<i>(a, b)
+#define _simd_shuffle_ps(a, b, i)           SIMD::shuffle_ps<i>(a, b)
+#define _simd_shuffle_pd(a, b, i)           SIMD::shuffle_pd<i>(a, b)
+#define _simd_shuffle_epi32(a, b, imm8)     SIMD::shuffle_epi32<imm8>(a, b)
+#define _simd_shuffle_epi64(a, b, imm8)     SIMD::shuffle_epi64<imm8>(a, b)
+#define _simd_set1_epi32                    SIMD::set1_epi32
+#define _simd_set_epi32                     SIMD::set_epi32
+#define _simd_set_ps                        SIMD::set_ps
+#define _simd_set1_epi8                     SIMD::set1_epi8
+#define _simd_setzero_si                    SIMD::setzero_si
+#define _simd_cvttps_epi32                  SIMD::cvttps_epi32
+#define _simd_store_si                      SIMD::store_si
+#define _simd_broadcast_ss                  SIMD::broadcast_ss
+#define _simd_maskstore_ps                  SIMD::maskstore_ps
+#define _simd_load_si                       SIMD::load_si
+#define _simd_loadu_si                      SIMD::loadu_si
+#define _simd_sub_ps                        SIMD::sub_ps
+#define _simd_testz_ps                      SIMD::testz_ps
+#define _simd_testz_si                      SIMD::testz_si
+#define _simd_xor_ps                        SIMD::xor_ps
+
+#define _simd_loadu2_si                     SIMD::loadu2_si
+#define _simd_storeu2_si                    SIMD::storeu2_si
+
+#define _simd_blendv_epi32                  SIMD::blendv_epi32
+
+template<int mask> SIMDINLINE
+SIMD128::Integer _simd_blend4_epi32(SIMD128::Integer a, SIMD128::Integer b)
+{
+    return SIMD128::castps_si(SIMD128::blend_ps<mask>(SIMD128::castsi_ps(a), SIMD128::castsi_ps(b)));
 }
 
 // convert bitmask to vector mask
-INLINE
-simdscalar vMask(int32_t mask)
+SIMDINLINE
+SIMD256::Float vMask(int32_t mask)
 {
-    __m256i vec = _mm256_set1_epi32(mask);
-    const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
-    vec = _simd_and_si(vec, bit);
-    vec = _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
-    return _simd_castsi_ps(vec);
+    SIMD256::Integer vec = SIMD256::set1_epi32(mask);
+    const SIMD256::Integer bit = SIMD256::set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+    vec = SIMD256::and_si(vec, bit);
+    vec = SIMD256::cmplt_epi32(SIMD256::setzero_si(), vec);
+    return SIMD256::castsi_ps(vec);
 }
 
-INLINE
-simdscalari vMaski(int32_t mask)
+SIMDINLINE
+SIMD256::Integer vMaski(int32_t mask)
 {
-    __m256i vec = _mm256_set1_epi32(mask);
-    const __m256i bit = _mm256_set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
-    vec = _simd_and_si(vec, bit);
-    return _simd_cmplt_epi32(_mm256_setzero_si256(), vec);
+    SIMD256::Integer vec = SIMD256::set1_epi32(mask);
+    const SIMD256::Integer bit = SIMD256::set_epi32(0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+    vec = SIMD256::and_si(vec, bit);
+    return SIMD256::cmplt_epi32(SIMD256::setzero_si(), vec);
 }
 
-INLINE
+SIMDINLINE
 void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int slane)
 {
     OSALIGNSIMD(float) rArray[KNOB_SIMD_WIDTH], sArray[KNOB_SIMD_WIDTH];
-    _mm256_store_ps(rArray, r);
-    _mm256_store_ps(sArray, s);
+    SIMD256::store_ps(rArray, r);
+    SIMD256::store_ps(sArray, s);
     rArray[rlane] = sArray[slane];
-    r = _mm256_load_ps(rArray);
+    r = SIMD256::load_ps(rArray);
 }
 
-INLINE __m256i _simdemu_slli_epi32(__m256i a, uint32_t i)
-{
-    __m128i aHi = _mm256_extractf128_si256(a, 1);
-    __m128i aLo = _mm256_castsi256_si128(a);
-
-    __m128i resHi = _mm_slli_epi32(aHi, i);
-    __m128i resLo = _mm_slli_epi32(aLo, i);
-
-    __m256i result = _mm256_castsi128_si256(resLo);
-            result = _mm256_insertf128_si256(result, resHi, 1);
-
-    return result;
-}
-
-INLINE __m256i _simdemu_srai_epi32(__m256i a, uint32_t i)
-{
-    __m128i aHi = _mm256_extractf128_si256(a, 1);
-    __m128i aLo = _mm256_castsi256_si128(a);
-
-    __m128i resHi = _mm_srai_epi32(aHi, i);
-    __m128i resLo = _mm_srai_epi32(aLo, i);
-
-    __m256i result = _mm256_castsi128_si256(resLo);
-            result = _mm256_insertf128_si256(result, resHi, 1);
-
-    return result;
-}
-
-INLINE __m256i _simdemu_srli_epi32(__m256i a, uint32_t i)
-{
-    __m128i aHi = _mm256_extractf128_si256(a, 1);
-    __m128i aLo = _mm256_castsi256_si128(a);
-
-    __m128i resHi = _mm_srli_epi32(aHi, i);
-    __m128i resLo = _mm_srli_epi32(aLo, i);
-
-    __m256i result = _mm256_castsi128_si256(resLo);
-    result = _mm256_insertf128_si256(result, resHi, 1);
-
-    return result;
-}
-
-INLINE
-void _simdvec_transpose(simdvector &v)
-{
-    SWR_INVALID("Need to implement 8 wide version");
-}
-
-#else
-#error Unsupported vector width
-#endif
-
 // Populates a simdvector from a vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
-INLINE
-void _simdvec_load_ps(simdvector& r, const float *p)
-{
-    r[0] = _simd_set1_ps(p[0]);
-    r[1] = _simd_set1_ps(p[1]);
-    r[2] = _simd_set1_ps(p[2]);
-    r[3] = _simd_set1_ps(p[3]);
-}
+#define _simdvec_load_ps SIMD::vec4_load1_ps
 
-INLINE
+SIMDINLINE
 void _simdvec_mov(simdvector& r, const simdscalar& s)
 {
-    r[0] = s;
-    r[1] = s;
-    r[2] = s;
-    r[3] = s;
+    SIMD::vec4_set1_vps(r, s);
 }
 
-INLINE
+SIMDINLINE
 void _simdvec_mov(simdvector& r, const simdvector& v)
 {
-    r[0] = v[0];
-    r[1] = v[1];
-    r[2] = v[2];
-    r[3] = v[3];
+    r = v;
 }
 
 #if 0
 // just move a lane from the source simdvector to dest simdvector
-INLINE
+SIMDINLINE
 void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane)
 {
     _simd_mov(r[0], rlane, s[0], slane);
@@ -776,330 +245,23 @@ void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int
 }
 
 #endif
-INLINE
-void _simdvec_dp3_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
-{
-    simdscalar tmp;
-    r   = _simd_mul_ps(v0[0], v1[0]);   // (v0.x*v1.x)
-
-    tmp = _simd_mul_ps(v0[1], v1[1]);       // (v0.y*v1.y)
-    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y)
 
-    tmp = _simd_mul_ps(v0[2], v1[2]);   // (v0.z*v1.z)
-    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
-}
-
-INLINE
-void _simdvec_dp4_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
-{
-    simdscalar tmp;
-    r   = _simd_mul_ps(v0[0], v1[0]);   // (v0.x*v1.x)
-
-    tmp = _simd_mul_ps(v0[1], v1[1]);       // (v0.y*v1.y)
-    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y)
-
-    tmp = _simd_mul_ps(v0[2], v1[2]);   // (v0.z*v1.z)
-    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
-
-    tmp = _simd_mul_ps(v0[3], v1[3]);   // (v0.w*v1.w)
-    r   = _simd_add_ps(r, tmp);         // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
-}
-
-INLINE
-simdscalar _simdvec_rcp_length_ps(const simdvector& v)
-{
-    simdscalar length;
-    _simdvec_dp4_ps(length, v, v);
-    return _simd_rsqrt_ps(length);
-}
-
-INLINE
-void _simdvec_normalize_ps(simdvector& r, const simdvector& v)
-{
-    simdscalar vecLength;
-    vecLength = _simdvec_rcp_length_ps(v);
-
-    r[0] = _simd_mul_ps(v[0], vecLength);
-    r[1] = _simd_mul_ps(v[1], vecLength);
-    r[2] = _simd_mul_ps(v[2], vecLength);
-    r[3] = _simd_mul_ps(v[3], vecLength);
-}
-
-INLINE
-void _simdvec_mul_ps(simdvector& r, const simdvector& v, const simdscalar& s)
-{
-    r[0] = _simd_mul_ps(v[0], s);
-    r[1] = _simd_mul_ps(v[1], s);
-    r[2] = _simd_mul_ps(v[2], s);
-    r[3] = _simd_mul_ps(v[3], s);
-}
-
-INLINE
-void _simdvec_mul_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
-{
-    r[0] = _simd_mul_ps(v0[0], v1[0]);
-    r[1] = _simd_mul_ps(v0[1], v1[1]);
-    r[2] = _simd_mul_ps(v0[2], v1[2]);
-    r[3] = _simd_mul_ps(v0[3], v1[3]);
-}
-
-INLINE
-void _simdvec_add_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
-{
-    r[0] = _simd_add_ps(v0[0], v1[0]);
-    r[1] = _simd_add_ps(v0[1], v1[1]);
-    r[2] = _simd_add_ps(v0[2], v1[2]);
-    r[3] = _simd_add_ps(v0[3], v1[3]);
-}
-
-INLINE
-void _simdvec_min_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
-{
-    r[0] = _simd_min_ps(v0[0], s);
-    r[1] = _simd_min_ps(v0[1], s);
-    r[2] = _simd_min_ps(v0[2], s);
-    r[3] = _simd_min_ps(v0[3], s);
-}
-
-INLINE
-void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
-{
-    r[0] = _simd_max_ps(v0[0], s);
-    r[1] = _simd_max_ps(v0[1], s);
-    r[2] = _simd_max_ps(v0[2], s);
-    r[3] = _simd_max_ps(v0[3], s);
-}
-
-// Matrix4x4 * Vector4
-//   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
-//   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
-//   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
-//   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
-INLINE
-void _simd_mat4x4_vec4_multiply(
-    simdvector& result,
-    const float *pMatrix,
-    const simdvector& v)
-{
-    simdscalar m;
-    simdscalar r0;
-    simdscalar r1;
-
-    m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 3);    // m[row][3]
-    r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
-    result[0] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 3);    // m[row][3]
-    r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
-    result[1] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 3);    // m[row][3]
-    r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
-    result[2] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 3*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 3*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 3*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 3*4 + 3);    // m[row][3]
-    r1  = _simd_mul_ps(m, v[3]);                // (m3 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
-    result[3] = r0;
-}
-
-// Matrix4x4 * Vector3 - Direction Vector where w = 0.
-//   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
-//   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
-//   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
-//   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
-INLINE
-void _simd_mat3x3_vec3_w0_multiply(
-    simdvector& result,
-    const float *pMatrix,
-    const simdvector& v)
-{
-    simdscalar m;
-    simdscalar r0;
-    simdscalar r1;
-
-    m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    result[0] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    result[1] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    result[2] = r0;
-
-    result[3] = _simd_setzero_ps();
-}
-
-// Matrix4x4 * Vector3 - Position vector where w = 1.
-//   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
-//   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
-//   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
-//   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
-INLINE
-void _simd_mat4x4_vec3_w1_multiply(
-    simdvector& result,
-    const float *pMatrix,
-    const simdvector& v)
-{
-    simdscalar m;
-    simdscalar r0;
-    simdscalar r1;
-
-    m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 3);    // m[row][3]
-    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-    result[0] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 3);    // m[row][3]
-    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-    result[1] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 3);    // m[row][3]
-    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-    result[2] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 3*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 3*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 3*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 3*4 + 3);    // m[row][3]
-    result[3]   = _simd_add_ps(r0, m);          // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-}
-
-INLINE
-void _simd_mat4x3_vec3_w1_multiply(
-    simdvector& result,
-    const float *pMatrix,
-    const simdvector& v)
-{
-    simdscalar m;
-    simdscalar r0;
-    simdscalar r1;
-
-    m   = _simd_load1_ps(pMatrix + 0*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 0*4 + 3);    // m[row][3]
-    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-    result[0] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 1*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 1*4 + 3);    // m[row][3]
-    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-    result[1] = r0;
-
-    m   = _simd_load1_ps(pMatrix + 2*4 + 0);    // m[row][0]
-    r0  = _simd_mul_ps(m, v[0]);                // (m00 * v.x)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 1);    // m[row][1]
-    r1  = _simd_mul_ps(m, v[1]);                // (m1 * v.y)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 2);    // m[row][2]
-    r1  = _simd_mul_ps(m, v[2]);                // (m2 * v.z)
-    r0  = _simd_add_ps(r0, r1);                 // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
-    m   = _simd_load1_ps(pMatrix + 2*4 + 3);    // m[row][3]
-    r0  = _simd_add_ps(r0, m);                  // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
-    result[2] = r0;
-    result[3] = _simd_set1_ps(1.0f);
-}
+#define _simdvec_dp3_ps                 SIMD::vec4_dp3_ps
+#define _simdvec_dp4_ps                 SIMD::vec4_dp4_ps
+#define _simdvec_rcp_length_ps          SIMD::vec4_rcp_length_ps
+#define _simdvec_normalize_ps           SIMD::vec4_normalize_ps
+#define _simdvec_mul_ps                 SIMD::vec4_mul_ps
+#define _simdvec_add_ps                 SIMD::vec4_add_ps
+#define _simdvec_min_ps                 SIMD::vec4_min_ps
+#define _simdvec_max_ps                 SIMD::vec4_max_ps
+#define _simd_mat4x4_vec4_multiply      SIMD::mat4x4_vec4_multiply
+#define _simd_mat3x3_vec3_w0_multiply   SIMD::mat3x3_vec3_w0_multiply
+#define _simd_mat4x4_vec3_w1_multiply   SIMD::mat4x4_vec3_w1_multiply
+#define _simd_mat4x3_vec3_w1_multiply   SIMD::mat4x3_vec3_w1_multiply
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Compute plane equation vA * vX + vB * vY + vC
-INLINE simdscalar vplaneps(simdscalar vA, simdscalar vB, simdscalar vC, simdscalar &vX, simdscalar &vY)
+SIMDINLINE simdscalar vplaneps(simdscalar vA, simdscalar vB, simdscalar vC, simdscalar &vX, simdscalar &vY)
 {
     simdscalar vOut = _simd_fmadd_ps(vA, vX, vC);
     vOut = _simd_fmadd_ps(vB, vY, vOut);
@@ -1108,9 +270,9 @@ INLINE simdscalar vplaneps(simdscalar vA, simdscalar vB, simdscalar vC, simdscal
 
 //////////////////////////////////////////////////////////////////////////
 /// @brief Compute plane equation vA * vX + vB * vY + vC
-INLINE __m128 vplaneps128(__m128 vA, __m128 vB, __m128 vC, __m128 &vX, __m128 &vY)
+SIMDINLINE simd4scalar vplaneps(simd4scalar vA, simd4scalar vB, simd4scalar vC, simd4scalar &vX, simd4scalar &vY)
 {
-    __m128 vOut = _simd128_fmadd_ps(vA, vX, vC);
+    simd4scalar vOut = _simd128_fmadd_ps(vA, vX, vC);
     vOut = _simd128_fmadd_ps(vB, vY, vOut);
     return vOut;
 }
@@ -1121,7 +283,7 @@ INLINE __m128 vplaneps128(__m128 vA, __m128 vB, __m128 vC, __m128 &vX, __m128 &v
 /// @param vJ - barycentric J
 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
 template<UINT Attrib, UINT Comp, UINT numComponents = 4>
-static INLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, const float *pInterpBuffer)
+static SIMDINLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, const float *pInterpBuffer)
 {
     const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
     const float *pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
@@ -1141,7 +303,7 @@ static INLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, cons
 /// @brief Interpolates a single component (flat shade).
 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
 template<UINT Attrib, UINT Comp, UINT numComponents = 4>
-static INLINE simdscalar InterpolateComponentFlat(const float *pInterpBuffer)
+static SIMDINLINE simdscalar InterpolateComponentFlat(const float *pInterpBuffer)
 {
     const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
 
@@ -1156,34 +318,35 @@ static INLINE simdscalar InterpolateComponentFlat(const float *pInterpBuffer)
 /// @param vJ - barycentric J
 /// @param pInterpBuffer - pointer to attribute barycentric coeffs
 template<UINT Attrib, UINT Comp, UINT numComponents = 4>
-static INLINE __m128 InterpolateComponent(__m128 vI, __m128 vJ, const float *pInterpBuffer)
+static SIMDINLINE simd4scalar InterpolateComponent(simd4scalar vI, simd4scalar vJ, const float *pInterpBuffer)
 {
     const float *pInterpA = &pInterpBuffer[Attrib * 3 * numComponents + 0 + Comp];
     const float *pInterpB = &pInterpBuffer[Attrib * 3 * numComponents + numComponents + Comp];
     const float *pInterpC = &pInterpBuffer[Attrib * 3 * numComponents + numComponents * 2 + Comp];
 
-    __m128 vA = _mm_broadcast_ss(pInterpA);
-    __m128 vB = _mm_broadcast_ss(pInterpB);
-    __m128 vC = _mm_broadcast_ss(pInterpC);
+    simd4scalar vA = SIMD128::broadcast_ss(pInterpA);
+    simd4scalar vB = SIMD128::broadcast_ss(pInterpB);
+    simd4scalar vC = SIMD128::broadcast_ss(pInterpC);
 
-    __m128 vk = _mm_sub_ps(_mm_sub_ps(_mm_set1_ps(1.0f), vI), vJ);
-    vC = _mm_mul_ps(vk, vC);
+    simd4scalar vk = SIMD128::sub_ps(SIMD128::sub_ps(SIMD128::set1_ps(1.0f), vI), vJ);
+    vC = SIMD128::mul_ps(vk, vC);
 
-    return vplaneps128(vA, vB, vC, vI, vJ);
+    return vplaneps(vA, vB, vC, vI, vJ);
 }
 
-static INLINE __m128 _simd128_abs_ps(__m128 a)
+static SIMDINLINE simd4scalar _simd128_abs_ps(simd4scalar a)
 {
-    __m128i ai = _mm_castps_si128(a);
-    return _mm_castsi128_ps(_mm_and_si128(ai, _mm_set1_epi32(0x7fffffff)));
+    simd4scalari ai = SIMD128::castps_si(a);
+    return SIMD128::castsi_ps(SIMD128::and_si(ai, SIMD128::set1_epi32(0x7fffffff)));
 }
 
-static INLINE simdscalar _simd_abs_ps(simdscalar a)
+static SIMDINLINE simdscalar _simd_abs_ps(simdscalar a)
 {
     simdscalari ai = _simd_castps_si(a);
     return _simd_castsi_ps(_simd_and_si(ai, _simd_set1_epi32(0x7fffffff)));
 }
 
+
 #if ENABLE_AVX512_SIMD16
 #include "simd16intrin.h"
 #endif//ENABLE_AVX512_SIMD16
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
new file mode 100644
index 00000000000..fb1113204d5
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib.hpp
@@ -0,0 +1,550 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#pragma once
+
+#include "simdlib_types.hpp"
+
+// For documentation, please see the following include...
+// #include "simdlib_interface.hpp"
+
+namespace SIMDImpl
+{
+    namespace SIMD128Impl
+    {
+#if SIMD_ARCH >= SIMD_ARCH_AVX
+        struct AVXImpl
+        {
+#define __SIMD_LIB_AVX_HPP__
+#include "simdlib_128_avx.inl"
+#undef __SIMD_LIB_AVX_HPP__
+        }; // struct AVXImpl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
+
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX2
+        struct AVX2Impl : AVXImpl
+        {
+#define __SIMD_LIB_AVX2_HPP__
+#include "simdlib_128_avx2.inl"
+#undef __SIMD_LIB_AVX2_HPP__
+        }; // struct AVX2Impl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+        struct AVX512Impl : AVX2Impl
+        {
+#define __SIMD_LIB_AVX512_HPP__
+#include "simdlib_128_avx512.inl"
+#undef __SIMD_LIB_AVX512_HPP__
+        }; // struct AVX2Impl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
+
+        struct Traits : SIMDImpl::Traits
+        {
+#if SIMD_ARCH == SIMD_ARCH_AVX
+            using IsaImpl = AVXImpl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX2
+            using IsaImpl = AVX2Impl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX512
+            using IsaImpl = AVX512Impl;
+#else
+#error Invalid value for SIMD_ARCH
+#endif
+
+            using Float     = SIMD128Impl::Float;
+            using Double    = SIMD128Impl::Double;
+            using Integer   = SIMD128Impl::Integer;
+            using Vec4      = SIMD128Impl::Vec4;
+            using Mask      = SIMD128Impl::Mask;
+        };
+    } // ns SIMD128Impl
+
+    namespace SIMD256Impl
+    {
+#if SIMD_ARCH >= SIMD_ARCH_AVX
+        struct AVXImpl
+        {
+#define __SIMD_LIB_AVX_HPP__
+#include "simdlib_256_avx.inl"
+#undef __SIMD_LIB_AVX_HPP__
+        }; // struct AVXImpl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
+
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX2
+        struct AVX2Impl : AVXImpl
+        {
+#define __SIMD_LIB_AVX2_HPP__
+#include "simdlib_256_avx2.inl"
+#undef __SIMD_LIB_AVX2_HPP__
+        }; // struct AVX2Impl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+        struct AVX512Impl : AVX2Impl
+        {
+#define __SIMD_LIB_AVX512_HPP__
+#include "simdlib_256_avx512.inl"
+#undef __SIMD_LIB_AVX512_HPP__
+        }; // struct AVX2Impl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
+
+        struct Traits : SIMDImpl::Traits
+        {
+#if SIMD_ARCH == SIMD_ARCH_AVX
+            using IsaImpl = AVXImpl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX2
+            using IsaImpl = AVX2Impl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX512
+            using IsaImpl = AVX512Impl;
+#else
+#error Invalid value for SIMD_ARCH
+#endif
+
+            using Float     = SIMD256Impl::Float;
+            using Double    = SIMD256Impl::Double;
+            using Integer   = SIMD256Impl::Integer;
+            using Vec4      = SIMD256Impl::Vec4;
+            using Mask      = SIMD256Impl::Mask;
+        };
+    } // ns SIMD256Impl
+
+    namespace SIMD512Impl
+    {
+#if SIMD_ARCH >= SIMD_ARCH_AVX
+        template<typename SIMD256T>
+        struct AVXImplBase
+        {
+#define __SIMD_LIB_AVX_HPP__
+#include "simdlib_512_emu.inl"
+#include "simdlib_512_emu_masks.inl"
+#undef __SIMD_LIB_AVX_HPP__
+        }; // struct AVXImplBase
+        using AVXImpl = AVXImplBase<SIMD256Impl::AVXImpl>;
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX
+
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX2
+        using AVX2Impl = AVXImplBase<SIMD256Impl::AVX2Impl>;
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX2
+
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+        struct AVX512Impl
+        {
+#define __SIMD_LIB_AVX512_HPP__
+#include "simdlib_512_avx512.inl"
+#include "simdlib_512_avx512_masks.inl"
+#undef __SIMD_LIB_AVX512_HPP__
+        }; // struct AVX512Impl
+#endif // #if SIMD_ARCH >= SIMD_ARCH_AVX512
+
+        struct Traits : SIMDImpl::Traits
+        {
+#if SIMD_ARCH == SIMD_ARCH_AVX
+            using IsaImpl = AVXImpl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX2
+            using IsaImpl = AVX2Impl;
+#elif SIMD_ARCH == SIMD_ARCH_AVX512
+            using IsaImpl = AVX512Impl;
+#else
+#error Invalid value for SIMD_ARCH
+#endif
+
+            using Float     = SIMD512Impl::Float;
+            using Double    = SIMD512Impl::Double;
+            using Integer   = SIMD512Impl::Integer;
+            using Vec4      = SIMD512Impl::Vec4;
+            using Mask      = SIMD512Impl::Mask;
+        };
+    } // ns SIMD512Impl
+} // ns SIMDImpl
+
+template <typename Traits>
+struct SIMDBase : Traits::IsaImpl
+{
+    using CompareType   = typename Traits::CompareType;
+    using ScaleFactor   = typename Traits::ScaleFactor;
+    using RoundMode     = typename Traits::RoundMode;
+    using SIMD          = typename Traits::IsaImpl;
+    using Float         = typename Traits::Float;
+    using Double        = typename Traits::Double;
+    using Integer       = typename Traits::Integer;
+    using Vec4          = typename Traits::Vec4;
+    using Mask          = typename Traits::Mask;
+
+    // Populates a SIMD Vec4 from a non-simd vector. So p = xyzw becomes xxxx yyyy zzzz wwww.
+    static SIMDINLINE
+    void vec4_load1_ps(Vec4& r, const float *p)
+    {
+        r[0] = SIMD::set1_ps(p[0]);
+        r[1] = SIMD::set1_ps(p[1]);
+        r[2] = SIMD::set1_ps(p[2]);
+        r[3] = SIMD::set1_ps(p[3]);
+    }
+
+    static SIMDINLINE
+    void vec4_set1_vps(Vec4& r, Float s)
+    {
+        r[0] = s;
+        r[1] = s;
+        r[2] = s;
+        r[3] = s;
+    }
+
+    static SIMDINLINE
+    Float vec4_dp3_ps(const Vec4& v0, const Vec4& v1)
+    {
+        Float tmp, r;
+        r   = SIMD::mul_ps(v0[0], v1[0]);     // (v0.x*v1.x)
+
+        tmp = SIMD::mul_ps(v0[1], v1[1]);     // (v0.y*v1.y)
+        r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y)
+
+        tmp = SIMD::mul_ps(v0[2], v1[2]);     // (v0.z*v1.z)
+        r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+
+        return r;
+    }
+
+    static SIMDINLINE
+    Float vec4_dp4_ps(const Vec4& v0, const Vec4& v1)
+    {
+        Float tmp, r;
+        r   = SIMD::mul_ps(v0[0], v1[0]);     // (v0.x*v1.x)
+
+        tmp = SIMD::mul_ps(v0[1], v1[1]);     // (v0.y*v1.y)
+        r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y)
+
+        tmp = SIMD::mul_ps(v0[2], v1[2]);     // (v0.z*v1.z)
+        r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+
+        tmp = SIMD::mul_ps(v0[3], v1[3]);     // (v0.w*v1.w)
+        r   = SIMD::add_ps(r, tmp);           // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+
+        return r;
+    }
+
+    static SIMDINLINE
+    Float vec4_rcp_length_ps(const Vec4& v)
+    {
+        Float length = vec4_dp4_ps(v, v);
+        return SIMD::rsqrt_ps(length);
+    }
+
+    static SIMDINLINE
+    void vec4_normalize_ps(Vec4& r, const Vec4& v)
+    {
+        Float rcpLength = vec4_rcp_length_ps(v);
+
+        r[0] = SIMD::mul_ps(v[0], rcpLength);
+        r[1] = SIMD::mul_ps(v[1], rcpLength);
+        r[2] = SIMD::mul_ps(v[2], rcpLength);
+        r[3] = SIMD::mul_ps(v[3], rcpLength);
+    }
+
+    static SIMDINLINE
+    void vec4_mul_ps(Vec4& r, const Vec4& v, Float s)
+    {
+        r[0] = SIMD::mul_ps(v[0], s);
+        r[1] = SIMD::mul_ps(v[1], s);
+        r[2] = SIMD::mul_ps(v[2], s);
+        r[3] = SIMD::mul_ps(v[3], s);
+    }
+
+    static SIMDINLINE
+    void vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
+    {
+        r[0] = SIMD::mul_ps(v0[0], v1[0]);
+        r[1] = SIMD::mul_ps(v0[1], v1[1]);
+        r[2] = SIMD::mul_ps(v0[2], v1[2]);
+        r[3] = SIMD::mul_ps(v0[3], v1[3]);
+    }
+
+    static SIMDINLINE
+    void vec4_add_ps(Vec4& r, const Vec4& v0, Float s)
+    {
+        r[0] = SIMD::add_ps(v0[0], s);
+        r[1] = SIMD::add_ps(v0[1], s);
+        r[2] = SIMD::add_ps(v0[2], s);
+        r[3] = SIMD::add_ps(v0[3], s);
+    }
+
+    static SIMDINLINE
+    void vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1)
+    {
+        r[0] = SIMD::add_ps(v0[0], v1[0]);
+        r[1] = SIMD::add_ps(v0[1], v1[1]);
+        r[2] = SIMD::add_ps(v0[2], v1[2]);
+        r[3] = SIMD::add_ps(v0[3], v1[3]);
+    }
+
+    static SIMDINLINE
+    void vec4_min_ps(Vec4& r, const Vec4& v0, Float s)
+    {
+        r[0] = SIMD::min_ps(v0[0], s);
+        r[1] = SIMD::min_ps(v0[1], s);
+        r[2] = SIMD::min_ps(v0[2], s);
+        r[3] = SIMD::min_ps(v0[3], s);
+    }
+
+    static SIMDINLINE
+    void vec4_max_ps(Vec4& r, const Vec4& v0, Float s)
+    {
+        r[0] = SIMD::max_ps(v0[0], s);
+        r[1] = SIMD::max_ps(v0[1], s);
+        r[2] = SIMD::max_ps(v0[2], s);
+        r[3] = SIMD::max_ps(v0[3], s);
+    }
+
+    // Matrix4x4 * Vector4
+    //   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * v.w)
+    //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * v.w)
+    //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * v.w)
+    //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
+    static SIMDINLINE
+    void SIMDCALL mat4x4_vec4_multiply(
+        Vec4& result,
+        const float *pMatrix,
+        const Vec4& v)
+    {
+        Float m;
+        Float r0;
+        Float r1;
+
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 3);  // m[row][3]
+        r1  = SIMD::mul_ps(m, v[3]);              // (m3 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+        result[0] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 3);  // m[row][3]
+        r1  = SIMD::mul_ps(m, v[3]);              // (m3 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+        result[1] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 3);  // m[row][3]
+        r1  = SIMD::mul_ps(m, v[3]);              // (m3 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+        result[2] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 3);  // m[row][3]
+        r1  = SIMD::mul_ps(m, v[3]);              // (m3 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+        result[3] = r0;
+    }
+
+    // Matrix4x4 * Vector3 - Direction Vector where w = 0.
+    //   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 0)
+    //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 0)
+    //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 0)
+    //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
+    static SIMDINLINE
+    void SIMDCALL mat3x3_vec3_w0_multiply(
+        Vec4& result,
+        const float *pMatrix,
+        const Vec4& v)
+    {
+        Float m;
+        Float r0;
+        Float r1;
+
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        result[0] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        result[1] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        result[2] = r0;
+
+        result[3] = SIMD::setzero_ps();
+    }
+
+    // Matrix4x4 * Vector3 - Position vector where w = 1.
+    //   outVec.x = (m00 * v.x) + (m01 * v.y) + (m02 * v.z) + (m03 * 1)
+    //   outVec.y = (m10 * v.x) + (m11 * v.y) + (m12 * v.z) + (m13 * 1)
+    //   outVec.z = (m20 * v.x) + (m21 * v.y) + (m22 * v.z) + (m23 * 1)
+    //   outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
+    static SIMDINLINE
+    void SIMDCALL mat4x4_vec3_w1_multiply(
+        Vec4& result,
+        const float *pMatrix,
+        const Vec4& v)
+    {
+        Float m;
+        Float r0;
+        Float r1;
+
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 3);  // m[row][3]
+        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        result[0] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 3);  // m[row][3]
+        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        result[1] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 3);  // m[row][3]
+        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        result[2] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 3*4 + 3);  // m[row][3]
+        result[3] = SIMD::add_ps(r0, m);        // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+    }
+
+    static SIMDINLINE
+    void SIMDCALL mat4x3_vec3_w1_multiply(
+        Vec4& result,
+        const float *pMatrix,
+        const Vec4& v)
+    {
+        Float m;
+        Float r0;
+        Float r1;
+
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 0*4 + 3);  // m[row][3]
+        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        result[0] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 1*4 + 3);  // m[row][3]
+        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        result[1] = r0;
+
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 0);  // m[row][0]
+        r0  = SIMD::mul_ps(m, v[0]);              // (m00 * v.x)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 1);  // m[row][1]
+        r1  = SIMD::mul_ps(m, v[1]);              // (m1 * v.y)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 2);  // m[row][2]
+        r1  = SIMD::mul_ps(m, v[2]);              // (m2 * v.z)
+        r0  = SIMD::add_ps(r0, r1);               // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+        m   = SIMD::load1_ps(pMatrix + 2*4 + 3);  // m[row][3]
+        r0  = SIMD::add_ps(r0, m);                // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+        result[2] = r0;
+        result[3] = SIMD::set1_ps(1.0f);
+    }
+}; // struct SIMDBase
+
+using SIMD128 = SIMDBase<SIMDImpl::SIMD128Impl::Traits>;
+using SIMD256 = SIMDBase<SIMDImpl::SIMD256Impl::Traits>;
+using SIMD512 = SIMDBase<SIMDImpl::SIMD512Impl::Traits>;
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl
new file mode 100644
index 00000000000..5bcedf39713
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx.inl
@@ -0,0 +1,545 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD128 AVX (1) implementation
+//============================================================================
+
+#define SIMD_WRAPPER_1(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return _mm_##op(a);\
+    }
+
+#define SIMD_WRAPPER_2(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm_##op(a, b);\
+    }
+
+#define SIMD_DWRAPPER_2(op)  \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return _mm_##op(a, b);\
+    }
+
+#define SIMD_WRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm_##op(a, b, ImmT);\
+    }
+
+#define SIMD_DWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return _mm_##op(a, b, ImmT);\
+    }
+
+#define SIMD_WRAPPER_3(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+    {\
+        return _mm_##op(a, b, c);\
+    }
+
+#define SIMD_IWRAPPER_1(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm_##op(a);\
+    }
+
+#define SIMD_IWRAPPER_1I_(op, intrin)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return intrin(a, ImmT);\
+    }
+#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm_##op)
+
+#define SIMD_IWRAPPER_2_(op, intrin)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return intrin(a, b);\
+    }
+
+#define SIMD_IWRAPPER_2(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm_##op(a, b);\
+    }
+
+#define SIMD_IFWRAPPER_2(op, intrin)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return castps_si( intrin(castsi_ps(a), castsi_ps(b)) );\
+    }
+
+#define SIMD_IWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm_##op(a, b, ImmT);\
+    }
+
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps);     // return a + b
+SIMD_WRAPPER_2(div_ps);     // return a / b
+SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);     // return a * b
+SIMD_WRAPPER_1(rcp_ps);     // return 1.0f / a
+SIMD_WRAPPER_1(rsqrt_ps);   // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);     // return a - b
+
+static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c)    // return (a * b) + c
+{
+    return add_ps(mul_ps(a, b), c);
+}
+static SIMDINLINE Float SIMDCALL fmsub_ps(Float a, Float b, Float c)    // return (a * b) - c
+{
+    return sub_ps(mul_ps(a, b), c);
+}
+
+template <RoundMode RMT>
+static SIMDINLINE Float SIMDCALL round_ps(Float a)
+{
+    return _mm_round_ps(a, static_cast<int>(RMT));
+}
+
+static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
+SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
+SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2(mullo_epi32);
+SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
+SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
+SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(and_ps);                             // return a & b       (float treated as int)
+SIMD_IWRAPPER_2_(and_si, _mm_and_si128);        // return a & b       (int)
+SIMD_WRAPPER_2(andnot_ps);                          // return (~a) & b    (float treated as int)
+SIMD_IWRAPPER_2_(andnot_si, _mm_andnot_si128);  // return (~a) & b    (int)
+SIMD_WRAPPER_2(or_ps);                              // return a | b       (float treated as int)
+SIMD_IWRAPPER_2_(or_si, _mm_or_si128);          // return a | b       (int)
+SIMD_WRAPPER_2(xor_ps);                             // return a ^ b       (float treated as int)
+SIMD_IWRAPPER_2_(xor_si, _mm_xor_si128);        // return a ^ b       (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1I(slli_epi32);               // return a << ImmT
+
+static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b      (uint32)
+{
+    int32_t a, count;
+    a = _mm_extract_epi32(vA, 0);
+    count = _mm_extract_epi32(vB, 0);
+    a <<= count;
+    vA = _mm_insert_epi32(vA, a, 0);
+
+    a = _mm_extract_epi32(vA, 1);
+    count = _mm_extract_epi32(vB, 1);
+    a <<= count;
+    vA = _mm_insert_epi32(vA, a, 1);
+
+    a = _mm_extract_epi32(vA, 2);
+    count = _mm_extract_epi32(vB, 2);
+    a <<= count;
+    vA = _mm_insert_epi32(vA, a, 2);
+
+    a = _mm_extract_epi32(vA, 3);
+    count = _mm_extract_epi32(vB, 3);
+    a <<= count;
+    vA = _mm_insert_epi32(vA, a, 3);
+
+    return vA;
+}
+
+SIMD_IWRAPPER_1I(srai_epi32);               // return a >> ImmT   (int32)
+SIMD_IWRAPPER_1I(srli_epi32);               // return a >> ImmT   (uint32)
+SIMD_IWRAPPER_1I_(srli_si, _mm_srli_si128); // return a >> (ImmT*8) (uint)
+
+template<int ImmT>                              // same as srli_si, but with Float cast to int
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
+{
+    return castsi_ps(srli_si<ImmT>(castps_si(a)));
+}
+
+static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b      (uint32)
+{
+    int32_t a, count;
+    a = _mm_extract_epi32(vA, 0);
+    count = _mm_extract_epi32(vB, 0);
+    a >>= count;
+    vA = _mm_insert_epi32(vA, a, 0);
+
+    a = _mm_extract_epi32(vA, 1);
+    count = _mm_extract_epi32(vB, 1);
+    a >>= count;
+    vA = _mm_insert_epi32(vA, a, 1);
+
+    a = _mm_extract_epi32(vA, 2);
+    count = _mm_extract_epi32(vB, 2);
+    a >>= count;
+    vA = _mm_insert_epi32(vA, a, 2);
+
+    a = _mm_extract_epi32(vA, 3);
+    count = _mm_extract_epi32(vB, 3);
+    a >>= count;
+    vA = _mm_insert_epi32(vA, a, 3);
+
+    return vA;
+}
+
+
+
+//-----------------------------------------------------------------------
+// Conversion operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL castpd_ps(Double a)   // return *(Float*)(&a)
+{
+    return _mm_castpd_ps(a);
+}
+
+static SIMDINLINE Integer SIMDCALL castps_si(Float a)   // return *(Integer*)(&a)
+{
+    return _mm_castps_si128(a);
+}
+
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer a)   // return *(Double*)(&a)
+{
+    return _mm_castsi128_pd(a);
+}
+
+static SIMDINLINE Double SIMDCALL castps_pd(Float a)   // return *(Double*)(&a)
+{
+    return _mm_castps_pd(a);
+}
+
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer a)   // return *(Float*)(&a)
+{
+    return _mm_castsi128_ps(a);
+}
+
+static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a    (int32 --> float)
+{
+    return _mm_cvtepi32_ps(a);
+}
+
+SIMD_IWRAPPER_1(cvtepu8_epi16);     // return (int16)a    (uint8 --> int16)
+SIMD_IWRAPPER_1(cvtepu8_epi32);     // return (int32)a    (uint8 --> int32)
+SIMD_IWRAPPER_1(cvtepu16_epi32);    // return (int32)a    (uint16 --> int32)
+SIMD_IWRAPPER_1(cvtepu16_epi64);    // return (int64)a    (uint16 --> int64)
+SIMD_IWRAPPER_1(cvtepu32_epi64);    // return (int64)a    (uint32 --> int64)
+
+static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a)            // return (int32)a    (float --> int32)
+{
+    return _mm_cvtps_epi32(a);
+}
+
+static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a)           // return (int32)a    (rnd_to_zero(float) --> int32)
+{
+    return _mm_cvttps_epi32(a);
+}
+
+//-----------------------------------------------------------------------
+// Comparison operations
+//-----------------------------------------------------------------------
+template<CompareType CmpTypeT>
+static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
+{
+    return _mm_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
+}
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
+
+SIMD_IWRAPPER_2(cmpeq_epi8);    // return a == b (int8)
+SIMD_IWRAPPER_2(cmpeq_epi16);   // return a == b (int16)
+SIMD_IWRAPPER_2(cmpeq_epi32);   // return a == b (int32)
+SIMD_IWRAPPER_2(cmpeq_epi64);   // return a == b (int64)
+SIMD_IWRAPPER_2(cmpgt_epi8);    // return a > b (int8)
+SIMD_IWRAPPER_2(cmpgt_epi16);   // return a > b (int16)
+SIMD_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
+SIMD_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
+SIMD_IWRAPPER_2(cmplt_epi32);   // return a < b (int32)
+
+static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
+{
+    return  0 != _mm_testz_ps(a, b);
+}
+
+static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
+{
+    return  0 != _mm_testz_si128(a, b);
+}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2I(blend_ps);  // return ImmT ? b : a  (float)
+SIMD_WRAPPER_3(blendv_ps);  // return mask ? b : a  (float)
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
+{
+    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
+}
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
+{
+    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
+}
+
+static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)  // return *p (all elements in vector get same value)
+{
+    return _mm_broadcast_ss(p);
+}
+
+SIMD_IWRAPPER_2(packs_epi16);   // See documentation for _mm_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2(packs_epi32);   // See documentation for _mm_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2(packus_epi16);  // See documentation for _mm_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2(packus_epi32);  // See documentation for _mm_packus_epi32 and _mm512_packus_epi32
+
+static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    return castps_si(_mm_permutevar_ps(castsi_ps(a), swiz));
+}
+
+static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    return _mm_permutevar_ps(a, swiz);
+}
+
+SIMD_IWRAPPER_1I(shuffle_epi32);
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b) = delete;
+
+SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_DWRAPPER_2I(shuffle_pd);
+SIMD_WRAPPER_2I(shuffle_ps);
+SIMD_IWRAPPER_2(unpackhi_epi16);
+
+//SIMD_IFWRAPPER_2(unpackhi_epi32, _mm_unpackhi_ps);
+static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
+{
+    return castps_si(_mm_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
+}
+
+SIMD_IWRAPPER_2(unpackhi_epi64);
+SIMD_IWRAPPER_2(unpackhi_epi8);
+SIMD_DWRAPPER_2(unpackhi_pd);
+SIMD_WRAPPER_2(unpackhi_ps);
+SIMD_IWRAPPER_2(unpacklo_epi16);
+SIMD_IFWRAPPER_2(unpacklo_epi32, _mm_unpacklo_ps);
+SIMD_IWRAPPER_2(unpacklo_epi64);
+SIMD_IWRAPPER_2(unpacklo_epi8);
+SIMD_DWRAPPER_2(unpacklo_pd);
+SIMD_WRAPPER_2(unpacklo_ps);
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    uint32_t *pOffsets = (uint32_t*)&idx;
+    Float vResult;
+    float* pResult = (float*)&vResult;
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        uint32_t offset = pOffsets[i];
+        offset = offset * static_cast<uint32_t>(ScaleT);
+        pResult[i] = *(float const*)(((uint8_t const*)p + offset));
+    }
+
+    return vResult;
+}
+
+static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
+{
+    return broadcast_ss(p);
+}
+
+static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+{
+    return _mm_load_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+{
+    return _mm_load_si128(&p->v);
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+{
+    return _mm_loadu_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+{
+    return _mm_lddqu_si128(&p->v);
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    uint32_t *pOffsets = (uint32_t*)&idx;
+    Float vResult = old;
+    float* pResult = (float*)&vResult;
+    DWORD index;
+    uint32_t umask = movemask_ps(mask);
+    while (_BitScanForward(&index, umask))
+    {
+        umask &= ~(1 << index);
+        uint32_t offset = pOffsets[index];
+        offset = offset * static_cast<uint32_t>(ScaleT);
+        pResult[index] = *(float const *)(((uint8_t const *)p + offset));
+    }
+
+    return vResult;
+}
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+    _mm_maskstore_ps(p, mask, src);
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+    return static_cast<uint32_t>(_mm_movemask_epi8(a));
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
+{
+    return static_cast<uint32_t>(_mm_movemask_pd(a));
+}
+static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
+{
+    return static_cast<uint32_t>(_mm_movemask_ps(a));
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
+{
+    return _mm_set1_epi32(i);
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
+{
+    return _mm_set1_epi8(i);
+}
+
+static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
+{
+    return _mm_set1_ps(f);
+}
+
+static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
+{
+    return _mm_setzero_ps();
+}
+
+static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
+{
+    return _mm_setzero_si128();
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+{
+    _mm_store_ps(p, a);
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+{
+    _mm_store_si128(&p->v, a);
+}
+
+static SIMDINLINE void SIMDCALL storeu_si(Integer *p, Integer a) // *p = a    (same as store_si but allows for unaligned mem)
+{
+    _mm_storeu_si128(&p->v, a);
+}
+
+static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
+{
+    _mm_stream_ps(p, a);
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(float in3, float in2, float in1, float in0)
+{
+    return _mm_set_ps(in3, in2, in1, in0);
+}
+
+template <int ImmT>
+static SIMDINLINE float SIMDCALL extract_ps(Float a)
+{
+    int tmp = _mm_extract_ps(a, ImmT);
+    return *reinterpret_cast<float*>(&tmp);
+}
+
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_2
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IFWRAPPER_2
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_1I
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2I
+
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl
new file mode 100644
index 00000000000..e8ee0b4d87b
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx2.inl
@@ -0,0 +1,68 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX2_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD4 AVX (2) implementation
+//
+// Since this implementation inherits from the AVX (1) implementation,
+// the only operations below ones that replace AVX (1) operations.
+// Only 2 shifts and 2 gathers were introduced with AVX 2
+// Also, add native support for FMA operations
+//============================================================================
+#define SIMD_WRAPPER_3(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+    {\
+        return _mm_##op(a, b, c);\
+    }
+
+SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
+
+static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vB) // return a << b      (uint32)
+{
+    return _mm_sllv_epi32(vA, vB);
+}
+
+static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vB) // return a >> b      (uint32)
+{
+    return _mm_srlv_epi32(vA, vB);
+}
+
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    return _mm_i32gather_ps(p, idx, static_cast<const int>(ScaleT));
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    return _mm_mask_i32gather_ps(old, p, idx, mask, static_cast<const int>(ScaleT));
+}
+
+#undef SIMD_WRAPPER_3
+
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl
new file mode 100644
index 00000000000..3ab41a23651
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_128_avx512.inl
@@ -0,0 +1,408 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD128 AVX (512) implementation
+//
+// Since this implementation inherits from the AVX (2) implementation,
+// the only operations below ones that replace AVX (2) operations.
+// These use native AVX512 instructions with masking to enable a larger
+// register set.
+//============================================================================
+
+private:
+    static SIMDINLINE __m512  __conv(Float r) { return _mm512_castps128_ps512(r.v); }
+    static SIMDINLINE __m512d __conv(Double r) { return _mm512_castpd128_pd512(r.v); }
+    static SIMDINLINE __m512i __conv(Integer r) { return _mm512_castsi128_si512(r.v); }
+    static SIMDINLINE Float   __conv(__m512 r) { return _mm512_castps512_ps128(r); }
+    static SIMDINLINE Double  __conv(__m512d r) { return _mm512_castpd512_pd128(r); }
+    static SIMDINLINE Integer __conv(__m512i r) { return _mm512_castsi512_si128(r); }
+public:
+
+#define SIMD_WRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#define SIMD_WRAPPER_1(op)  SIMD_WRAPPER_1_(op, op, __mmask16(0xf))
+
+#define SIMD_WRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#define SIMD_WRAPPER_1I(op)  SIMD_WRAPPER_1I_(op, op, __mmask16(0xf))
+
+#define SIMD_WRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#define SIMD_WRAPPER_2(op)  SIMD_WRAPPER_2_(op, op, __mmask16(0xf))
+
+#define SIMD_WRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
+    }
+
+#define SIMD_WRAPPER_3_(op, intrin, mask)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c)));\
+    }
+#define SIMD_WRAPPER_3(op)  SIMD_WRAPPER_3_(op, op, __mmask16(0xf))
+
+#define SIMD_DWRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Double SIMDCALL op(Double a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_1(op)  SIMD_DWRAPPER_1_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_DWRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Double SIMDCALL op(Double a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_1I(op)  SIMD_DWRAPPER_1I_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_DWRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_2(op)  SIMD_DWRAPPER_2_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_DWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return __conv(_mm512_maskz_##op(0x3, __conv(a), __conv(b), ImmT));\
+    }
+
+#define SIMD_IWRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#define SIMD_IWRAPPER_1_32(op)  SIMD_IWRAPPER_1_(op, op, __mmask16(0xf))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_1_8(op)   SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffull))
+#define SIMD_IWRAPPER_1_16(op)  SIMD_IWRAPPER_1_(op, op, __mmask32(0xff))
+#define SIMD_IWRAPPER_1_64(op)  SIMD_IWRAPPER_1_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_IWRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#define SIMD_IWRAPPER_1I_32(op)  SIMD_IWRAPPER_1I_(op, op, __mmask16(0xf))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_1I_8(op)   SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffull))
+#define SIMD_IWRAPPER_1I_16(op)  SIMD_IWRAPPER_1I_(op, op, __mmask32(0xff))
+#define SIMD_IWRAPPER_1I_64(op)  SIMD_IWRAPPER_1I_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_IWRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#define SIMD_IWRAPPER_2_32(op)  SIMD_IWRAPPER_2_(op, op, __mmask16(0xf))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_2_8(op)   SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffull))
+#define SIMD_IWRAPPER_2_16(op)  SIMD_IWRAPPER_2_(op, op, __mmask32(0xff))
+#define SIMD_IWRAPPER_2_64(op)  SIMD_IWRAPPER_2_(op, op, __mmask8(0x3))
+#endif
+
+#define SIMD_IWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
+    }
+
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps);     // return a + b
+SIMD_WRAPPER_2(div_ps);     // return a / b
+SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
+SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);     // return a * b
+SIMD_WRAPPER_1_(rcp_ps, rcp28_ps, __mmask16(0xf));     // return 1.0f / a
+SIMD_WRAPPER_1_(rsqrt_ps, rsqrt28_ps, __mmask16(0xf));   // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);     // return a - b
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1_32(abs_epi32);  // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2_32(add_epi32);  // return a + b (int32)
+SIMD_IWRAPPER_2_32(max_epi32);  // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2_32(max_epu32);  // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2_32(min_epi32);  // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2_32(min_epu32);  // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2_32(mul_epi32);  // return a * b (int32)
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_8(add_epi8);    // return a + b (int8)
+SIMD_IWRAPPER_2_8(adds_epu8);   // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+
+#endif
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2_32(mullo_epi32);
+SIMD_IWRAPPER_2_32(sub_epi32);  // return a - b (int32)
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_64(sub_epi64);  // return a - b (int64)
+SIMD_IWRAPPER_2_8(subs_epu8);   // return (b > a) ? 0 : (a - b) (uint8)
+
+#endif
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2_(and_si,    and_epi32, __mmask16(0xf));    // return a & b       (int)
+SIMD_IWRAPPER_2_(andnot_si, andnot_epi32, __mmask16(0xf)); // return (~a) & b    (int)
+SIMD_IWRAPPER_2_(or_si,     or_epi32, __mmask16(0xf));     // return a | b       (int)
+SIMD_IWRAPPER_2_(xor_si,    xor_epi32, __mmask16(0xf));    // return a ^ b       (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1I_32(slli_epi32);               // return a << ImmT
+SIMD_IWRAPPER_2_32(sllv_epi32);                // return a << b      (uint32)
+SIMD_IWRAPPER_1I_32(srai_epi32);               // return a >> ImmT   (int32)
+SIMD_IWRAPPER_1I_32(srli_epi32);               // return a >> ImmT   (uint32)
+SIMD_IWRAPPER_2_32(srlv_epi32);                // return a >> b      (uint32)
+
+// use AVX2 version
+//SIMD_IWRAPPER_1I_(srli_si, srli_si256);     // return a >> (ImmT*8) (uint)
+
+//-----------------------------------------------------------------------
+// Conversion operations (Use AVX2 versions)
+//-----------------------------------------------------------------------
+// SIMD_IWRAPPER_1L(cvtepu8_epi16, 0xffff);    // return (int16)a    (uint8 --> int16)
+// SIMD_IWRAPPER_1L(cvtepu8_epi32, 0xff);      // return (int32)a    (uint8 --> int32)
+// SIMD_IWRAPPER_1L(cvtepu16_epi32, 0xff);     // return (int32)a    (uint16 --> int32)
+// SIMD_IWRAPPER_1L(cvtepu16_epi64, 0xf);      // return (int64)a    (uint16 --> int64)
+// SIMD_IWRAPPER_1L(cvtepu32_epi64, 0xf);      // return (int64)a    (uint32 --> int64)
+
+//-----------------------------------------------------------------------
+// Comparison operations (Use AVX2 versions
+//-----------------------------------------------------------------------
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi8);    // return a == b (int8)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi16);   // return a == b (int16)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi32);   // return a == b (int32)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi64);   // return a == b (int64)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi8,);   // return a > b (int8)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi16);   // return a > b (int16)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi32);   // return a > b (int32)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi64);   // return a > b (int64)
+//
+//static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b)   // return a < b (int32)
+//{
+//    return cmpgt_epi32(b, a);
+//}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_8(packs_epi16);     // int16 --> int8    See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2_16(packs_epi32);    // int32 --> int16   See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2_8(packus_epi16);    // uint16 --> uint8  See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2_16(packus_epi32);   // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+
+#endif
+// SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
+
+//static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+//{
+//    return _mm256_permutevar8x32_ps(a, swiz);
+//}
+
+SIMD_IWRAPPER_1I_32(shuffle_epi32);
+//template<int ImmT>
+//static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+//{
+//    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
+//}
+//SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_IWRAPPER_2_32(unpackhi_epi32);
+SIMD_IWRAPPER_2_32(unpacklo_epi32);
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_16(unpackhi_epi16);
+SIMD_IWRAPPER_2_64(unpackhi_epi64);
+SIMD_IWRAPPER_2_8(unpackhi_epi8);
+SIMD_IWRAPPER_2_16(unpacklo_epi16);
+SIMD_IWRAPPER_2_64(unpacklo_epi64);
+SIMD_IWRAPPER_2_8(unpacklo_epi8);
+
+#endif
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+{
+    return __conv(_mm512_maskz_load_ps(__mmask16(0xf), p));
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+{
+    return __conv(_mm512_maskz_load_epi32(__mmask16(0xf), p));
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+{
+    return __conv(_mm512_maskz_loadu_ps(__mmask16(0xf), p));
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+{
+    return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xf), p));
+}
+
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    return __conv(_mm512_mask_i32gather_ps(
+                    _mm512_setzero_ps(),
+                    __mmask16(0xf),
+                    __conv(idx),
+                    p,
+                    static_cast<int>(ScaleT)));
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    __mmask16 m = 0xf;
+    m = _mm512_mask_test_epi32_mask(m, _mm512_castps_si512(__conv(mask)),
+                                _mm512_set1_epi32(0x8000000));
+    return __conv(_mm512_mask_i32gather_ps(
+                    __conv(old),
+                    m,
+                    __conv(idx),
+                    p,
+                    static_cast<int>(ScaleT)));
+}
+
+#if !defined(AVX512F_STRICT)
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+    __mmask64 m = 0xffffull;
+    return static_cast<uint32_t>(
+        _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
+}
+
+#endif
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+    __mmask16 m = 0xf;
+    m = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000));
+    _mm512_mask_store_ps(p, m, __conv(src));
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+{
+    _mm512_mask_store_ps(p, __mmask16(0xf), __conv(a));
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+{
+    _mm512_mask_store_epi32(p, __mmask16(0xf), __conv(a));
+}
+
+//=======================================================================
+// Legacy interface (available only in SIMD256 width)
+//=======================================================================
+
+#undef SIMD_WRAPPER_1_
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_1I_
+#undef SIMD_WRAPPER_1I
+#undef SIMD_WRAPPER_2_
+#undef SIMD_WRAPPER_2
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3_
+#undef SIMD_WRAPPER_3
+#undef SIMD_DWRAPPER_1_
+#undef SIMD_DWRAPPER_1
+#undef SIMD_DWRAPPER_1I_
+#undef SIMD_DWRAPPER_1I
+#undef SIMD_DWRAPPER_2_
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_IWRAPPER_1_
+#undef SIMD_IWRAPPER_1_8
+#undef SIMD_IWRAPPER_1_16
+#undef SIMD_IWRAPPER_1_32
+#undef SIMD_IWRAPPER_1_64
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_1I_8
+#undef SIMD_IWRAPPER_1I_16
+#undef SIMD_IWRAPPER_1I_32
+#undef SIMD_IWRAPPER_1I_64
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2_8
+#undef SIMD_IWRAPPER_2_16
+#undef SIMD_IWRAPPER_2_32
+#undef SIMD_IWRAPPER_2_64
+#undef SIMD_IWRAPPER_2I
+//#undef SIMD_IWRAPPER_2I_8
+//#undef SIMD_IWRAPPER_2I_16
+//#undef SIMD_IWRAPPER_2I_32
+//#undef SIMD_IWRAPPER_2I_64
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
new file mode 100644
index 00000000000..aec79e31590
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx.inl
@@ -0,0 +1,757 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+using SIMD128T = SIMD128Impl::AVXImpl;
+
+//============================================================================
+// SIMD256 AVX (1) implementation
+//============================================================================
+
+#define SIMD_WRAPPER_1(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return _mm256_##op(a);\
+    }
+
+#define SIMD_WRAPPER_2(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm256_##op(a, b);\
+    }
+
+#define SIMD_DWRAPPER_2(op)  \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return _mm256_##op(a, b);\
+    }
+
+#define SIMD_WRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return  _mm256_##op(a, b, ImmT);\
+    }
+
+#define SIMD_DWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return _mm256_##op(a, b, ImmT);\
+    }
+
+#define SIMD_WRAPPER_3(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+    {\
+        return _mm256_##op(a, b, c);\
+    }
+
+#define SIMD_IWRAPPER_1(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm256_##op(a);\
+    }
+
+#define SIMD_IWRAPPER_2(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm256_##op(a, b);\
+    }
+
+#define SIMD_IFWRAPPER_2(op, intrin)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return castps_si( intrin(castsi_ps(a), castsi_ps(b)) );\
+    }
+
+#define SIMD_IFWRAPPER_2I(op, intrin)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return castps_si( intrin(castsi_ps(a), castsi_ps(b), ImmT) );\
+    }
+
+#define SIMD_IWRAPPER_2I_(op, intrin)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm256_##intrin(a, b, ImmT);\
+    }
+#define SIMD_IWRAPPER_2I(op)  SIMD_IWRAPPER_2I_(op, op)
+
+#define SIMD_IWRAPPER_3(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b, Integer c)   \
+    {\
+        return _mm256_##op(a, b, c);\
+    }
+
+// emulated integer simd
+#define SIMD_EMU_IWRAPPER_1(op) \
+    static SIMDINLINE \
+    Integer SIMDCALL op(Integer a)\
+    {\
+        return Integer\
+        {\
+            SIMD128T::op(a.v4[0]),\
+            SIMD128T::op(a.v4[1]),\
+        };\
+    }
+#define SIMD_EMU_IWRAPPER_1L(op, shift) \
+    static SIMDINLINE \
+    Integer SIMDCALL op(Integer a)\
+    {\
+        return Integer \
+        {\
+            SIMD128T::op(a.v4[0]), \
+            SIMD128T::op(SIMD128T::template srli_si<shift>(a.v4[0])), \
+        };\
+    }\
+    static SIMDINLINE \
+    Integer SIMDCALL op(SIMD128Impl::Integer a)\
+    {\
+        return Integer \
+        {\
+            SIMD128T::op(a), \
+            SIMD128T::op(SIMD128T::template srli_si<shift>(a)), \
+        };\
+    }
+
+#define SIMD_EMU_IWRAPPER_1I(op) \
+    template <int ImmT> static SIMDINLINE \
+    Integer SIMDCALL op(Integer a)\
+    {\
+        return Integer\
+        {\
+            SIMD128T::template op<ImmT>(a.v4[0]),\
+            SIMD128T::template op<ImmT>(a.v4[1]),\
+        };\
+    }
+
+#define SIMD_EMU_IWRAPPER_2(op) \
+    static SIMDINLINE \
+    Integer SIMDCALL op(Integer a, Integer b)\
+    {\
+        return Integer\
+        {\
+            SIMD128T::op(a.v4[0], b.v4[0]),\
+            SIMD128T::op(a.v4[1], b.v4[1]),\
+        };\
+    }
+
+#define SIMD_EMU_IWRAPPER_2I(op) \
+    template <int ImmT> static SIMDINLINE \
+    Integer SIMDCALL op(Integer a, Integer b)\
+    {\
+        return Integer\
+        {\
+            SIMD128T::template op<ImmT>(a.v4[0], b.v[0]),\
+            SIMD128T::template op<ImmT>(a.v4[1], b.v[1]),\
+        };\
+    }
+
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps);     // return a + b
+SIMD_WRAPPER_2(div_ps);     // return a / b
+
+static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c) // return (a * b) + c
+{
+    return add_ps(mul_ps(a, b), c);
+}
+
+SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
+SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);     // return a * b
+SIMD_WRAPPER_1(rcp_ps);     // return 1.0f / a
+SIMD_WRAPPER_1(rsqrt_ps);   // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);     // return a - b
+
+template <RoundMode RMT>
+static SIMDINLINE Float SIMDCALL round_ps(Float a)
+{
+    return _mm256_round_ps(a, static_cast<int>(RMT));
+}
+
+static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_EMU_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
+SIMD_EMU_IWRAPPER_2(add_epi32); // return a + b (int32)
+SIMD_EMU_IWRAPPER_2(add_epi8);  // return a + b (int8)
+SIMD_EMU_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+SIMD_EMU_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_EMU_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_EMU_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_EMU_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_EMU_IWRAPPER_2(mul_epi32); // return a * b (int32)
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_EMU_IWRAPPER_2(mullo_epi32);
+SIMD_EMU_IWRAPPER_2(sub_epi32); // return a - b (int32)
+SIMD_EMU_IWRAPPER_2(sub_epi64); // return a - b (int64)
+SIMD_EMU_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(and_ps);         // return a & b       (float treated as int)
+SIMD_EMU_IWRAPPER_2(and_si);    // return a & b       (int)
+SIMD_WRAPPER_2(andnot_ps);      // return (~a) & b    (float treated as int)
+SIMD_EMU_IWRAPPER_2(andnot_si); // return (~a) & b    (int)
+SIMD_WRAPPER_2(or_ps);          // return a | b       (float treated as int)
+SIMD_EMU_IWRAPPER_2(or_si);     // return a | b       (int)
+SIMD_WRAPPER_2(xor_ps);         // return a ^ b       (float treated as int)
+SIMD_EMU_IWRAPPER_2(xor_si);    // return a ^ b       (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_EMU_IWRAPPER_1I(slli_epi32);               // return a << ImmT
+
+static SIMDINLINE Integer SIMDCALL sllv_epi32(Integer vA, Integer vCount) // return a << b      (uint32)
+{
+    int32_t aHi, aLow, countHi, countLow;
+    __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
+    __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
+    __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
+    __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
+
+    aHi = _mm_extract_epi32(vAHi, 0);
+    countHi = _mm_extract_epi32(vCountHi, 0);
+    aHi <<= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 0);
+
+    aLow = _mm_extract_epi32(vALow, 0);
+    countLow = _mm_extract_epi32(vCountLow, 0);
+    aLow <<= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 0);
+
+    aHi = _mm_extract_epi32(vAHi, 1);
+    countHi = _mm_extract_epi32(vCountHi, 1);
+    aHi <<= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 1);
+
+    aLow = _mm_extract_epi32(vALow, 1);
+    countLow = _mm_extract_epi32(vCountLow, 1);
+    aLow <<= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 1);
+
+    aHi = _mm_extract_epi32(vAHi, 2);
+    countHi = _mm_extract_epi32(vCountHi, 2);
+    aHi <<= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 2);
+
+    aLow = _mm_extract_epi32(vALow, 2);
+    countLow = _mm_extract_epi32(vCountLow, 2);
+    aLow <<= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 2);
+
+    aHi = _mm_extract_epi32(vAHi, 3);
+    countHi = _mm_extract_epi32(vCountHi, 3);
+    aHi <<= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 3);
+
+    aLow = _mm_extract_epi32(vALow, 3);
+    countLow = _mm_extract_epi32(vCountLow, 3);
+    aLow <<= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 3);
+
+    __m256i ret = _mm256_set1_epi32(0);
+    ret = _mm256_insertf128_si256(ret, vAHi, 1);
+    ret = _mm256_insertf128_si256(ret, vALow, 0);
+    return ret;
+}
+
+SIMD_EMU_IWRAPPER_1I(srai_epi32);   // return a >> ImmT   (int32)
+SIMD_EMU_IWRAPPER_1I(srli_epi32);   // return a >> ImmT   (uint32)
+SIMD_EMU_IWRAPPER_1I(srli_si);      // return a >> (ImmT*8) (uint)
+
+template<int ImmT>                              // same as srli_si, but with Float cast to int
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
+{
+    return castsi_ps(srli_si<ImmT>(castps_si(a)));
+}
+
+static SIMDINLINE Integer SIMDCALL srlv_epi32(Integer vA, Integer vCount) // return a >> b      (uint32)
+{
+    int32_t aHi, aLow, countHi, countLow;
+    __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
+    __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
+    __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
+    __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
+
+    aHi = _mm_extract_epi32(vAHi, 0);
+    countHi = _mm_extract_epi32(vCountHi, 0);
+    aHi >>= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 0);
+
+    aLow = _mm_extract_epi32(vALow, 0);
+    countLow = _mm_extract_epi32(vCountLow, 0);
+    aLow >>= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 0);
+
+    aHi = _mm_extract_epi32(vAHi, 1);
+    countHi = _mm_extract_epi32(vCountHi, 1);
+    aHi >>= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 1);
+
+    aLow = _mm_extract_epi32(vALow, 1);
+    countLow = _mm_extract_epi32(vCountLow, 1);
+    aLow >>= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 1);
+
+    aHi = _mm_extract_epi32(vAHi, 2);
+    countHi = _mm_extract_epi32(vCountHi, 2);
+    aHi >>= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 2);
+
+    aLow = _mm_extract_epi32(vALow, 2);
+    countLow = _mm_extract_epi32(vCountLow, 2);
+    aLow >>= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 2);
+
+    aHi = _mm_extract_epi32(vAHi, 3);
+    countHi = _mm_extract_epi32(vCountHi, 3);
+    aHi >>= countHi;
+    vAHi = _mm_insert_epi32(vAHi, aHi, 3);
+
+    aLow = _mm_extract_epi32(vALow, 3);
+    countLow = _mm_extract_epi32(vCountLow, 3);
+    aLow >>= countLow;
+    vALow = _mm_insert_epi32(vALow, aLow, 3);
+
+    __m256i ret = _mm256_set1_epi32(0);
+    ret = _mm256_insertf128_si256(ret, vAHi, 1);
+    ret = _mm256_insertf128_si256(ret, vALow, 0);
+    return ret;
+}
+
+
+
+//-----------------------------------------------------------------------
+// Conversion operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL castpd_ps(Double a)   // return *(Float*)(&a)
+{
+    return _mm256_castpd_ps(a);
+}
+
+static SIMDINLINE Integer SIMDCALL castps_si(Float a)   // return *(Integer*)(&a)
+{
+    return _mm256_castps_si256(a);
+}
+
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer a)   // return *(Double*)(&a)
+{
+    return _mm256_castsi256_pd(a);
+}
+
+static SIMDINLINE Double SIMDCALL castps_pd(Float a)   // return *(Double*)(&a)
+{
+    return _mm256_castps_pd(a);
+}
+
+static SIMDINLINE Integer SIMDCALL castpd_si(Double a)   // return *(Integer*)(&a)
+{
+    return _mm256_castpd_si256(a);
+}
+
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer a)   // return *(Float*)(&a)
+{
+    return _mm256_castsi256_ps(a);
+}
+
+static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a    (int32 --> float)
+{
+    return _mm256_cvtepi32_ps(a);
+}
+
+SIMD_EMU_IWRAPPER_1L(cvtepu8_epi16, 8);                  // return (int16)a    (uint8 --> int16)
+SIMD_EMU_IWRAPPER_1L(cvtepu8_epi32, 4);                  // return (int32)a    (uint8 --> int32)
+SIMD_EMU_IWRAPPER_1L(cvtepu16_epi32, 8);                 // return (int32)a    (uint16 --> int32)
+SIMD_EMU_IWRAPPER_1L(cvtepu16_epi64, 4);                 // return (int64)a    (uint16 --> int64)
+SIMD_EMU_IWRAPPER_1L(cvtepu32_epi64, 8);                 // return (int64)a    (uint32 --> int64)
+
+static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a)            // return (int32)a    (float --> int32)
+{
+    return _mm256_cvtps_epi32(a);
+}
+
+static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a)           // return (int32)a    (rnd_to_zero(float) --> int32)
+{
+    return _mm256_cvttps_epi32(a);
+}
+
+//-----------------------------------------------------------------------
+// Comparison operations
+//-----------------------------------------------------------------------
+template<CompareType CmpTypeT>
+static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
+{
+    return _mm256_cmp_ps(a, b, static_cast<const int>(CmpTypeT));
+}
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
+
+SIMD_EMU_IWRAPPER_2(cmpeq_epi8);    // return a == b (int8)
+SIMD_EMU_IWRAPPER_2(cmpeq_epi16);   // return a == b (int16)
+SIMD_EMU_IWRAPPER_2(cmpeq_epi32);   // return a == b (int32)
+SIMD_EMU_IWRAPPER_2(cmpeq_epi64);   // return a == b (int64)
+SIMD_EMU_IWRAPPER_2(cmpgt_epi8);    // return a > b (int8)
+SIMD_EMU_IWRAPPER_2(cmpgt_epi16);   // return a > b (int16)
+SIMD_EMU_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
+SIMD_EMU_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
+SIMD_EMU_IWRAPPER_2(cmplt_epi32);   // return a < b (int32)
+
+static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
+{
+    return  0 != _mm256_testz_ps(a, b);
+}
+
+static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
+{
+    return  0 != _mm256_testz_si256(a, b);
+}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2I(blend_ps);  // return ImmT ? b : a  (float)
+SIMD_IFWRAPPER_2I(blend_epi32, _mm256_blend_ps);  // return ImmT ? b : a  (int32)
+SIMD_WRAPPER_3(blendv_ps);  // return mask ? b : a  (float)
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
+{
+    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
+}
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
+{
+    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
+}
+
+static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)  // return *p (all elements in vector get same value)
+{
+    return _mm256_broadcast_ss(p);
+}
+
+SIMD_EMU_IWRAPPER_2(packs_epi16);   // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_EMU_IWRAPPER_2(packs_epi32);   // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_EMU_IWRAPPER_2(packus_epi16);  // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_EMU_IWRAPPER_2(packus_epi32);  // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+
+static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
+{
+    Integer result;
+
+    // Ugly slow implementation
+    uint32_t const *pA = reinterpret_cast<uint32_t const*>(&a);
+    uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
+    uint32_t *pResult = reinterpret_cast<uint32_t *>(&result);
+
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        pResult[i] = pA[0xF & pSwiz[i]];
+    }
+
+    return result;
+}
+
+static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    Float result;
+
+    // Ugly slow implementation
+    float const *pA = reinterpret_cast<float const*>(&a);
+    uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
+    float *pResult = reinterpret_cast<float *>(&result);
+
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        pResult[i] = pA[0xF & pSwiz[i]];
+    }
+
+    return result;
+}
+
+SIMD_WRAPPER_2I(permute2f128_ps);
+SIMD_DWRAPPER_2I(permute2f128_pd);
+SIMD_IWRAPPER_2I_(permute2f128_si, permute2f128_si256);
+
+
+SIMD_EMU_IWRAPPER_1I(shuffle_epi32);
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+{
+    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
+}
+SIMD_EMU_IWRAPPER_2(shuffle_epi8);
+SIMD_DWRAPPER_2I(shuffle_pd);
+SIMD_WRAPPER_2I(shuffle_ps);
+SIMD_EMU_IWRAPPER_2(unpackhi_epi16);
+SIMD_IFWRAPPER_2(unpackhi_epi32, _mm256_unpackhi_ps);
+SIMD_EMU_IWRAPPER_2(unpackhi_epi64);
+SIMD_EMU_IWRAPPER_2(unpackhi_epi8);
+SIMD_DWRAPPER_2(unpackhi_pd);
+SIMD_WRAPPER_2(unpackhi_ps);
+SIMD_EMU_IWRAPPER_2(unpacklo_epi16);
+SIMD_IFWRAPPER_2(unpacklo_epi32, _mm256_unpacklo_ps);
+SIMD_EMU_IWRAPPER_2(unpacklo_epi64);
+SIMD_EMU_IWRAPPER_2(unpacklo_epi8);
+SIMD_DWRAPPER_2(unpacklo_pd);
+SIMD_WRAPPER_2(unpacklo_ps);
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    uint32_t *pOffsets = (uint32_t*)&idx;
+    Float vResult;
+    float* pResult = (float*)&vResult;
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        uint32_t offset = pOffsets[i];
+        offset = offset * static_cast<uint32_t>(ScaleT);
+        pResult[i] = *(float const*)(((uint8_t const*)p + offset));
+    }
+
+    return vResult;
+}
+
+static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
+{
+    return broadcast_ss(p);
+}
+
+static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+{
+    return _mm256_load_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+{
+    return _mm256_load_si256(&p->v);
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+{
+    return _mm256_loadu_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+{
+    return _mm256_lddqu_si256(&p->v);
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    uint32_t *pOffsets = (uint32_t*)&idx;
+    Float vResult = old;
+    float* pResult = (float*)&vResult;
+    DWORD index;
+    uint32_t umask = movemask_ps(mask);
+    while (_BitScanForward(&index, umask))
+    {
+        umask &= ~(1 << index);
+        uint32_t offset = pOffsets[index];
+        offset = offset * static_cast<uint32_t>(ScaleT);
+        pResult[index] = *(float const *)(((uint8_t const *)p + offset));
+    }
+
+    return vResult;
+}
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+    _mm256_maskstore_ps(p, mask, src);
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+    return SIMD128T::movemask_epi8(a.v4[0]) |
+           (SIMD128T::movemask_epi8(a.v4[1]) << 16);
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
+{
+    return static_cast<uint32_t>(_mm256_movemask_pd(a));
+}
+static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
+{
+    return static_cast<uint32_t>(_mm256_movemask_ps(a));
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
+{
+    return _mm256_set1_epi32(i);
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
+{
+    return _mm256_set1_epi8(i);
+}
+
+static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
+{
+    return _mm256_set1_ps(f);
+}
+
+static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
+{
+    return _mm256_setzero_ps();
+}
+
+static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
+{
+    return _mm256_setzero_si256();
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+{
+    _mm256_store_ps(p, a);
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+{
+    _mm256_store_si256(&p->v, a);
+}
+
+static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
+{
+    _mm256_stream_ps(p, a);
+}
+
+//=======================================================================
+// Legacy interface (available only in SIMD256 width)
+//=======================================================================
+
+static SIMDINLINE Float SIMDCALL broadcast_ps(SIMD128Impl::Float const *p)
+{
+    return _mm256_broadcast_ps(&p->v);
+}
+
+template<int ImmT>
+static SIMDINLINE SIMD128Impl::Double SIMDCALL extractf128_pd(Double a)
+{
+    return _mm256_extractf128_pd(a, ImmT);
+}
+
+template<int ImmT>
+static SIMDINLINE SIMD128Impl::Float  SIMDCALL extractf128_ps(Float a)
+{
+    return _mm256_extractf128_ps(a, ImmT);
+}
+
+template<int ImmT>
+static SIMDINLINE SIMD128Impl::Integer SIMDCALL extractf128_si(Integer a)
+{
+    return _mm256_extractf128_si256(a, ImmT);
+}
+
+template<int ImmT>
+static SIMDINLINE Double SIMDCALL insertf128_pd(Double a, SIMD128Impl::Double b)
+{
+    return _mm256_insertf128_pd(a, b, ImmT);
+}
+
+template<int ImmT>
+static SIMDINLINE Float SIMDCALL insertf128_ps(Float a, SIMD128Impl::Float b)
+{
+    return _mm256_insertf128_ps(a, b, ImmT);
+}
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL insertf128_si(Integer a, SIMD128Impl::Integer b)
+{
+    return _mm256_insertf128_si256(a, b, ImmT);
+}
+
+#ifndef _mm256_set_m128i
+#define _mm256_set_m128i(/* SIMD128Impl::Integer */ hi, /* SIMD128Impl::Integer */ lo) \
+    _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 0x1)
+#endif
+
+#ifndef _mm256_loadu2_m128i
+#define _mm256_loadu2_m128i(/* SIMD128Impl::Integer const* */ hiaddr, \
+                            /* SIMD128Impl::Integer const* */ loaddr) \
+    _mm256_set_m128i(_mm_loadu_si128(hiaddr), _mm_loadu_si128(loaddr))
+#endif
+
+static SIMDINLINE Integer SIMDCALL loadu2_si(SIMD128Impl::Integer const* phi, SIMD128Impl::Integer const* plo)
+{
+    return _mm256_loadu2_m128i(&phi->v, &plo->v);
+}
+
+static SIMDINLINE Integer SIMDCALL set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+{
+    return _mm256_set_epi32(i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+{
+    return _mm256_set_ps(i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE void SIMDCALL storeu2_si(SIMD128Impl::Integer *phi, SIMD128Impl::Integer *plo, Integer src)
+{
+    _mm256_storeu2_m128i(&phi->v, &plo->v, src);
+}
+
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_2
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IFWRAPPER_2
+#undef SIMD_IFWRAPPER_2I
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_2I_
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_3
+#undef SIMD_EMU_IWRAPPER_1
+#undef SIMD_EMU_IWRAPPER_1I
+#undef SIMD_EMU_IWRAPPER_2
+#undef SIMD_EMU_IWRAPPER_2I
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl
new file mode 100644
index 00000000000..0a812039300
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx2.inl
@@ -0,0 +1,234 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX2_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD256 AVX (2) implementation
+//
+// Since this implementation inherits from the AVX (1) implementation,
+// the only operations below ones that replace AVX (1) operations.
+// Mostly these are integer operations that are no longer emulated with SSE
+//============================================================================
+
+#define SIMD_IWRAPPER_1(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm256_##op(a);\
+    }
+
+#define SIMD_IWRAPPER_1L(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm256_##op(_mm256_castsi256_si128(a));\
+    }\
+
+#define SIMD_IWRAPPER_1I(op)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm256_##op(a, ImmT);\
+    }
+
+#define SIMD_IWRAPPER_1I_(op, intrin)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm256_##intrin(a, ImmT);\
+    }
+
+#define SIMD_IWRAPPER_2_(op, intrin)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm256_##intrin(a, b);\
+    }
+
+#define SIMD_IWRAPPER_2(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm256_##op(a, b);\
+    }
+
+#define SIMD_IWRAPPER_2I(op)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm256_##op(a, b, ImmT);\
+    }
+
+#define SIMD_IWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm256_##op(a, b, ImmT);\
+    }
+
+//-----------------------------------------------------------------------
+// Floating point arithmetic operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL fmadd_ps(Float a, Float b, Float c)   // return (a * b) + c
+{
+    return _mm256_fmadd_ps(a, b, c);
+}
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
+SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
+SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2(mullo_epi32);
+SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
+SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
+SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2_(and_si,    and_si256);     // return a & b       (int)
+SIMD_IWRAPPER_2_(andnot_si, andnot_si256);  // return (~a) & b    (int)
+SIMD_IWRAPPER_2_(or_si,     or_si256);      // return a | b       (int)
+SIMD_IWRAPPER_2_(xor_si,    xor_si256);     // return a ^ b       (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1I(slli_epi32);               // return a << ImmT
+SIMD_IWRAPPER_2(sllv_epi32);                // return a << b      (uint32)
+SIMD_IWRAPPER_1I(srai_epi32);               // return a >> ImmT   (int32)
+SIMD_IWRAPPER_1I(srli_epi32);               // return a >> ImmT   (uint32)
+SIMD_IWRAPPER_2(srlv_epi32);                // return a >> b      (uint32)
+SIMD_IWRAPPER_1I_(srli_si, srli_si256);     // return a >> (ImmT*8) (uint)
+
+template<int ImmT>                          // same as srli_si, but with Float cast to int
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
+{
+    return castsi_ps(srli_si<ImmT>(castps_si(a)));
+}
+
+
+//-----------------------------------------------------------------------
+// Conversion operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1L(cvtepu8_epi16);    // return (int16)a    (uint8 --> int16)
+SIMD_IWRAPPER_1L(cvtepu8_epi32);    // return (int32)a    (uint8 --> int32)
+SIMD_IWRAPPER_1L(cvtepu16_epi32);   // return (int32)a    (uint16 --> int32)
+SIMD_IWRAPPER_1L(cvtepu16_epi64);   // return (int64)a    (uint16 --> int64)
+SIMD_IWRAPPER_1L(cvtepu32_epi64);   // return (int64)a    (uint32 --> int64)
+
+//-----------------------------------------------------------------------
+// Comparison operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2(cmpeq_epi8);    // return a == b (int8)
+SIMD_IWRAPPER_2(cmpeq_epi16);   // return a == b (int16)
+SIMD_IWRAPPER_2(cmpeq_epi32);   // return a == b (int32)
+SIMD_IWRAPPER_2(cmpeq_epi64);   // return a == b (int64)
+SIMD_IWRAPPER_2(cmpgt_epi8);    // return a > b (int8)
+SIMD_IWRAPPER_2(cmpgt_epi16);   // return a > b (int16)
+SIMD_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
+SIMD_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
+
+static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b)   // return a < b (int32)
+{
+    return cmpgt_epi32(b, a);
+}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2I(blend_epi32);  // return ImmT ? b : a  (int32)
+SIMD_IWRAPPER_2(packs_epi16);   // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2(packs_epi32);   // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2(packus_epi16);  // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2(packus_epi32);  // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
+
+static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    return _mm256_permutevar8x32_ps(a, swiz);
+}
+
+SIMD_IWRAPPER_1I(shuffle_epi32);
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+{
+    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
+}
+SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_IWRAPPER_2(unpackhi_epi16);
+SIMD_IWRAPPER_2(unpackhi_epi32);
+SIMD_IWRAPPER_2(unpackhi_epi64);
+SIMD_IWRAPPER_2(unpackhi_epi8);
+SIMD_IWRAPPER_2(unpacklo_epi16);
+SIMD_IWRAPPER_2(unpacklo_epi32);
+SIMD_IWRAPPER_2(unpacklo_epi64);
+SIMD_IWRAPPER_2(unpacklo_epi8);
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    return _mm256_i32gather_ps(p, idx, static_cast<int>(ScaleT));
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+	// g++ in debug mode needs the explicit .v suffix instead of relying on operator __m256()
+	// Only for this intrinsic - not sure why. :(
+    return _mm256_mask_i32gather_ps(old.v, p, idx.v, mask.v, static_cast<int>(ScaleT));
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+    return static_cast<uint32_t>(_mm256_movemask_epi8(a));
+}
+
+//=======================================================================
+// Legacy interface (available only in SIMD256 width)
+//=======================================================================
+
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_1L
+#undef SIMD_IWRAPPER_1I
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_2I
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl
new file mode 100644
index 00000000000..76afbd01c05
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_256_avx512.inl
@@ -0,0 +1,409 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD256 AVX (512) implementation
+//
+// Since this implementation inherits from the AVX (2) implementation,
+// the only operations below ones that replace AVX (2) operations.
+// These use native AVX512 instructions with masking to enable a larger
+// register set.
+//============================================================================
+
+private:
+    static SIMDINLINE __m512  __conv(Float r) { return _mm512_castps256_ps512(r.v); }
+    static SIMDINLINE __m512d __conv(Double r) { return _mm512_castpd256_pd512(r.v); }
+    static SIMDINLINE __m512i __conv(Integer r) { return _mm512_castsi256_si512(r.v); }
+    static SIMDINLINE Float   __conv(__m512 r) { return _mm512_castps512_ps256(r); }
+    static SIMDINLINE Double  __conv(__m512d r) { return _mm512_castpd512_pd256(r); }
+    static SIMDINLINE Integer __conv(__m512i r) { return _mm512_castsi512_si256(r); }
+public:
+
+#define SIMD_WRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#define SIMD_WRAPPER_1(op)  SIMD_WRAPPER_1_(op, op, __mmask16(0xff))
+
+#define SIMD_WRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#define SIMD_WRAPPER_1I(op)  SIMD_WRAPPER_1I_(op, op, __mmask16(0xff))
+
+#define SIMD_WRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#define SIMD_WRAPPER_2(op)  SIMD_WRAPPER_2_(op, op, __mmask16(0xff))
+
+#define SIMD_WRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT));\
+    }
+
+#define SIMD_WRAPPER_3_(op, intrin, mask)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b), __conv(c)));\
+    }
+#define SIMD_WRAPPER_3(op)  SIMD_WRAPPER_3_(op, op, __mmask16(0xff))
+
+#define SIMD_DWRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Double SIMDCALL op(Double a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_1(op)  SIMD_DWRAPPER_1_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_DWRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Double SIMDCALL op(Double a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_1I(op)  SIMD_DWRAPPER_1I_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_DWRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#if !defined(AVX512F_STRICT)
+#define SIMD_DWRAPPER_2(op)  SIMD_DWRAPPER_2_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_DWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return __conv(_mm512_maskz_##op(0xf, __conv(a), __conv(b), ImmT));\
+    }
+
+#define SIMD_IWRAPPER_1_(op, intrin, mask)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a)));\
+    }
+#define SIMD_IWRAPPER_1_32(op)  SIMD_IWRAPPER_1_(op, op, __mmask16(0xff))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_1_8(op)   SIMD_IWRAPPER_1_(op, op, __mmask64(0xffffffffull))
+#define SIMD_IWRAPPER_1_16(op)  SIMD_IWRAPPER_1_(op, op, __mmask32(0xffff))
+#define SIMD_IWRAPPER_1_64(op)  SIMD_IWRAPPER_1_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_IWRAPPER_1I_(op, intrin, mask)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), ImmT));\
+    }
+#define SIMD_IWRAPPER_1I_32(op)  SIMD_IWRAPPER_1I_(op, op, __mmask16(0xff))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_1I_8(op)   SIMD_IWRAPPER_1I_(op, op, __mmask64(0xffffffffull))
+#define SIMD_IWRAPPER_1I_16(op)  SIMD_IWRAPPER_1I_(op, op, __mmask32(0xffff))
+#define SIMD_IWRAPPER_1I_64(op)  SIMD_IWRAPPER_1I_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_IWRAPPER_2_(op, intrin, mask)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return __conv(_mm512_maskz_##intrin((mask), __conv(a), __conv(b)));\
+    }
+#define SIMD_IWRAPPER_2_32(op)  SIMD_IWRAPPER_2_(op, op, __mmask16(0xff))
+#if !defined(AVX512F_STRICT)
+#define SIMD_IWRAPPER_2_8(op)   SIMD_IWRAPPER_2_(op, op, __mmask64(0xffffffffull))
+#define SIMD_IWRAPPER_2_16(op)  SIMD_IWRAPPER_2_(op, op, __mmask32(0xffff))
+#define SIMD_IWRAPPER_2_64(op)  SIMD_IWRAPPER_2_(op, op, __mmask8(0xf))
+#endif
+
+#define SIMD_IWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return __conv(_mm512_maskz_##op(0xff, __conv(a), __conv(b), ImmT));\
+    }
+
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps);     // return a + b
+SIMD_WRAPPER_2(div_ps);     // return a / b
+SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
+SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);     // return a * b
+//SIMD_WRAPPER_1_(rcp_ps, rcp28_ps, __mmask16(0xff));     // return 1.0f / a
+//SIMD_WRAPPER_1_(rsqrt_ps, rsqrt28_ps, __mmask16(0xff));   // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);     // return a - b
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1_32(abs_epi32);  // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2_32(add_epi32);  // return a + b (int32)
+SIMD_IWRAPPER_2_32(max_epi32);  // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2_32(max_epu32);  // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2_32(min_epi32);  // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2_32(min_epu32);  // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2_32(mul_epi32);  // return a * b (int32)
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_8(add_epi8);    // return a + b (int8)
+SIMD_IWRAPPER_2_8(adds_epu8);   // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+
+#endif
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2_32(mullo_epi32);
+SIMD_IWRAPPER_2_32(sub_epi32);  // return a - b (int32)
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_64(sub_epi64);  // return a - b (int64)
+SIMD_IWRAPPER_2_8(subs_epu8);   // return (b > a) ? 0 : (a - b) (uint8)
+
+#endif
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2_(and_si,    and_epi32, __mmask16(0xff));    // return a & b       (int)
+SIMD_IWRAPPER_2_(andnot_si, andnot_epi32, __mmask16(0xff)); // return (~a) & b    (int)
+SIMD_IWRAPPER_2_(or_si,     or_epi32, __mmask16(0xff));     // return a | b       (int)
+SIMD_IWRAPPER_2_(xor_si,    xor_epi32, __mmask16(0xff));    // return a ^ b       (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1I_32(slli_epi32);               // return a << ImmT
+SIMD_IWRAPPER_2_32(sllv_epi32);                // return a << b      (uint32)
+SIMD_IWRAPPER_1I_32(srai_epi32);               // return a >> ImmT   (int32)
+SIMD_IWRAPPER_1I_32(srli_epi32);               // return a >> ImmT   (uint32)
+SIMD_IWRAPPER_2_32(srlv_epi32);                // return a >> b      (uint32)
+
+// use AVX2 version
+//SIMD_IWRAPPER_1I_(srli_si, srli_si256);     // return a >> (ImmT*8) (uint)
+
+//-----------------------------------------------------------------------
+// Conversion operations (Use AVX2 versions)
+//-----------------------------------------------------------------------
+// SIMD_IWRAPPER_1L(cvtepu8_epi16, 0xffff);    // return (int16)a    (uint8 --> int16)
+// SIMD_IWRAPPER_1L(cvtepu8_epi32, 0xff);      // return (int32)a    (uint8 --> int32)
+// SIMD_IWRAPPER_1L(cvtepu16_epi32, 0xff);     // return (int32)a    (uint16 --> int32)
+// SIMD_IWRAPPER_1L(cvtepu16_epi64, 0xf);      // return (int64)a    (uint16 --> int64)
+// SIMD_IWRAPPER_1L(cvtepu32_epi64, 0xf);      // return (int64)a    (uint32 --> int64)
+
+//-----------------------------------------------------------------------
+// Comparison operations (Use AVX2 versions
+//-----------------------------------------------------------------------
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi8);    // return a == b (int8)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi16);   // return a == b (int16)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi32);   // return a == b (int32)
+//SIMD_IWRAPPER_2_CMP(cmpeq_epi64);   // return a == b (int64)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi8,);   // return a > b (int8)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi16);   // return a > b (int16)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi32);   // return a > b (int32)
+//SIMD_IWRAPPER_2_CMP(cmpgt_epi64);   // return a > b (int64)
+//
+//static SIMDINLINE Integer SIMDCALL cmplt_epi32(Integer a, Integer b)   // return a < b (int32)
+//{
+//    return cmpgt_epi32(b, a);
+//}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_8(packs_epi16);     // int16 --> int8    See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2_16(packs_epi32);    // int32 --> int16   See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2_8(packus_epi16);    // uint16 --> uint8  See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2_16(packus_epi32);   // uint32 --> uint16 See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+
+#endif
+
+// SIMD_IWRAPPER_2_(permute_epi32, permutevar8x32_epi32);
+
+//static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+//{
+//    return _mm256_permutevar8x32_ps(a, swiz);
+//}
+
+SIMD_IWRAPPER_1I_32(shuffle_epi32);
+//template<int ImmT>
+//static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+//{
+//    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
+//}
+//SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_IWRAPPER_2_32(unpackhi_epi32);
+SIMD_IWRAPPER_2_32(unpacklo_epi32);
+
+#if !defined(AVX512F_STRICT)
+
+SIMD_IWRAPPER_2_16(unpackhi_epi16);
+SIMD_IWRAPPER_2_64(unpackhi_epi64);
+SIMD_IWRAPPER_2_8(unpackhi_epi8);
+SIMD_IWRAPPER_2_16(unpacklo_epi16);
+SIMD_IWRAPPER_2_64(unpacklo_epi64);
+SIMD_IWRAPPER_2_8(unpacklo_epi8);
+
+#endif
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+{
+    return __conv(_mm512_maskz_load_ps(__mmask16(0xff), p));
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+{
+    return __conv(_mm512_maskz_load_epi32(__mmask16(0xff), p));
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+{
+    return __conv(_mm512_maskz_loadu_ps(__mmask16(0xff), p));
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+{
+    return __conv(_mm512_maskz_loadu_epi32(__mmask16(0xff), p));
+}
+
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    return __conv(_mm512_mask_i32gather_ps(
+                    _mm512_setzero_ps(),
+                    __mmask16(0xff),
+                    __conv(idx),
+                    p,
+                    static_cast<int>(ScaleT)));
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    __mmask16 m = 0xff;
+    m = _mm512_mask_test_epi32_mask(m, _mm512_castps_si512(__conv(mask)),
+                                _mm512_set1_epi32(0x8000000));
+    return __conv(_mm512_mask_i32gather_ps(
+                    __conv(old),
+                    m,
+                    __conv(idx),
+                    p,
+                    static_cast<int>(ScaleT)));
+}
+
+#if !defined(AVX512F_STRICT)
+
+static SIMDINLINE uint32_t SIMDCALL movemask_epi8(Integer a)
+{
+    __mmask64 m = 0xffffffffull;
+    return static_cast<uint32_t>(
+        _mm512_mask_test_epi8_mask(m, __conv(a), _mm512_set1_epi8(0x80)));
+}
+
+#endif
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+    __mmask16 m = 0xff;
+    m = _mm512_mask_test_epi32_mask(m, __conv(mask), _mm512_set1_epi32(0x80000000));
+    _mm512_mask_store_ps(p, m, __conv(src));
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+{
+    _mm512_mask_store_ps(p, __mmask16(0xff), __conv(a));
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+{
+    _mm512_mask_store_epi32(p, __mmask16(0xff), __conv(a));
+}
+
+//=======================================================================
+// Legacy interface (available only in SIMD256 width)
+//=======================================================================
+
+#undef SIMD_WRAPPER_1_
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_1I_
+#undef SIMD_WRAPPER_1I
+#undef SIMD_WRAPPER_2_
+#undef SIMD_WRAPPER_2
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3_
+#undef SIMD_WRAPPER_3
+#undef SIMD_DWRAPPER_1_
+#undef SIMD_DWRAPPER_1
+#undef SIMD_DWRAPPER_1I_
+#undef SIMD_DWRAPPER_1I
+#undef SIMD_DWRAPPER_2_
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_IWRAPPER_1_
+#undef SIMD_IWRAPPER_1_8
+#undef SIMD_IWRAPPER_1_16
+#undef SIMD_IWRAPPER_1_32
+#undef SIMD_IWRAPPER_1_64
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_1I_8
+#undef SIMD_IWRAPPER_1I_16
+#undef SIMD_IWRAPPER_1I_32
+#undef SIMD_IWRAPPER_1I_64
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2_8
+#undef SIMD_IWRAPPER_2_16
+#undef SIMD_IWRAPPER_2_32
+#undef SIMD_IWRAPPER_2_64
+#undef SIMD_IWRAPPER_2I
+//#undef SIMD_IWRAPPER_2I_8
+//#undef SIMD_IWRAPPER_2I_16
+//#undef SIMD_IWRAPPER_2I_32
+//#undef SIMD_IWRAPPER_2I_64
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
new file mode 100644
index 00000000000..226952e282e
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512.inl
@@ -0,0 +1,682 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD16 AVX512 (F) implementation
+//
+//  TODO: Optimize for KNL / KNH or for SKX??
+//      For now probably optimizing more for KNL as that's where
+//      immediate customers are.
+//============================================================================
+
+static const int TARGET_SIMD_WIDTH = 16;
+using SIMD256T = SIMD256Impl::AVX2Impl;
+
+#define SIMD_WRAPPER_1_(op, intrin)  \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return intrin(a);\
+    }
+
+#define SIMD_WRAPPER_1(op)  \
+    SIMD_WRAPPER_1_(op, _mm512_##op)
+
+#define SIMD_WRAPPER_2_(op, intrin)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm512_##intrin(a, b);\
+    }
+#define SIMD_WRAPPER_2(op) SIMD_WRAPPER_2_(op, op)
+
+#define SIMD_WRAPPERI_2_(op, intrin)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm512_castsi512_ps(_mm512_##intrin(\
+            _mm512_castps_si512(a), _mm512_castps_si512(b)));\
+    }
+
+#define SIMD_DWRAPPER_2(op)  \
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return _mm512_##op(a, b);\
+    }
+
+#define SIMD_WRAPPER_2I_(op, intrin)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return _mm512_##intrin(a, b, ImmT);\
+    }
+#define SIMD_WRAPPER_2I(op)  SIMD_WRAPPER_2I_(op, op)
+
+#define SIMD_DWRAPPER_2I_(op, intrin)  \
+    template<int ImmT>\
+    static SIMDINLINE Double SIMDCALL op(Double a, Double b)   \
+    {\
+        return _mm512_##intrin(a, b, ImmT);\
+    }
+#define SIMD_DWRAPPER_2I(op)  SIMD_DWRAPPER_2I_(op, op)
+
+#define SIMD_WRAPPER_3(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+    {\
+        return _mm512_##op(a, b, c);\
+    }
+
+#define SIMD_IWRAPPER_1(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return _mm512_##op(a);\
+    }
+#define SIMD_IWRAPPER_1_8(op)  \
+    static SIMDINLINE Integer SIMDCALL op(SIMD256Impl::Integer a)   \
+    {\
+        return _mm512_##op(a);\
+    }
+
+#define SIMD_IWRAPPER_1_4(op)  \
+    static SIMDINLINE Integer SIMDCALL op(SIMD128Impl::Integer a)   \
+    {\
+        return _mm512_##op(a);\
+    }
+
+#define SIMD_IWRAPPER_1I_(op, intrin)  \
+    template<int ImmT> \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return intrin(a, ImmT);\
+    }
+#define SIMD_IWRAPPER_1I(op) SIMD_IWRAPPER_1I_(op, _mm512_##op)
+
+#define SIMD_IWRAPPER_2_(op, intrin)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm512_##intrin(a, b);\
+    }
+#define SIMD_IWRAPPER_2(op)  SIMD_IWRAPPER_2_(op, op)
+
+#define SIMD_IWRAPPER_2_CMP(op, cmp)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return cmp(a, b);\
+    }
+
+#define SIMD_IFWRAPPER_2(op, intrin)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return castps_si(_mm512_##intrin(castsi_ps(a), castsi_ps(b)) );\
+    }
+
+#define SIMD_IWRAPPER_2I_(op, intrin)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return _mm512_##intrin(a, b, ImmT);\
+    }
+#define SIMD_IWRAPPER_2I(op) SIMD_IWRAPPER_2I_(op, op)
+
+private:
+    static SIMDINLINE Integer vmask(__mmask8 m)
+    {
+        return _mm512_maskz_set1_epi64(m, -1LL);
+    }
+    static SIMDINLINE Integer vmask(__mmask16 m)
+    {
+        return _mm512_maskz_set1_epi32(m, -1);
+    }
+    static SIMDINLINE Integer vmask(__mmask32 m)
+    {
+        return _mm512_maskz_set1_epi16(m, -1);
+    }
+    static SIMDINLINE Integer vmask(__mmask64 m)
+    {
+        return _mm512_maskz_set1_epi8(m, -1);
+    }
+
+public:
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps);     // return a + b
+SIMD_WRAPPER_2(div_ps);     // return a / b
+SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
+SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);     // return a * b
+SIMD_WRAPPER_1_(rcp_ps, _mm512_rcp28_ps);       // return 1.0f / a
+SIMD_WRAPPER_1_(rsqrt_ps, _mm512_rsqrt28_ps);   // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);     // return a - b
+
+template <RoundMode RMT>
+static SIMDINLINE Float SIMDCALL round_ps(Float a)
+{
+    return _mm512_roundscale_ps(a, static_cast<int>(RMT));
+}
+
+static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
+SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
+SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
+
+                            // return (a * b) & 0xFFFFFFFF
+                            //
+                            // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+                            // and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2(mullo_epi32);
+SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
+SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
+SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_2_(and_si, and_si512);        // return a & b       (int)
+SIMD_IWRAPPER_2_(andnot_si, andnot_si512);  // return (~a) & b    (int)
+SIMD_IWRAPPER_2_(or_si, or_si512);          // return a | b       (int)
+SIMD_IWRAPPER_2_(xor_si, xor_si512);        // return a ^ b       (int)
+
+#if defined(AVX512F_STRICT)
+
+SIMD_WRAPPERI_2_(and_ps, and_epi32);          // return a & b       (float treated as int)
+SIMD_WRAPPERI_2_(andnot_ps, andnot_epi32);    // return (~a) & b    (float treated as int)
+SIMD_WRAPPERI_2_(or_ps, or_epi32);            // return a | b       (float treated as int)
+SIMD_WRAPPERI_2_(xor_ps, xor_epi32);          // return a ^ b       (float treated as int)
+
+#else
+
+SIMD_WRAPPER_2(and_ps);                     // return a & b       (float treated as int)
+SIMD_WRAPPER_2(andnot_ps);                  // return (~a) & b    (float treated as int)
+SIMD_WRAPPER_2(or_ps);                      // return a | b       (float treated as int)
+SIMD_WRAPPER_2(xor_ps);                     // return a ^ b       (float treated as int)
+
+#endif
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1I(slli_epi32);               // return a << ImmT
+SIMD_IWRAPPER_2(sllv_epi32);
+SIMD_IWRAPPER_1I(srai_epi32);               // return a >> ImmT   (int32)
+SIMD_IWRAPPER_1I(srli_epi32);               // return a >> ImmT   (uint32)
+SIMD_IWRAPPER_1I_(srli_si, srli_si512);     // return a >> (ImmT*8) (uint)
+
+template<int ImmT>                              // same as srli_si, but with Float cast to int
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)
+{
+    return castsi_ps(srli_si<ImmT>(castps_si(a)));
+}
+
+SIMD_IWRAPPER_2(srlv_epi32);
+
+//-----------------------------------------------------------------------
+// Conversion operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL castpd_ps(Double a)   // return *(Float*)(&a)
+{
+    return _mm512_castpd_ps(a);
+}
+
+static SIMDINLINE Integer SIMDCALL castps_si(Float a)   // return *(Integer*)(&a)
+{
+    return _mm512_castps_si512(a);
+}
+
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer a)   // return *(Double*)(&a)
+{
+    return _mm512_castsi512_pd(a);
+}
+
+static SIMDINLINE Double SIMDCALL castps_pd(Float a)   // return *(Double*)(&a)
+{
+    return _mm512_castps_pd(a);
+}
+
+static SIMDINLINE Integer SIMDCALL castpd_si(Double a)   // return *(Integer*)(&a)
+{
+    return _mm512_castpd_si512(a);
+}
+
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer a)   // return *(Float*)(&a)
+{
+    return _mm512_castsi512_ps(a);
+}
+
+static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a) // return (float)a    (int32 --> float)
+{
+    return _mm512_cvtepi32_ps(a);
+}
+
+SIMD_IWRAPPER_1_8(cvtepu8_epi16);     // return (int16)a    (uint8 --> int16)
+SIMD_IWRAPPER_1_4(cvtepu8_epi32);     // return (int32)a    (uint8 --> int32)
+SIMD_IWRAPPER_1_8(cvtepu16_epi32);    // return (int32)a    (uint16 --> int32)
+SIMD_IWRAPPER_1_4(cvtepu16_epi64);    // return (int64)a    (uint16 --> int64)
+SIMD_IWRAPPER_1_8(cvtepu32_epi64);    // return (int64)a    (uint32 --> int64)
+
+static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a)            // return (int32)a    (float --> int32)
+{
+    return _mm512_cvtps_epi32(a);
+}
+
+static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a)           // return (int32)a    (rnd_to_zero(float) --> int32)
+{
+    return _mm512_cvttps_epi32(a);
+}
+
+//-----------------------------------------------------------------------
+// Comparison operations
+//-----------------------------------------------------------------------
+template<CompareType CmpTypeT>
+static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b)
+{
+    return _mm512_cmp_ps_mask(a, b, static_cast<const int>(CmpTypeT));
+}
+
+template<CompareType CmpTypeT>
+static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
+{
+    // Legacy vector mask generator
+    __mmask16 result = cmp_ps_mask<CmpTypeT>(a, b);
+    return castsi_ps(vmask(result));
+}
+
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
+
+template<CompareTypeInt CmpTypeT>
+static SIMDINLINE Integer SIMDCALL cmp_epi8(Integer a, Integer b)
+{
+    // Legacy vector mask generator
+    __mmask64 result = _mm512_cmp_epi8_mask(a, b, static_cast<const int>(CmpTypeT));
+    return vmask(result);
+}
+template<CompareTypeInt CmpTypeT>
+static SIMDINLINE Integer SIMDCALL cmp_epi16(Integer a, Integer b)
+{
+    // Legacy vector mask generator
+    __mmask32 result = _mm512_cmp_epi16_mask(a, b, static_cast<const int>(CmpTypeT));
+    return vmask(result);
+}
+template<CompareTypeInt CmpTypeT>
+static SIMDINLINE Integer SIMDCALL cmp_epi32(Integer a, Integer b)
+{
+    // Legacy vector mask generator
+    __mmask16 result = _mm512_cmp_epi32_mask(a, b, static_cast<const int>(CmpTypeT));
+    return vmask(result);
+}
+template<CompareTypeInt CmpTypeT>
+static SIMDINLINE Integer SIMDCALL cmp_epi64(Integer a, Integer b)
+{
+    // Legacy vector mask generator
+    __mmask8 result = _mm512_cmp_epi64_mask(a, b, static_cast<const int>(CmpTypeT));
+    return vmask(result);
+}
+
+SIMD_IWRAPPER_2_CMP(cmpeq_epi8,  cmp_epi8<CompareTypeInt::EQ>);    // return a == b (int8)
+SIMD_IWRAPPER_2_CMP(cmpeq_epi16, cmp_epi16<CompareTypeInt::EQ>);   // return a == b (int16)
+SIMD_IWRAPPER_2_CMP(cmpeq_epi32, cmp_epi32<CompareTypeInt::EQ>);   // return a == b (int32)
+SIMD_IWRAPPER_2_CMP(cmpeq_epi64, cmp_epi64<CompareTypeInt::EQ>);   // return a == b (int64)
+SIMD_IWRAPPER_2_CMP(cmpgt_epi8,  cmp_epi8<CompareTypeInt::GT>);    // return a > b (int8)
+SIMD_IWRAPPER_2_CMP(cmpgt_epi16, cmp_epi16<CompareTypeInt::GT>);   // return a > b (int16)
+SIMD_IWRAPPER_2_CMP(cmpgt_epi32, cmp_epi32<CompareTypeInt::GT>);   // return a > b (int32)
+SIMD_IWRAPPER_2_CMP(cmpgt_epi64, cmp_epi64<CompareTypeInt::GT>);   // return a > b (int64)
+SIMD_IWRAPPER_2_CMP(cmplt_epi32, cmp_epi32<CompareTypeInt::LT>);   // return a < b (int32)
+
+static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
+{
+    return (0 == static_cast<int>(_mm512_test_epi32_mask(castps_si(a), castps_si(b))));
+}
+
+static SIMDINLINE bool SIMDCALL testz_si(Integer a, Integer b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
+{
+    return (0 == static_cast<int>(_mm512_test_epi32_mask(a, b)));
+}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+template <int ImmT>
+static SIMDINLINE Float blend_ps(Float a, Float b) // return ImmT ? b : a  (float)
+{
+    return _mm512_mask_blend_ps(__mmask16(ImmT), a, b);
+}
+
+template <int ImmT>
+static SIMDINLINE Float blend_epi32(Integer a, Integer b) // return ImmT ? b : a  (int32)
+{
+    return _mm512_mask_blend_epi32(__mmask16(ImmT), a, b);
+}
+
+static SIMDINLINE Float blendv_ps(Float a, Float b, Float mask) // return mask ? b : a  (float)
+{
+    return _mm512_mask_blend_ps(__mmask16(movemask_ps(mask)), a, b);
+}
+
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
+{
+    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), mask));
+}
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
+{
+    return castps_si(blendv_ps(castsi_ps(a), castsi_ps(b), castsi_ps(mask)));
+}
+
+static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)  // return *p (all elements in vector get same value)
+{
+    return _mm512_set1_ps(*p);
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a)
+{
+    return _mm256_castpd_ps(_mm512_extractf64x4_pd(_mm512_castps_pd(a), imm));
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a)
+{
+    return _mm512_extractf64x4_pd(a, imm);
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a)
+{
+    return _mm512_extracti64x4_epi64(a, imm);
+}
+
+template<int imm>
+static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b)
+{
+    return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(a), _mm256_castps_pd(b), imm));
+}
+
+template<int imm>
+static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b)
+{
+    return _mm512_insertf64x4(a, b, imm);
+}
+
+template<int imm>
+static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
+{
+    return _mm512_inserti64x4(a, b, imm);
+}
+
+SIMD_IWRAPPER_2(packs_epi16);   // See documentation for _mm512_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2(packs_epi32);   // See documentation for _mm512_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2(packus_epi16);  // See documentation for _mm512_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2(packus_epi32);  // See documentation for _mm512_packus_epi32 and _mm512_packus_epi32
+
+static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    return _mm512_permutexvar_epi32(swiz, a);
+}
+
+static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    return _mm512_permutexvar_ps(swiz, a);
+}
+
+SIMD_WRAPPER_2I_(permute2f128_ps, shuffle_f32x4);
+SIMD_DWRAPPER_2I_(permute2f128_pd, shuffle_f64x2);
+SIMD_IWRAPPER_2I_(permute2f128_si, shuffle_i32x4);
+
+SIMD_IWRAPPER_1I(shuffle_epi32);
+
+SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_DWRAPPER_2I(shuffle_pd);
+SIMD_WRAPPER_2I(shuffle_ps);
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL shuffle_epi64(Integer a, Integer b)
+{
+    return castpd_si(shuffle_pd<ImmT>(castsi_pd(a), castsi_pd(b)));
+}
+
+SIMD_IWRAPPER_2(unpackhi_epi16);
+
+//SIMD_IFWRAPPER_2(unpackhi_epi32, _mm512_unpackhi_ps);
+static SIMDINLINE Integer SIMDCALL unpackhi_epi32(Integer a, Integer b)
+{
+    return castps_si(_mm512_unpackhi_ps(castsi_ps(a), castsi_ps(b)));
+}
+
+SIMD_IWRAPPER_2(unpackhi_epi64);
+SIMD_IWRAPPER_2(unpackhi_epi8);
+SIMD_DWRAPPER_2(unpackhi_pd);
+SIMD_WRAPPER_2(unpackhi_ps);
+SIMD_IWRAPPER_2(unpacklo_epi16);
+SIMD_IFWRAPPER_2(unpacklo_epi32, unpacklo_ps);
+SIMD_IWRAPPER_2(unpacklo_epi64);
+SIMD_IWRAPPER_2(unpacklo_epi8);
+SIMD_DWRAPPER_2(unpacklo_pd);
+SIMD_WRAPPER_2(unpacklo_ps);
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    uint32_t *pOffsets = (uint32_t*)&idx;
+    Float vResult;
+    float* pResult = (float*)&vResult;
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        uint32_t offset = pOffsets[i];
+        offset = offset * static_cast<uint32_t>(ScaleT);
+        pResult[i] = *(float const*)(((uint8_t const*)p + offset));
+    }
+
+    return vResult;
+}
+
+static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
+{
+    return broadcast_ss(p);
+}
+
+static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+{
+    return _mm512_load_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+{
+    return _mm512_load_si512(&p->v);
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+{
+    return _mm512_loadu_ps(p);
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+{
+    return _mm512_loadu_si512(p);
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    __mmask16 k = _mm512_cmpneq_ps_mask(mask, setzero_ps());
+
+    return _mm512_mask_i32gather_ps(old, k, idx, p, ScaleT);
+}
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+    Mask m = _mm512_cmplt_epi32_mask(mask, setzero_si());
+    _mm512_mask_store_ps(p, m, src);
+}
+
+static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
+{
+    __mmask64 m = _mm512_cmplt_epi8_mask(a, setzero_si());
+    return static_cast<uint64_t>(m);
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
+{
+    __mmask8 m = _mm512_cmplt_pd_mask(a, setzero_pd());
+    return static_cast<uint32_t>(m);
+}
+static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
+{
+    __mmask16 m = _mm512_cmplt_ps_mask(a, setzero_ps());
+    return static_cast<uint32_t>(m);
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
+{
+    return _mm512_set1_epi32(i);
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
+{
+    return _mm512_set1_epi8(i);
+}
+
+static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
+{
+    return _mm512_set1_ps(f);
+}
+
+static SIMDINLINE Double SIMDCALL setzero_pd()      // return 0 (double)
+{
+    return _mm512_setzero_pd();
+}
+
+static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
+{
+    return _mm512_setzero_ps();
+}
+
+static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
+{
+    return _mm512_setzero_si512();
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+{
+    _mm512_store_ps(p, a);
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+{
+    _mm512_store_si512(&p->v, a);
+}
+
+static SIMDINLINE void SIMDCALL storeu_si(Integer *p, Integer a) // *p = a    (same as store_si but allows for unaligned mem)
+{
+    _mm512_storeu_si512(&p->v, a);
+}
+
+static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
+{
+    _mm512_stream_ps(p, a);
+}
+
+static SIMDINLINE Integer SIMDCALL set_epi32(
+    int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
+    int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+{
+    return _mm512_set_epi32(
+        i15, i14, i13, i12, i11, i10, i9, i8,
+        i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Integer SIMDCALL set_epi32(
+    int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+{
+    return set_epi32(
+        0, 0, 0, 0, 0, 0, 0, 0,
+        i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(
+    float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8,
+    float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+{
+    return _mm512_set_ps(
+        i15, i14, i13, i12, i11, i10, i9, i8,
+        i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(
+    float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+{
+    return set_ps(
+        0, 0, 0, 0, 0, 0, 0, 0,
+        i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
+{
+    return castsi_ps(_mm512_maskz_mov_epi32(__mmask16(mask), set1_epi32(-1)));
+}
+
+#undef SIMD_WRAPPER_1_
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_2
+#undef SIMD_WRAPPER_2_
+#undef SIMD_WRAPPERI_2_
+#undef SIMD_DWRAPPER_2
+#undef SIMD_DWRAPPER_2I
+#undef SIMD_WRAPPER_2I_
+#undef SIMD_WRAPPER_3_
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_3
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IFWRAPPER_2
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_1I
+#undef SIMD_IWRAPPER_1I_
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IWRAPPER_2_
+#undef SIMD_IWRAPPER_2I
+
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks.inl
new file mode 100644
index 00000000000..3e36ce5bd36
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_avx512_masks.inl
@@ -0,0 +1,27 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX512_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+// Implement mask-enabled SIMD functions
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
new file mode 100644
index 00000000000..a45429f4b6b
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu.inl
@@ -0,0 +1,842 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+//============================================================================
+// SIMD16 AVX (1) implementation
+//============================================================================
+
+static const int TARGET_SIMD_WIDTH = 8;
+using SIMD128T = SIMD128Impl::AVXImpl;
+
+#define SIMD_WRAPPER_1(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a)   \
+    {\
+        return Float\
+        {\
+            SIMD256T::op(a.v8[0]),\
+            SIMD256T::op(a.v8[1]),\
+        };\
+    }
+
+#define SIMD_WRAPPER_2(op)  \
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return Float\
+        {\
+            SIMD256T::op(a.v8[0], b.v8[0]),\
+            SIMD256T::op(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_WRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return Float\
+        {\
+            SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\
+            SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_WRAPPER_2I_1(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Float SIMDCALL op(Float a, Float b)   \
+    {\
+        return Float\
+        {\
+            SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\
+            SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_WRAPPER_3(op)  \
+        static SIMDINLINE Float SIMDCALL op(Float a, Float b, Float c)   \
+        {\
+            return Float\
+            {\
+                SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\
+                SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\
+            };\
+        }
+
+#define SIMD_IWRAPPER_1(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::op(a.v8[0]),\
+            SIMD256T::op(a.v8[1]),\
+        };\
+    }
+
+#define SIMD_IWRAPPER_2(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::op(a.v8[0], b.v8[0]),\
+            SIMD256T::op(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_IWRAPPER_2I(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::template op<0xFF & ImmT>(a.v8[0], b.v8[0]),\
+            SIMD256T::template op<0xFF & (ImmT >> TARGET_SIMD_WIDTH)>(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_IWRAPPER_2I_1(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::template op<ImmT>(a.v8[0], b.v8[0]),\
+            SIMD256T::template op<ImmT>(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_IWRAPPER_2I_2(op)  \
+    template<int ImmT>\
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::template op<0xF & ImmT>(a.v8[0], b.v8[0]),\
+            SIMD256T::template op<0xF & (ImmT >> 4)>(a.v8[1], b.v8[1]),\
+        };\
+    }
+
+#define SIMD_IWRAPPER_3(op)  \
+    static SIMDINLINE Integer SIMDCALL op(Integer a, Integer b, Integer c)   \
+    {\
+        return Integer\
+        {\
+            SIMD256T::op(a.v8[0], b.v8[0], c.v8[0]),\
+            SIMD256T::op(a.v8[1], b.v8[1], c.v8[1]),\
+        };\
+    }
+
+//-----------------------------------------------------------------------
+// Single precision floating point arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(add_ps);     // return a + b
+SIMD_WRAPPER_2(div_ps);     // return a / b
+SIMD_WRAPPER_3(fmadd_ps);   // return (a * b) + c
+SIMD_WRAPPER_3(fmsub_ps);   // return (a * b) - c
+SIMD_WRAPPER_2(max_ps);     // return (a > b) ? a : b
+SIMD_WRAPPER_2(min_ps);     // return (a < b) ? a : b
+SIMD_WRAPPER_2(mul_ps);     // return a * b
+SIMD_WRAPPER_1(rcp_ps);     // return 1.0f / a
+SIMD_WRAPPER_1(rsqrt_ps);   // return 1.0f / sqrt(a)
+SIMD_WRAPPER_2(sub_ps);     // return a - b
+
+template <RoundMode RMT>
+static SIMDINLINE Float SIMDCALL round_ps(Float a)
+{
+    return Float
+    {
+        SIMD256T::template round_ps<RMT>(a.v8[0]),
+        SIMD256T::template round_ps<RMT>(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL ceil_ps(Float a) { return round_ps<RoundMode::CEIL_NOEXC>(a); }
+static SIMDINLINE Float SIMDCALL floor_ps(Float a) { return round_ps<RoundMode::FLOOR_NOEXC>(a); }
+
+//-----------------------------------------------------------------------
+// Integer (various width) arithmetic operations
+//-----------------------------------------------------------------------
+SIMD_IWRAPPER_1(abs_epi32); // return absolute_value(a) (int32)
+SIMD_IWRAPPER_2(add_epi32); // return a + b (int32)
+SIMD_IWRAPPER_2(add_epi8);  // return a + b (int8)
+SIMD_IWRAPPER_2(adds_epu8); // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+SIMD_IWRAPPER_2(max_epi32); // return (a > b) ? a : b (int32)
+SIMD_IWRAPPER_2(max_epu32); // return (a > b) ? a : b (uint32)
+SIMD_IWRAPPER_2(min_epi32); // return (a < b) ? a : b (int32)
+SIMD_IWRAPPER_2(min_epu32); // return (a < b) ? a : b (uint32)
+SIMD_IWRAPPER_2(mul_epi32); // return a * b (int32)
+
+// return (a * b) & 0xFFFFFFFF
+//
+// Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+// and store the low 32 bits of the intermediate integers in dst.
+SIMD_IWRAPPER_2(mullo_epi32);
+SIMD_IWRAPPER_2(sub_epi32); // return a - b (int32)
+SIMD_IWRAPPER_2(sub_epi64); // return a - b (int64)
+SIMD_IWRAPPER_2(subs_epu8); // return (b > a) ? 0 : (a - b) (uint8)
+
+//-----------------------------------------------------------------------
+// Logical operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2(and_ps);     // return a & b       (float treated as int)
+SIMD_IWRAPPER_2(and_si);    // return a & b       (int)
+SIMD_WRAPPER_2(andnot_ps);  // return (~a) & b    (float treated as int)
+SIMD_IWRAPPER_2(andnot_si); // return (~a) & b    (int)
+SIMD_WRAPPER_2(or_ps);      // return a | b       (float treated as int)
+SIMD_IWRAPPER_2(or_si);     // return a | b       (int)
+SIMD_WRAPPER_2(xor_ps);     // return a ^ b       (float treated as int)
+SIMD_IWRAPPER_2(xor_si);    // return a ^ b       (int)
+
+
+//-----------------------------------------------------------------------
+// Shift operations
+//-----------------------------------------------------------------------
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL slli_epi32(Integer a)      // return a << ImmT
+{
+    return Integer
+    {
+        SIMD256T::template slli_epi32<ImmT>(a.v8[0]),
+        SIMD256T::template slli_epi32<ImmT>(a.v8[1]),
+    };
+}
+
+SIMD_IWRAPPER_2(sllv_epi32);                                // return a << b      (uint32)
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL srai_epi32(Integer a)      // return a >> ImmT   (int32)
+{
+    return Integer
+    {
+        SIMD256T::template srai_epi32<ImmT>(a.v8[0]),
+        SIMD256T::template srai_epi32<ImmT>(a.v8[1]),
+    };
+}
+
+template<int ImmT>
+static SIMDINLINE Integer SIMDCALL srli_epi32(Integer a)      // return a >> ImmT   (uint32)
+{
+    return Integer
+    {
+        SIMD256T::template srli_epi32<ImmT>(a.v8[0]),
+        SIMD256T::template srli_epi32<ImmT>(a.v8[1]),
+    };
+}
+
+template<int ImmT>                                          // for each 128-bit lane:
+static SIMDINLINE Integer SIMDCALL srli_si(Integer a)         //  return a >> (ImmT*8) (uint)
+{
+    return Integer
+    {
+        SIMD256T::template srli_si<ImmT>(a.v8[0]),
+        SIMD256T::template srli_si<ImmT>(a.v8[1]),
+    };
+}
+template<int ImmT>
+static SIMDINLINE Float SIMDCALL srlisi_ps(Float a)       // same as srli_si, but with Float cast to int
+{
+    return Float
+    {
+        SIMD256T::template srlisi_ps<ImmT>(a.v8[0]),
+        SIMD256T::template srlisi_ps<ImmT>(a.v8[1]),
+    };
+}
+
+SIMD_IWRAPPER_2(srlv_epi32);                                // return a >> b      (uint32)
+
+//-----------------------------------------------------------------------
+// Conversion operations
+//-----------------------------------------------------------------------
+static SIMDINLINE Float SIMDCALL castpd_ps(Double a)              // return *(Float*)(&a)
+{
+    return Float
+    {
+        SIMD256T::castpd_ps(a.v8[0]),
+        SIMD256T::castpd_ps(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL castps_si(Float a)              // return *(Integer*)(&a)
+{
+    return Integer
+    {
+        SIMD256T::castps_si(a.v8[0]),
+        SIMD256T::castps_si(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Double SIMDCALL castsi_pd(Integer a)              // return *(Double*)(&a)
+{
+    return Double
+    {
+        SIMD256T::castsi_pd(a.v8[0]),
+        SIMD256T::castsi_pd(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Double SIMDCALL castps_pd(Float a)   // return *(Double*)(&a)
+{
+    return Double
+    {
+        SIMD256T::castps_pd(a.v8[0]),
+        SIMD256T::castps_pd(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL castsi_ps(Integer a)              // return *(Float*)(&a)
+{
+    return Float
+    {
+        SIMD256T::castsi_ps(a.v8[0]),
+        SIMD256T::castsi_ps(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL cvtepi32_ps(Integer a)            // return (float)a    (int32 --> float)
+{
+    return Float
+    {
+        SIMD256T::cvtepi32_ps(a.v8[0]),
+        SIMD256T::cvtepi32_ps(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu8_epi16(SIMD256Impl::Integer a)          // return (int16)a    (uint8 --> int16)
+{
+    return Integer
+    {
+        SIMD256T::cvtepu8_epi16(a.v4[0]),
+        SIMD256T::cvtepu8_epi16(a.v4[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu8_epi32(SIMD256Impl::Integer a)          // return (int32)a    (uint8 --> int32)
+{
+    return Integer
+	{
+        SIMD256T::cvtepu8_epi32(a.v4[0]),
+        SIMD256T::cvtepu8_epi32(SIMD128T::template srli_si<8>(a.v4[0])),
+	};
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu16_epi32(SIMD256Impl::Integer a)         // return (int32)a    (uint16 --> int32)
+{
+    return Integer
+    {
+        SIMD256T::cvtepu16_epi32(a.v4[0]),
+        SIMD256T::cvtepu16_epi32(a.v4[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu16_epi64(SIMD256Impl::Integer a)         // return (int64)a    (uint16 --> int64)
+{
+    return Integer
+    {
+        SIMD256T::cvtepu16_epi64(a.v4[0]),
+        SIMD256T::cvtepu16_epi64(SIMD128T::template srli_si<8>(a.v4[0])),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtepu32_epi64(SIMD256Impl::Integer a)         // return (int64)a    (uint32 --> int64)
+{
+    return Integer
+    {
+        SIMD256T::cvtepu32_epi64(a.v4[0]),
+        SIMD256T::cvtepu32_epi64(a.v4[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvtps_epi32(Float a)            // return (int32)a    (float --> int32)
+{
+    return Integer
+    {
+        SIMD256T::cvtps_epi32(a.v8[0]),
+        SIMD256T::cvtps_epi32(a.v8[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL cvttps_epi32(Float a)           // return (int32)a    (rnd_to_zero(float) --> int32)
+{
+    return Integer
+    {
+        SIMD256T::cvtps_epi32(a.v8[0]),
+        SIMD256T::cvtps_epi32(a.v8[1]),
+    };
+}
+
+//-----------------------------------------------------------------------
+// Comparison operations
+//-----------------------------------------------------------------------
+template<CompareType CmpTypeT>
+static SIMDINLINE Float SIMDCALL cmp_ps(Float a, Float b) // return a (CmpTypeT) b
+{
+    return Float
+    {
+        SIMD256T::template cmp_ps<CmpTypeT>(a.v8[0], b.v8[0]),
+        SIMD256T::template cmp_ps<CmpTypeT>(a.v8[1], b.v8[1]),
+    };
+}
+static SIMDINLINE Float SIMDCALL cmplt_ps(Float a, Float b) { return cmp_ps<CompareType::LT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpgt_ps(Float a, Float b) { return cmp_ps<CompareType::GT_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpneq_ps(Float a, Float b) { return cmp_ps<CompareType::NEQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpeq_ps(Float a, Float b) { return cmp_ps<CompareType::EQ_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmpge_ps(Float a, Float b) { return cmp_ps<CompareType::GE_OQ>(a, b); }
+static SIMDINLINE Float SIMDCALL cmple_ps(Float a, Float b) { return cmp_ps<CompareType::LE_OQ>(a, b); }
+
+template<CompareType CmpTypeT>
+static SIMDINLINE Mask SIMDCALL cmp_ps_mask(Float a, Float b)
+{
+    return static_cast<Mask>(movemask_ps(cmp_ps<CmpTypeT>(a, b)));
+}
+
+
+SIMD_IWRAPPER_2(cmpeq_epi8);    // return a == b (int8)
+SIMD_IWRAPPER_2(cmpeq_epi16);   // return a == b (int16)
+SIMD_IWRAPPER_2(cmpeq_epi32);   // return a == b (int32)
+SIMD_IWRAPPER_2(cmpeq_epi64);   // return a == b (int64)
+SIMD_IWRAPPER_2(cmpgt_epi8);    // return a > b (int8)
+SIMD_IWRAPPER_2(cmpgt_epi16);   // return a > b (int16)
+SIMD_IWRAPPER_2(cmpgt_epi32);   // return a > b (int32)
+SIMD_IWRAPPER_2(cmpgt_epi64);   // return a > b (int64)
+SIMD_IWRAPPER_2(cmplt_epi32);   // return a < b (int32)
+
+static SIMDINLINE bool SIMDCALL testz_ps(Float a, Float b)  // return all_lanes_zero(a & b) ? 1 : 0 (float)
+{
+    return  0 != (SIMD256T::testz_ps(a.v8[0], b.v8[0]) &
+                  SIMD256T::testz_ps(a.v8[1], b.v8[1]));
+}
+
+static SIMDINLINE int SIMDCALL testz_si(Integer a, Integer b)  // return all_lanes_zero(a & b) ? 1 : 0 (int)
+{
+    return  0 != (SIMD256T::testz_si(a.v8[0], b.v8[0]) &
+                  SIMD256T::testz_si(a.v8[1], b.v8[1]));
+}
+
+//-----------------------------------------------------------------------
+// Blend / shuffle / permute operations
+//-----------------------------------------------------------------------
+SIMD_WRAPPER_2I(blend_ps);  // return ImmT ? b : a  (float)
+SIMD_IWRAPPER_2I(blend_epi32);  // return ImmT ? b : a  (int32)
+SIMD_WRAPPER_3(blendv_ps);  // return mask ? b : a  (float)
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Float mask) // return mask ? b : a (int)
+{
+    return Integer
+    {
+        SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
+        SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL blendv_epi32(Integer a, Integer b, Integer mask) // return mask ? b : a (int)
+{
+    return Integer
+    {
+        SIMD256T::blendv_epi32(a.v8[0], b.v8[0], mask.v8[0]),
+        SIMD256T::blendv_epi32(a.v8[1], b.v8[1], mask.v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL broadcast_ss(float const *p)         // return *p (all elements in vector get same value)
+{
+    float f = *p;
+    return Float
+    {
+        SIMD256T::set1_ps(f),
+        SIMD256T::set1_ps(f),
+    };
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Float SIMDCALL extract_ps(Float a)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    return a.v8[imm];
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Double SIMDCALL extract_pd(Double a)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    return a.v8[imm];
+}
+
+template<int imm>
+static SIMDINLINE SIMD256Impl::Integer SIMDCALL extract_si(Integer a)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    return a.v8[imm];
+}
+
+template<int imm>
+static SIMDINLINE Float SIMDCALL insert_ps(Float a, SIMD256Impl::Float b)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    a.v8[imm] = b;
+    return a;
+}
+
+template<int imm>
+static SIMDINLINE Double SIMDCALL insert_pd(Double a, SIMD256Impl::Double b)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    a.v8[imm] = b;
+    return a;
+}
+
+template<int imm>
+static SIMDINLINE Integer SIMDCALL insert_si(Integer a, SIMD256Impl::Integer b)
+{
+    SWR_ASSERT(imm == 0 || imm == 1, "Invalid control code: %d", imm);
+    a.v8[imm] = b;
+    return a;
+}
+
+SIMD_IWRAPPER_2(packs_epi16);      // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+SIMD_IWRAPPER_2(packs_epi32);      // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+SIMD_IWRAPPER_2(packus_epi16);     // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+SIMD_IWRAPPER_2(packus_epi32);     // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+
+static SIMDINLINE Integer SIMDCALL permute_epi32(Integer a, Integer swiz) // return a[swiz[i]] for each 32-bit lane i (int32)
+{
+    Integer result;
+
+    // Ugly slow implementation
+    uint32_t const *pA = reinterpret_cast<uint32_t const*>(&a);
+    uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
+    uint32_t *pResult = reinterpret_cast<uint32_t *>(&result);
+
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        pResult[i] = pA[0xF & pSwiz[i]];
+    }
+
+    return result;
+}
+
+static SIMDINLINE Float SIMDCALL permute_ps(Float a, Integer swiz)    // return a[swiz[i]] for each 32-bit lane i (float)
+{
+    Float result;
+
+    // Ugly slow implementation
+    float const *pA = reinterpret_cast<float const*>(&a);
+    uint32_t const *pSwiz = reinterpret_cast<uint32_t const*>(&swiz);
+    float *pResult = reinterpret_cast<float *>(&result);
+
+    for (uint32_t i = 0; i < SIMD_WIDTH; ++i)
+    {
+        pResult[i] = pA[0xF & pSwiz[i]];
+    }
+
+    return result;
+}
+
+// All of the 512-bit permute2f128_XX intrinsics do the following:
+//
+//      SELECT4(src, control) {
+//          CASE(control[1:0])
+//              0:	tmp[127:0] : = src[127:0]
+//              1 : tmp[127:0] : = src[255:128]
+//              2 : tmp[127:0] : = src[383:256]
+//              3 : tmp[127:0] : = src[511:384]
+//              ESAC
+//              RETURN tmp[127:0]
+//      }
+//      
+//      dst[127:0]   : = SELECT4(a[511:0], imm8[1:0])
+//      dst[255:128] : = SELECT4(a[511:0], imm8[3:2])
+//      dst[383:256] : = SELECT4(b[511:0], imm8[5:4])
+//      dst[511:384] : = SELECT4(b[511:0], imm8[7:6])
+//      dst[MAX:512] : = 0
+//
+// Since the 256-bit AVX instructions use a 4-bit control field (instead
+// of 2-bit for AVX512), we need to expand the control bits sent to the
+// AVX instructions for emulation.
+//
+template <int shuf>
+static SIMDINLINE Float SIMDCALL permute2f128_ps(Float a, Float b)
+{
+    return Float
+    {
+        SIMD256T::template permute2f128_ps<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
+        SIMD256T::template permute2f128_ps<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
+    };
+}
+
+template <int shuf>
+static SIMDINLINE Double SIMDCALL permute2f128_pd(Double a, Double b)
+{
+    return Double
+    {
+        SIMD256T::template permute2f128_pd<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
+        SIMD256T::template permute2f128_pd<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
+    };
+}
+
+template <int shuf>
+static SIMDINLINE Integer SIMDCALL permute2f128_si(Integer a, Integer b)
+{
+    return Integer
+	{
+        SIMD256T::template permute2f128_si<((shuf & 0x03) << 0) | ((shuf & 0x0C) << 2)>(a.v8[0], a.v8[1]),
+        SIMD256T::template permute2f128_si<((shuf & 0x30) >> 4) | ((shuf & 0xC0) >> 2)>(b.v8[0], b.v8[1]),
+	};
+}
+
+SIMD_IWRAPPER_2I_1(shuffle_epi32);
+SIMD_IWRAPPER_2I_2(shuffle_epi64);
+SIMD_IWRAPPER_2(shuffle_epi8);
+SIMD_WRAPPER_2I_1(shuffle_pd);
+SIMD_WRAPPER_2I_1(shuffle_ps);
+SIMD_IWRAPPER_2(unpackhi_epi16);
+SIMD_IWRAPPER_2(unpackhi_epi32);
+SIMD_IWRAPPER_2(unpackhi_epi64);
+SIMD_IWRAPPER_2(unpackhi_epi8);
+SIMD_WRAPPER_2(unpackhi_pd);
+SIMD_WRAPPER_2(unpackhi_ps);
+SIMD_IWRAPPER_2(unpacklo_epi16);
+SIMD_IWRAPPER_2(unpacklo_epi32);
+SIMD_IWRAPPER_2(unpacklo_epi64);
+SIMD_IWRAPPER_2(unpacklo_epi8);
+SIMD_WRAPPER_2(unpacklo_pd);
+SIMD_WRAPPER_2(unpacklo_ps);
+
+//-----------------------------------------------------------------------
+// Load / store operations
+//-----------------------------------------------------------------------
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL i32gather_ps(float const* p, Integer idx) // return *(float*)(((int8*)p) + (idx * ScaleT))
+{
+    return Float
+    {
+        SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[0]),
+        SIMD256T::template i32gather_ps<ScaleT>(p, idx.v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL load1_ps(float const *p)  // return *p    (broadcast 1 value to all elements)
+{
+    return broadcast_ss(p);
+}
+
+static SIMDINLINE Float SIMDCALL load_ps(float const *p)   // return *p    (loads SIMD width elements from memory)
+{
+    return Float
+    {
+        SIMD256T::load_ps(p),
+        SIMD256T::load_ps(p + TARGET_SIMD_WIDTH)
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL load_si(Integer const *p)  // return *p
+{
+    return Integer
+    {
+        SIMD256T::load_si(&p->v8[0]),
+        SIMD256T::load_si(&p->v8[1]),
+    };
+}
+
+static SIMDINLINE Float SIMDCALL loadu_ps(float const *p)  // return *p    (same as load_ps but allows for unaligned mem)
+{
+    return Float
+    {
+        SIMD256T::loadu_ps(p),
+        SIMD256T::loadu_ps(p + TARGET_SIMD_WIDTH)
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL loadu_si(Integer const *p) // return *p    (same as load_si but allows for unaligned mem)
+{
+    return Integer
+    {
+        SIMD256T::loadu_si(&p->v8[0]),
+        SIMD256T::loadu_si(&p->v8[1]),
+    };
+}
+
+// for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+template<ScaleFactor ScaleT>
+static SIMDINLINE Float SIMDCALL mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask)
+{
+    return Float
+    {
+        SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[0], p, idx.v8[0], mask.v8[0]),
+        SIMD256T::template mask_i32gather_ps<ScaleT>(old.v8[1], p, idx.v8[1], mask.v8[1]),
+    };
+}
+
+static SIMDINLINE void SIMDCALL maskstore_ps(float *p, Integer mask, Float src)
+{
+    SIMD256T::maskstore_ps(p, mask.v8[0], src.v8[0]);
+    SIMD256T::maskstore_ps(p + TARGET_SIMD_WIDTH, mask.v8[1], src.v8[1]);
+}
+
+static SIMDINLINE uint64_t SIMDCALL movemask_epi8(Integer a)
+{
+    uint64_t mask = static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[0]));
+             mask |= static_cast<uint64_t>(SIMD256T::movemask_epi8(a.v8[1])) << (TARGET_SIMD_WIDTH * 4);
+
+    return mask;
+}
+
+static SIMDINLINE uint32_t SIMDCALL movemask_pd(Double a)
+{
+    uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[0]));
+             mask |= static_cast<uint32_t>(SIMD256T::movemask_pd(a.v8[1])) << (TARGET_SIMD_WIDTH / 2);
+
+    return mask;
+}
+static SIMDINLINE uint32_t SIMDCALL movemask_ps(Float a)
+{
+    uint32_t mask = static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[0]));
+             mask |= static_cast<uint32_t>(SIMD256T::movemask_ps(a.v8[1])) << TARGET_SIMD_WIDTH;
+
+    return mask;
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi32(int i) // return i (all elements are same value)
+{
+    return Integer
+    {
+        SIMD256T::set1_epi32(i),
+        SIMD256T::set1_epi32(i)
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL set1_epi8(char i) // return i (all elements are same value)
+{
+    return Integer
+    {
+        SIMD256T::set1_epi8(i),
+        SIMD256T::set1_epi8(i)
+    };
+}
+
+static SIMDINLINE Float SIMDCALL set1_ps(float f)  // return f (all elements are same value)
+{
+    return Float
+    {
+        SIMD256T::set1_ps(f),
+        SIMD256T::set1_ps(f)
+    };
+}
+
+static SIMDINLINE Float SIMDCALL setzero_ps()      // return 0 (float)
+{
+    return Float
+    {
+        SIMD256T::setzero_ps(),
+        SIMD256T::setzero_ps()
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL setzero_si()      // return 0 (integer)
+{
+    return Integer
+    {
+        SIMD256T::setzero_si(),
+        SIMD256T::setzero_si()
+    };
+}
+
+static SIMDINLINE void SIMDCALL store_ps(float *p, Float a)    // *p = a   (stores all elements contiguously in memory)
+{
+    SIMD256T::store_ps(p, a.v8[0]);
+    SIMD256T::store_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
+}
+
+static SIMDINLINE void SIMDCALL store_si(Integer *p, Integer a)   // *p = a
+{
+    SIMD256T::store_si(&p->v8[0], a.v8[0]);
+    SIMD256T::store_si(&p->v8[1], a.v8[1]);
+}
+
+static SIMDINLINE void SIMDCALL stream_ps(float *p, Float a)   // *p = a   (same as store_ps, but doesn't keep memory in cache)
+{
+    SIMD256T::stream_ps(p, a.v8[0]);
+    SIMD256T::stream_ps(p + TARGET_SIMD_WIDTH, a.v8[1]);
+}
+
+static SIMDINLINE Integer SIMDCALL set_epi32(
+    int i15, int i14, int i13, int i12, int i11, int i10, int i9, int i8,
+    int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+{
+    return Integer
+    {
+        SIMD256T::set_epi32(
+            i7, i6, i5, i4, i3, i2, i1, i0),
+        SIMD256T::set_epi32(
+            i15, i14, i13, i12, i11, i10, i9, i8)
+    };
+}
+
+static SIMDINLINE Integer SIMDCALL set_epi32(
+    int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0)
+{
+    return set_epi32(
+        0, 0, 0, 0, 0, 0, 0, 0,
+        i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(
+    float i15, float i14, float i13, float i12, float i11, float i10, float i9, float i8,
+    float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+{
+    return Float
+    {
+        SIMD256T::set_ps(
+            i7, i6, i5, i4, i3, i2, i1, i0),
+        SIMD256T::set_ps(
+            i15, i14, i13, i12, i11, i10, i9, i8)
+    };
+}
+
+static SIMDINLINE Float SIMDCALL set_ps(
+    float i7, float i6, float i5, float i4, float i3, float i2, float i1, float i0)
+{
+    return set_ps(
+        0, 0, 0, 0, 0, 0, 0, 0,
+        i7, i6, i5, i4, i3, i2, i1, i0);
+}
+
+static SIMDINLINE Float SIMDCALL vmask_ps(int32_t mask)
+{
+    Integer vec = set1_epi32(mask);
+    const Integer bit = set_epi32(
+        0x8000, 0x4000, 0x2000, 0x1000, 0x0800, 0x0400, 0x0200, 0x0100,
+        0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01);
+    vec = and_si(vec, bit);
+    vec = cmplt_epi32(setzero_si(), vec);
+    return castsi_ps(vec);
+}
+
+#undef SIMD_WRAPPER_1
+#undef SIMD_WRAPPER_2
+#undef SIMD_WRAPPER_2I
+#undef SIMD_WRAPPER_2I_1
+#undef SIMD_WRAPPER_3
+#undef SIMD_IWRAPPER_1
+#undef SIMD_IWRAPPER_2
+#undef SIMD_IWRAPPER_2I
+#undef SIMD_IWRAPPER_2I_1
+#undef SIMD_IWRAPPER_3
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu_masks.inl b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu_masks.inl
new file mode 100644
index 00000000000..bc5bff477a4
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_512_emu_masks.inl
@@ -0,0 +1,28 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#if !defined(__SIMD_LIB_AVX_HPP__)
+#error Do not include this file directly, use "simdlib.hpp" instead.
+#endif
+
+// no backwards compatibility for simd mask-enabled functions
+
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp
new file mode 100644
index 00000000000..df2df1b09cd
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_interface.hpp
@@ -0,0 +1,428 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#pragma once
+#if 0
+//===========================================================================
+// Placeholder name representing either SIMD4, SIMD256, or SIMD16 structures.
+//===========================================================================
+struct SIMD256 // or SIMD4 or SIMD16
+{
+    //=======================================================================
+    // SIMD Types
+    //
+    // These typedefs are examples. The SIMD256 and SIMD16 implementations will
+    // use different base types with this same naming.
+    using Float     = __m256;  // Packed single-precision float vector
+    using Double    = __m256d; // Packed double-precision float vector
+    using Integer   = __m256i; // Packed integer vector (mutable element widths)
+    using Mask      = uint8_t; // Integer representing mask bits
+
+    //=======================================================================
+    // Standard interface
+    // (available in both SIMD256 and SIMD16 widths)
+    //=======================================================================
+
+    //-----------------------------------------------------------------------
+    // Single precision floating point arithmetic operations
+    //-----------------------------------------------------------------------
+    static Float    add_ps(Float a, Float b);               // return a + b
+    static Float    div_ps(Float a, Float b);               // return a / b
+    static Float    fmadd_ps(Float a, Float b, Float c);    // return (a * b) + c
+    static Float    fmsub_ps(Float a, Float b, Float c);    // return (a * b) - c
+    static Float    max_ps(Float a, Float b);               // return (a > b) ? a : b
+    static Float    min_ps(Float a, Float b);               // return (a < b) ? a : b
+    static Float    mul_ps(Float a, Float b);               // return a * b
+    static Float    rcp_ps(Float a);                        // return 1.0f / a
+    static Float    rsqrt_ps(Float a);                      // return 1.0f / sqrt(a)
+    static Float    sub_ps(Float a, Float b);               // return a - b
+
+    enum class RoundMode
+    {
+        TO_NEAREST_INT  = 0x00, // Round to nearest integer == TRUNCATE(value + (signof(value))0.5)
+        TO_NEG_INF      = 0x01, // Round to negative infinity
+        TO_POS_INF      = 0x02, // Round to positive infinity
+        TO_ZERO         = 0x03, // Round to 0 a.k.a. truncate
+        CUR_DIRECTION   = 0x04, // Round in direction set in MXCSR register
+
+        RAISE_EXC       = 0x00, // Raise exception on overflow
+        NO_EXC          = 0x08, // Suppress exceptions
+
+        NINT            = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(RAISE_EXC),
+        NINT_NOEXC      = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(NO_EXC),
+        FLOOR           = static_cast<int>(TO_NEG_INF)      | static_cast<int>(RAISE_EXC),
+        FLOOR_NOEXC     = static_cast<int>(TO_NEG_INF)      | static_cast<int>(NO_EXC),
+        CEIL            = static_cast<int>(TO_POS_INF)      | static_cast<int>(RAISE_EXC),
+        CEIL_NOEXC      = static_cast<int>(TO_POS_INF)      | static_cast<int>(NO_EXC),
+        TRUNC           = static_cast<int>(TO_ZERO)         | static_cast<int>(RAISE_EXC),
+        TRUNC_NOEXC     = static_cast<int>(TO_ZERO)         | static_cast<int>(NO_EXC),
+        RINT            = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(RAISE_EXC),
+        NEARBYINT       = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(NO_EXC),
+    };
+
+    // return round_func(a)
+    //
+    // round_func is chosen on the RMT template parameter.  See the documentation
+    // for the RoundMode enumeration above.
+    template <RoundMode RMT>
+    static Float    round_ps(Float a);                  // return round(a) 
+
+
+    //-----------------------------------------------------------------------
+    // Integer (various width) arithmetic operations
+    //-----------------------------------------------------------------------
+    static Integer  abs_epi32(Integer a);               // return absolute_value(a) (int32)
+    static Integer  add_epi32(Integer a, Integer b);    // return a + b (int32)
+    static Integer  add_epi8(Integer a, Integer b);     // return a + b (int8)
+    static Integer  adds_epu8(Integer a, Integer b);    // return ((a + b) > 0xff) ? 0xff : (a + b) (uint8) 
+    static Integer  max_epi32(Integer a, Integer b);    // return (a > b) ? a : b (int32)
+    static Integer  max_epu32(Integer a, Integer b);    // return (a > b) ? a : b (uint32)
+    static Integer  min_epi32(Integer a, Integer b);    // return (a < b) ? a : b (int32)
+    static Integer  min_epu32(Integer a, Integer b);    // return (a < b) ? a : b (uint32)
+    static Integer  mul_epi32(Integer a, Integer b);    // return a * b (int32)
+
+    // return (a * b) & 0xFFFFFFFF
+    //
+    // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit integers,
+    // and store the low 32 bits of the intermediate integers in dst.
+    static Float    mullo_epi32(Integer a, Integer b);
+
+    static Integer  sub_epi32(Integer a, Integer b);    // return a - b (int32)
+    static Integer  sub_epi64(Integer a, Integer b);    // return a - b (int64)
+    static Integer  subs_epu8(Integer a, Integer b);    // return (b > a) ? 0 : (a - b) (uint8)
+
+    //-----------------------------------------------------------------------
+    // Logical operations
+    //-----------------------------------------------------------------------
+    static Float    and_ps(Float a, Float b);           // return a & b       (float treated as int)
+    static Integer  and_si(Integer a, Integer b);       // return a & b       (int)
+    static Float    andnot_ps(Float a, Float b);        // return (~a) & b    (float treated as int)
+    static Integer  andnot_si(Integer a, Integer b);    // return (~a) & b    (int)
+    static Float    or_ps(Float a, Float b);            // return a | b       (float treated as int)
+    static Float    or_si(Integer a, Integer b);        // return a | b       (int)
+    static Float    xor_ps(Float a, Float b);           // return a ^ b       (float treated as int)
+    static Integer  xor_si(Integer a, Integer b);       // return a ^ b       (int)
+
+    //-----------------------------------------------------------------------
+    // Shift operations
+    //-----------------------------------------------------------------------
+    template<int ImmT>
+    static Integer  slli_epi32(Integer a);              // return a << ImmT
+    static Integer  sllv_epi32(Integer a, Integer b);   // return a << b
+    template<int ImmT>
+    static Integer  srai_epi32(Integer a);              // return a >> ImmT   (int32)
+    template<int ImmT>
+    static Integer  srli_epi32(Integer a);              // return a >> ImmT   (uint32)
+    template<int ImmT>                                  // for each 128-bit lane:
+    static Integer  srli_si(Integer a);                 //  return a >> (ImmT*8) (uint)
+    template<int ImmT>
+    static Float    srlisi_ps(Float a);                 // same as srli_si, but with Float cast to int
+    static Integer  srlv_epi32(Integer a, Integer b);   // return a >> b      (uint32)
+
+    //-----------------------------------------------------------------------
+    // Conversion operations
+    //-----------------------------------------------------------------------
+    static Float    castpd_ps(Double a);                // return *(Float*)(&a)
+    static Integer  castps_si(Float a);                 // return *(Integer*)(&a)
+    static Double   castsi_pd(Integer a);               // return *(Double*)(&a)
+    static Double   castps_pd(Float a);                 // return *(Double*)(&a)
+    static Float    castsi_ps(Integer a);               // return *(Float*)(&a)
+    static Float    cvtepi32_ps(Integer a);             // return (float)a    (int32 --> float)
+    static Integer  cvtepu8_epi16(Integer a);           // return (int16)a    (uint8 --> int16)
+    static Integer  cvtepu8_epi32(Integer a);           // return (int32)a    (uint8 --> int32)
+    static Integer  cvtepu16_epi32(Integer a);          // return (int32)a    (uint16 --> int32)
+    static Integer  cvtepu16_epi64(Integer a);          // return (int64)a    (uint16 --> int64)
+    static Integer  cvtepu32_epi64(Integer a);          // return (int64)a    (uint32 --> int64)
+    static Integer  cvtps_epi32(Float a);               // return (int32)a    (float --> int32)
+    static Integer  cvttps_epi32(Float a);              // return (int32)a    (rnd_to_zero(float) --> int32)
+
+    //-----------------------------------------------------------------------
+    // Comparison operations
+    //-----------------------------------------------------------------------
+
+    // Comparison types used with cmp_ps:
+    //   - ordered comparisons are always false if either operand is NaN
+    //   - unordered comparisons are always true if either operand is NaN
+    //   - signaling comparisons raise an exception if either operand is NaN
+    //   - non-signaling comparisons will never raise an exception
+    // 
+    // Ordered:     return (a != NaN) && (b != NaN) && (a cmp b)
+    // Unordered:   return (a == NaN) || (b == NaN) || (a cmp b)
+    enum class CompareType
+    {
+        EQ_OQ      = 0x00, // Equal (ordered, nonsignaling)
+        LT_OS      = 0x01, // Less-than (ordered, signaling)
+        LE_OS      = 0x02, // Less-than-or-equal (ordered, signaling)
+        UNORD_Q    = 0x03, // Unordered (nonsignaling)
+        NEQ_UQ     = 0x04, // Not-equal (unordered, nonsignaling)
+        NLT_US     = 0x05, // Not-less-than (unordered, signaling)
+        NLE_US     = 0x06, // Not-less-than-or-equal (unordered, signaling)
+        ORD_Q      = 0x07, // Ordered (nonsignaling)
+        EQ_UQ      = 0x08, // Equal (unordered, non-signaling)
+        NGE_US     = 0x09, // Not-greater-than-or-equal (unordered, signaling)
+        NGT_US     = 0x0A, // Not-greater-than (unordered, signaling)
+        FALSE_OQ   = 0x0B, // False (ordered, nonsignaling)
+        NEQ_OQ     = 0x0C, // Not-equal (ordered, non-signaling)
+        GE_OS      = 0x0D, // Greater-than-or-equal (ordered, signaling)
+        GT_OS      = 0x0E, // Greater-than (ordered, signaling)
+        TRUE_UQ    = 0x0F, // True (unordered, non-signaling)
+        EQ_OS      = 0x10, // Equal (ordered, signaling)
+        LT_OQ      = 0x11, // Less-than (ordered, nonsignaling)
+        LE_OQ      = 0x12, // Less-than-or-equal (ordered, nonsignaling)
+        UNORD_S    = 0x13, // Unordered (signaling)
+        NEQ_US     = 0x14, // Not-equal (unordered, signaling)
+        NLT_UQ     = 0x15, // Not-less-than (unordered, nonsignaling)
+        NLE_UQ     = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
+        ORD_S      = 0x17, // Ordered (signaling)
+        EQ_US      = 0x18, // Equal (unordered, signaling)
+        NGE_UQ     = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
+        NGT_UQ     = 0x1A, // Not-greater-than (unordered, nonsignaling)
+        FALSE_OS   = 0x1B, // False (ordered, signaling)
+        NEQ_OS     = 0x1C, // Not-equal (ordered, signaling)
+        GE_OQ      = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
+        GT_OQ      = 0x1E, // Greater-than (ordered, nonsignaling)
+        TRUE_US    = 0x1F, // True (unordered, signaling)
+    };
+
+    // return a (CmpTypeT) b (float)
+    //
+    // See documentation for CompareType above for valid values for CmpTypeT.
+    template<CompareType CmpTypeT>
+    static Float    cmp_ps(Float a, Float b);           // return a (CmtTypeT) b (see above)
+    static Float    cmpgt_ps(Float a, Float b);         // return cmp_ps<CompareType::GT_OQ>(a, b)
+    static Float    cmple_ps(Float a, Float b);         // return cmp_ps<CompareType::LE_OQ>(a, b)
+    static Float    cmplt_ps(Float a, Float b);         // return cmp_ps<CompareType::LT_OQ>(a, b)
+    static Float    cmpneq_ps(Float a, Float b);        // return cmp_ps<CompareType::NEQ_OQ>(a, b)
+    static Float    cmpeq_ps(Float a, Float b);         // return cmp_ps<CompareType::EQ_OQ>(a, b)
+    static Float    cmpge_ps(Float a, Float b);         // return cmp_ps<CompareType::GE_OQ>(a, b)
+    static Integer  cmpeq_epi8(Integer a, Integer b);   // return a == b (int8)
+    static Integer  cmpeq_epi16(Integer a, Integer b);  // return a == b (int16)
+    static Integer  cmpeq_epi32(Integer a, Integer b);  // return a == b (int32)
+    static Integer  cmpeq_epi64(Integer a, Integer b);  // return a == b (int64)
+    static Integer  cmpgt_epi8(Integer a, Integer b);   // return a > b (int8)
+    static Integer  cmpgt_epi16(Integer a, Integer b);  // return a > b (int16)
+    static Integer  cmpgt_epi32(Integer a, Integer b);  // return a > b (int32)
+    static Integer  cmpgt_epi64(Integer a, Integer b);  // return a > b (int64)
+    static Integer  cmplt_epi32(Integer a, Integer b);  // return a < b (int32)
+    static bool     testz_ps(Float a, Float b);         // return all_lanes_zero(a & b) ? 1 : 0 (float)
+    static bool     testz_si(Integer a, Integer b);     // return all_lanes_zero(a & b) ? 1 : 0 (int)
+
+    //-----------------------------------------------------------------------
+    // Blend / shuffle / permute operations
+    //-----------------------------------------------------------------------
+    template<int ImmT>
+    static Float    blend_ps(Float a, Float b);                     // return ImmT ? b : a  (float)
+    static Integer  blendv_epi32(Integer a, Integer b, Float mask); // return mask ? b : a (int)
+    static Float    blendv_ps(Float a, Float b, Float mask);        // return mask ? b : a (float)
+    static Float    broadcast_ss(float const *p);                   // return *p (all elements in vector get same value)
+    static Integer  packs_epi16(Integer a, Integer b);              // See documentation for _mm256_packs_epi16 and _mm512_packs_epi16
+    static Integer  packs_epi32(Integer a, Integer b);              // See documentation for _mm256_packs_epi32 and _mm512_packs_epi32
+    static Integer  packus_epi16(Integer a, Integer b);             // See documentation for _mm256_packus_epi16 and _mm512_packus_epi16
+    static Integer  packus_epi32(Integer a, Integer b);             // See documentation for _mm256_packus_epi32 and _mm512_packus_epi32
+    static Float    permute_epi32(Integer a, Integer swiz);         // return a[swiz[i]] for each 32-bit lane i (int32)
+    static Float    permute_ps(Float a, Integer swiz);              // return a[swiz[i]] for each 32-bit lane i (float)
+    template<int SwizT>
+    static Integer  shuffle_epi32(Integer a, Integer b);    
+    template<int SwizT>
+    static Integer  shuffle_epi64(Integer a, Integer b);
+    static Integer  shuffle_epi8(Integer a, Integer b);
+    template<int SwizT>
+    static Float    shuffle_pd(Double a, Double b);
+    template<int SwizT>
+    static Float    shuffle_ps(Float a, Float b);
+    static Integer  unpackhi_epi16(Integer a, Integer b);
+    static Integer  unpackhi_epi32(Integer a, Integer b);
+    static Integer  unpackhi_epi64(Integer a, Integer b);
+    static Integer  unpackhi_epi8(Integer a, Integer b);
+    static Float    unpackhi_pd(Double a, Double b);
+    static Float    unpackhi_ps(Float a, Float b);
+    static Integer  unpacklo_epi16(Integer a, Integer b);
+    static Integer  unpacklo_epi32(Integer a, Integer b);
+    static Integer  unpacklo_epi64(Integer a, Integer b);
+    static Integer  unpacklo_epi8(Integer a, Integer b);
+    static Float    unpacklo_pd(Double a, Double b);
+    static Float    unpacklo_ps(Float a, Float b);
+
+    //-----------------------------------------------------------------------
+    // Load / store operations
+    //-----------------------------------------------------------------------
+    enum class ScaleFactor
+    {
+        SF_1,   // No scaling
+        SF_2,   // Scale offset by 2
+        SF_4,   // Scale offset by 4
+        SF_8,   // Scale offset by 8
+    };
+
+    template<ScaleFactor ScaleT>
+    static Float    i32gather_ps(float const* p, Integer idx);  // return *(float*)(((int8*)p) + (idx * ScaleT))
+    static Float    load1_ps(float const *p);                   // return *p    (broadcast 1 value to all elements)
+    static Float    load_ps(float const *p);                    // return *p    (loads SIMD width elements from memory)
+    static Integer  load_si(Integer const *p);                  // return *p
+    static Float    loadu_ps(float const *p);                   // return *p    (same as load_ps but allows for unaligned mem)
+    static Integer  loadu_si(Integer const *p);                 // return *p    (same as load_si but allows for unaligned mem)
+
+    // for each element: (mask & (1 << 31)) ? (i32gather_ps<ScaleT>(p, idx), mask = 0) : old
+    template<int ScaleT>
+    static Float    mask_i32gather_ps(Float old, float const* p, Integer idx, Float mask);
+
+    static void     maskstore_ps(float *p, Integer mask, Float src);
+    static int      movemask_epi8(Integer a);
+    static int      movemask_pd(Double a);
+    static int      movemask_ps(Float a);
+    static Integer  set1_epi32(int i);                          // return i (all elements are same value)
+    static Integer  set1_epi8(char i);                          // return i (all elements are same value)
+    static Float    set1_ps(float f);                           // return f (all elements are same value)
+    static Float    setzero_ps();                               // return 0 (float)
+    static Integer  setzero_si();                               // return 0 (integer)
+    static void     store_ps(float *p, Float a);                // *p = a   (stores all elements contiguously in memory)
+    static void     store_si(Integer *p, Integer a);            // *p = a
+    static void     stream_ps(float *p, Float a);               // *p = a   (same as store_ps, but doesn't keep memory in cache)
+
+    //=======================================================================
+    // Legacy interface (available only in SIMD256 width)
+    //=======================================================================
+
+    static Float    broadcast_ps(__m128 const *p);
+    template<int ImmT>
+    static __m128d  extractf128_pd(Double a);
+    template<int ImmT>
+    static __m128   extractf128_ps(Float a);
+    template<int ImmT>
+    static __m128i  extractf128_si(Integer a);
+    template<int ImmT>
+    static Double   insertf128_pd(Double a, __m128d b);
+    template<int ImmT>
+    static Float    insertf128_ps(Float a, __m128 b);
+    template<int ImmT>
+    static Integer  insertf128_si(Integer a, __m128i b);
+    static Integer  loadu2_si(__m128 const* phi, __m128 const* plo);
+    template<int ImmT>
+    static Double   permute2f128_pd(Double a, Double b);
+    template<int ImmT>
+    static Float    permute2f128_ps(Float a, Float b);
+    template<int ImmT>
+    static Integer  permute2f128_si(Integer a, Integer b);
+    static Integer  set_epi32(int i7, int i6, int i5, int i4, int i3, int i2, int i1, int i0);
+    static void     storeu2_si(__m128i *phi, __m128i *plo, Integer src);
+
+    //=======================================================================
+    // Advanced masking interface (currently available only in SIMD16 width)
+    //=======================================================================
+
+
+    //=======================================================================
+    // Extended Utility Functions (common to SIMD256 and SIMD16)
+    //=======================================================================
+
+    //-----------------------------------------------------------------------
+    // Extended Types
+    //-----------------------------------------------------------------------
+
+    // Vec4, an SOA SIMD set of 4-dimensional vectors
+    union Vec4
+    {
+        Vec4() = default;
+        Vec4(Float in)
+        {
+            s.x = in;
+            s.y = in;
+            s.z = in;
+            s.w = in;
+        }
+        Vec4(Float x, Float y, Float z, Float w)
+        {
+            s.x = x;
+            s.y = y;
+            s.z = z;
+            s.w = w;
+        }
+
+        Float      v[4];
+        Integer      vi[4];
+        struct
+        {
+            Float  x;
+            Float  y;
+            Float  z;
+            Float  w;
+        } s;
+        Float& operator[] (const int i) { return v[i]; }
+        Float const & operator[] (const int i) const { return v[i]; }
+    };
+
+    //-----------------------------------------------------------------------
+    // Extended Functions
+    //-----------------------------------------------------------------------
+    static void     vec4_set1_ps(Vec4& r, const float *p);                  // r[0] = set1(p[0]), r[1] = set1(p[1]), ...
+    static void     vec4_set1_vps(Vec4& r, Float s);                        // r[0] = s, r[1] = s, ...
+    static Float    vec4_dp3_ps(const Vec4& v0, const Vec4& v1);            // return dp3(v0, v1)
+    static Float    vec4_dp4_ps(const Vec4& v0, const Vec4& v1);            // return dp4(v0, v1)
+    static Float    vec4_rcp_length_ps(const Vec4& v);                      // return 1.0f / sqrt(dp4(v, v))
+    static void     vec4_normalize_ps(Vec4& r, const Vec4& v);              // r = v * rcp_length(v)
+    static void     vec4_mul_ps(Vec4& r, const Vec4& v, Float s);           // r = v * set1_vps(s)
+    static void     vec4_mul_ps(Vec4& r, const Vec4& v0, const Vec4& v1);   // r = v0 * v1
+    static void     vec4_add_ps(Vec4& r, const Vec4& v0, const Vec4& v1);   // r = v0 + v1
+    static void     vec4_min_ps(Vec4& r, const Vec4& v0, Float s);          // r = (v0 < s) ? v0 : s
+    static void     vec4_max_ps(Vec4& r, const Vec4& v0, Float s);          // r = (v0 > s) ? v0 : s
+
+    // Matrix4x4 * Vector4
+    //   result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * v.s.w)
+    //   result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * v.s.w)
+    //   result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * v.s.w)
+    //   result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * v.s.w)
+    static void mat4x4_vec4_multiply(
+            Vec4& result,
+            const float *pMatrix,
+            const Vec4& v);
+
+    // Matrix4x4 * Vector3 - Direction Vector where w = 0.
+    //   result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 0)
+    //   result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 0)
+    //   result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 0)
+    //   result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * 0)
+    static void mat3x3_vec3_w0_multiply(
+            Vec4& result,
+            const float *pMatrix,
+            const Vec4& v);
+
+    // Matrix4x4 * Vector3 - Position vector where w = 1.
+    //   result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 1)
+    //   result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 1)
+    //   result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 1)
+    //   result.s.w = (m30 * v.s.x) + (m31 * v.s.y) + (m32 * v.s.z) + (m33 * 1)
+    static void mat4x4_vec3_w1_multiply(
+            Vec4& result,
+            const float *pMatrix,
+            const Vec4& v);
+
+    // Matrix4x3 * Vector3 - Position vector where w = 1.
+    //   result.s.x = (m00 * v.s.x) + (m01 * v.s.y) + (m02 * v.s.z) + (m03 * 1)
+    //   result.s.y = (m10 * v.s.x) + (m11 * v.s.y) + (m12 * v.s.z) + (m13 * 1)
+    //   result.s.z = (m20 * v.s.x) + (m21 * v.s.y) + (m22 * v.s.z) + (m23 * 1)
+    //   result.s.w = 1
+    static void mat4x3_vec3_w1_multiply(
+            Vec4& result,
+            const float *pMatrix,
+            const Vec4& v);
+};
+#endif // #if 0
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp b/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp
new file mode 100644
index 00000000000..1964ef47027
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/common/simdlib_types.hpp
@@ -0,0 +1,377 @@
+/****************************************************************************
+* Copyright (C) 2017 Intel Corporation.   All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+****************************************************************************/
+#pragma once
+
+#if !defined(__cplusplus)
+#error C++ compilation required
+#endif
+
+#include <immintrin.h>
+#include <inttypes.h>
+#include <stdint.h>
+
+#define SIMD_ARCH_AVX       0
+#define SIMD_ARCH_AVX2      1
+#define SIMD_ARCH_AVX512    2
+
+#if !defined(SIMD_ARCH)
+#define SIMD_ARCH SIMD_ARCH_AVX
+#endif
+
+#if defined(_MSC_VER)
+#define SIMDCALL __vectorcall
+#define SIMDINLINE __forceinline
+#define SIMDALIGN(type_, align_) __declspec(align(align_)) type_
+#else
+#define SIMDCALL
+#define SIMDINLINE inline
+#define SIMDALIGN(type_, align_) type_ __attribute__((aligned(align_)))
+#endif
+
+// For documentation, please see the following include...
+// #include "simdlib_interface.hpp"
+
+namespace SIMDImpl
+{
+    enum class CompareType
+    {
+        EQ_OQ      = 0x00, // Equal (ordered, nonsignaling)
+        LT_OS      = 0x01, // Less-than (ordered, signaling)
+        LE_OS      = 0x02, // Less-than-or-equal (ordered, signaling)
+        UNORD_Q    = 0x03, // Unordered (nonsignaling)
+        NEQ_UQ     = 0x04, // Not-equal (unordered, nonsignaling)
+        NLT_US     = 0x05, // Not-less-than (unordered, signaling)
+        NLE_US     = 0x06, // Not-less-than-or-equal (unordered, signaling)
+        ORD_Q      = 0x07, // Ordered (nonsignaling)
+        EQ_UQ      = 0x08, // Equal (unordered, non-signaling)
+        NGE_US     = 0x09, // Not-greater-than-or-equal (unordered, signaling)
+        NGT_US     = 0x0A, // Not-greater-than (unordered, signaling)
+        FALSE_OQ   = 0x0B, // False (ordered, nonsignaling)
+        NEQ_OQ     = 0x0C, // Not-equal (ordered, non-signaling)
+        GE_OS      = 0x0D, // Greater-than-or-equal (ordered, signaling)
+        GT_OS      = 0x0E, // Greater-than (ordered, signaling)
+        TRUE_UQ    = 0x0F, // True (unordered, non-signaling)
+        EQ_OS      = 0x10, // Equal (ordered, signaling)
+        LT_OQ      = 0x11, // Less-than (ordered, nonsignaling)
+        LE_OQ      = 0x12, // Less-than-or-equal (ordered, nonsignaling)
+        UNORD_S    = 0x13, // Unordered (signaling)
+        NEQ_US     = 0x14, // Not-equal (unordered, signaling)
+        NLT_UQ     = 0x15, // Not-less-than (unordered, nonsignaling)
+        NLE_UQ     = 0x16, // Not-less-than-or-equal (unordered, nonsignaling)
+        ORD_S      = 0x17, // Ordered (signaling)
+        EQ_US      = 0x18, // Equal (unordered, signaling)
+        NGE_UQ     = 0x19, // Not-greater-than-or-equal (unordered, nonsignaling)
+        NGT_UQ     = 0x1A, // Not-greater-than (unordered, nonsignaling)
+        FALSE_OS   = 0x1B, // False (ordered, signaling)
+        NEQ_OS     = 0x1C, // Not-equal (ordered, signaling)
+        GE_OQ      = 0x1D, // Greater-than-or-equal (ordered, nonsignaling)
+        GT_OQ      = 0x1E, // Greater-than (ordered, nonsignaling)
+        TRUE_US    = 0x1F, // True (unordered, signaling)
+    };
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+    enum class CompareTypeInt
+    {
+        EQ  = _MM_CMPINT_EQ,    // Equal
+        LT  = _MM_CMPINT_LT,    // Less than
+        LE  = _MM_CMPINT_LE,    // Less than or Equal
+        NE  = _MM_CMPINT_NE,    // Not Equal
+        GE  = _MM_CMPINT_GE,    // Greater than or Equal
+        GT  = _MM_CMPINT_GT,    // Greater than
+    };
+#endif // SIMD_ARCH >= SIMD_ARCH_AVX512
+
+    enum class ScaleFactor
+    {
+        SF_1 = 1,   // No scaling
+        SF_2 = 2,   // Scale offset by 2
+        SF_4 = 4,   // Scale offset by 4
+        SF_8 = 8,   // Scale offset by 8
+    };
+
+    enum class RoundMode
+    {
+        TO_NEAREST_INT  = 0x00, // Round to nearest integer == TRUNCATE(value + 0.5)
+        TO_NEG_INF      = 0x01, // Round to negative infinity
+        TO_POS_INF      = 0x02, // Round to positive infinity
+        TO_ZERO         = 0x03, // Round to 0 a.k.a. truncate
+        CUR_DIRECTION   = 0x04, // Round in direction set in MXCSR register
+        
+        RAISE_EXC       = 0x00, // Raise exception on overflow
+        NO_EXC          = 0x08, // Suppress exceptions
+        
+        NINT            = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(RAISE_EXC),
+        NINT_NOEXC      = static_cast<int>(TO_NEAREST_INT)  | static_cast<int>(NO_EXC),
+        FLOOR           = static_cast<int>(TO_NEG_INF)      | static_cast<int>(RAISE_EXC),
+        FLOOR_NOEXC     = static_cast<int>(TO_NEG_INF)      | static_cast<int>(NO_EXC),
+        CEIL            = static_cast<int>(TO_POS_INF)      | static_cast<int>(RAISE_EXC),
+        CEIL_NOEXC      = static_cast<int>(TO_POS_INF)      | static_cast<int>(NO_EXC),
+        TRUNC           = static_cast<int>(TO_ZERO)         | static_cast<int>(RAISE_EXC),
+        TRUNC_NOEXC     = static_cast<int>(TO_ZERO)         | static_cast<int>(NO_EXC),
+        RINT            = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(RAISE_EXC),
+        NEARBYINT       = static_cast<int>(CUR_DIRECTION)   | static_cast<int>(NO_EXC),
+    };
+
+    struct Traits
+    {
+        using CompareType = SIMDImpl::CompareType;
+        using ScaleFactor = SIMDImpl::ScaleFactor;
+        using RoundMode   = SIMDImpl::RoundMode;
+    };
+
+    // Attribute, 4-dimensional attribute in SIMD SOA layout
+    template<typename Float, typename Integer, typename Double>
+    union Vec4
+    {
+        Float   v[4];
+        Integer vi[4];
+        Double  vd[4];
+        struct
+        {
+            Float  x;
+            Float  y;
+            Float  z;
+            Float  w;
+        };
+        SIMDINLINE Float& operator[] (const int i) { return v[i]; }
+        SIMDINLINE Float const & operator[] (const int i) const { return v[i]; }
+        SIMDINLINE Vec4& operator=(Vec4 const & in)
+        {
+            v[0] = in.v[0];
+            v[1] = in.v[1];
+            v[2] = in.v[2];
+            v[3] = in.v[3];
+            return *this;
+        }
+    };
+
+    namespace SIMD128Impl
+    {
+        union Float
+        {
+            SIMDINLINE Float() = default;
+            SIMDINLINE Float(__m128 in) : v(in) {}
+            SIMDINLINE Float& operator=(__m128 in) { v = in; return *this; }
+            SIMDINLINE Float& operator=(Float const & in) { v = in.v; return *this; }
+            SIMDINLINE operator __m128() const { return v; }
+
+            SIMDALIGN(__m128, 16) v;
+        };
+
+        union Integer
+        {
+            SIMDINLINE Integer() = default;
+            SIMDINLINE Integer(__m128i in) : v(in) {}
+            SIMDINLINE Integer& operator=(__m128i in) { v = in; return *this; }
+            SIMDINLINE Integer& operator=(Integer const & in) { v = in.v; return *this; }
+            SIMDINLINE operator __m128i() const { return v; }
+            SIMDALIGN(__m128i, 16) v;
+        };
+
+        union Double
+        {
+            SIMDINLINE Double() = default;
+            SIMDINLINE Double(__m128d in) : v(in) {}
+            SIMDINLINE Double& operator=(__m128d in) { v = in; return *this; }
+            SIMDINLINE Double& operator=(Double const & in) { v = in.v; return *this; }
+            SIMDINLINE operator __m128d() const { return v; }
+            SIMDALIGN(__m128d, 16) v;
+        };
+
+        using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
+        using Mask = uint8_t;
+
+        static const uint32_t SIMD_WIDTH = 4;
+    } // ns SIMD128Impl
+
+    namespace SIMD256Impl
+    {
+        union Float
+        {
+            SIMDINLINE Float() = default;
+            SIMDINLINE Float(__m256 in) : v(in) {}
+            SIMDINLINE Float(SIMD128Impl::Float in_lo, SIMD128Impl::Float in_hi = _mm_setzero_ps())
+            {
+                v = _mm256_insertf128_ps(_mm256_castps128_ps256(in_lo), in_hi, 0x1);
+            }
+            SIMDINLINE Float& operator=(__m256 in) { v = in; return *this; }
+            SIMDINLINE Float& operator=(Float const & in) { v = in.v; return *this; }
+            SIMDINLINE operator __m256() const { return v; }
+
+            SIMDALIGN(__m256, 32) v;
+            SIMD128Impl::Float v4[2];
+        };
+
+        union Integer
+        {
+            SIMDINLINE Integer() = default;
+            SIMDINLINE Integer(__m256i in) : v(in) {}
+            SIMDINLINE Integer(SIMD128Impl::Integer in_lo, SIMD128Impl::Integer in_hi = _mm_setzero_si128())
+            {
+                v = _mm256_insertf128_si256(_mm256_castsi128_si256(in_lo), in_hi, 0x1);
+            }
+            SIMDINLINE Integer& operator=(__m256i in) { v = in; return *this; }
+            SIMDINLINE Integer& operator=(Integer const & in) { v = in.v; return *this; }
+            SIMDINLINE operator __m256i() const { return v; }
+
+            SIMDALIGN(__m256i, 32) v;
+            SIMD128Impl::Integer v4[2];
+        };
+
+        union Double
+        {
+            SIMDINLINE Double() = default;
+            SIMDINLINE Double(__m256d in) : v(in) {}
+            SIMDINLINE Double(SIMD128Impl::Double in_lo, SIMD128Impl::Double in_hi = _mm_setzero_pd())
+            {
+                v = _mm256_insertf128_pd(_mm256_castpd128_pd256(in_lo), in_hi, 0x1);
+            }
+            SIMDINLINE Double& operator=(__m256d in) { v = in; return *this; }
+            SIMDINLINE Double& operator=(Double const & in) { v = in.v; return *this; }
+            SIMDINLINE operator __m256d() const { return v; }
+
+            SIMDALIGN(__m256d, 32) v;
+            SIMD128Impl::Double v4[2];
+        };
+
+        using Vec4 = SIMDImpl::Vec4<Float, Integer, Double>;
+        using Mask = uint8_t;
+
+        static const uint32_t SIMD_WIDTH = 8;
+    } // ns SIMD256Impl
+
+    namespace SIMD512Impl
+    {
+#if !defined(_MM_K0_REG)
+        // Define AVX512 types if not included via immintrin.h.
+        // All data members of these types are ONLY to viewed
+        // in a debugger.  Do NOT access them via code!
+        union __m512
+        {
+        private:
+            float m512_f32[16];
+        };
+        struct __m512d
+        {
+        private:
+            double m512d_f64[8];
+        };
+
+        union __m512i
+        {
+        private:
+            int8_t              m512i_i8[64];
+            int16_t             m512i_i16[32];
+            int32_t             m512i_i32[16];
+            int64_t             m512i_i64[8];
+            uint8_t             m512i_u8[64];
+            uint16_t            m512i_u16[32];
+            uint32_t            m512i_u32[16];
+            uint64_t            m512i_u64[8];
+        };
+
+        using __mmask16 = uint16_t;
+#endif
+
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+#define SIMD_ALIGNMENT_BYTES 64
+#else
+#define SIMD_ALIGNMENT_BYTES 32
+#endif
+
+        union Float
+        {
+            SIMDINLINE Float() = default;
+            SIMDINLINE Float(__m512 in) : v(in) {}
+            SIMDINLINE Float(SIMD256Impl::Float in_lo, SIMD256Impl::Float in_hi = _mm256_setzero_ps()) { v8[0] = in_lo; v8[1] = in_hi; }
+            SIMDINLINE Float& operator=(__m512 in) { v = in; return *this; }
+            SIMDINLINE Float& operator=(Float const & in)
+            {
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+                v = in.v;
+#else
+                v8[0] = in.v8[0];
+                v8[1] = in.v8[1];
+#endif
+                return *this;
+            }
+            SIMDINLINE operator __m512() const { return v; }
+
+            SIMDALIGN(__m512, SIMD_ALIGNMENT_BYTES) v;
+            SIMD256Impl::Float v8[2];
+        };
+
+        union Integer
+        {
+            SIMDINLINE Integer() = default;
+            SIMDINLINE Integer(__m512i in) : v(in) {}
+            SIMDINLINE Integer(SIMD256Impl::Integer in_lo, SIMD256Impl::Integer in_hi = _mm256_setzero_si256()) { v8[0] = in_lo; v8[1] = in_hi; }
+            SIMDINLINE Integer& operator=(__m512i in) { v = in; return *this; }
+            SIMDINLINE Integer& operator=(Integer const & in)
+            {
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+                v = in.v;
+#else
+                v8[0] = in.v8[0];
+                v8[1] = in.v8[1];
+#endif
+                return *this;
+            }
+
+            SIMDINLINE operator __m512i() const { return v; }
+
+            SIMDALIGN(__m512i, SIMD_ALIGNMENT_BYTES) v;
+            SIMD256Impl::Integer v8[2];
+        };
+
+        union Double
+        {
+            SIMDINLINE Double() = default;
+            SIMDINLINE Double(__m512d in) : v(in) {}
+            SIMDINLINE Double(SIMD256Impl::Double in_lo, SIMD256Impl::Double in_hi = _mm256_setzero_pd()) { v8[0] = in_lo; v8[1] = in_hi; }
+            SIMDINLINE Double& operator=(__m512d in) { v = in; return *this; }
+            SIMDINLINE Double& operator=(Double const & in)
+            {
+#if SIMD_ARCH >= SIMD_ARCH_AVX512
+                v = in.v;
+#else
+                v8[0] = in.v8[0];
+                v8[1] = in.v8[1];
+#endif
+                return *this;
+            }
+
+            SIMDINLINE operator __m512d() const { return v; }
+
+            SIMDALIGN(__m512d, SIMD_ALIGNMENT_BYTES) v;
+            SIMD256Impl::Double v8[2];
+        };
+
+        typedef SIMDImpl::Vec4<Float, Integer, Double> SIMDALIGN(Vec4, 64);
+        using Mask = __mmask16;
+
+        static const uint32_t SIMD_WIDTH = 16;
+
+#undef SIMD_ALIGNMENT_BYTES
+    } // ns SIMD512Impl
+} // ns SIMDImpl
author	Tim Rowley <[email protected]>	2017-06-15 15:24:07 -0500
committer	Tim Rowley <[email protected]>	2017-06-30 13:26:19 -0500
commit	fc4f6c44c479a97b9cad5d08f0d9cd71a8e1e5f8 (patch)
tree	a8ea649f549dc856f402b0b5d9323c5cef080e34 /src/gallium/drivers/swr/rasterizer/common
parent	8b66d18a3b4f6d6a4f0ea9d71459dac68e5e0295 (diff)