diff options
author | Tim Rowley <[email protected]> | 2017-03-21 15:32:34 -0500 |
---|---|---|
committer | Tim Rowley <[email protected]> | 2017-03-28 11:24:21 -0500 |
commit | 4cd0b1bb2c284609d2ac3413456b29f1a3e42d10 (patch) | |
tree | 8b00a66edacd577c6ab213babacd620939a7ac3d /src/gallium/drivers | |
parent | ec51e8ecfea9d81313192fcd25f9767f8203a9ca (diff) |
swr: [rasterizer core] Enable SIMD16
Make the AVX512 insert/extract intrinsics KNL-compatible
Reviewed-by: George Kyriazis <[email protected]>
Diffstat (limited to 'src/gallium/drivers')
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/common/simd16intrin.h | 14 | ||||
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/core/knobs.h | 2 | ||||
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp | 12 |
3 files changed, 20 insertions, 8 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h index 88814a58aa9..3b43d510e68 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h @@ -853,10 +853,10 @@ INLINE simd16scalari _simd16_set_epi32(int e7, int e6, int e5, int e4, int e3, i #define _simd16_broadcast_ps(m) _mm512_extload_ps(m, _MM_UPCONV_PS_NONE, _MM_BROADCAST_4X16, 0) #define _simd16_store_ps _mm512_store_ps #define _simd16_store_si _mm512_store_si512 -#define _simd16_extract_ps _mm512_extractf32x8_ps -#define _simd16_extract_si _mm512_extracti32x8_epi32 -#define _simd16_insert_ps _mm512_insertf32x8 -#define _simd16_insert_si _mm512_inserti32x8 +#define _simd16_extract_ps(a, imm8) _mm256_castsi256_ps(_mm512_extracti64x4_epi64(_mm512_castps_si512(a), imm8)) +#define _simd16_extract_si _mm512_extracti64x4_epi64 +#define _simd16_insert_ps(a, b, imm8) _mm512_castsi512_ps(_mm512_inserti64x4(_mm512_castps_si512(a), _mm256_castps_si256(b), imm8)) +#define _simd16_insert_si _mm512_inserti64x4 INLINE void _simd16_maskstore_ps(float *m, simd16scalari mask, simd16scalar a) { @@ -871,21 +871,21 @@ INLINE simd16scalar _simd16_blendv_ps(simd16scalar a, simd16scalar b, const simd { simd16mask k = _simd16_scalari2mask(_mm512_castps_si512(mask)); - _mm512_mask_blend_ps(k, a, b); + return _mm512_mask_blend_ps(k, a, b); } INLINE simd16scalari _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalar mask) { simd16mask k = _simd16_scalari2mask(_mm512_castps_si512(mask)); - _mm512_mask_blend_epi32(k, a, b); + return _mm512_mask_blend_epi32(k, a, b); } INLINE simd16scalari _simd16_blendv_epi32(simd16scalari a, simd16scalari b, const simd16scalari mask) { simd16mask k = _simd16_scalari2mask(mask); - _mm512_mask_blend_epi32(k, a, b); + return _mm512_mask_blend_epi32(k, a, b); } #define _simd16_mul_ps _mm512_mul_ps diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs.h b/src/gallium/drivers/swr/rasterizer/core/knobs.h index 8e54f90526b..7928f5d6d76 100644 --- a/src/gallium/drivers/swr/rasterizer/core/knobs.h +++ b/src/gallium/drivers/swr/rasterizer/core/knobs.h @@ -38,7 +38,7 @@ // AVX512 Support /////////////////////////////////////////////////////////////////////////////// -#define ENABLE_AVX512_SIMD16 0 +#define ENABLE_AVX512_SIMD16 1 #define USE_8x2_TILE_BACKEND 0 #define USE_SIMD16_FRONTEND 0 diff --git a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp index 297f23a88ca..511a1fc0df3 100644 --- a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp @@ -1297,7 +1297,19 @@ bool PaTriFan0_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) bool PaTriFan1_simd16(PA_STATE_OPT& pa, uint32_t slot, simd16vector verts[]) { +#if USE_SIMD16_FRONTEND const simd16vector &a = pa.leadingVertex.attrib[slot]; +#else + simd16vector a; + + { + for (uint32_t i = 0; i < 4; i += 1) + { + a[i] = _simd16_insert_ps(_simd16_setzero_ps(), pa.leadingVertex.attrib[slot][i], 0); + } + } + +#endif const simd16vector &b = PaGetSimdVector_simd16(pa, pa.prev, slot); const simd16vector &c = PaGetSimdVector_simd16(pa, pa.cur, slot); |