diff options
author | Tim Rowley <[email protected]> | 2016-10-28 15:59:18 -0500 |
---|---|---|
committer | Tim Rowley <[email protected]> | 2016-11-14 09:00:59 -0600 |
commit | 937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017 (patch) | |
tree | 56d840da72f70b93adda6dd12fd20a7eca6f174f /src/gallium/drivers/swr/rasterizer/common | |
parent | f233bcda8930eea1f6fc0b830e4953485361a0e7 (diff) |
swr: [rasterizer core] 16-wide tile store nearly completed
* All format combinations coded
* Fully emulated on AVX2 and AVX
* Known issue: the MSAA sample locations need to be adjusted for 8x2
Set ENABLE_AVX512_SIMD16 and USD_8x2_TILE_BACKEND to 1 in knobs.h to enable
Reviewed-by: Bruce Cherniak <[email protected]>
Diffstat (limited to 'src/gallium/drivers/swr/rasterizer/common')
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/common/simd16intrin.h | 61 | ||||
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/common/simdintrin.h | 38 |
2 files changed, 68 insertions, 31 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h index 56ecf5bfd3d..cf6a6b6883f 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h @@ -459,10 +459,10 @@ INLINE simd16scalar _simd16_cmp_ps(simd16scalar a, simd16scalar b) #define _simd16_cmpge_ps(a, b) _simd16_cmp_ps<_CMP_GE_OQ>(a, b) #define _simd16_cmple_ps(a, b) _simd16_cmp_ps<_CMP_LE_OQ>(a, b) -SIMD16_EMU_AVX512_2(simd16scalar, _simd16_and_ps, _mm256_and_ps) -SIMD16_EMU_AVX512_2(simd16scalar, _simd16_or_ps, _mm256_or_ps) -SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rcp_ps, _mm256_rcp_ps) -SIMD16_EMU_AVX512_2(simd16scalar, _simd16_div_ps, _mm256_div_ps) +SIMD16_EMU_AVX512_2(simd16scalar, _simd16_and_ps, _simd_and_ps) +SIMD16_EMU_AVX512_2(simd16scalar, _simd16_or_ps, _simd_or_ps) +SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rcp_ps, _simd_rcp_ps) +SIMD16_EMU_AVX512_2(simd16scalar, _simd16_div_ps, _simd_div_ps) INLINE simd16scalar _simd16_castsi_ps(simd16scalari a) { @@ -509,21 +509,22 @@ INLINE simd16scalar _simd16_round_ps_temp(simd16scalar a) #define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mul_epi32, _mm256_mul_epi32) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mullo_epi32, _mm256_mullo_epi32) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi32, _mm256_sub_epi32) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi64, _mm256_sub_epi64) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epi32, _mm256_min_epi32) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epi32, _mm256_max_epi32) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epu32, _mm256_min_epu32) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epu32, _mm256_max_epu32) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi32, _mm256_add_epi32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mul_epi32, _simd_mul_epi32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mullo_epi32, _simd_mullo_epi32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi32, _simd_sub_epi32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi64, _simd_sub_epi64) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epi32, _simd_min_epi32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epi32, _simd_max_epi32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epu32, _simd_min_epu32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epu32, _simd_max_epu32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi32, _simd_add_epi32) SIMD16_EMU_AVX512_2(simd16scalari, _simd16_and_si, _simd_and_si) SIMD16_EMU_AVX512_2(simd16scalari, _simd16_andnot_si, _simd_andnot_si) SIMD16_EMU_AVX512_2(simd16scalari, _simd16_or_si, _simd_or_si) SIMD16_EMU_AVX512_2(simd16scalari, _simd16_xor_si, _simd_xor_si) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi32, _mm256_cmpeq_epi32) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi32, _mm256_cmpgt_epi32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi32, _simd_cmpeq_epi32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi32, _simd_cmpgt_epi32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmplt_epi32, _simd_cmplt_epi32) INLINE int _simd16_testz_ps(simd16scalar a, simd16scalar b) { @@ -579,13 +580,13 @@ INLINE simd16scalari _simd16_srli_epi32_temp(simd16scalari a) #define _simd16_srli_epi32(a, imm8) _simd16_srli_epi32_temp<imm8>(a) -SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmadd_ps, _mm256_fmadd_ps) -SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmsub_ps, _mm256_fmsub_ps) +SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmadd_ps, _simd_fmadd_ps) +SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmsub_ps, _simd_fmsub_ps) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_shuffle_epi8, _mm256_shuffle_epi8) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_adds_epu8, _mm256_adds_epu8) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_subs_epu8, _mm256_subs_epu8) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi8, _mm256_add_epi8) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_shuffle_epi8, _simd_shuffle_epi8) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_adds_epu8, _simd_adds_epu8) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_subs_epu8, _simd_subs_epu8) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi8, _simd_add_epi8) template <int imm8> INLINE simd16scalar _simd16_i32gather_ps_temp(float const *m, simd16scalari a) @@ -600,13 +601,13 @@ INLINE simd16scalar _simd16_i32gather_ps_temp(float const *m, simd16scalari a) #define _simd16_i32gather_ps(m, a, imm8) _simd16_i32gather_ps_temp<imm8>(m, a) -SIMD16_EMU_AVX512_1(simd16scalari, _simd16_abs_epi32, _mm256_abs_epi32) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi64, _mm256_cmpeq_epi64) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi64, _mm256_cmpgt_epi64) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi16, _mm256_cmpeq_epi16) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi16, _mm256_cmpgt_epi16) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi8, _mm256_cmpeq_epi8) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi8, _mm256_cmpgt_epi8) +SIMD16_EMU_AVX512_1(simd16scalari, _simd16_abs_epi32, _simd_abs_epi32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi64, _simd_cmpeq_epi64) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi64, _simd_cmpgt_epi64) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi16, _simd_cmpeq_epi16) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi16, _simd_cmpgt_epi16) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi8, _simd_cmpeq_epi8) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi8, _simd_cmpgt_epi8) INLINE simd16scalar _simd16_permute_ps(simd16scalar a, simd16scalari i) { @@ -631,8 +632,8 @@ INLINE simd16scalari _simd16_permute_epi32(simd16scalari a, simd16scalari i) return _simd16_castps_si(_simd16_permute_ps(_simd16_castsi_ps(a), i)); } -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_srlv_epi32, _mm256_srlv_epi32) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sllv_epi32, _mm256_sllv_epi32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_srlv_epi32, _simd_srlv_epi32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sllv_epi32, _simd_sllv_epi32) template <int imm8> INLINE simd16scalar _simd16_permute2f128_ps_temp(simd16scalar a, simd16scalar b) diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h index 10c0955fe40..e777b22ec1c 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h @@ -314,7 +314,15 @@ SIMD_EMU_EPI(_simdemu_cmpgt_epi8, _mm_cmpgt_epi8) SIMD_EMU_EPI(_simdemu_cmpeq_epi8, _mm_cmpeq_epi8) SIMD_EMU_EPI(_simdemu_cmpgt_epi16, _mm_cmpgt_epi16) SIMD_EMU_EPI(_simdemu_cmpeq_epi16, _mm_cmpeq_epi16) - +SIMD_EMU_EPI(_simdemu_unpacklo_epi8, _mm_unpacklo_epi8) +SIMD_EMU_EPI(_simdemu_unpackhi_epi8, _mm_unpackhi_epi8) +SIMD_EMU_EPI(_simdemu_unpacklo_epi16, _mm_unpacklo_epi16) +SIMD_EMU_EPI(_simdemu_unpackhi_epi16, _mm_unpackhi_epi16) + +#define _simd_unpacklo_epi8 _simdemu_unpacklo_epi8 +#define _simd_unpackhi_epi8 _simdemu_unpackhi_epi8 +#define _simd_unpacklo_epi16 _simdemu_unpacklo_epi16 +#define _simd_unpackhi_epi16 _simdemu_unpackhi_epi16 #define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))) #define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))) #define _simd_unpacklo_epi64(a, b) _mm256_castpd_si256(_mm256_unpacklo_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b))) @@ -490,6 +498,10 @@ __m256i _simd_packs_epi32(__m256i a, __m256i b) #define _simd_xor_si _mm256_xor_si256 #define _simd_castps_si _mm256_castps_si256 +#define _simd_unpacklo_epi8 _mm256_unpacklo_epi8 +#define _simd_unpackhi_epi8 _mm256_unpackhi_epi8 +#define _simd_unpacklo_epi16 _mm256_unpacklo_epi16 +#define _simd_unpackhi_epi16 _mm256_unpackhi_epi16 #define _simd_unpacklo_epi32 _mm256_unpacklo_epi32 #define _simd_unpackhi_epi32 _mm256_unpackhi_epi32 #define _simd_unpacklo_epi64 _mm256_unpacklo_epi64 @@ -529,6 +541,14 @@ __m256i _simd_packs_epi32(__m256i a, __m256i b) #endif +#define _simd_unpacklo_ps _mm256_unpacklo_ps +#define _simd_unpacklo_pd _mm256_unpacklo_pd +#define _simd_insertf128_ps _mm256_insertf128_ps +#define _simd_insertf128_pd _mm256_insertf128_pd +#define _simd_insertf128_si _mm256_insertf128_si256 +#define _simd_extractf128_ps _mm256_extractf128_ps +#define _simd_extractf128_pd _mm256_extractf128_pd +#define _simd_extractf128_si _mm256_extractf128_si256 #define _simd_permute2f128_ps _mm256_permute2f128_ps #define _simd_permute2f128_pd _mm256_permute2f128_pd #define _simd_permute2f128_si _mm256_permute2f128_si256 @@ -551,6 +571,22 @@ __m256i _simd_packs_epi32(__m256i a, __m256i b) #define _simd_xor_ps _mm256_xor_ps INLINE +simdscalari _simd_loadu2_si(const __m128i *hiaddr, const __m128i *loaddr) +{ + __m128i lo = _mm_loadu_si128(loaddr); + __m128i hi = _mm_loadu_si128(hiaddr); + + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +} + +INLINE +void _simd_storeu2_si(__m128i *hiaddr, __m128i *loaddr, simdscalari a) +{ + _mm_storeu_si128(loaddr, _mm256_castsi256_si128(a)); + _mm_storeu_si128(hiaddr, _mm256_extractf128_si256(a, 1)); +} + +INLINE simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalar mask) { return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), mask)); |