aboutsummaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/swr/rasterizer/common
diff options
context:
space:
mode:
authorTim Rowley <[email protected]>2016-10-28 15:59:18 -0500
committerTim Rowley <[email protected]>2016-11-14 09:00:59 -0600
commit937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017 (patch)
tree56d840da72f70b93adda6dd12fd20a7eca6f174f /src/gallium/drivers/swr/rasterizer/common
parentf233bcda8930eea1f6fc0b830e4953485361a0e7 (diff)
swr: [rasterizer core] 16-wide tile store nearly completed
* All format combinations coded * Fully emulated on AVX2 and AVX * Known issue: the MSAA sample locations need to be adjusted for 8x2 Set ENABLE_AVX512_SIMD16 and USD_8x2_TILE_BACKEND to 1 in knobs.h to enable Reviewed-by: Bruce Cherniak <[email protected]>
Diffstat (limited to 'src/gallium/drivers/swr/rasterizer/common')
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/simd16intrin.h61
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/simdintrin.h38
2 files changed, 68 insertions, 31 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
index 56ecf5bfd3d..cf6a6b6883f 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
@@ -459,10 +459,10 @@ INLINE simd16scalar _simd16_cmp_ps(simd16scalar a, simd16scalar b)
#define _simd16_cmpge_ps(a, b) _simd16_cmp_ps<_CMP_GE_OQ>(a, b)
#define _simd16_cmple_ps(a, b) _simd16_cmp_ps<_CMP_LE_OQ>(a, b)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_and_ps, _mm256_and_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_or_ps, _mm256_or_ps)
-SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rcp_ps, _mm256_rcp_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_div_ps, _mm256_div_ps)
+SIMD16_EMU_AVX512_2(simd16scalar, _simd16_and_ps, _simd_and_ps)
+SIMD16_EMU_AVX512_2(simd16scalar, _simd16_or_ps, _simd_or_ps)
+SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rcp_ps, _simd_rcp_ps)
+SIMD16_EMU_AVX512_2(simd16scalar, _simd16_div_ps, _simd_div_ps)
INLINE simd16scalar _simd16_castsi_ps(simd16scalari a)
{
@@ -509,21 +509,22 @@ INLINE simd16scalar _simd16_round_ps_temp(simd16scalar a)
#define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mul_epi32, _mm256_mul_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mullo_epi32, _mm256_mullo_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi32, _mm256_sub_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi64, _mm256_sub_epi64)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epi32, _mm256_min_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epi32, _mm256_max_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epu32, _mm256_min_epu32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epu32, _mm256_max_epu32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi32, _mm256_add_epi32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mul_epi32, _simd_mul_epi32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mullo_epi32, _simd_mullo_epi32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi32, _simd_sub_epi32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi64, _simd_sub_epi64)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epi32, _simd_min_epi32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epi32, _simd_max_epi32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epu32, _simd_min_epu32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epu32, _simd_max_epu32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi32, _simd_add_epi32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_and_si, _simd_and_si)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_andnot_si, _simd_andnot_si)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_or_si, _simd_or_si)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_xor_si, _simd_xor_si)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi32, _mm256_cmpeq_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi32, _mm256_cmpgt_epi32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi32, _simd_cmpeq_epi32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi32, _simd_cmpgt_epi32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmplt_epi32, _simd_cmplt_epi32)
INLINE int _simd16_testz_ps(simd16scalar a, simd16scalar b)
{
@@ -579,13 +580,13 @@ INLINE simd16scalari _simd16_srli_epi32_temp(simd16scalari a)
#define _simd16_srli_epi32(a, imm8) _simd16_srli_epi32_temp<imm8>(a)
-SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmadd_ps, _mm256_fmadd_ps)
-SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmsub_ps, _mm256_fmsub_ps)
+SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmadd_ps, _simd_fmadd_ps)
+SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmsub_ps, _simd_fmsub_ps)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_shuffle_epi8, _mm256_shuffle_epi8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_adds_epu8, _mm256_adds_epu8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_subs_epu8, _mm256_subs_epu8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi8, _mm256_add_epi8)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_shuffle_epi8, _simd_shuffle_epi8)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_adds_epu8, _simd_adds_epu8)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_subs_epu8, _simd_subs_epu8)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi8, _simd_add_epi8)
template <int imm8>
INLINE simd16scalar _simd16_i32gather_ps_temp(float const *m, simd16scalari a)
@@ -600,13 +601,13 @@ INLINE simd16scalar _simd16_i32gather_ps_temp(float const *m, simd16scalari a)
#define _simd16_i32gather_ps(m, a, imm8) _simd16_i32gather_ps_temp<imm8>(m, a)
-SIMD16_EMU_AVX512_1(simd16scalari, _simd16_abs_epi32, _mm256_abs_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi64, _mm256_cmpeq_epi64)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi64, _mm256_cmpgt_epi64)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi16, _mm256_cmpeq_epi16)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi16, _mm256_cmpgt_epi16)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi8, _mm256_cmpeq_epi8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi8, _mm256_cmpgt_epi8)
+SIMD16_EMU_AVX512_1(simd16scalari, _simd16_abs_epi32, _simd_abs_epi32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi64, _simd_cmpeq_epi64)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi64, _simd_cmpgt_epi64)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi16, _simd_cmpeq_epi16)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi16, _simd_cmpgt_epi16)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi8, _simd_cmpeq_epi8)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi8, _simd_cmpgt_epi8)
INLINE simd16scalar _simd16_permute_ps(simd16scalar a, simd16scalari i)
{
@@ -631,8 +632,8 @@ INLINE simd16scalari _simd16_permute_epi32(simd16scalari a, simd16scalari i)
return _simd16_castps_si(_simd16_permute_ps(_simd16_castsi_ps(a), i));
}
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_srlv_epi32, _mm256_srlv_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sllv_epi32, _mm256_sllv_epi32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_srlv_epi32, _simd_srlv_epi32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sllv_epi32, _simd_sllv_epi32)
template <int imm8>
INLINE simd16scalar _simd16_permute2f128_ps_temp(simd16scalar a, simd16scalar b)
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
index 10c0955fe40..e777b22ec1c 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
@@ -314,7 +314,15 @@ SIMD_EMU_EPI(_simdemu_cmpgt_epi8, _mm_cmpgt_epi8)
SIMD_EMU_EPI(_simdemu_cmpeq_epi8, _mm_cmpeq_epi8)
SIMD_EMU_EPI(_simdemu_cmpgt_epi16, _mm_cmpgt_epi16)
SIMD_EMU_EPI(_simdemu_cmpeq_epi16, _mm_cmpeq_epi16)
-
+SIMD_EMU_EPI(_simdemu_unpacklo_epi8, _mm_unpacklo_epi8)
+SIMD_EMU_EPI(_simdemu_unpackhi_epi8, _mm_unpackhi_epi8)
+SIMD_EMU_EPI(_simdemu_unpacklo_epi16, _mm_unpacklo_epi16)
+SIMD_EMU_EPI(_simdemu_unpackhi_epi16, _mm_unpackhi_epi16)
+
+#define _simd_unpacklo_epi8 _simdemu_unpacklo_epi8
+#define _simd_unpackhi_epi8 _simdemu_unpackhi_epi8
+#define _simd_unpacklo_epi16 _simdemu_unpacklo_epi16
+#define _simd_unpackhi_epi16 _simdemu_unpackhi_epi16
#define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
#define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
#define _simd_unpacklo_epi64(a, b) _mm256_castpd_si256(_mm256_unpacklo_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)))
@@ -490,6 +498,10 @@ __m256i _simd_packs_epi32(__m256i a, __m256i b)
#define _simd_xor_si _mm256_xor_si256
#define _simd_castps_si _mm256_castps_si256
+#define _simd_unpacklo_epi8 _mm256_unpacklo_epi8
+#define _simd_unpackhi_epi8 _mm256_unpackhi_epi8
+#define _simd_unpacklo_epi16 _mm256_unpacklo_epi16
+#define _simd_unpackhi_epi16 _mm256_unpackhi_epi16
#define _simd_unpacklo_epi32 _mm256_unpacklo_epi32
#define _simd_unpackhi_epi32 _mm256_unpackhi_epi32
#define _simd_unpacklo_epi64 _mm256_unpacklo_epi64
@@ -529,6 +541,14 @@ __m256i _simd_packs_epi32(__m256i a, __m256i b)
#endif
+#define _simd_unpacklo_ps _mm256_unpacklo_ps
+#define _simd_unpacklo_pd _mm256_unpacklo_pd
+#define _simd_insertf128_ps _mm256_insertf128_ps
+#define _simd_insertf128_pd _mm256_insertf128_pd
+#define _simd_insertf128_si _mm256_insertf128_si256
+#define _simd_extractf128_ps _mm256_extractf128_ps
+#define _simd_extractf128_pd _mm256_extractf128_pd
+#define _simd_extractf128_si _mm256_extractf128_si256
#define _simd_permute2f128_ps _mm256_permute2f128_ps
#define _simd_permute2f128_pd _mm256_permute2f128_pd
#define _simd_permute2f128_si _mm256_permute2f128_si256
@@ -551,6 +571,22 @@ __m256i _simd_packs_epi32(__m256i a, __m256i b)
#define _simd_xor_ps _mm256_xor_ps
INLINE
+simdscalari _simd_loadu2_si(const __m128i *hiaddr, const __m128i *loaddr)
+{
+ __m128i lo = _mm_loadu_si128(loaddr);
+ __m128i hi = _mm_loadu_si128(hiaddr);
+
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+}
+
+INLINE
+void _simd_storeu2_si(__m128i *hiaddr, __m128i *loaddr, simdscalari a)
+{
+ _mm_storeu_si128(loaddr, _mm256_castsi256_si128(a));
+ _mm_storeu_si128(hiaddr, _mm256_extractf128_si256(a, 1));
+}
+
+INLINE
simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalar mask)
{
return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), mask));