aboutsummaryrefslogtreecommitdiffstats
path: root/src/gallium/drivers/swr/rasterizer/common
diff options
context:
space:
mode:
authorTim Rowley <[email protected]>2016-12-19 15:25:52 -0600
committerTim Rowley <[email protected]>2017-01-06 10:04:41 -0600
commitbd22c3d41151ce265e61d64f9034928f83d3c959 (patch)
tree08aec9e01e260e11da2222c312edc4e0309b2f22 /src/gallium/drivers/swr/rasterizer/common
parente6eede81afc8890b4ec7bcfb74d9a71b97f3c031 (diff)
swr: [rasterizer core] fix SIMD16 transpose functions
Fixed Transpose_16 methods of following formats: Transpose8_8_8_8 Transpose8_8 Transpose32_32 Transpose16_16_16_16 Transpose16_16_16 Transpose16_16 Reviewed-by: Bruce Cherniak <[email protected]>
Diffstat (limited to 'src/gallium/drivers/swr/rasterizer/common')
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/simd16intrin.h93
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/simdintrin.h18
2 files changed, 99 insertions, 12 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
index cf6a6b6883f..94da225c651 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
@@ -536,6 +536,15 @@ INLINE int _simd16_testz_ps(simd16scalar a, simd16scalar b)
#define _simd16_cmplt_epi32(a, b) _simd16_cmpgt_epi32(b, a)
+SIMD16_EMU_AVX512_2(simd16scalar, _simd16_unpacklo_ps, _simd_unpacklo_ps)
+SIMD16_EMU_AVX512_2(simd16scalar, _simd16_unpackhi_ps, _simd_unpackhi_ps)
+SIMD16_EMU_AVX512_2(simd16scalard, _simd16_unpacklo_pd, _simd_unpacklo_pd)
+SIMD16_EMU_AVX512_2(simd16scalard, _simd16_unpackhi_pd, _simd_unpackhi_pd)
+
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi8, _simd_unpacklo_epi8)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi8, _simd_unpackhi_epi8)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi16, _simd_unpacklo_epi16)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi16, _simd_unpackhi_epi16)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi32, _simd_unpacklo_epi32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpackhi_epi32, _simd_unpackhi_epi32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_unpacklo_epi64, _simd_unpacklo_epi64)
@@ -583,24 +592,38 @@ INLINE simd16scalari _simd16_srli_epi32_temp(simd16scalari a)
SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmadd_ps, _simd_fmadd_ps)
SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmsub_ps, _simd_fmsub_ps)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_shuffle_epi8, _simd_shuffle_epi8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_adds_epu8, _simd_adds_epu8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_subs_epu8, _simd_subs_epu8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi8, _simd_add_epi8)
+//__m256 _simd_i32gather_ps(const float* pBase, __m256i vOffsets, const int scale)
+template <int scale>
+INLINE simd16scalar _simd16_i32gather_ps_temp(const float *m, simd16scalari index)
+{
+ simd16scalar result;
-template <int imm8>
-INLINE simd16scalar _simd16_i32gather_ps_temp(float const *m, simd16scalari a)
+ result.lo = _simd_i32gather_ps(m, index.lo, scale);
+ result.hi = _simd_i32gather_ps(m, index.hi, scale);
+
+ return result;
+}
+
+#define _simd16_i32gather_ps(m, index, scale) _simd16_i32gather_ps_temp<scale>(m, index)
+
+//__m256 _simd_mask_i32gather_ps(__m256 vSrc, const float* pBase, __m256i vOffsets, __m256 vMask, const int scale)
+template <int scale>
+INLINE simd16scalar _simd16_mask_i32gather_ps_temp(simd16scalar a, const float *m, simd16scalari index, simd16scalari mask)
{
simd16scalar result;
- result.lo = _mm256_i32gather_ps(m, a.lo, imm8);
- result.hi = _mm256_i32gather_ps(m, a.hi, imm8);
+ result.lo = _simd_mask_i32gather_ps(a.lo, m, index.lo, _simd_castsi_ps(mask.lo), scale);
+ result.hi = _simd_mask_i32gather_ps(a.hi, m, index.hi, _simd_castsi_ps(mask.hi), scale);
return result;
}
-#define _simd16_i32gather_ps(m, a, imm8) _simd16_i32gather_ps_temp<imm8>(m, a)
+#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp<scale>(a, m, mask, index)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_shuffle_epi8, _simd_shuffle_epi8)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_adds_epu8, _simd_adds_epu8)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_subs_epu8, _simd_subs_epu8)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi8, _simd_add_epi8)
SIMD16_EMU_AVX512_1(simd16scalari, _simd16_abs_epi32, _simd_abs_epi32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi64, _simd_cmpeq_epi64)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi64, _simd_cmpgt_epi64)
@@ -716,6 +739,39 @@ INLINE simd16scalari _simd16_shuffle_epi64_temp(simd16scalari a, simd16scalari b
#define _simd16_shuffle_epi64(a, b, imm8) _simd16_shuffle_epi64_temp<imm8>(a, b)
+INLINE simd16scalari _simd16_cvtepu8_epi16(simdscalari a)
+{
+ simd16scalari result;
+
+ result.lo = _simd_cvtepu8_epi16(_mm256_extractf128_si256(a, 0));
+ result.hi = _simd_cvtepu8_epi16(_mm256_extractf128_si256(a, 1));
+
+ return result;
+}
+
+INLINE simd16scalari _simd16_cvtepu8_epi32(__m128i a)
+{
+ simd16scalari result;
+
+ result.lo = _simd_cvtepu8_epi32(a);
+ result.hi = _simd_cvtepu8_epi32(_mm_srli_si128(a, 8));
+
+ return result;
+}
+
+INLINE simd16scalari _simd16_cvtepu16_epi32(simdscalari a)
+{
+ simd16scalari result;
+
+ result.lo = _simd_cvtepu16_epi32(_mm256_extractf128_si256(a, 0));
+ result.hi = _simd_cvtepu16_epi32(_mm256_extractf128_si256(a, 1));
+
+ return result;
+}
+
+SIMD16_EMU_AVX512_2(simd16scalari, _simd_packus_epi32, _mm256_packus_epi32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd_packs_epi32, _mm256_packs_epi32)
+
INLINE simd16mask _simd16_int2mask(int mask)
{
return mask;
@@ -939,6 +995,14 @@ INLINE int _simd16_testz_ps(simd16scalar a, simd16scalar b)
#endif
+#define _simd16_unpacklo_ps _mm512_unpacklo_ps
+#define _simd16_unpackhi_ps _mm512_unpackhi_ps
+#define _simd16_unpacklo_pd _mm512_unpacklo_pd
+#define _simd16_unpackhi_pd _mm512_unpackhi_pd
+#define _simd16_unpacklo_epi8 _mm512_unpacklo_epi8
+#define _simd16_unpackhi_epi8 _mm512_unpackhi_epi8
+#define _simd16_unpacklo_epi16 _mm512_unpacklo_epi16
+#define _simd16_unpackhi_epi16 _mm512_unpackhi_epi16
#define _simd16_unpacklo_epi32 _mm512_unpacklo_epi32
#define _simd16_unpackhi_epi32 _mm512_unpackhi_epi32
#define _simd16_unpacklo_epi64 _mm512_unpacklo_epi64
@@ -953,7 +1017,11 @@ INLINE int _simd16_testz_ps(simd16scalar a, simd16scalar b)
#define _simd16_add_epi8 _mm512_add_epi8
#define _simd16_shuffle_epi8 _mm512_shuffle_epi8
-#define _simd16_i32gather_ps(m, index, scale) _mm512_i32gather_ps(index, m, scale)
+#define _simd16_fmadd_ps _mm512_fmadd_ps
+#define _simd16_fmsub_ps _mm512_fmsub_ps
+
+#define _simd16_i32gather_ps(m, index, scale) _mm512_i32gather_ps(index, m, scale)
+#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _mm512_mask_i32gather_ps(a, m, index, mask, scale)
#define _simd16_abs_epi32 _mm512_abs_epi32
#define _simd16_cmpeq_epi64 _mm512_abs_epi32
@@ -1009,6 +1077,11 @@ INLINE simd16scalari _simd16_cmpgt_epi8(simd16scalari a, simd16scalari b)
#define _simd16_permute2f128_si _mm512_shuffle_i32x4
#define _simd16_shuffle_ps _mm512_shuffle_ps
#define _simd16_shuffle_pd _mm512_shuffle_pd
+#define _simd16_cvtepu8_epi16 _mm512_cvtepu8_epi16
+#define _simd16_cvtepu8_epi32 _mm512_cvtepu8_epi32
+#define _simd16_cvtepu16_epi32 _mm512_cvtepu16_epi32
+#define _simd16_packus_epi32 _mm512_packus_epi32
+#define _simd16_packs_epi32 _mm512_packs_epi32
template <int imm8>
INLINE simd16scalari _simd16_shuffle_epi32_temp(simd16scalari a, simd16scalari b)
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
index e777b22ec1c..671e3b82a34 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
@@ -423,10 +423,21 @@ int _simdemu_movemask_epi8(__m256i a)
}
INLINE
+__m256i _simd_cvtepu8_epi16(__m128i a)
+{
+ __m128i resultlo = _mm_cvtepu8_epi16(a);
+ __m128i resulthi = _mm_cvtepu8_epi16(_mm_srli_si128(a, 8));
+
+ __m256i result = _mm256_castsi128_si256(resultlo);
+
+ return _mm256_insertf128_si256(result, resulthi, 1);
+}
+
+INLINE
__m256i _simd_cvtepu8_epi32(__m128i a)
{
__m128i resultlo = _mm_cvtepu8_epi32(a);
- __m128i resulthi = _mm_shuffle_epi8(a, _mm_set_epi32(0x80808007, 0x80808006, 0x80808005, 0x80808004));
+ __m128i resulthi = _mm_cvtepu8_epi32(_mm_srli_si128(a, 4));
__m256i result = _mm256_castsi128_si256(resultlo);
@@ -437,7 +448,7 @@ INLINE
__m256i _simd_cvtepu16_epi32(__m128i a)
{
__m128i resultlo = _mm_cvtepu16_epi32(a);
- __m128i resulthi = _mm_shuffle_epi8(a, _mm_set_epi32(0x80800F0E, 0x80800D0C, 0x80800B0A, 0x80800908));
+ __m128i resulthi = _mm_cvtepu16_epi32(_mm_srli_si128(a, 8));
__m256i result = _mm256_castsi128_si256(resultlo);
@@ -534,6 +545,7 @@ __m256i _simd_packs_epi32(__m256i a, __m256i b)
#define _simd_permute_epi32 _mm256_permutevar8x32_epi32
#define _simd_srlv_epi32 _mm256_srlv_epi32
#define _simd_sllv_epi32 _mm256_sllv_epi32
+#define _simd_cvtepu8_epi16 _mm256_cvtepu8_epi16
#define _simd_cvtepu8_epi32 _mm256_cvtepu8_epi32
#define _simd_cvtepu16_epi32 _mm256_cvtepu16_epi32
#define _simd_packus_epi32 _mm256_packus_epi32
@@ -542,7 +554,9 @@ __m256i _simd_packs_epi32(__m256i a, __m256i b)
#endif
#define _simd_unpacklo_ps _mm256_unpacklo_ps
+#define _simd_unpackhi_ps _mm256_unpackhi_ps
#define _simd_unpacklo_pd _mm256_unpacklo_pd
+#define _simd_unpackhi_pd _mm256_unpackhi_pd
#define _simd_insertf128_ps _mm256_insertf128_ps
#define _simd_insertf128_pd _mm256_insertf128_pd
#define _simd_insertf128_si _mm256_insertf128_si256