summaryrefslogtreecommitdiffstats
path: root/src
diff options
context:
space:
mode:
authorTim Rowley <[email protected]>2016-10-28 15:59:18 -0500
committerTim Rowley <[email protected]>2016-11-14 09:00:59 -0600
commit937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017 (patch)
tree56d840da72f70b93adda6dd12fd20a7eca6f174f /src
parentf233bcda8930eea1f6fc0b830e4953485361a0e7 (diff)
swr: [rasterizer core] 16-wide tile store nearly completed
* All format combinations coded * Fully emulated on AVX2 and AVX * Known issue: the MSAA sample locations need to be adjusted for 8x2 Set ENABLE_AVX512_SIMD16 and USD_8x2_TILE_BACKEND to 1 in knobs.h to enable Reviewed-by: Bruce Cherniak <[email protected]>
Diffstat (limited to 'src')
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/simd16intrin.h61
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/simdintrin.h38
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/format_types.h20
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/utils.h174
-rw-r--r--src/gallium/drivers/swr/rasterizer/memory/StoreTile.h938
5 files changed, 917 insertions, 314 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
index 56ecf5bfd3d..cf6a6b6883f 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h
@@ -459,10 +459,10 @@ INLINE simd16scalar _simd16_cmp_ps(simd16scalar a, simd16scalar b)
#define _simd16_cmpge_ps(a, b) _simd16_cmp_ps<_CMP_GE_OQ>(a, b)
#define _simd16_cmple_ps(a, b) _simd16_cmp_ps<_CMP_LE_OQ>(a, b)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_and_ps, _mm256_and_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_or_ps, _mm256_or_ps)
-SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rcp_ps, _mm256_rcp_ps)
-SIMD16_EMU_AVX512_2(simd16scalar, _simd16_div_ps, _mm256_div_ps)
+SIMD16_EMU_AVX512_2(simd16scalar, _simd16_and_ps, _simd_and_ps)
+SIMD16_EMU_AVX512_2(simd16scalar, _simd16_or_ps, _simd_or_ps)
+SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rcp_ps, _simd_rcp_ps)
+SIMD16_EMU_AVX512_2(simd16scalar, _simd16_div_ps, _simd_div_ps)
INLINE simd16scalar _simd16_castsi_ps(simd16scalari a)
{
@@ -509,21 +509,22 @@ INLINE simd16scalar _simd16_round_ps_temp(simd16scalar a)
#define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mul_epi32, _mm256_mul_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mullo_epi32, _mm256_mullo_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi32, _mm256_sub_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi64, _mm256_sub_epi64)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epi32, _mm256_min_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epi32, _mm256_max_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epu32, _mm256_min_epu32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epu32, _mm256_max_epu32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi32, _mm256_add_epi32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mul_epi32, _simd_mul_epi32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mullo_epi32, _simd_mullo_epi32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi32, _simd_sub_epi32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi64, _simd_sub_epi64)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epi32, _simd_min_epi32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epi32, _simd_max_epi32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epu32, _simd_min_epu32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epu32, _simd_max_epu32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi32, _simd_add_epi32)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_and_si, _simd_and_si)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_andnot_si, _simd_andnot_si)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_or_si, _simd_or_si)
SIMD16_EMU_AVX512_2(simd16scalari, _simd16_xor_si, _simd_xor_si)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi32, _mm256_cmpeq_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi32, _mm256_cmpgt_epi32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi32, _simd_cmpeq_epi32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi32, _simd_cmpgt_epi32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmplt_epi32, _simd_cmplt_epi32)
INLINE int _simd16_testz_ps(simd16scalar a, simd16scalar b)
{
@@ -579,13 +580,13 @@ INLINE simd16scalari _simd16_srli_epi32_temp(simd16scalari a)
#define _simd16_srli_epi32(a, imm8) _simd16_srli_epi32_temp<imm8>(a)
-SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmadd_ps, _mm256_fmadd_ps)
-SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmsub_ps, _mm256_fmsub_ps)
+SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmadd_ps, _simd_fmadd_ps)
+SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmsub_ps, _simd_fmsub_ps)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_shuffle_epi8, _mm256_shuffle_epi8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_adds_epu8, _mm256_adds_epu8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_subs_epu8, _mm256_subs_epu8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi8, _mm256_add_epi8)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_shuffle_epi8, _simd_shuffle_epi8)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_adds_epu8, _simd_adds_epu8)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_subs_epu8, _simd_subs_epu8)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi8, _simd_add_epi8)
template <int imm8>
INLINE simd16scalar _simd16_i32gather_ps_temp(float const *m, simd16scalari a)
@@ -600,13 +601,13 @@ INLINE simd16scalar _simd16_i32gather_ps_temp(float const *m, simd16scalari a)
#define _simd16_i32gather_ps(m, a, imm8) _simd16_i32gather_ps_temp<imm8>(m, a)
-SIMD16_EMU_AVX512_1(simd16scalari, _simd16_abs_epi32, _mm256_abs_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi64, _mm256_cmpeq_epi64)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi64, _mm256_cmpgt_epi64)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi16, _mm256_cmpeq_epi16)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi16, _mm256_cmpgt_epi16)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi8, _mm256_cmpeq_epi8)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi8, _mm256_cmpgt_epi8)
+SIMD16_EMU_AVX512_1(simd16scalari, _simd16_abs_epi32, _simd_abs_epi32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi64, _simd_cmpeq_epi64)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi64, _simd_cmpgt_epi64)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi16, _simd_cmpeq_epi16)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi16, _simd_cmpgt_epi16)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi8, _simd_cmpeq_epi8)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi8, _simd_cmpgt_epi8)
INLINE simd16scalar _simd16_permute_ps(simd16scalar a, simd16scalari i)
{
@@ -631,8 +632,8 @@ INLINE simd16scalari _simd16_permute_epi32(simd16scalari a, simd16scalari i)
return _simd16_castps_si(_simd16_permute_ps(_simd16_castsi_ps(a), i));
}
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_srlv_epi32, _mm256_srlv_epi32)
-SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sllv_epi32, _mm256_sllv_epi32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_srlv_epi32, _simd_srlv_epi32)
+SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sllv_epi32, _simd_sllv_epi32)
template <int imm8>
INLINE simd16scalar _simd16_permute2f128_ps_temp(simd16scalar a, simd16scalar b)
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
index 10c0955fe40..e777b22ec1c 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
@@ -314,7 +314,15 @@ SIMD_EMU_EPI(_simdemu_cmpgt_epi8, _mm_cmpgt_epi8)
SIMD_EMU_EPI(_simdemu_cmpeq_epi8, _mm_cmpeq_epi8)
SIMD_EMU_EPI(_simdemu_cmpgt_epi16, _mm_cmpgt_epi16)
SIMD_EMU_EPI(_simdemu_cmpeq_epi16, _mm_cmpeq_epi16)
-
+SIMD_EMU_EPI(_simdemu_unpacklo_epi8, _mm_unpacklo_epi8)
+SIMD_EMU_EPI(_simdemu_unpackhi_epi8, _mm_unpackhi_epi8)
+SIMD_EMU_EPI(_simdemu_unpacklo_epi16, _mm_unpacklo_epi16)
+SIMD_EMU_EPI(_simdemu_unpackhi_epi16, _mm_unpackhi_epi16)
+
+#define _simd_unpacklo_epi8 _simdemu_unpacklo_epi8
+#define _simd_unpackhi_epi8 _simdemu_unpackhi_epi8
+#define _simd_unpacklo_epi16 _simdemu_unpacklo_epi16
+#define _simd_unpackhi_epi16 _simdemu_unpackhi_epi16
#define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
#define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
#define _simd_unpacklo_epi64(a, b) _mm256_castpd_si256(_mm256_unpacklo_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b)))
@@ -490,6 +498,10 @@ __m256i _simd_packs_epi32(__m256i a, __m256i b)
#define _simd_xor_si _mm256_xor_si256
#define _simd_castps_si _mm256_castps_si256
+#define _simd_unpacklo_epi8 _mm256_unpacklo_epi8
+#define _simd_unpackhi_epi8 _mm256_unpackhi_epi8
+#define _simd_unpacklo_epi16 _mm256_unpacklo_epi16
+#define _simd_unpackhi_epi16 _mm256_unpackhi_epi16
#define _simd_unpacklo_epi32 _mm256_unpacklo_epi32
#define _simd_unpackhi_epi32 _mm256_unpackhi_epi32
#define _simd_unpacklo_epi64 _mm256_unpacklo_epi64
@@ -529,6 +541,14 @@ __m256i _simd_packs_epi32(__m256i a, __m256i b)
#endif
+#define _simd_unpacklo_ps _mm256_unpacklo_ps
+#define _simd_unpacklo_pd _mm256_unpacklo_pd
+#define _simd_insertf128_ps _mm256_insertf128_ps
+#define _simd_insertf128_pd _mm256_insertf128_pd
+#define _simd_insertf128_si _mm256_insertf128_si256
+#define _simd_extractf128_ps _mm256_extractf128_ps
+#define _simd_extractf128_pd _mm256_extractf128_pd
+#define _simd_extractf128_si _mm256_extractf128_si256
#define _simd_permute2f128_ps _mm256_permute2f128_ps
#define _simd_permute2f128_pd _mm256_permute2f128_pd
#define _simd_permute2f128_si _mm256_permute2f128_si256
@@ -551,6 +571,22 @@ __m256i _simd_packs_epi32(__m256i a, __m256i b)
#define _simd_xor_ps _mm256_xor_ps
INLINE
+simdscalari _simd_loadu2_si(const __m128i *hiaddr, const __m128i *loaddr)
+{
+ __m128i lo = _mm_loadu_si128(loaddr);
+ __m128i hi = _mm_loadu_si128(hiaddr);
+
+ return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1);
+}
+
+INLINE
+void _simd_storeu2_si(__m128i *hiaddr, __m128i *loaddr, simdscalari a)
+{
+ _mm_storeu_si128(loaddr, _mm256_castsi256_si128(a));
+ _mm_storeu_si128(hiaddr, _mm256_extractf128_si256(a, 1));
+}
+
+INLINE
simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalar mask)
{
return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), mask));
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h b/src/gallium/drivers/swr/rasterizer/core/format_types.h
index a24292482c1..a57daa5a1a5 100644
--- a/src/gallium/drivers/swr/rasterizer/core/format_types.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h
@@ -166,12 +166,12 @@ struct PackTraits<8, false>
simd16scalari result = _simd16_setzero_si();
simdscalari resultlo = _simd_setzero_si();
- __m128i templo = _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 0))), _mm256_extractf128_si256(_mm256_castps_si256(_simd16_extract_ps(in, 0)), 1));
- __m128i temphi = _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 1))), _mm256_extractf128_si256(_mm256_castps_si256(_simd16_extract_ps(in, 1)), 1));
+ __m128i templo = _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 0))), _simd_extractf128_si(_mm256_castps_si256(_simd16_extract_ps(in, 0)), 1));
+ __m128i temphi = _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 1))), _simd_extractf128_si(_mm256_castps_si256(_simd16_extract_ps(in, 1)), 1));
__m128i temp = _mm_packus_epi16(templo, temphi);
- resultlo = _mm256_inserti128_si256(resultlo, temp, 0);
+ resultlo = _simd_insertf128_si(resultlo, temp, 0);
result = _simd16_insert_si(result, resultlo, 0);
return _simd16_castsi_ps(result);
@@ -278,12 +278,12 @@ struct PackTraits<8, true>
simd16scalari result = _simd16_setzero_si();
simdscalari resultlo = _simd_setzero_si();
- __m128i templo = _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 0))), _mm256_extractf128_si256(_mm256_castps_si256(_simd16_extract_ps(in, 0)), 1));
- __m128i temphi = _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 1))), _mm256_extractf128_si256(_mm256_castps_si256(_simd16_extract_ps(in, 1)), 1));
+ __m128i templo = _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 0))), _simd_extractf128_si(_mm256_castps_si256(_simd16_extract_ps(in, 0)), 1));
+ __m128i temphi = _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 1))), _simd_extractf128_si(_mm256_castps_si256(_simd16_extract_ps(in, 1)), 1));
__m128i temp = _mm_packs_epi16(templo, temphi);
- resultlo = _mm256_inserti128_si256(resultlo, temp, 0);
+ resultlo = _simd_insertf128_si(resultlo, temp, 0);
result = _simd16_insert_si(result, resultlo, 0);
return _simd16_castsi_ps(result);
@@ -1057,16 +1057,16 @@ template<> struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16>
simdscalar simdlo = pack(_simd16_extract_ps(in, 0));
simdscalar simdhi = pack(_simd16_extract_ps(in, 1));
- __m128i templo = _mm256_extractf128_si256(_simd_castps_si(simdlo), 0);
- __m128i temphi = _mm256_extractf128_si256(_simd_castps_si(simdhi), 0);
+ __m128i templo = _simd_extractf128_si(_simd_castps_si(simdlo), 0);
+ __m128i temphi = _simd_extractf128_si(_simd_castps_si(simdhi), 0);
#else
__m128i templo = _mm256_cvtps_ph(_simd16_extract_ps(in, 0), _MM_FROUND_TRUNC);
__m128i temphi = _mm256_cvtps_ph(_simd16_extract_ps(in, 1), _MM_FROUND_TRUNC);
#endif
- resultlo = _mm256_insertf128_si256(resultlo, templo, 0);
- resultlo = _mm256_insertf128_si256(resultlo, temphi, 1);
+ resultlo = _simd_insertf128_si(resultlo, templo, 0);
+ resultlo = _simd_insertf128_si(resultlo, temphi, 1);
result = _simd16_insert_si(result, resultlo, 0);
diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h
index 91a994e432e..8f968649a57 100644
--- a/src/gallium/drivers/swr/rasterizer/core/utils.h
+++ b/src/gallium/drivers/swr/rasterizer/core/utils.h
@@ -147,7 +147,7 @@ void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3)
#if KNOB_SIMD_WIDTH == 8
INLINE
-void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2)
+void vTranspose3x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2)
{
__m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
__m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5
@@ -171,7 +171,7 @@ void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc
}
INLINE
-void vTranspose4x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2, __m256 &vSrc3)
+void vTranspose4x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2, const __m256 &vSrc3)
{
__m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5
__m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5
@@ -357,15 +357,17 @@ struct Transpose8_8
INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
{
- __m256i src = _mm256_load_si256(reinterpret_cast<const __m256i *>(pSrc)); // rrrrrrrrrrrrrrrrgggggggggggggggg
+ simdscalari r = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrrgggggggggggggggg
- __m256i r = _mm256_permute4x64_epi64(src, 0x50); // 0x50 = 01010000b // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx
+ simdscalari g = _simd_permute2f128_si(r, r, 1); // ggggggggggggggggxxxxxxxxxxxxxxxx
- __m256i g = _mm256_permute4x64_epi64(src, 0xFA); // 0xFA = 11111010b // ggggggggxxxxxxxxggggggggxxxxxxxx
+ r = _simd_insertf128_si(r, _mm_srli_si128(_simd_extractf128_si(r, 0), 8), 1); // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx
- __m256i dst = _mm256_unpacklo_epi8(r, g); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
+ g = _simd_insertf128_si(g, _mm_srli_si128(_simd_extractf128_si(g, 0), 8), 1); // ggggggggxxxxxxxxggggggggxxxxxxxx
- _mm256_store_si256(reinterpret_cast<__m256i *>(pDst), dst);
+ simdscalari dst = _simd_unpacklo_epi8(r, g); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg
+
+ _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst);
}
#endif
};
@@ -414,35 +416,13 @@ struct Transpose32_32_32_32
vTranspose4x8(vDst, _simd16_extract_ps(src0, 0), _simd16_extract_ps(src1, 0), _simd16_extract_ps(src2, 0), _simd16_extract_ps(src3, 0));
-#if 1
_simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, reinterpret_cast<simd16scalar *>(vDst)[0]);
_simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, reinterpret_cast<simd16scalar *>(vDst)[1]);
-#else
- _mm_store_ps(reinterpret_cast<float *>(pDst), vDst[0]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 4, vDst[1]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 8, vDst[2]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 12, vDst[3]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 16, vDst[4]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 20, vDst[5]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 24, vDst[6]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 28, vDst[7]);
-#endif
vTranspose4x8(vDst, _simd16_extract_ps(src0, 1), _simd16_extract_ps(src1, 1), _simd16_extract_ps(src2, 1), _simd16_extract_ps(src3, 1));
-#if 1
_simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, reinterpret_cast<simd16scalar *>(vDst)[2]);
_simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, reinterpret_cast<simd16scalar *>(vDst)[3]);
-#else
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 32, vDst[0]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 36, vDst[1]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 40, vDst[2]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 44, vDst[3]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 48, vDst[4]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 52, vDst[5]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 56, vDst[6]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 60, vDst[7]);
-#endif
}
#endif
};
@@ -489,35 +469,13 @@ struct Transpose32_32_32
vTranspose3x8(vDst, _simd16_extract_ps(src0, 0), _simd16_extract_ps(src1, 0), _simd16_extract_ps(src2, 0));
-#if 1
_simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, reinterpret_cast<simd16scalar *>(vDst)[0]);
_simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, reinterpret_cast<simd16scalar *>(vDst)[1]);
-#else
- _mm_store_ps(reinterpret_cast<float *>(pDst), vDst[0]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 4, vDst[1]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 8, vDst[2]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 12, vDst[3]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 16, vDst[4]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 20, vDst[5]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 24, vDst[6]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 28, vDst[7]);
-#endif
vTranspose3x8(vDst, _simd16_extract_ps(src0, 1), _simd16_extract_ps(src1, 1), _simd16_extract_ps(src2, 1));
-#if 1
_simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, reinterpret_cast<simd16scalar *>(vDst)[2]);
_simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, reinterpret_cast<simd16scalar *>(vDst)[3]);
-#else
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 32, vDst[0]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 36, vDst[1]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 40, vDst[2]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 44, vDst[3]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 48, vDst[4]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 52, vDst[5]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 56, vDst[6]);
- _mm_store_ps(reinterpret_cast<float *>(pDst) + 60, vDst[7]);
-#endif
}
#endif
};
@@ -558,24 +516,20 @@ struct Transpose32_32
INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst)
{
- const float *pfSrc = reinterpret_cast<const float *>(pSrc);
-
- __m256 src_r0 = _mm256_load_ps(pfSrc + 0);
- __m256 src_r1 = _mm256_load_ps(pfSrc + 8);
- __m256 src_g0 = _mm256_load_ps(pfSrc + 16);
- __m256 src_g1 = _mm256_load_ps(pfSrc + 24);
-
- __m256 dst0 = _mm256_unpacklo_ps(src_r0, src_g0);
- __m256 dst1 = _mm256_unpackhi_ps(src_r0, src_g0);
- __m256 dst2 = _mm256_unpacklo_ps(src_r1, src_g1);
- __m256 dst3 = _mm256_unpackhi_ps(src_r1, src_g1);
-
- float *pfDst = reinterpret_cast<float *>(pDst);
-
- _mm256_store_ps(pfDst + 0, dst0);
- _mm256_store_ps(pfDst + 8, dst1);
- _mm256_store_ps(pfDst + 16, dst2);
- _mm256_store_ps(pfDst + 24, dst3);
+ simdscalar src_r0 = _simd_load_ps(reinterpret_cast<const float *>(pSrc));
+ simdscalar src_r1 = _simd_load_ps(reinterpret_cast<const float *>(pSrc) + 8);
+ simdscalar src_g0 = _simd_load_ps(reinterpret_cast<const float *>(pSrc) + 16);
+ simdscalar src_g1 = _simd_load_ps(reinterpret_cast<const float *>(pSrc) + 24);
+
+ simdscalar dst0 = _simd_unpacklo_ps(src_r0, src_g0);
+ simdscalar dst1 = _simd_unpacklo_ps(src_r0, src_g0);
+ simdscalar dst2 = _simd_unpacklo_ps(src_r1, src_g1);
+ simdscalar dst3 = _simd_unpacklo_ps(src_r1, src_g1);
+
+ _simd_store_ps(reinterpret_cast<float *>(pDst) + 0, dst0);
+ _simd_store_ps(reinterpret_cast<float *>(pDst) + 8, dst1);
+ _simd_store_ps(reinterpret_cast<float *>(pDst) + 16, dst2);
+ _simd_store_ps(reinterpret_cast<float *>(pDst) + 24, dst3);
}
#endif
};
@@ -625,25 +579,25 @@ struct Transpose16_16_16_16
simd16scalari src_rg = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc));
simd16scalari src_ba = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc + sizeof(simd16scalari)));
- __m256i src_r = _simd16_extract_si(src_rg, 0);
- __m256i src_g = _simd16_extract_si(src_rg, 1);
- __m256i src_b = _simd16_extract_si(src_ba, 0);
- __m256i src_a = _simd16_extract_si(src_ba, 1);
-
- __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g);
- __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g);
- __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a);
- __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a);
-
- __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0);
- __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0);
- __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1);
- __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1);
-
- _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0);
- _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1);
- _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2);
- _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3);
+ simdscalari src_r = _simd16_extract_si(src_rg, 0);
+ simdscalari src_g = _simd16_extract_si(src_rg, 1);
+ simdscalari src_b = _simd16_extract_si(src_ba, 0);
+ simdscalari src_a = _simd16_extract_si(src_ba, 1);
+
+ simdscalari rg0 = _simd_unpacklo_epi16(src_r, src_g);
+ simdscalari rg1 = _simd_unpackhi_epi16(src_r, src_g);
+ simdscalari ba0 = _simd_unpacklo_epi16(src_b, src_a);
+ simdscalari ba1 = _simd_unpackhi_epi16(src_b, src_a);
+
+ simdscalari dst0 = _simd_unpacklo_epi32(rg0, ba0);
+ simdscalari dst1 = _simd_unpackhi_epi32(rg0, ba0);
+ simdscalari dst2 = _simd_unpacklo_epi32(rg1, ba1);
+ simdscalari dst3 = _simd_unpackhi_epi32(rg1, ba1);
+
+ _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);
+ _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);
+ _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2);
+ _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3);
}
#endif
};
@@ -691,25 +645,25 @@ struct Transpose16_16_16
{
simd16scalari src_rg = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc));
- __m256i src_r = _simd16_extract_si(src_rg, 0);
- __m256i src_g = _simd16_extract_si(src_rg, 1);
- __m256i src_b = _mm256_load_si256(reinterpret_cast<const __m256i *>(pSrc + sizeof(simd16scalari)));
- __m256i src_a = _mm256_undefined_si256();
-
- __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g);
- __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g);
- __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a);
- __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a);
-
- __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0);
- __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0);
- __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1);
- __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1);
-
- _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0);
- _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1);
- _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2);
- _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3);
+ simdscalari src_r = _simd16_extract_si(src_rg, 0);
+ simdscalari src_g = _simd16_extract_si(src_rg, 1);
+ simdscalari src_b = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc + sizeof(simd16scalari)));
+ simdscalari src_a = _mm256_undefined_si256();
+
+ simdscalari rg0 = _simd_unpacklo_epi16(src_r, src_g);
+ simdscalari rg1 = _simd_unpackhi_epi16(src_r, src_g);
+ simdscalari ba0 = _simd_unpacklo_epi16(src_b, src_a);
+ simdscalari ba1 = _simd_unpackhi_epi16(src_b, src_a);
+
+ simdscalari dst0 = _simd_unpacklo_epi32(rg0, ba0);
+ simdscalari dst1 = _simd_unpackhi_epi32(rg0, ba0);
+ simdscalari dst2 = _simd_unpacklo_epi32(rg1, ba1);
+ simdscalari dst3 = _simd_unpackhi_epi32(rg1, ba1);
+
+ _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0);
+ _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1);
+ _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2);
+ _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3);
}
#endif
};
@@ -749,13 +703,13 @@ struct Transpose16_16
{
simd16scalari result = _simd16_setzero_si();
- simd16scalari src = _simd16_castps_si(_simd16_load_ps(reinterpret_cast<const float *>(pSrc)));
+ simd16scalari src = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc));
simdscalari srclo = _simd16_extract_si(src, 0);
simdscalari srchi = _simd16_extract_si(src, 1);
- result = _simd16_insert_si(result, _mm256_unpacklo_epi16(srclo, srchi), 0);
- result = _simd16_insert_si(result, _mm256_unpackhi_epi16(srclo, srchi), 1);
+ result = _simd16_insert_si(result, _simd_unpacklo_epi16(srclo, srchi), 0);
+ result = _simd16_insert_si(result, _simd_unpackhi_epi16(srclo, srchi), 1);
_simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), result);
}
diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h
index 21ee443841c..0c0b96204f6 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h
+++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h
@@ -34,7 +34,6 @@
#include "core/format_conversion.h"
#include "memory/TilingFunctions.h"
-#include "memory/tilingtraits.h"
#include "memory/Convert.h"
#include "core/multisample.h"
@@ -103,6 +102,33 @@ struct StorePixels<8, 2>
}
};
+#if USE_8x2_TILE_BACKEND
+template <>
+struct StorePixels<8, 4>
+{
+ static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
+ {
+ // 8 x 2 bytes = 16 bytes, 16 pixels
+ const uint16_t *pSrc16 = reinterpret_cast<const uint16_t *>(pSrc);
+
+ uint16_t **ppDsts16 = reinterpret_cast<uint16_t **>(ppDsts);
+
+ // Unswizzle from SWR-Z order
+ ppDsts16[0][0] = pSrc16[0]; // 0 1
+ ppDsts16[0][1] = pSrc16[2]; // 4 5
+
+ ppDsts16[1][0] = pSrc16[1]; // 2 3
+ ppDsts16[1][1] = pSrc16[3]; // 6 7
+
+ ppDsts16[2][0] = pSrc16[4]; // 8 9
+ ppDsts16[2][1] = pSrc16[6]; // C D
+
+ ppDsts16[3][0] = pSrc16[5]; // A B
+ ppDsts16[3][1] = pSrc16[7]; // E F
+ }
+};
+
+#endif
//////////////////////////////////////////////////////////////////////////
/// StorePixels (32-bit pixel specialization)
/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
@@ -131,6 +157,33 @@ struct StorePixels<16, 2>
}
};
+#if USE_8x2_TILE_BACKEND
+template <>
+struct StorePixels<16, 4>
+{
+ static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
+ {
+ // 8 x 4 bytes = 32 bytes, 16 pixels
+ const uint32_t *pSrc32 = reinterpret_cast<const uint32_t *>(pSrc);
+
+ uint32_t **ppDsts32 = reinterpret_cast<uint32_t **>(ppDsts);
+
+ // Unswizzle from SWR-Z order
+ ppDsts32[0][0] = pSrc32[0]; // 0 1
+ ppDsts32[0][1] = pSrc32[2]; // 4 5
+
+ ppDsts32[1][0] = pSrc32[1]; // 2 3
+ ppDsts32[1][1] = pSrc32[3]; // 6 7
+
+ ppDsts32[2][0] = pSrc32[4]; // 8 9
+ ppDsts32[2][1] = pSrc32[6]; // C D
+
+ ppDsts32[3][0] = pSrc32[5]; // A B
+ ppDsts32[3][1] = pSrc32[7]; // E F
+ }
+};
+
+#endif
//////////////////////////////////////////////////////////////////////////
/// StorePixels (32-bit pixel specialization)
/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
@@ -164,15 +217,21 @@ struct StorePixels<32, 4>
{
static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4])
{
- __m128i quad0 = _mm_load_si128(&reinterpret_cast<const __m128i *>(pSrc)[0]);
- __m128i quad1 = _mm_load_si128(&reinterpret_cast<const __m128i *>(pSrc)[1]);
- __m128i quad2 = _mm_load_si128(&reinterpret_cast<const __m128i *>(pSrc)[2]);
- __m128i quad3 = _mm_load_si128(&reinterpret_cast<const __m128i *>(pSrc)[3]);
-
- _mm_storeu_si128(reinterpret_cast<__m128i *>(ppDsts[0]), _mm_unpacklo_epi64(quad0, quad1));
- _mm_storeu_si128(reinterpret_cast<__m128i *>(ppDsts[1]), _mm_unpackhi_epi64(quad0, quad1));
- _mm_storeu_si128(reinterpret_cast<__m128i *>(ppDsts[2]), _mm_unpacklo_epi64(quad2, quad3));
- _mm_storeu_si128(reinterpret_cast<__m128i *>(ppDsts[3]), _mm_unpackhi_epi64(quad2, quad3));
+ // 4 x 16 bytes = 64 bytes, 16 pixels
+ const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
+
+ __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
+
+ // Unswizzle from SWR-Z order
+ __m128i quad0 = _mm_load_si128(&pSrc128[0]); // 0 1 2 3
+ __m128i quad1 = _mm_load_si128(&pSrc128[1]); // 4 5 6 7
+ __m128i quad2 = _mm_load_si128(&pSrc128[2]); // 8 9 A B
+ __m128i quad3 = _mm_load_si128(&pSrc128[3]); // C D E F
+
+ _mm_storeu_si128(ppDsts128[0], _mm_unpacklo_epi64(quad0, quad1)); // 0 1 4 5
+ _mm_storeu_si128(ppDsts128[1], _mm_unpackhi_epi64(quad0, quad1)); // 2 3 6 7
+ _mm_storeu_si128(ppDsts128[2], _mm_unpacklo_epi64(quad2, quad3)); // 8 9 C D
+ _mm_storeu_si128(ppDsts128[3], _mm_unpackhi_epi64(quad2, quad3)); // A B E F
}
};
@@ -203,6 +262,30 @@ struct StorePixels<64, 4>
}
};
+#if USE_8x2_TILE_BACKEND
+template <>
+struct StorePixels<64, 8>
+{
+ static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8])
+ {
+ // 8 x 16 bytes = 128 bytes, 16 pixels
+ const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
+
+ __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
+
+ // order of pointers match SWR-Z layout
+ *ppDsts128[0] = pSrc128[0]; // 0 1
+ *ppDsts128[1] = pSrc128[1]; // 2 3
+ *ppDsts128[2] = pSrc128[2]; // 4 5
+ *ppDsts128[3] = pSrc128[3]; // 6 7
+ *ppDsts128[4] = pSrc128[4]; // 8 9
+ *ppDsts128[5] = pSrc128[5]; // A B
+ *ppDsts128[6] = pSrc128[6]; // C D
+ *ppDsts128[7] = pSrc128[7]; // E F
+ }
+};
+
+#endif
//////////////////////////////////////////////////////////////////////////
/// StorePixels (32-bit pixel specialization)
/// @brief Stores a 4x2 (AVX) raster-tile to two rows.
@@ -233,6 +316,28 @@ struct StorePixels<128, 8>
}
};
+#if USE_8x2_TILE_BACKEND
+template <>
+struct StorePixels<128, 16>
+{
+ static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16])
+ {
+ // 16 x 16 bytes = 256 bytes, 16 pixels
+ const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc);
+
+ __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts);
+
+ for (uint32_t i = 0; i < 16; i += 4)
+ {
+ *ppDsts128[i + 0] = pSrc128[i + 0];
+ *ppDsts128[i + 1] = pSrc128[i + 2];
+ *ppDsts128[i + 2] = pSrc128[i + 1];
+ *ppDsts128[i + 3] = pSrc128[i + 3];
+ }
+ }
+};
+
+#endif
//////////////////////////////////////////////////////////////////////////
/// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2)
//////////////////////////////////////////////////////////////////////////
@@ -332,6 +437,51 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM >
template <size_t NumDests>
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
{
+#if USE_8x2_TILE_BACKEND
+ static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
+ static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
+
+ static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel
+
+ OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES];
+
+ // Load hot-tile
+ simd16vector src, dst;
+ LoadSOA<SrcFormat>(pSrc, src);
+
+ // deswizzle
+ dst.x = src[FormatTraits<DstFormat>::swizzle(0)];
+ dst.y = src[FormatTraits<DstFormat>::swizzle(1)];
+ dst.z = src[FormatTraits<DstFormat>::swizzle(2)];
+
+ // clamp
+ dst.x = Clamp<DstFormat>(dst.x, 0);
+ dst.y = Clamp<DstFormat>(dst.y, 1);
+ dst.z = Clamp<DstFormat>(dst.z, 2);
+
+ // normalize
+ dst.x = Normalize<DstFormat>(dst.x, 0);
+ dst.y = Normalize<DstFormat>(dst.y, 1);
+ dst.z = Normalize<DstFormat>(dst.z, 2);
+
+ // pack
+ simd16scalari packed = _simd16_castps_si(dst.x);
+
+ SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(0) == 5);
+ SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(1) == 6);
+
+ packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.y), 5));
+ packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.z), 5 + 6));
+
+ // pack low 16 bits of each 32 bit lane to low 128 bits of dst
+ uint32_t *pPacked = (uint32_t*)&packed;
+ uint16_t *pAosTile = (uint16_t*)&aosTile[0];
+ for (uint32_t t = 0; t < KNOB_SIMD16_WIDTH; ++t)
+ {
+ *pAosTile++ = *pPacked++;
+ }
+
+#else
static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT;
static const SWR_FORMAT DstFormat = B5G6R5_UNORM;
static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
@@ -371,6 +521,7 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM >
*pAosTile++ = *pPacked++;
}
+#endif
// Store data into destination
StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts);
}
@@ -409,56 +560,23 @@ struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS>
// Store data into destination but don't overwrite the X8 bits
// Each 4-pixel row is 16-bytes
-#if 1
+
simdscalari loadlo = _simd_load_si(reinterpret_cast<simdscalari *>(aosTile));
simdscalari loadhi = _simd_load_si(reinterpret_cast<simdscalari *>(aosTile + sizeof(simdscalari)));
simdscalari templo = _simd_unpacklo_epi64(loadlo, loadhi);
simdscalari temphi = _simd_unpackhi_epi64(loadlo, loadhi);
- simdscalari destlo = _mm256_loadu2_m128i(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]));
- simdscalari desthi = _mm256_loadu2_m128i(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]));
+ simdscalari destlo = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]));
+ simdscalari desthi = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]));
- simdscalari mask = _simd_set1_epi32(0xFFFFFF);
+ simdscalari mask = _simd_set1_epi32(0x00FFFFFF);
destlo = _simd_or_si(_simd_andnot_si(mask, destlo), _simd_and_si(mask, templo));
desthi = _simd_or_si(_simd_andnot_si(mask, desthi), _simd_and_si(mask, templo));
- _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]), destlo);
- _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]), desthi);
-#else
- __m128i *pZRow01 = (__m128i*)aosTile;
- __m128i vQuad00 = _mm_load_si128(pZRow01);
- __m128i vQuad01 = _mm_load_si128(pZRow01 + 1);
- __m128i vQuad02 = _mm_load_si128(pZRow01 + 2);
- __m128i vQuad03 = _mm_load_si128(pZRow01 + 3);
-
- __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01);
- __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01);
- __m128i vRow20 = _mm_unpacklo_epi64(vQuad02, vQuad03);
- __m128i vRow30 = _mm_unpackhi_epi64(vQuad02, vQuad03);
-
- __m128i vDst0 = _mm_loadu_si128((const __m128i*)ppDsts[0]);
- __m128i vDst1 = _mm_loadu_si128((const __m128i*)ppDsts[1]);
- __m128i vDst2 = _mm_loadu_si128((const __m128i*)ppDsts[2]);
- __m128i vDst3 = _mm_loadu_si128((const __m128i*)ppDsts[3]);
-
- __m128i vMask = _mm_set1_epi32(0xFFFFFF);
-
- vDst0 = _mm_andnot_si128(vMask, vDst0);
- vDst0 = _mm_or_si128(vDst0, _mm_and_si128(vRow00, vMask));
- vDst1 = _mm_andnot_si128(vMask, vDst1);
- vDst1 = _mm_or_si128(vDst1, _mm_and_si128(vRow10, vMask));
- vDst2 = _mm_andnot_si128(vMask, vDst2);
- vDst2 = _mm_or_si128(vDst2, _mm_and_si128(vRow20, vMask));
- vDst3 = _mm_andnot_si128(vMask, vDst3);
- vDst3 = _mm_or_si128(vDst3, _mm_and_si128(vRow10, vMask));
-
- _mm_storeu_si128((__m128i*)ppDsts[0], vDst0);
- _mm_storeu_si128((__m128i*)ppDsts[1], vDst1);
- _mm_storeu_si128((__m128i*)ppDsts[2], vDst2);
- _mm_storeu_si128((__m128i*)ppDsts[3], vDst3);
-#endif
+ _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]), destlo);
+ _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]), desthi);
#else
static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel
@@ -508,7 +626,7 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDs
simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
simd16scalar comp3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(3) * sizeof(simd16scalar))); // float32 aaaaaaaaaaaaaaaa
- // clamp
+ // clamp
const simd16scalar zero = _simd16_setzero_ps();
const simd16scalar ones = _simd16_set1_ps(1.0f);
@@ -524,15 +642,15 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDs
comp3 = _simd16_max_ps(comp3, zero);
comp3 = _simd16_min_ps(comp3, ones);
+ // gamma-correct only rgb
if (FormatTraits<DstFormat>::isSRGB)
{
- // Gamma-correct only rgb
comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
}
- // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format
+ // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
@@ -544,15 +662,14 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDs
simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
simd16scalari src3 = _simd16_cvtps_epi32(comp3); // padded byte aaaaaaaaaaaaaaaa
-#if 1
- // SOA to AOS conversion
+ // SOA to AOS conversion
src1 = _simd16_slli_epi32(src1, 8);
src2 = _simd16_slli_epi32(src2, 16);
src3 = _simd16_slli_epi32(src3, 24);
simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3)); // 0 1 2 3 4 5 6 7 8 9 A B C D E F
- // de-swizzle conversion
+ // de-swizzle conversion
#if 1
simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F
@@ -563,66 +680,11 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDs
final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
#endif
-#endif
-#if KNOB_ARCH == KNOB_ARCH_AVX
-
- // splitting into two sets of 4 wide integer vector types
- // because AVX doesn't have instructions to support this operation at 8 wide
-#if 0
- __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r
- __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g
- __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b
- __m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a
-
- __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r
- __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g
- __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b
- __m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a
-
- srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0
- srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0
- srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00
- srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00
- srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000
- srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000
-
- srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr
- srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00
-
- srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr
- srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00
-
- srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr
- srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr
-
- // unpack into rows that get the tiling order correct
- __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr
- __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0);
-
- __m256i final = _mm256_castsi128_si256(vRow00);
- final = _mm256_insertf128_si256(final, vRow10, 1);
-
-#else
-#if 0
- simd16scalari final = _simd16_setzero_si();
-
-#endif
-#endif
-#elif KNOB_ARCH >= KNOB_ARCH_AVX2
- // logic is as above, only wider
-#if 0
- src1 = _simd16_slli_epi32(src1, 8);
- src2 = _simd16_slli_epi32(src2, 16);
- src3 = _simd16_slli_epi32(src3, 24);
-
- simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3));
-
- final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
-
-#endif
-#endif
- _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
- _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
+ // store 8x2 memory order:
+ // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
+ // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
+ _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
+ _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
}
#endif
@@ -730,9 +792,74 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst
#endif
#endif
- _mm256_storeu2_m128i((__m128i*)pDst1, (__m128i*)pDst, final);
+ _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
}
+#if USE_8x2_TILE_BACKEND
+template<SWR_FORMAT DstFormat>
+INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3)
+{
+ // swizzle rgba -> bgra while we load
+ simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr
+ simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg
+ simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb
+
+ // clamp
+ const simd16scalar zero = _simd16_setzero_ps();
+ const simd16scalar ones = _simd16_set1_ps(1.0f);
+
+ comp0 = _simd16_max_ps(comp0, zero);
+ comp0 = _simd16_min_ps(comp0, ones);
+
+ comp1 = _simd16_max_ps(comp1, zero);
+ comp1 = _simd16_min_ps(comp1, ones);
+
+ comp2 = _simd16_max_ps(comp2, zero);
+ comp2 = _simd16_min_ps(comp2, ones);
+
+ // gamma-correct only rgb
+ if (FormatTraits<DstFormat>::isSRGB)
+ {
+ comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0);
+ comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1);
+ comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2);
+ }
+
+ // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format
+ comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0)));
+ comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1)));
+ comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2)));
+
+ // moving to 16 wide integer vector types
+ simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr
+ simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg
+ simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb
+
+ // SOA to AOS conversion
+ src1 = _simd16_slli_epi32(src1, 8);
+ src2 = _simd16_slli_epi32(src2, 16);
+
+ simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), src2); // 0 1 2 3 4 5 6 7 8 9 A B C D E F
+
+ // de-swizzle conversion
+#if 1
+ simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B
+ simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F
+
+ final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F
+
+#else
+ final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0));
+
+#endif
+ // store 8x2 memory order:
+ // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D }
+ // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F }
+ _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0));
+ _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1));
+}
+
+#endif
template<SWR_FORMAT DstFormat>
INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1)
{
@@ -816,7 +943,7 @@ INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_
#endif
- _mm256_storeu2_m128i((__m128i*)pDst1, (__m128i*)pDst, final);
+ _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final);
}
template<>
@@ -825,7 +952,11 @@ struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM>
template <size_t NumDests>
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
{
+#if USE_8x2_TILE_BACKEND
+ FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
+#else
FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
+#endif
}
};
@@ -835,7 +966,11 @@ struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM>
template <size_t NumDests>
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
{
+#if USE_8x2_TILE_BACKEND
+ FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
+#else
FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
+#endif
}
};
@@ -845,7 +980,11 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB >
template <size_t NumDests>
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
{
+#if USE_8x2_TILE_BACKEND
+ FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
+#else
FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
+#endif
}
};
@@ -855,7 +994,11 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB >
template <size_t NumDests>
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
{
+#if USE_8x2_TILE_BACKEND
+ FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
+#else
FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
+#endif
}
};
@@ -879,7 +1022,11 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM >
template <size_t NumDests>
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
{
+#if USE_8x2_TILE_BACKEND
+ FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
+#else
FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]);
+#endif
}
};
@@ -903,7 +1050,11 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB >
template <size_t NumDests>
INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests])
{
+#if USE_8x2_TILE_BACKEND
+ FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]);
+#else
FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]);
+#endif
}
};
@@ -926,13 +1077,13 @@ struct StoreRasterTile
#if USE_8x2_TILE_BACKEND
typedef SimdTile_16<SrcFormat, DstFormat> SimdT;
- SimdT* pSrcSimdTiles = (SimdT*)pSrc;
+ SimdT *pSrcSimdTiles = reinterpret_cast<SimdT *>(pSrc);
// Compute which simd tile we're accessing within 8x8 tile.
// i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates.
uint32_t simdIndex = (y / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM) + (x / SIMD16_TILE_X_DIM);
- SimdT* pSimdTile = &pSrcSimdTiles[simdIndex];
+ SimdT *pSimdTile = &pSrcSimdTiles[simdIndex];
uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM);
@@ -1024,8 +1175,41 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat>
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
}
- uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+#if USE_8x2_TILE_BACKEND
+
+ const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
+ const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
+
+ uint8_t* ppDsts[] =
+ {
+ pDst, // row 0, col 0
+ pDst + pDstSurface->pitch, // row 1, col 0
+ pDst + dx / 2, // row 0, col 1
+ pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
+ };
+
+ for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
+ {
+ for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
+ {
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+
+ pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
+
+ ppDsts[0] += dx;
+ ppDsts[1] += dx;
+ ppDsts[2] += dx;
+ ppDsts[3] += dx;
+ }
+
+ ppDsts[0] += dy;
+ ppDsts[1] += dy;
+ ppDsts[2] += dy;
+ ppDsts[3] += dy;
+ }
+#else
uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
@@ -1045,6 +1229,7 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat>
ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
}
+#endif
}
};
@@ -1077,8 +1262,41 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
}
- uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+#if USE_8x2_TILE_BACKEND
+
+ const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
+ const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
+
+ uint8_t* ppDsts[] =
+ {
+ pDst, // row 0, col 0
+ pDst + pDstSurface->pitch, // row 1, col 0
+ pDst + dx / 2, // row 0, col 1
+ pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
+ };
+
+ for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
+ {
+ for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
+ {
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+
+ pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
+
+ ppDsts[0] += dx;
+ ppDsts[1] += dx;
+ ppDsts[2] += dx;
+ ppDsts[3] += dx;
+ }
+
+ ppDsts[0] += dy;
+ ppDsts[1] += dy;
+ ppDsts[2] += dy;
+ ppDsts[3] += dy;
+ }
+#else
uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row)
@@ -1098,6 +1316,7 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat
ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch;
ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch;
}
+#endif
}
};
@@ -1130,32 +1349,39 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
}
- uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
#if USE_8x2_TILE_BACKEND
- uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch, pDst + (SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL) / 2, pDst + pDstSurface->pitch + (SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL) / 2 };
- for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM; ++row)
+ const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
+ const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
+
+ uint8_t* ppDsts[] =
{
- uint8_t* ppStartRows[] = { ppRows[0], ppRows[1], ppRows[2], ppRows[3] };
+ pDst, // row 0, col 0
+ pDst + pDstSurface->pitch, // row 1, col 0
+ pDst + dx / 2, // row 0, col 1
+ pDst + pDstSurface->pitch + dx / 2 // row 1, col 1
+ };
- for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM; ++col)
+ for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
+ {
+ for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
{
- // Format conversion and convert from SOA to AOS, and store the rows.
- ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows);
-
- ppRows[0] += SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
- ppRows[1] += SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
- ppRows[2] += SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
- ppRows[3] += SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
+
+ ppDsts[0] += dx;
+ ppDsts[1] += dx;
+ ppDsts[2] += dx;
+ ppDsts[3] += dx;
}
- ppRows[0] = ppStartRows[0] + SIMD16_TILE_Y_DIM * pDstSurface->pitch;
- ppRows[1] = ppStartRows[1] + SIMD16_TILE_Y_DIM * pDstSurface->pitch;
- ppRows[2] = ppStartRows[2] + SIMD16_TILE_Y_DIM * pDstSurface->pitch;
- ppRows[3] = ppStartRows[3] + SIMD16_TILE_Y_DIM * pDstSurface->pitch;
+ ppDsts[0] += dy;
+ ppDsts[1] += dy;
+ ppDsts[2] += dy;
+ ppDsts[3] += dy;
}
#else
uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch };
@@ -1184,15 +1410,17 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat
//////////////////////////////////////////////////////////////////////////
/// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp
//////////////////////////////////////////////////////////////////////////
-template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
+template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat >
struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat>
{
typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile;
- static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
+ static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
static const size_t MAX_DST_COLUMN_BYTES = 16;
+#if !USE_8x2_TILE_BACKEND
static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
+#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Stores an 8x8 raster tile to the destination surface.
@@ -1213,8 +1441,58 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
}
- uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+#if USE_8x2_TILE_BACKEND
+
+ const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
+ const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch;
+
+ // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
+ static_assert(dx == MAX_DST_COLUMN_BYTES * 4, "Invalid column offsets");
+
+#if 1
+ uint8_t *ppDsts[8];
+
+ {
+ for (uint32_t y = 0; y < 2; y += 1)
+ {
+ for (uint32_t x = 0; x < 4; x += 1)
+ {
+ ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
+ }
+ }
+ }
+
+#else
+ uint8_t *ppDsts[] =
+ {
+ pDst, // row 0, col 0
+ pDst + pDstSurface->pitch, // row 1, col 0
+ pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
+ pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
+ pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2
+ pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2
+ pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3
+ pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3 // row 1, col 3
+ };
+
+#endif
+ for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
+ {
+ // Raster tile width is same as simd16 tile width
+ static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
+
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+
+ pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
+
+ for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
+ {
+ ppDsts[i] += dy;
+ }
+ }
+#else
uint8_t* ppDsts[] =
{
pDst, // row 0, col 0
@@ -1250,6 +1528,7 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat
ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch;
ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch;
}
+#endif
}
};
@@ -1260,11 +1539,13 @@ template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat>
{
typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
- static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
+ static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
static const size_t MAX_DST_COLUMN_BYTES = 16;
+#if !USE_8x2_TILE_BACKEND
static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2;
+#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Stores an 8x8 raster tile to the destination surface.
@@ -1285,8 +1566,92 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstForma
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
}
- uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+#if USE_8x2_TILE_BACKEND
+
+ const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
+ const uint32_t dy = SIMD16_TILE_Y_DIM * 2 * pDstSurface->pitch; // double up on tile y dim, one simd16 tile will do twice the rows
+
+ // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
+ static_assert(dx == MAX_DST_COLUMN_BYTES * 8, "Invalid column offsets");
+
+#if 1
+ uint8_t *ppDsts[16];
+
+ {
+ for (uint32_t y = 0; y < 2; y += 1)
+ {
+ for (uint32_t x = 0; x < 4; x += 1)
+ {
+ ppDsts[x * 2 + (y + 0)] = pDst + (y + 0) * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
+ ppDsts[x * 2 + (y + 8)] = pDst + (y + 2) * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES;
+ }
+ }
+ }
+
+#else
+ uint8_t* ppDsts[] =
+ {
+ pDst, // row 0, col 0
+ pDst + pDstSurface->pitch, // row 1, col 0
+ pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1
+ pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1
+ pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2
+ pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2
+ pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3
+ pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3, // row 1, col 3
+
+ pDst + pDstSurface->pitch * 2, // row 2, col 0
+ pDst + pDstSurface->pitch * 3, // row 3, col 0
+ pDst + pDstSurface->pitch * 2 + MAX_DST_COLUMN_BYTES, // row 2, col 1
+ pDst + pDstSurface->pitch * 3 + MAX_DST_COLUMN_BYTES, // row 3, col 1
+ pDst + pDstSurface->pitch * 2 + MAX_DST_COLUMN_BYTES * 2, // row 2, col 2
+ pDst + pDstSurface->pitch * 3 + MAX_DST_COLUMN_BYTES * 2, // row 3, col 2
+ pDst + pDstSurface->pitch * 2 + MAX_DST_COLUMN_BYTES * 3, // row 2, col 3
+ pDst + pDstSurface->pitch * 3 + MAX_DST_COLUMN_BYTES * 3 // row 3, col 3
+ };
+
+#endif
+#if 1
+ // Raster tile height is quadruple simd16 tile height
+ static_assert(KNOB_TILE_Y_DIM == SIMD16_TILE_Y_DIM * 4, "Invalid tile y dim");
+
+ // Raster tile width is same as simd16 tile width
+ static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
+
+ // tile rows 0 thru 3
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+
+ pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
+
+ for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
+ {
+ ppDsts[i] += dy;
+ }
+
+ // tile rows 4 thru 7
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+
+#else
+ for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM * 2)
+ {
+ // Raster tile width is same as simd16 tile width
+ static_assert(KNOB_TILE_X_DIM * 2 == SIMD16_TILE_X_DIM, "Invalid tile x dim");
+
+ // Format conversion, convert from SOA to AOS, and store
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+
+ pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
+
+ for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
+ {
+ ppDsts[i] += dy;
+ }
+ }
+
+#endif
+#else
struct DstPtrs
{
uint8_t* ppDsts[8];
@@ -1330,6 +1695,7 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstForma
ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch;
ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch;
}
+#endif
}
};
@@ -1340,6 +1706,7 @@ template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat>
{
typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile;
+ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
//////////////////////////////////////////////////////////////////////////
/// @brief Stores an 8x8 raster tile to the destination surface.
@@ -1365,6 +1732,33 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, Dst
// TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
// We can compute the offsets to each column within the raster tile once and increment from these.
// There will be 2 x 4-wide columns in an 8x8 raster tile.
+#if USE_8x2_TILE_BACKEND
+ uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+
+ const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
+
+ uint8_t *ppDsts[] =
+ {
+ pDst,
+ pDst + DestRowWidthBytes,
+ pDst + DestRowWidthBytes / 4,
+ pDst + DestRowWidthBytes + DestRowWidthBytes / 4
+ };
+
+ // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
+ for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
+ {
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+
+ pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
+
+ ppDsts[0] += dy;
+ ppDsts[1] += dy;
+ ppDsts[2] += dy;
+ ppDsts[3] += dy;
+ }
+#else
uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
@@ -1388,6 +1782,7 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, Dst
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
pSrc += pSrcInc;
}
+#endif
}
};
@@ -1398,6 +1793,7 @@ template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat>
{
typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile;
+ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
//////////////////////////////////////////////////////////////////////////
/// @brief Stores an 8x8 raster tile to the destination surface.
@@ -1423,6 +1819,33 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, Ds
// TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
// We can compute the offsets to each column within the raster tile once and increment from these.
// There will be 2 x 4-wide columns in an 8x8 raster tile.
+#if USE_8x2_TILE_BACKEND
+ uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+
+ const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
+
+ uint8_t *ppDsts[] =
+ {
+ pDst,
+ pDst + DestRowWidthBytes,
+ pDst + DestRowWidthBytes / 2,
+ pDst + DestRowWidthBytes + DestRowWidthBytes / 2
+ };
+
+ // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
+ for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
+ {
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+
+ pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
+
+ ppDsts[0] += dy;
+ ppDsts[1] += dy;
+ ppDsts[2] += dy;
+ ppDsts[3] += dy;
+ }
+#else
uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
@@ -1446,6 +1869,7 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, Ds
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
pSrc += pSrcInc;
}
+#endif
}
};
@@ -1456,6 +1880,8 @@ template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat>
{
typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile;
+ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
+ static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
//////////////////////////////////////////////////////////////////////////
/// @brief Stores an 8x8 raster tile to the destination surface.
@@ -1480,7 +1906,42 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, Ds
// TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows.
// We can compute the offsets to each column within the raster tile once and increment from these.
- uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+#if USE_8x2_TILE_BACKEND
+ uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+
+ const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL;
+ const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL;
+
+ uint8_t* ppDsts[] =
+ {
+ pDst, // row 0, col 0
+ pDst + DestRowWidthBytes, // row 1, col 0
+ pDst + dx / 2, // row 0, col 1
+ pDst + DestRowWidthBytes + dx / 2 // row 1, col 1
+ };
+
+ for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
+ {
+ for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM)
+ {
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+
+ pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
+
+ ppDsts[0] += dx;
+ ppDsts[1] += dx;
+ ppDsts[2] += dx;
+ ppDsts[3] += dx;
+ }
+
+ ppDsts[0] += dy;
+ ppDsts[1] += dy;
+ ppDsts[2] += dy;
+ ppDsts[3] += dy;
+ }
+#else
+ uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
uint8_t* pRow1 = pRow0 + DestRowWidthBytes;
@@ -1500,6 +1961,7 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, Ds
pRow0 += (DestRowWidthBytes * 2);
pRow1 += (DestRowWidthBytes * 2);
}
+#endif
}
};
@@ -1537,22 +1999,36 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, Ds
// TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
// We can compute the offsets to each column within the raster tile once and increment from these.
// There will be 2 x 4-wide columns in an 8x8 raster tile.
- uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+#if USE_8x2_TILE_BACKEND
+ uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
-#if USE_8x2_TILE_BACKEND
- // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
- for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD16_TILE_Y_DIM)
- {
- uint8_t *pRow = pCol0 + row * DestRowWidthBytes;
+ const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
- uint8_t *ppDsts[] = { pRow, pRow + DestRowWidthBytes, pRow + DestColumnBytes, pRow + DestColumnBytes + DestRowWidthBytes };
+ uint8_t *ppDsts[] =
+ {
+ pDst,
+ pDst + DestRowWidthBytes,
+ pDst + DestColumnBytes,
+ pDst + DestRowWidthBytes + DestColumnBytes
+ };
+ // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
+ for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
+ {
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
+
+ ppDsts[0] += dy;
+ ppDsts[1] += dy;
+ ppDsts[2] += dy;
+ ppDsts[3] += dy;
}
#else
+ uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+
// Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE.
uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8;
@@ -1584,6 +2060,7 @@ template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat>
{
typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile;
+ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
//////////////////////////////////////////////////////////////////////////
/// @brief Stores an 8x8 raster tile to the destination surface.
@@ -1610,7 +2087,54 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, Ds
// TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows.
// We can compute the offsets to each column within the raster tile once and increment from these.
// There will be 2 x 4-wide columns in an 8x8 raster tile.
- uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+#if USE_8x2_TILE_BACKEND
+ uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+
+ const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes;
+
+#if 1
+ // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
+ uint8_t *ppDsts[8];
+
+ {
+ for (uint32_t y = 0; y < 2; y += 1)
+ {
+ for (uint32_t x = 0; x < 4; x += 1)
+ {
+ ppDsts[x * 2 + y] = pDst + y * DestRowWidthBytes + x * DestColumnBytes;
+ }
+ }
+ }
+
+#else
+ uint8_t *ppDsts[] =
+ {
+ pDst,
+ pDst + DestRowWidthBytes,
+ pDst + DestColumnBytes,
+ pDst + DestRowWidthBytes + DestColumnBytes,
+ pDst + DestColumnBytes * 2,
+ pDst + DestRowWidthBytes + DestColumnBytes * 2,
+ pDst + DestColumnBytes * 3,
+ pDst + DestRowWidthBytes + DestColumnBytes * 3
+ };
+
+#endif
+ // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
+ for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM)
+ {
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+
+ pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
+
+ for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
+ {
+ ppDsts[i] += dy;
+ }
+ }
+#else
+ uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
uint8_t* pCol1 = pCol0 + DestColumnBytes;
@@ -1641,6 +2165,7 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, Ds
ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
pSrc += pSrcInc;
}
+#endif
}
};
@@ -1651,18 +2176,22 @@ template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat>
struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat>
{
typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile;
-
- static const size_t TILE_Y_COL_WIDTH_BYTES = 16;
- static const size_t TILE_Y_ROWS = 32;
- static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES;
-
- static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
- static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
- static const size_t MAX_DST_COLUMN_BYTES = 16;
-
- static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
+#if USE_8x2_TILE_BACKEND
+ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
+
+#else
+ static const size_t TILE_Y_COL_WIDTH_BYTES = 16;
+ static const size_t TILE_Y_ROWS = 32;
+ static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES;
+
+ static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8;
+ static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8;
+ static const size_t MAX_DST_COLUMN_BYTES = 16;
+
+ static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL;
static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4;
+#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Stores an 8x8 raster tile to the destination surface.
/// @param pSrc - Pointer to raster tile.
@@ -1673,6 +2202,11 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, D
SWR_SURFACE_STATE* pDstSurface,
uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex)
{
+#if USE_8x2_TILE_BACKEND
+ static const uint32_t DestRowWidthBytes = 16; // 16B rows
+ static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows.
+#endif
+
// Punt non-full tiles to generic store
uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U);
uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U);
@@ -1682,7 +2216,85 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, D
return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex);
}
- uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+#if USE_8x2_TILE_BACKEND
+ uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
+ pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
+
+ const uint32_t dy = SIMD16_TILE_Y_DIM * 2 * DestRowWidthBytes; // double up on tile y dim, one simd16 tile will do twice the rows
+
+#if 1
+ // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation)
+ uint8_t *ppDsts[16];
+
+ {
+ for (uint32_t y = 0; y < 2; y += 1)
+ {
+ for (uint32_t x = 0; x < 4; x += 1)
+ {
+ ppDsts[x * 2 + (y + 0)] = pDst + (y + 0) * DestRowWidthBytes + x * DestColumnBytes;
+ ppDsts[x * 2 + (y + 8)] = pDst + (y + 2) * DestRowWidthBytes + x * DestColumnBytes;
+ }
+ }
+ }
+
+#else
+ uint8_t *ppDsts[] =
+ {
+ pDst,
+ pDst + DestRowWidthBytes,
+ pDst + DestColumnBytes,
+ pDst + DestRowWidthBytes + DestColumnBytes,
+ pDst + DestColumnBytes * 2,
+ pDst + DestRowWidthBytes + DestColumnBytes * 2,
+ pDst + DestColumnBytes * 3,
+ pDst + DestRowWidthBytes + DestColumnBytes * 3,
+
+ pDst + DestRowWidthBytes * 2,
+ pDst + DestRowWidthBytes * 3,
+ pDst + DestRowWidthBytes * 2 + DestColumnBytes,
+ pDst + DestRowWidthBytes * 3 + DestColumnBytes,
+ pDst + DestRowWidthBytes * 2 + DestColumnBytes * 2,
+ pDst + DestRowWidthBytes * 3 + DestColumnBytes * 2,
+ pDst + DestRowWidthBytes * 2 + DestColumnBytes * 3,
+ pDst + DestRowWidthBytes * 3 + DestColumnBytes * 3
+ };
+
+#endif
+#if 1
+ // Raster tile height is quadruple simd16 tile height
+ static_assert(KNOB_TILE_Y_DIM == SIMD16_TILE_Y_DIM * 4, "Invalid tile y dim");
+
+ // Raster tile width is same as simd16 tile width
+ static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim");
+
+ // tile rows 0 thru 3
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+
+ pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
+
+ for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
+ {
+ ppDsts[i] += dy;
+ }
+
+ // tile rows 4 thru 7
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+#else
+ // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern.
+ for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM * 2)
+ {
+ ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts);
+
+ pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL;
+
+ for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1)
+ {
+ ppDsts[i] += dy;
+ }
+ }
+#endif
+#else
+ uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex,
pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface);
struct DstPtrs
{
@@ -1727,6 +2339,7 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, D
ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES;
ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES;
}
+#endif
}
};
@@ -1776,7 +2389,6 @@ struct StoreMacroTile
uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex)
{
PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES];
-
for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++)
{
size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, false>(