diff options
author | Tim Rowley <[email protected]> | 2016-10-28 15:59:18 -0500 |
---|---|---|
committer | Tim Rowley <[email protected]> | 2016-11-14 09:00:59 -0600 |
commit | 937b7d8e5a97d1c3cc5ab7303c03dbdd2fdc8017 (patch) | |
tree | 56d840da72f70b93adda6dd12fd20a7eca6f174f | |
parent | f233bcda8930eea1f6fc0b830e4953485361a0e7 (diff) |
swr: [rasterizer core] 16-wide tile store nearly completed
* All format combinations coded
* Fully emulated on AVX2 and AVX
* Known issue: the MSAA sample locations need to be adjusted for 8x2
Set ENABLE_AVX512_SIMD16 and USD_8x2_TILE_BACKEND to 1 in knobs.h to enable
Reviewed-by: Bruce Cherniak <[email protected]>
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/common/simd16intrin.h | 61 | ||||
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/common/simdintrin.h | 38 | ||||
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/core/format_types.h | 20 | ||||
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/core/utils.h | 174 | ||||
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/memory/StoreTile.h | 938 |
5 files changed, 917 insertions, 314 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h index 56ecf5bfd3d..cf6a6b6883f 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h @@ -459,10 +459,10 @@ INLINE simd16scalar _simd16_cmp_ps(simd16scalar a, simd16scalar b) #define _simd16_cmpge_ps(a, b) _simd16_cmp_ps<_CMP_GE_OQ>(a, b) #define _simd16_cmple_ps(a, b) _simd16_cmp_ps<_CMP_LE_OQ>(a, b) -SIMD16_EMU_AVX512_2(simd16scalar, _simd16_and_ps, _mm256_and_ps) -SIMD16_EMU_AVX512_2(simd16scalar, _simd16_or_ps, _mm256_or_ps) -SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rcp_ps, _mm256_rcp_ps) -SIMD16_EMU_AVX512_2(simd16scalar, _simd16_div_ps, _mm256_div_ps) +SIMD16_EMU_AVX512_2(simd16scalar, _simd16_and_ps, _simd_and_ps) +SIMD16_EMU_AVX512_2(simd16scalar, _simd16_or_ps, _simd_or_ps) +SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rcp_ps, _simd_rcp_ps) +SIMD16_EMU_AVX512_2(simd16scalar, _simd16_div_ps, _simd_div_ps) INLINE simd16scalar _simd16_castsi_ps(simd16scalari a) { @@ -509,21 +509,22 @@ INLINE simd16scalar _simd16_round_ps_temp(simd16scalar a) #define _simd16_round_ps(a, mode) _simd16_round_ps_temp<mode>(a) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mul_epi32, _mm256_mul_epi32) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mullo_epi32, _mm256_mullo_epi32) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi32, _mm256_sub_epi32) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi64, _mm256_sub_epi64) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epi32, _mm256_min_epi32) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epi32, _mm256_max_epi32) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epu32, _mm256_min_epu32) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epu32, _mm256_max_epu32) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi32, _mm256_add_epi32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mul_epi32, _simd_mul_epi32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_mullo_epi32, _simd_mullo_epi32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi32, _simd_sub_epi32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sub_epi64, _simd_sub_epi64) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epi32, _simd_min_epi32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epi32, _simd_max_epi32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epu32, _simd_min_epu32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epu32, _simd_max_epu32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi32, _simd_add_epi32) SIMD16_EMU_AVX512_2(simd16scalari, _simd16_and_si, _simd_and_si) SIMD16_EMU_AVX512_2(simd16scalari, _simd16_andnot_si, _simd_andnot_si) SIMD16_EMU_AVX512_2(simd16scalari, _simd16_or_si, _simd_or_si) SIMD16_EMU_AVX512_2(simd16scalari, _simd16_xor_si, _simd_xor_si) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi32, _mm256_cmpeq_epi32) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi32, _mm256_cmpgt_epi32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi32, _simd_cmpeq_epi32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi32, _simd_cmpgt_epi32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmplt_epi32, _simd_cmplt_epi32) INLINE int _simd16_testz_ps(simd16scalar a, simd16scalar b) { @@ -579,13 +580,13 @@ INLINE simd16scalari _simd16_srli_epi32_temp(simd16scalari a) #define _simd16_srli_epi32(a, imm8) _simd16_srli_epi32_temp<imm8>(a) -SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmadd_ps, _mm256_fmadd_ps) -SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmsub_ps, _mm256_fmsub_ps) +SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmadd_ps, _simd_fmadd_ps) +SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmsub_ps, _simd_fmsub_ps) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_shuffle_epi8, _mm256_shuffle_epi8) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_adds_epu8, _mm256_adds_epu8) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_subs_epu8, _mm256_subs_epu8) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi8, _mm256_add_epi8) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_shuffle_epi8, _simd_shuffle_epi8) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_adds_epu8, _simd_adds_epu8) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_subs_epu8, _simd_subs_epu8) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi8, _simd_add_epi8) template <int imm8> INLINE simd16scalar _simd16_i32gather_ps_temp(float const *m, simd16scalari a) @@ -600,13 +601,13 @@ INLINE simd16scalar _simd16_i32gather_ps_temp(float const *m, simd16scalari a) #define _simd16_i32gather_ps(m, a, imm8) _simd16_i32gather_ps_temp<imm8>(m, a) -SIMD16_EMU_AVX512_1(simd16scalari, _simd16_abs_epi32, _mm256_abs_epi32) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi64, _mm256_cmpeq_epi64) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi64, _mm256_cmpgt_epi64) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi16, _mm256_cmpeq_epi16) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi16, _mm256_cmpgt_epi16) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi8, _mm256_cmpeq_epi8) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi8, _mm256_cmpgt_epi8) +SIMD16_EMU_AVX512_1(simd16scalari, _simd16_abs_epi32, _simd_abs_epi32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi64, _simd_cmpeq_epi64) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi64, _simd_cmpgt_epi64) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi16, _simd_cmpeq_epi16) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi16, _simd_cmpgt_epi16) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi8, _simd_cmpeq_epi8) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi8, _simd_cmpgt_epi8) INLINE simd16scalar _simd16_permute_ps(simd16scalar a, simd16scalari i) { @@ -631,8 +632,8 @@ INLINE simd16scalari _simd16_permute_epi32(simd16scalari a, simd16scalari i) return _simd16_castps_si(_simd16_permute_ps(_simd16_castsi_ps(a), i)); } -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_srlv_epi32, _mm256_srlv_epi32) -SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sllv_epi32, _mm256_sllv_epi32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_srlv_epi32, _simd_srlv_epi32) +SIMD16_EMU_AVX512_2(simd16scalari, _simd16_sllv_epi32, _simd_sllv_epi32) template <int imm8> INLINE simd16scalar _simd16_permute2f128_ps_temp(simd16scalar a, simd16scalar b) diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h index 10c0955fe40..e777b22ec1c 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h @@ -314,7 +314,15 @@ SIMD_EMU_EPI(_simdemu_cmpgt_epi8, _mm_cmpgt_epi8) SIMD_EMU_EPI(_simdemu_cmpeq_epi8, _mm_cmpeq_epi8) SIMD_EMU_EPI(_simdemu_cmpgt_epi16, _mm_cmpgt_epi16) SIMD_EMU_EPI(_simdemu_cmpeq_epi16, _mm_cmpeq_epi16) - +SIMD_EMU_EPI(_simdemu_unpacklo_epi8, _mm_unpacklo_epi8) +SIMD_EMU_EPI(_simdemu_unpackhi_epi8, _mm_unpackhi_epi8) +SIMD_EMU_EPI(_simdemu_unpacklo_epi16, _mm_unpacklo_epi16) +SIMD_EMU_EPI(_simdemu_unpackhi_epi16, _mm_unpackhi_epi16) + +#define _simd_unpacklo_epi8 _simdemu_unpacklo_epi8 +#define _simd_unpackhi_epi8 _simdemu_unpackhi_epi8 +#define _simd_unpacklo_epi16 _simdemu_unpacklo_epi16 +#define _simd_unpackhi_epi16 _simdemu_unpackhi_epi16 #define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))) #define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))) #define _simd_unpacklo_epi64(a, b) _mm256_castpd_si256(_mm256_unpacklo_pd(_mm256_castsi256_pd(a), _mm256_castsi256_pd(b))) @@ -490,6 +498,10 @@ __m256i _simd_packs_epi32(__m256i a, __m256i b) #define _simd_xor_si _mm256_xor_si256 #define _simd_castps_si _mm256_castps_si256 +#define _simd_unpacklo_epi8 _mm256_unpacklo_epi8 +#define _simd_unpackhi_epi8 _mm256_unpackhi_epi8 +#define _simd_unpacklo_epi16 _mm256_unpacklo_epi16 +#define _simd_unpackhi_epi16 _mm256_unpackhi_epi16 #define _simd_unpacklo_epi32 _mm256_unpacklo_epi32 #define _simd_unpackhi_epi32 _mm256_unpackhi_epi32 #define _simd_unpacklo_epi64 _mm256_unpacklo_epi64 @@ -529,6 +541,14 @@ __m256i _simd_packs_epi32(__m256i a, __m256i b) #endif +#define _simd_unpacklo_ps _mm256_unpacklo_ps +#define _simd_unpacklo_pd _mm256_unpacklo_pd +#define _simd_insertf128_ps _mm256_insertf128_ps +#define _simd_insertf128_pd _mm256_insertf128_pd +#define _simd_insertf128_si _mm256_insertf128_si256 +#define _simd_extractf128_ps _mm256_extractf128_ps +#define _simd_extractf128_pd _mm256_extractf128_pd +#define _simd_extractf128_si _mm256_extractf128_si256 #define _simd_permute2f128_ps _mm256_permute2f128_ps #define _simd_permute2f128_pd _mm256_permute2f128_pd #define _simd_permute2f128_si _mm256_permute2f128_si256 @@ -551,6 +571,22 @@ __m256i _simd_packs_epi32(__m256i a, __m256i b) #define _simd_xor_ps _mm256_xor_ps INLINE +simdscalari _simd_loadu2_si(const __m128i *hiaddr, const __m128i *loaddr) +{ + __m128i lo = _mm_loadu_si128(loaddr); + __m128i hi = _mm_loadu_si128(hiaddr); + + return _mm256_insertf128_si256(_mm256_castsi128_si256(lo), (hi), 1); +} + +INLINE +void _simd_storeu2_si(__m128i *hiaddr, __m128i *loaddr, simdscalari a) +{ + _mm_storeu_si128(loaddr, _mm256_castsi256_si128(a)); + _mm_storeu_si128(hiaddr, _mm256_extractf128_si256(a, 1)); +} + +INLINE simdscalari _simd_blendv_epi32(simdscalari a, simdscalari b, simdscalar mask) { return _simd_castps_si(_simd_blendv_ps(_simd_castsi_ps(a), _simd_castsi_ps(b), mask)); diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h b/src/gallium/drivers/swr/rasterizer/core/format_types.h index a24292482c1..a57daa5a1a5 100644 --- a/src/gallium/drivers/swr/rasterizer/core/format_types.h +++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h @@ -166,12 +166,12 @@ struct PackTraits<8, false> simd16scalari result = _simd16_setzero_si(); simdscalari resultlo = _simd_setzero_si(); - __m128i templo = _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 0))), _mm256_extractf128_si256(_mm256_castps_si256(_simd16_extract_ps(in, 0)), 1)); - __m128i temphi = _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 1))), _mm256_extractf128_si256(_mm256_castps_si256(_simd16_extract_ps(in, 1)), 1)); + __m128i templo = _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 0))), _simd_extractf128_si(_mm256_castps_si256(_simd16_extract_ps(in, 0)), 1)); + __m128i temphi = _mm_packus_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 1))), _simd_extractf128_si(_mm256_castps_si256(_simd16_extract_ps(in, 1)), 1)); __m128i temp = _mm_packus_epi16(templo, temphi); - resultlo = _mm256_inserti128_si256(resultlo, temp, 0); + resultlo = _simd_insertf128_si(resultlo, temp, 0); result = _simd16_insert_si(result, resultlo, 0); return _simd16_castsi_ps(result); @@ -278,12 +278,12 @@ struct PackTraits<8, true> simd16scalari result = _simd16_setzero_si(); simdscalari resultlo = _simd_setzero_si(); - __m128i templo = _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 0))), _mm256_extractf128_si256(_mm256_castps_si256(_simd16_extract_ps(in, 0)), 1)); - __m128i temphi = _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 1))), _mm256_extractf128_si256(_mm256_castps_si256(_simd16_extract_ps(in, 1)), 1)); + __m128i templo = _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 0))), _simd_extractf128_si(_mm256_castps_si256(_simd16_extract_ps(in, 0)), 1)); + __m128i temphi = _mm_packs_epi32(_mm256_castsi256_si128(_mm256_castps_si256(_simd16_extract_ps(in, 1))), _simd_extractf128_si(_mm256_castps_si256(_simd16_extract_ps(in, 1)), 1)); __m128i temp = _mm_packs_epi16(templo, temphi); - resultlo = _mm256_inserti128_si256(resultlo, temp, 0); + resultlo = _simd_insertf128_si(resultlo, temp, 0); result = _simd16_insert_si(result, resultlo, 0); return _simd16_castsi_ps(result); @@ -1057,16 +1057,16 @@ template<> struct TypeTraits<SWR_TYPE_FLOAT, 16> : PackTraits<16> simdscalar simdlo = pack(_simd16_extract_ps(in, 0)); simdscalar simdhi = pack(_simd16_extract_ps(in, 1)); - __m128i templo = _mm256_extractf128_si256(_simd_castps_si(simdlo), 0); - __m128i temphi = _mm256_extractf128_si256(_simd_castps_si(simdhi), 0); + __m128i templo = _simd_extractf128_si(_simd_castps_si(simdlo), 0); + __m128i temphi = _simd_extractf128_si(_simd_castps_si(simdhi), 0); #else __m128i templo = _mm256_cvtps_ph(_simd16_extract_ps(in, 0), _MM_FROUND_TRUNC); __m128i temphi = _mm256_cvtps_ph(_simd16_extract_ps(in, 1), _MM_FROUND_TRUNC); #endif - resultlo = _mm256_insertf128_si256(resultlo, templo, 0); - resultlo = _mm256_insertf128_si256(resultlo, temphi, 1); + resultlo = _simd_insertf128_si(resultlo, templo, 0); + resultlo = _simd_insertf128_si(resultlo, temphi, 1); result = _simd16_insert_si(result, resultlo, 0); diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h index 91a994e432e..8f968649a57 100644 --- a/src/gallium/drivers/swr/rasterizer/core/utils.h +++ b/src/gallium/drivers/swr/rasterizer/core/utils.h @@ -147,7 +147,7 @@ void vTranspose(__m128i &row0, __m128i &row1, __m128i &row2, __m128i &row3) #if KNOB_SIMD_WIDTH == 8 INLINE -void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2) +void vTranspose3x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2) { __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, _mm256_undefined_ps()); //y0w0y1w1 y4w4y5w5 @@ -171,7 +171,7 @@ void vTranspose3x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc } INLINE -void vTranspose4x8(__m128 (&vDst)[8], __m256 &vSrc0, __m256 &vSrc1, __m256 &vSrc2, __m256 &vSrc3) +void vTranspose4x8(__m128 (&vDst)[8], const __m256 &vSrc0, const __m256 &vSrc1, const __m256 &vSrc2, const __m256 &vSrc3) { __m256 r0r2 = _mm256_unpacklo_ps(vSrc0, vSrc2); //x0z0x1z1 x4z4x5z5 __m256 r1rx = _mm256_unpacklo_ps(vSrc1, vSrc3); //y0w0y1w1 y4w4y5w5 @@ -357,15 +357,17 @@ struct Transpose8_8 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) { - __m256i src = _mm256_load_si256(reinterpret_cast<const __m256i *>(pSrc)); // rrrrrrrrrrrrrrrrgggggggggggggggg + simdscalari r = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc)); // rrrrrrrrrrrrrrrrgggggggggggggggg - __m256i r = _mm256_permute4x64_epi64(src, 0x50); // 0x50 = 01010000b // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx + simdscalari g = _simd_permute2f128_si(r, r, 1); // ggggggggggggggggxxxxxxxxxxxxxxxx - __m256i g = _mm256_permute4x64_epi64(src, 0xFA); // 0xFA = 11111010b // ggggggggxxxxxxxxggggggggxxxxxxxx + r = _simd_insertf128_si(r, _mm_srli_si128(_simd_extractf128_si(r, 0), 8), 1); // rrrrrrrrxxxxxxxxrrrrrrrrxxxxxxxx - __m256i dst = _mm256_unpacklo_epi8(r, g); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg + g = _simd_insertf128_si(g, _mm_srli_si128(_simd_extractf128_si(g, 0), 8), 1); // ggggggggxxxxxxxxggggggggxxxxxxxx - _mm256_store_si256(reinterpret_cast<__m256i *>(pDst), dst); + simdscalari dst = _simd_unpacklo_epi8(r, g); // rgrgrgrgrgrgrgrgrgrgrgrgrgrgrgrg + + _simd_store_si(reinterpret_cast<simdscalari *>(pDst), dst); } #endif }; @@ -414,35 +416,13 @@ struct Transpose32_32_32_32 vTranspose4x8(vDst, _simd16_extract_ps(src0, 0), _simd16_extract_ps(src1, 0), _simd16_extract_ps(src2, 0), _simd16_extract_ps(src3, 0)); -#if 1 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, reinterpret_cast<simd16scalar *>(vDst)[0]); _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, reinterpret_cast<simd16scalar *>(vDst)[1]); -#else - _mm_store_ps(reinterpret_cast<float *>(pDst), vDst[0]); - _mm_store_ps(reinterpret_cast<float *>(pDst) + 4, vDst[1]); - _mm_store_ps(reinterpret_cast<float *>(pDst) + 8, vDst[2]); - _mm_store_ps(reinterpret_cast<float *>(pDst) + 12, vDst[3]); - _mm_store_ps(reinterpret_cast<float *>(pDst) + 16, vDst[4]); - _mm_store_ps(reinterpret_cast<float *>(pDst) + 20, vDst[5]); - _mm_store_ps(reinterpret_cast<float *>(pDst) + 24, vDst[6]); - _mm_store_ps(reinterpret_cast<float *>(pDst) + 28, vDst[7]); -#endif vTranspose4x8(vDst, _simd16_extract_ps(src0, 1), _simd16_extract_ps(src1, 1), _simd16_extract_ps(src2, 1), _simd16_extract_ps(src3, 1)); -#if 1 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, reinterpret_cast<simd16scalar *>(vDst)[2]); _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, reinterpret_cast<simd16scalar *>(vDst)[3]); -#else - _mm_store_ps(reinterpret_cast<float *>(pDst) + 32, vDst[0]); - _mm_store_ps(reinterpret_cast<float *>(pDst) + 36, vDst[1]); - _mm_store_ps(reinterpret_cast<float *>(pDst) + 40, vDst[2]); - _mm_store_ps(reinterpret_cast<float *>(pDst) + 44, vDst[3]); - _mm_store_ps(reinterpret_cast<float *>(pDst) + 48, vDst[4]); - _mm_store_ps(reinterpret_cast<float *>(pDst) + 52, vDst[5]); - _mm_store_ps(reinterpret_cast<float *>(pDst) + 56, vDst[6]); - _mm_store_ps(reinterpret_cast<float *>(pDst) + 60, vDst[7]); -#endif } #endif }; @@ -489,35 +469,13 @@ struct Transpose32_32_32 vTranspose3x8(vDst, _simd16_extract_ps(src0, 0), _simd16_extract_ps(src1, 0), _simd16_extract_ps(src2, 0)); -#if 1 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 0, reinterpret_cast<simd16scalar *>(vDst)[0]); _simd16_store_ps(reinterpret_cast<float *>(pDst) + 16, reinterpret_cast<simd16scalar *>(vDst)[1]); -#else - _mm_store_ps(reinterpret_cast<float *>(pDst), vDst[0]); - _mm_store_ps(reinterpret_cast<float *>(pDst) + 4, vDst[1]); - _mm_store_ps(reinterpret_cast<float *>(pDst) + 8, vDst[2]); - _mm_store_ps(reinterpret_cast<float *>(pDst) + 12, vDst[3]); - _mm_store_ps(reinterpret_cast<float *>(pDst) + 16, vDst[4]); - _mm_store_ps(reinterpret_cast<float *>(pDst) + 20, vDst[5]); - _mm_store_ps(reinterpret_cast<float *>(pDst) + 24, vDst[6]); - _mm_store_ps(reinterpret_cast<float *>(pDst) + 28, vDst[7]); -#endif vTranspose3x8(vDst, _simd16_extract_ps(src0, 1), _simd16_extract_ps(src1, 1), _simd16_extract_ps(src2, 1)); -#if 1 _simd16_store_ps(reinterpret_cast<float *>(pDst) + 32, reinterpret_cast<simd16scalar *>(vDst)[2]); _simd16_store_ps(reinterpret_cast<float *>(pDst) + 48, reinterpret_cast<simd16scalar *>(vDst)[3]); -#else - _mm_store_ps(reinterpret_cast<float *>(pDst) + 32, vDst[0]); - _mm_store_ps(reinterpret_cast<float *>(pDst) + 36, vDst[1]); - _mm_store_ps(reinterpret_cast<float *>(pDst) + 40, vDst[2]); - _mm_store_ps(reinterpret_cast<float *>(pDst) + 44, vDst[3]); - _mm_store_ps(reinterpret_cast<float *>(pDst) + 48, vDst[4]); - _mm_store_ps(reinterpret_cast<float *>(pDst) + 52, vDst[5]); - _mm_store_ps(reinterpret_cast<float *>(pDst) + 56, vDst[6]); - _mm_store_ps(reinterpret_cast<float *>(pDst) + 60, vDst[7]); -#endif } #endif }; @@ -558,24 +516,20 @@ struct Transpose32_32 INLINE static void Transpose_16(const uint8_t* pSrc, uint8_t* pDst) { - const float *pfSrc = reinterpret_cast<const float *>(pSrc); - - __m256 src_r0 = _mm256_load_ps(pfSrc + 0); - __m256 src_r1 = _mm256_load_ps(pfSrc + 8); - __m256 src_g0 = _mm256_load_ps(pfSrc + 16); - __m256 src_g1 = _mm256_load_ps(pfSrc + 24); - - __m256 dst0 = _mm256_unpacklo_ps(src_r0, src_g0); - __m256 dst1 = _mm256_unpackhi_ps(src_r0, src_g0); - __m256 dst2 = _mm256_unpacklo_ps(src_r1, src_g1); - __m256 dst3 = _mm256_unpackhi_ps(src_r1, src_g1); - - float *pfDst = reinterpret_cast<float *>(pDst); - - _mm256_store_ps(pfDst + 0, dst0); - _mm256_store_ps(pfDst + 8, dst1); - _mm256_store_ps(pfDst + 16, dst2); - _mm256_store_ps(pfDst + 24, dst3); + simdscalar src_r0 = _simd_load_ps(reinterpret_cast<const float *>(pSrc)); + simdscalar src_r1 = _simd_load_ps(reinterpret_cast<const float *>(pSrc) + 8); + simdscalar src_g0 = _simd_load_ps(reinterpret_cast<const float *>(pSrc) + 16); + simdscalar src_g1 = _simd_load_ps(reinterpret_cast<const float *>(pSrc) + 24); + + simdscalar dst0 = _simd_unpacklo_ps(src_r0, src_g0); + simdscalar dst1 = _simd_unpacklo_ps(src_r0, src_g0); + simdscalar dst2 = _simd_unpacklo_ps(src_r1, src_g1); + simdscalar dst3 = _simd_unpacklo_ps(src_r1, src_g1); + + _simd_store_ps(reinterpret_cast<float *>(pDst) + 0, dst0); + _simd_store_ps(reinterpret_cast<float *>(pDst) + 8, dst1); + _simd_store_ps(reinterpret_cast<float *>(pDst) + 16, dst2); + _simd_store_ps(reinterpret_cast<float *>(pDst) + 24, dst3); } #endif }; @@ -625,25 +579,25 @@ struct Transpose16_16_16_16 simd16scalari src_rg = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc)); simd16scalari src_ba = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc + sizeof(simd16scalari))); - __m256i src_r = _simd16_extract_si(src_rg, 0); - __m256i src_g = _simd16_extract_si(src_rg, 1); - __m256i src_b = _simd16_extract_si(src_ba, 0); - __m256i src_a = _simd16_extract_si(src_ba, 1); - - __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g); - __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g); - __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a); - __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a); - - __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0); - __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0); - __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1); - __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1); - - _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0); - _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1); - _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2); - _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3); + simdscalari src_r = _simd16_extract_si(src_rg, 0); + simdscalari src_g = _simd16_extract_si(src_rg, 1); + simdscalari src_b = _simd16_extract_si(src_ba, 0); + simdscalari src_a = _simd16_extract_si(src_ba, 1); + + simdscalari rg0 = _simd_unpacklo_epi16(src_r, src_g); + simdscalari rg1 = _simd_unpackhi_epi16(src_r, src_g); + simdscalari ba0 = _simd_unpacklo_epi16(src_b, src_a); + simdscalari ba1 = _simd_unpackhi_epi16(src_b, src_a); + + simdscalari dst0 = _simd_unpacklo_epi32(rg0, ba0); + simdscalari dst1 = _simd_unpackhi_epi32(rg0, ba0); + simdscalari dst2 = _simd_unpacklo_epi32(rg1, ba1); + simdscalari dst3 = _simd_unpackhi_epi32(rg1, ba1); + + _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0); + _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1); + _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2); + _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3); } #endif }; @@ -691,25 +645,25 @@ struct Transpose16_16_16 { simd16scalari src_rg = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc)); - __m256i src_r = _simd16_extract_si(src_rg, 0); - __m256i src_g = _simd16_extract_si(src_rg, 1); - __m256i src_b = _mm256_load_si256(reinterpret_cast<const __m256i *>(pSrc + sizeof(simd16scalari))); - __m256i src_a = _mm256_undefined_si256(); - - __m256i rg0 = _mm256_unpacklo_epi16(src_r, src_g); - __m256i rg1 = _mm256_unpackhi_epi16(src_r, src_g); - __m256i ba0 = _mm256_unpacklo_epi16(src_b, src_a); - __m256i ba1 = _mm256_unpackhi_epi16(src_b, src_a); - - __m256i dst0 = _mm256_unpacklo_epi32(rg0, ba0); - __m256i dst1 = _mm256_unpackhi_epi32(rg0, ba0); - __m256i dst2 = _mm256_unpacklo_epi32(rg1, ba1); - __m256i dst3 = _mm256_unpackhi_epi32(rg1, ba1); - - _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 0, dst0); - _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 1, dst1); - _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 2, dst2); - _mm256_store_si256(reinterpret_cast<__m256i*>(pDst) + 3, dst3); + simdscalari src_r = _simd16_extract_si(src_rg, 0); + simdscalari src_g = _simd16_extract_si(src_rg, 1); + simdscalari src_b = _simd_load_si(reinterpret_cast<const simdscalari *>(pSrc + sizeof(simd16scalari))); + simdscalari src_a = _mm256_undefined_si256(); + + simdscalari rg0 = _simd_unpacklo_epi16(src_r, src_g); + simdscalari rg1 = _simd_unpackhi_epi16(src_r, src_g); + simdscalari ba0 = _simd_unpacklo_epi16(src_b, src_a); + simdscalari ba1 = _simd_unpackhi_epi16(src_b, src_a); + + simdscalari dst0 = _simd_unpacklo_epi32(rg0, ba0); + simdscalari dst1 = _simd_unpackhi_epi32(rg0, ba0); + simdscalari dst2 = _simd_unpacklo_epi32(rg1, ba1); + simdscalari dst3 = _simd_unpackhi_epi32(rg1, ba1); + + _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 0, dst0); + _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 1, dst1); + _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 2, dst2); + _simd_store_si(reinterpret_cast<simdscalari *>(pDst) + 3, dst3); } #endif }; @@ -749,13 +703,13 @@ struct Transpose16_16 { simd16scalari result = _simd16_setzero_si(); - simd16scalari src = _simd16_castps_si(_simd16_load_ps(reinterpret_cast<const float *>(pSrc))); + simd16scalari src = _simd16_load_si(reinterpret_cast<const simd16scalari *>(pSrc)); simdscalari srclo = _simd16_extract_si(src, 0); simdscalari srchi = _simd16_extract_si(src, 1); - result = _simd16_insert_si(result, _mm256_unpacklo_epi16(srclo, srchi), 0); - result = _simd16_insert_si(result, _mm256_unpackhi_epi16(srclo, srchi), 1); + result = _simd16_insert_si(result, _simd_unpacklo_epi16(srclo, srchi), 0); + result = _simd16_insert_si(result, _simd_unpackhi_epi16(srclo, srchi), 1); _simd16_store_si(reinterpret_cast<simd16scalari *>(pDst), result); } diff --git a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h index 21ee443841c..0c0b96204f6 100644 --- a/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h +++ b/src/gallium/drivers/swr/rasterizer/memory/StoreTile.h @@ -34,7 +34,6 @@ #include "core/format_conversion.h" #include "memory/TilingFunctions.h" -#include "memory/tilingtraits.h" #include "memory/Convert.h" #include "core/multisample.h" @@ -103,6 +102,33 @@ struct StorePixels<8, 2> } }; +#if USE_8x2_TILE_BACKEND +template <> +struct StorePixels<8, 4> +{ + static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4]) + { + // 8 x 2 bytes = 16 bytes, 16 pixels + const uint16_t *pSrc16 = reinterpret_cast<const uint16_t *>(pSrc); + + uint16_t **ppDsts16 = reinterpret_cast<uint16_t **>(ppDsts); + + // Unswizzle from SWR-Z order + ppDsts16[0][0] = pSrc16[0]; // 0 1 + ppDsts16[0][1] = pSrc16[2]; // 4 5 + + ppDsts16[1][0] = pSrc16[1]; // 2 3 + ppDsts16[1][1] = pSrc16[3]; // 6 7 + + ppDsts16[2][0] = pSrc16[4]; // 8 9 + ppDsts16[2][1] = pSrc16[6]; // C D + + ppDsts16[3][0] = pSrc16[5]; // A B + ppDsts16[3][1] = pSrc16[7]; // E F + } +}; + +#endif ////////////////////////////////////////////////////////////////////////// /// StorePixels (32-bit pixel specialization) /// @brief Stores a 4x2 (AVX) raster-tile to two rows. @@ -131,6 +157,33 @@ struct StorePixels<16, 2> } }; +#if USE_8x2_TILE_BACKEND +template <> +struct StorePixels<16, 4> +{ + static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4]) + { + // 8 x 4 bytes = 32 bytes, 16 pixels + const uint32_t *pSrc32 = reinterpret_cast<const uint32_t *>(pSrc); + + uint32_t **ppDsts32 = reinterpret_cast<uint32_t **>(ppDsts); + + // Unswizzle from SWR-Z order + ppDsts32[0][0] = pSrc32[0]; // 0 1 + ppDsts32[0][1] = pSrc32[2]; // 4 5 + + ppDsts32[1][0] = pSrc32[1]; // 2 3 + ppDsts32[1][1] = pSrc32[3]; // 6 7 + + ppDsts32[2][0] = pSrc32[4]; // 8 9 + ppDsts32[2][1] = pSrc32[6]; // C D + + ppDsts32[3][0] = pSrc32[5]; // A B + ppDsts32[3][1] = pSrc32[7]; // E F + } +}; + +#endif ////////////////////////////////////////////////////////////////////////// /// StorePixels (32-bit pixel specialization) /// @brief Stores a 4x2 (AVX) raster-tile to two rows. @@ -164,15 +217,21 @@ struct StorePixels<32, 4> { static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[4]) { - __m128i quad0 = _mm_load_si128(&reinterpret_cast<const __m128i *>(pSrc)[0]); - __m128i quad1 = _mm_load_si128(&reinterpret_cast<const __m128i *>(pSrc)[1]); - __m128i quad2 = _mm_load_si128(&reinterpret_cast<const __m128i *>(pSrc)[2]); - __m128i quad3 = _mm_load_si128(&reinterpret_cast<const __m128i *>(pSrc)[3]); - - _mm_storeu_si128(reinterpret_cast<__m128i *>(ppDsts[0]), _mm_unpacklo_epi64(quad0, quad1)); - _mm_storeu_si128(reinterpret_cast<__m128i *>(ppDsts[1]), _mm_unpackhi_epi64(quad0, quad1)); - _mm_storeu_si128(reinterpret_cast<__m128i *>(ppDsts[2]), _mm_unpacklo_epi64(quad2, quad3)); - _mm_storeu_si128(reinterpret_cast<__m128i *>(ppDsts[3]), _mm_unpackhi_epi64(quad2, quad3)); + // 4 x 16 bytes = 64 bytes, 16 pixels + const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc); + + __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts); + + // Unswizzle from SWR-Z order + __m128i quad0 = _mm_load_si128(&pSrc128[0]); // 0 1 2 3 + __m128i quad1 = _mm_load_si128(&pSrc128[1]); // 4 5 6 7 + __m128i quad2 = _mm_load_si128(&pSrc128[2]); // 8 9 A B + __m128i quad3 = _mm_load_si128(&pSrc128[3]); // C D E F + + _mm_storeu_si128(ppDsts128[0], _mm_unpacklo_epi64(quad0, quad1)); // 0 1 4 5 + _mm_storeu_si128(ppDsts128[1], _mm_unpackhi_epi64(quad0, quad1)); // 2 3 6 7 + _mm_storeu_si128(ppDsts128[2], _mm_unpacklo_epi64(quad2, quad3)); // 8 9 C D + _mm_storeu_si128(ppDsts128[3], _mm_unpackhi_epi64(quad2, quad3)); // A B E F } }; @@ -203,6 +262,30 @@ struct StorePixels<64, 4> } }; +#if USE_8x2_TILE_BACKEND +template <> +struct StorePixels<64, 8> +{ + static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[8]) + { + // 8 x 16 bytes = 128 bytes, 16 pixels + const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc); + + __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts); + + // order of pointers match SWR-Z layout + *ppDsts128[0] = pSrc128[0]; // 0 1 + *ppDsts128[1] = pSrc128[1]; // 2 3 + *ppDsts128[2] = pSrc128[2]; // 4 5 + *ppDsts128[3] = pSrc128[3]; // 6 7 + *ppDsts128[4] = pSrc128[4]; // 8 9 + *ppDsts128[5] = pSrc128[5]; // A B + *ppDsts128[6] = pSrc128[6]; // C D + *ppDsts128[7] = pSrc128[7]; // E F + } +}; + +#endif ////////////////////////////////////////////////////////////////////////// /// StorePixels (32-bit pixel specialization) /// @brief Stores a 4x2 (AVX) raster-tile to two rows. @@ -233,6 +316,28 @@ struct StorePixels<128, 8> } }; +#if USE_8x2_TILE_BACKEND +template <> +struct StorePixels<128, 16> +{ + static void Store(const uint8_t* pSrc, uint8_t* (&ppDsts)[16]) + { + // 16 x 16 bytes = 256 bytes, 16 pixels + const __m128i *pSrc128 = reinterpret_cast<const __m128i *>(pSrc); + + __m128i **ppDsts128 = reinterpret_cast<__m128i **>(ppDsts); + + for (uint32_t i = 0; i < 16; i += 4) + { + *ppDsts128[i + 0] = pSrc128[i + 0]; + *ppDsts128[i + 1] = pSrc128[i + 2]; + *ppDsts128[i + 2] = pSrc128[i + 1]; + *ppDsts128[i + 3] = pSrc128[i + 3]; + } + } +}; + +#endif ////////////////////////////////////////////////////////////////////////// /// ConvertPixelsSOAtoAOS - Conversion for SIMD pixel (4x2 or 2x2) ////////////////////////////////////////////////////////////////////////// @@ -332,6 +437,51 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM > template <size_t NumDests> INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) { +#if USE_8x2_TILE_BACKEND + static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT; + static const SWR_FORMAT DstFormat = B5G6R5_UNORM; + + static const uint32_t MAX_RASTER_TILE_BYTES = 16 * 16; // 16 pixels * 16 bytes per pixel + + OSALIGNSIMD16(uint8_t) aosTile[MAX_RASTER_TILE_BYTES]; + + // Load hot-tile + simd16vector src, dst; + LoadSOA<SrcFormat>(pSrc, src); + + // deswizzle + dst.x = src[FormatTraits<DstFormat>::swizzle(0)]; + dst.y = src[FormatTraits<DstFormat>::swizzle(1)]; + dst.z = src[FormatTraits<DstFormat>::swizzle(2)]; + + // clamp + dst.x = Clamp<DstFormat>(dst.x, 0); + dst.y = Clamp<DstFormat>(dst.y, 1); + dst.z = Clamp<DstFormat>(dst.z, 2); + + // normalize + dst.x = Normalize<DstFormat>(dst.x, 0); + dst.y = Normalize<DstFormat>(dst.y, 1); + dst.z = Normalize<DstFormat>(dst.z, 2); + + // pack + simd16scalari packed = _simd16_castps_si(dst.x); + + SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(0) == 5); + SWR_ASSERT(FormatTraits<DstFormat>::GetBPC(1) == 6); + + packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.y), 5)); + packed = _simd16_or_si(packed, _simd16_slli_epi32(_simd16_castps_si(dst.z), 5 + 6)); + + // pack low 16 bits of each 32 bit lane to low 128 bits of dst + uint32_t *pPacked = (uint32_t*)&packed; + uint16_t *pAosTile = (uint16_t*)&aosTile[0]; + for (uint32_t t = 0; t < KNOB_SIMD16_WIDTH; ++t) + { + *pAosTile++ = *pPacked++; + } + +#else static const SWR_FORMAT SrcFormat = R32G32B32A32_FLOAT; static const SWR_FORMAT DstFormat = B5G6R5_UNORM; static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel @@ -371,6 +521,7 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B5G6R5_UNORM > *pAosTile++ = *pPacked++; } +#endif // Store data into destination StorePixels<FormatTraits<DstFormat>::bpp, NumDests>::Store(aosTile, ppDsts); } @@ -409,56 +560,23 @@ struct ConvertPixelsSOAtoAOS<R32_FLOAT, R24_UNORM_X8_TYPELESS> // Store data into destination but don't overwrite the X8 bits // Each 4-pixel row is 16-bytes -#if 1 + simdscalari loadlo = _simd_load_si(reinterpret_cast<simdscalari *>(aosTile)); simdscalari loadhi = _simd_load_si(reinterpret_cast<simdscalari *>(aosTile + sizeof(simdscalari))); simdscalari templo = _simd_unpacklo_epi64(loadlo, loadhi); simdscalari temphi = _simd_unpackhi_epi64(loadlo, loadhi); - simdscalari destlo = _mm256_loadu2_m128i(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0])); - simdscalari desthi = _mm256_loadu2_m128i(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2])); + simdscalari destlo = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0])); + simdscalari desthi = _simd_loadu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2])); - simdscalari mask = _simd_set1_epi32(0xFFFFFF); + simdscalari mask = _simd_set1_epi32(0x00FFFFFF); destlo = _simd_or_si(_simd_andnot_si(mask, destlo), _simd_and_si(mask, templo)); desthi = _simd_or_si(_simd_andnot_si(mask, desthi), _simd_and_si(mask, templo)); - _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]), destlo); - _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]), desthi); -#else - __m128i *pZRow01 = (__m128i*)aosTile; - __m128i vQuad00 = _mm_load_si128(pZRow01); - __m128i vQuad01 = _mm_load_si128(pZRow01 + 1); - __m128i vQuad02 = _mm_load_si128(pZRow01 + 2); - __m128i vQuad03 = _mm_load_si128(pZRow01 + 3); - - __m128i vRow00 = _mm_unpacklo_epi64(vQuad00, vQuad01); - __m128i vRow10 = _mm_unpackhi_epi64(vQuad00, vQuad01); - __m128i vRow20 = _mm_unpacklo_epi64(vQuad02, vQuad03); - __m128i vRow30 = _mm_unpackhi_epi64(vQuad02, vQuad03); - - __m128i vDst0 = _mm_loadu_si128((const __m128i*)ppDsts[0]); - __m128i vDst1 = _mm_loadu_si128((const __m128i*)ppDsts[1]); - __m128i vDst2 = _mm_loadu_si128((const __m128i*)ppDsts[2]); - __m128i vDst3 = _mm_loadu_si128((const __m128i*)ppDsts[3]); - - __m128i vMask = _mm_set1_epi32(0xFFFFFF); - - vDst0 = _mm_andnot_si128(vMask, vDst0); - vDst0 = _mm_or_si128(vDst0, _mm_and_si128(vRow00, vMask)); - vDst1 = _mm_andnot_si128(vMask, vDst1); - vDst1 = _mm_or_si128(vDst1, _mm_and_si128(vRow10, vMask)); - vDst2 = _mm_andnot_si128(vMask, vDst2); - vDst2 = _mm_or_si128(vDst2, _mm_and_si128(vRow20, vMask)); - vDst3 = _mm_andnot_si128(vMask, vDst3); - vDst3 = _mm_or_si128(vDst3, _mm_and_si128(vRow10, vMask)); - - _mm_storeu_si128((__m128i*)ppDsts[0], vDst0); - _mm_storeu_si128((__m128i*)ppDsts[1], vDst1); - _mm_storeu_si128((__m128i*)ppDsts[2], vDst2); - _mm_storeu_si128((__m128i*)ppDsts[3], vDst3); -#endif + _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[1]), reinterpret_cast<__m128i *>(ppDsts[0]), destlo); + _simd_storeu2_si(reinterpret_cast<__m128i *>(ppDsts[3]), reinterpret_cast<__m128i *>(ppDsts[2]), desthi); #else static const uint32_t MAX_RASTER_TILE_BYTES = 128; // 8 pixels * 16 bytes per pixel @@ -508,7 +626,7 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDs simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb simd16scalar comp3 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(3) * sizeof(simd16scalar))); // float32 aaaaaaaaaaaaaaaa - // clamp + // clamp const simd16scalar zero = _simd16_setzero_ps(); const simd16scalar ones = _simd16_set1_ps(1.0f); @@ -524,15 +642,15 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDs comp3 = _simd16_max_ps(comp3, zero); comp3 = _simd16_min_ps(comp3, ones); + // gamma-correct only rgb if (FormatTraits<DstFormat>::isSRGB) { - // Gamma-correct only rgb comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0); comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1); comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2); } - // convert float components from 0.0f .. 1.0f to correct scale for 0 .. 255 dest format + // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1))); comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2))); @@ -544,15 +662,14 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDs simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb simd16scalari src3 = _simd16_cvtps_epi32(comp3); // padded byte aaaaaaaaaaaaaaaa -#if 1 - // SOA to AOS conversion + // SOA to AOS conversion src1 = _simd16_slli_epi32(src1, 8); src2 = _simd16_slli_epi32(src2, 16); src3 = _simd16_slli_epi32(src3, 24); simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3)); // 0 1 2 3 4 5 6 7 8 9 A B C D E F - // de-swizzle conversion + // de-swizzle conversion #if 1 simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F @@ -563,66 +680,11 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDs final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0)); #endif -#endif -#if KNOB_ARCH == KNOB_ARCH_AVX - - // splitting into two sets of 4 wide integer vector types - // because AVX doesn't have instructions to support this operation at 8 wide -#if 0 - __m128i srcLo0 = _mm256_castsi256_si128(src0); // 000r000r000r000r - __m128i srcLo1 = _mm256_castsi256_si128(src1); // 000g000g000g000g - __m128i srcLo2 = _mm256_castsi256_si128(src2); // 000b000b000b000b - __m128i srcLo3 = _mm256_castsi256_si128(src3); // 000a000a000a000a - - __m128i srcHi0 = _mm256_extractf128_si256(src0, 1); // 000r000r000r000r - __m128i srcHi1 = _mm256_extractf128_si256(src1, 1); // 000g000g000g000g - __m128i srcHi2 = _mm256_extractf128_si256(src2, 1); // 000b000b000b000b - __m128i srcHi3 = _mm256_extractf128_si256(src3, 1); // 000a000a000a000a - - srcLo1 = _mm_slli_si128(srcLo1, 1); // 00g000g000g000g0 - srcHi1 = _mm_slli_si128(srcHi1, 1); // 00g000g000g000g0 - srcLo2 = _mm_slli_si128(srcLo2, 2); // 0b000b000b000b00 - srcHi2 = _mm_slli_si128(srcHi2, 2); // 0b000b000b000b00 - srcLo3 = _mm_slli_si128(srcLo3, 3); // a000a000a000a000 - srcHi3 = _mm_slli_si128(srcHi3, 3); // a000a000a000a000 - - srcLo0 = _mm_or_si128(srcLo0, srcLo1); // 00gr00gr00gr00gr - srcLo2 = _mm_or_si128(srcLo2, srcLo3); // ab00ab00ab00ab00 - - srcHi0 = _mm_or_si128(srcHi0, srcHi1); // 00gr00gr00gr00gr - srcHi2 = _mm_or_si128(srcHi2, srcHi3); // ab00ab00ab00ab00 - - srcLo0 = _mm_or_si128(srcLo0, srcLo2); // abgrabgrabgrabgr - srcHi0 = _mm_or_si128(srcHi0, srcHi2); // abgrabgrabgrabgr - - // unpack into rows that get the tiling order correct - __m128i vRow00 = _mm_unpacklo_epi64(srcLo0, srcHi0); // abgrabgrabgrabgrabgrabgrabgrabgr - __m128i vRow10 = _mm_unpackhi_epi64(srcLo0, srcHi0); - - __m256i final = _mm256_castsi128_si256(vRow00); - final = _mm256_insertf128_si256(final, vRow10, 1); - -#else -#if 0 - simd16scalari final = _simd16_setzero_si(); - -#endif -#endif -#elif KNOB_ARCH >= KNOB_ARCH_AVX2 - // logic is as above, only wider -#if 0 - src1 = _simd16_slli_epi32(src1, 8); - src2 = _simd16_slli_epi32(src2, 16); - src3 = _simd16_slli_epi32(src3, 24); - - simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), _simd16_or_si(src2, src3)); - - final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0)); - -#endif -#endif - _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0)); - _mm256_storeu2_m128i(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1)); + // store 8x2 memory order: + // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D } + // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F } + _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0)); + _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1)); } #endif @@ -730,9 +792,74 @@ INLINE static void FlatConvert(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst #endif #endif - _mm256_storeu2_m128i((__m128i*)pDst1, (__m128i*)pDst, final); + _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final); } +#if USE_8x2_TILE_BACKEND +template<SWR_FORMAT DstFormat> +INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst0, uint8_t* pDst1, uint8_t* pDst2, uint8_t* pDst3) +{ + // swizzle rgba -> bgra while we load + simd16scalar comp0 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(0) * sizeof(simd16scalar))); // float32 rrrrrrrrrrrrrrrr + simd16scalar comp1 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(1) * sizeof(simd16scalar))); // float32 gggggggggggggggg + simd16scalar comp2 = _simd16_load_ps(reinterpret_cast<const float*>(pSrc + FormatTraits<DstFormat>::swizzle(2) * sizeof(simd16scalar))); // float32 bbbbbbbbbbbbbbbb + + // clamp + const simd16scalar zero = _simd16_setzero_ps(); + const simd16scalar ones = _simd16_set1_ps(1.0f); + + comp0 = _simd16_max_ps(comp0, zero); + comp0 = _simd16_min_ps(comp0, ones); + + comp1 = _simd16_max_ps(comp1, zero); + comp1 = _simd16_min_ps(comp1, ones); + + comp2 = _simd16_max_ps(comp2, zero); + comp2 = _simd16_min_ps(comp2, ones); + + // gamma-correct only rgb + if (FormatTraits<DstFormat>::isSRGB) + { + comp0 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(0, comp0); + comp1 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(1, comp1); + comp2 = FormatTraits<R32G32B32A32_FLOAT>::convertSrgb(2, comp2); + } + + // convert float components from 0.0f..1.0f to correct scale for 0..255 dest format + comp0 = _simd16_mul_ps(comp0, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(0))); + comp1 = _simd16_mul_ps(comp1, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(1))); + comp2 = _simd16_mul_ps(comp2, _simd16_set1_ps(FormatTraits<DstFormat>::fromFloat(2))); + + // moving to 16 wide integer vector types + simd16scalari src0 = _simd16_cvtps_epi32(comp0); // padded byte rrrrrrrrrrrrrrrr + simd16scalari src1 = _simd16_cvtps_epi32(comp1); // padded byte gggggggggggggggg + simd16scalari src2 = _simd16_cvtps_epi32(comp2); // padded byte bbbbbbbbbbbbbbbb + + // SOA to AOS conversion + src1 = _simd16_slli_epi32(src1, 8); + src2 = _simd16_slli_epi32(src2, 16); + + simd16scalari final = _simd16_or_si(_simd16_or_si(src0, src1), src2); // 0 1 2 3 4 5 6 7 8 9 A B C D E F + + // de-swizzle conversion +#if 1 + simd16scalari final0 = _simd16_permute2f128_si(final, final, 0xA0); // (2, 2, 0, 0) // 0 1 2 3 0 1 2 3 8 9 A B 8 9 A B + simd16scalari final1 = _simd16_permute2f128_si(final, final, 0xF5); // (3, 3, 1, 1) // 4 5 6 7 4 5 6 7 C D E F C D E F + + final = _simd16_shuffle_epi64(final0, final1, 0xCC); // (1 1 0 0 1 1 0 0) // 0 1 4 5 2 3 6 7 8 9 C D A B E F + +#else + final = _simd16_permute_epi32(final, _simd16_set_epi32(15, 14, 11, 10, 13, 12, 9, 8, 7, 6, 3, 2, 5, 4, 1, 0)); + +#endif + // store 8x2 memory order: + // row0: [ pDst0, pDst2 ] = { 0 1 4 5 }, { 8 9 C D } + // row1: [ pDst1, pDst3 ] = { 2 3 6 7 }, { A B E F } + _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst1), reinterpret_cast<__m128i *>(pDst0), _simd16_extract_si(final, 0)); + _simd_storeu2_si(reinterpret_cast<__m128i *>(pDst3), reinterpret_cast<__m128i *>(pDst2), _simd16_extract_si(final, 1)); +} + +#endif template<SWR_FORMAT DstFormat> INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_t* pDst1) { @@ -816,7 +943,7 @@ INLINE static void FlatConvertNoAlpha(const uint8_t* pSrc, uint8_t* pDst, uint8_ #endif - _mm256_storeu2_m128i((__m128i*)pDst1, (__m128i*)pDst, final); + _simd_storeu2_si((__m128i*)pDst1, (__m128i*)pDst, final); } template<> @@ -825,7 +952,11 @@ struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8A8_UNORM> template <size_t NumDests> INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) { +#if USE_8x2_TILE_BACKEND + FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); +#else FlatConvert<B8G8R8A8_UNORM>(pSrc, ppDsts[0], ppDsts[1]); +#endif } }; @@ -835,7 +966,11 @@ struct ConvertPixelsSOAtoAOS<R32G32B32A32_FLOAT, B8G8R8X8_UNORM> template <size_t NumDests> INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) { +#if USE_8x2_TILE_BACKEND + FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); +#else FlatConvertNoAlpha<B8G8R8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]); +#endif } }; @@ -845,7 +980,11 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8A8_UNORM_SRGB > template <size_t NumDests> INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) { +#if USE_8x2_TILE_BACKEND + FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); +#else FlatConvert<B8G8R8A8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]); +#endif } }; @@ -855,7 +994,11 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, B8G8R8X8_UNORM_SRGB > template <size_t NumDests> INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) { +#if USE_8x2_TILE_BACKEND + FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); +#else FlatConvertNoAlpha<B8G8R8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]); +#endif } }; @@ -879,7 +1022,11 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM > template <size_t NumDests> INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) { +#if USE_8x2_TILE_BACKEND + FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); +#else FlatConvertNoAlpha<R8G8B8X8_UNORM>(pSrc, ppDsts[0], ppDsts[1]); +#endif } }; @@ -903,7 +1050,11 @@ struct ConvertPixelsSOAtoAOS < R32G32B32A32_FLOAT, R8G8B8X8_UNORM_SRGB > template <size_t NumDests> INLINE static void Convert(const uint8_t* pSrc, uint8_t* (&ppDsts)[NumDests]) { +#if USE_8x2_TILE_BACKEND + FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1], ppDsts[2], ppDsts[3]); +#else FlatConvertNoAlpha<R8G8B8X8_UNORM_SRGB>(pSrc, ppDsts[0], ppDsts[1]); +#endif } }; @@ -926,13 +1077,13 @@ struct StoreRasterTile #if USE_8x2_TILE_BACKEND typedef SimdTile_16<SrcFormat, DstFormat> SimdT; - SimdT* pSrcSimdTiles = (SimdT*)pSrc; + SimdT *pSrcSimdTiles = reinterpret_cast<SimdT *>(pSrc); // Compute which simd tile we're accessing within 8x8 tile. // i.e. Compute linear simd tile coordinate given (x, y) in pixel coordinates. uint32_t simdIndex = (y / SIMD16_TILE_Y_DIM) * (KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM) + (x / SIMD16_TILE_X_DIM); - SimdT* pSimdTile = &pSrcSimdTiles[simdIndex]; + SimdT *pSimdTile = &pSrcSimdTiles[simdIndex]; uint32_t simdOffset = (y % SIMD16_TILE_Y_DIM) * SIMD16_TILE_X_DIM + (x % SIMD16_TILE_X_DIM); @@ -1024,8 +1175,41 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); } - uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); +#if USE_8x2_TILE_BACKEND + + const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; + const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL; + + uint8_t* ppDsts[] = + { + pDst, // row 0, col 0 + pDst + pDstSurface->pitch, // row 1, col 0 + pDst + dx / 2, // row 0, col 1 + pDst + pDstSurface->pitch + dx / 2 // row 1, col 1 + }; + + for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) + { + for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM) + { + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); + + pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; + + ppDsts[0] += dx; + ppDsts[1] += dx; + ppDsts[2] += dx; + ppDsts[3] += dx; + } + + ppDsts[0] += dy; + ppDsts[1] += dy; + ppDsts[2] += dy; + ppDsts[3] += dy; + } +#else uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) @@ -1045,6 +1229,7 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 8>, SrcFormat, DstFormat> ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; } +#endif } }; @@ -1077,8 +1262,41 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); } - uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); +#if USE_8x2_TILE_BACKEND + + const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; + const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL; + + uint8_t* ppDsts[] = + { + pDst, // row 0, col 0 + pDst + pDstSurface->pitch, // row 1, col 0 + pDst + dx / 2, // row 0, col 1 + pDst + pDstSurface->pitch + dx / 2 // row 1, col 1 + }; + + for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) + { + for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM) + { + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); + + pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; + + ppDsts[0] += dx; + ppDsts[1] += dx; + ppDsts[2] += dx; + ppDsts[3] += dx; + } + + ppDsts[0] += dy; + ppDsts[1] += dy; + ppDsts[2] += dy; + ppDsts[3] += dy; + } +#else uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD_TILE_Y_DIM; ++row) @@ -1098,6 +1316,7 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 16>, SrcFormat, DstFormat ppRows[0] = ppStartRows[0] + 2 * pDstSurface->pitch; ppRows[1] = ppStartRows[1] + 2 * pDstSurface->pitch; } +#endif } }; @@ -1130,32 +1349,39 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); } - uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); #if USE_8x2_TILE_BACKEND - uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch, pDst + (SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL) / 2, pDst + pDstSurface->pitch + (SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL) / 2 }; - for (uint32_t row = 0; row < KNOB_TILE_Y_DIM / SIMD16_TILE_Y_DIM; ++row) + const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; + const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL; + + uint8_t* ppDsts[] = { - uint8_t* ppStartRows[] = { ppRows[0], ppRows[1], ppRows[2], ppRows[3] }; + pDst, // row 0, col 0 + pDst + pDstSurface->pitch, // row 1, col 0 + pDst + dx / 2, // row 0, col 1 + pDst + pDstSurface->pitch + dx / 2 // row 1, col 1 + }; - for (uint32_t col = 0; col < KNOB_TILE_X_DIM / SIMD16_TILE_X_DIM; ++col) + for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) + { + for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM) { - // Format conversion and convert from SOA to AOS, and store the rows. - ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppRows); - - ppRows[0] += SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; - ppRows[1] += SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; - ppRows[2] += SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; - ppRows[3] += SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; + + ppDsts[0] += dx; + ppDsts[1] += dx; + ppDsts[2] += dx; + ppDsts[3] += dx; } - ppRows[0] = ppStartRows[0] + SIMD16_TILE_Y_DIM * pDstSurface->pitch; - ppRows[1] = ppStartRows[1] + SIMD16_TILE_Y_DIM * pDstSurface->pitch; - ppRows[2] = ppStartRows[2] + SIMD16_TILE_Y_DIM * pDstSurface->pitch; - ppRows[3] = ppStartRows[3] + SIMD16_TILE_Y_DIM * pDstSurface->pitch; + ppDsts[0] += dy; + ppDsts[1] += dy; + ppDsts[2] += dy; + ppDsts[3] += dy; } #else uint8_t* ppRows[] = { pDst, pDst + pDstSurface->pitch }; @@ -1184,15 +1410,17 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 32>, SrcFormat, DstFormat ////////////////////////////////////////////////////////////////////////// /// OptStoreRasterTile - SWR_TILE_MODE_NONE specialization for 64bpp ////////////////////////////////////////////////////////////////////////// -template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> +template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat > struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> { typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat> GenericStoreTile; - static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; + static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; static const size_t MAX_DST_COLUMN_BYTES = 16; +#if !USE_8x2_TILE_BACKEND static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; +#endif ////////////////////////////////////////////////////////////////////////// /// @brief Stores an 8x8 raster tile to the destination surface. @@ -1213,8 +1441,58 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); } - uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); +#if USE_8x2_TILE_BACKEND + + const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; + const uint32_t dy = SIMD16_TILE_Y_DIM * pDstSurface->pitch; + + // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) + static_assert(dx == MAX_DST_COLUMN_BYTES * 4, "Invalid column offsets"); + +#if 1 + uint8_t *ppDsts[8]; + + { + for (uint32_t y = 0; y < 2; y += 1) + { + for (uint32_t x = 0; x < 4; x += 1) + { + ppDsts[x * 2 + y] = pDst + y * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES; + } + } + } + +#else + uint8_t *ppDsts[] = + { + pDst, // row 0, col 0 + pDst + pDstSurface->pitch, // row 1, col 0 + pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1 + pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1 + pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2 + pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2 + pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3 + pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3 // row 1, col 3 + }; + +#endif + for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) + { + // Raster tile width is same as simd16 tile width + static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); + + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); + + pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; + + for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1) + { + ppDsts[i] += dy; + } + } +#else uint8_t* ppDsts[] = { pDst, // row 0, col 0 @@ -1250,6 +1528,7 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 64>, SrcFormat, DstFormat ppDsts[2] = ppStartRows[2] + 2 * pDstSurface->pitch; ppDsts[3] = ppStartRows[3] + 2 * pDstSurface->pitch; } +#endif } }; @@ -1260,11 +1539,13 @@ template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> { typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile; - static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; + static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; static const size_t MAX_DST_COLUMN_BYTES = 16; +#if !USE_8x2_TILE_BACKEND static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; static const size_t DST_COLUMN_BYTES_PER_SRC = KNOB_SIMD_WIDTH * DST_BYTES_PER_PIXEL / 2; +#endif ////////////////////////////////////////////////////////////////////////// /// @brief Stores an 8x8 raster tile to the destination surface. @@ -1285,8 +1566,92 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstForma return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); } - uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); +#if USE_8x2_TILE_BACKEND + + const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; + const uint32_t dy = SIMD16_TILE_Y_DIM * 2 * pDstSurface->pitch; // double up on tile y dim, one simd16 tile will do twice the rows + + // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) + static_assert(dx == MAX_DST_COLUMN_BYTES * 8, "Invalid column offsets"); + +#if 1 + uint8_t *ppDsts[16]; + + { + for (uint32_t y = 0; y < 2; y += 1) + { + for (uint32_t x = 0; x < 4; x += 1) + { + ppDsts[x * 2 + (y + 0)] = pDst + (y + 0) * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES; + ppDsts[x * 2 + (y + 8)] = pDst + (y + 2) * pDstSurface->pitch + x * MAX_DST_COLUMN_BYTES; + } + } + } + +#else + uint8_t* ppDsts[] = + { + pDst, // row 0, col 0 + pDst + pDstSurface->pitch, // row 1, col 0 + pDst + MAX_DST_COLUMN_BYTES, // row 0, col 1 + pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES, // row 1, col 1 + pDst + MAX_DST_COLUMN_BYTES * 2, // row 0, col 2 + pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 2, // row 1, col 2 + pDst + MAX_DST_COLUMN_BYTES * 3, // row 0, col 3 + pDst + pDstSurface->pitch + MAX_DST_COLUMN_BYTES * 3, // row 1, col 3 + + pDst + pDstSurface->pitch * 2, // row 2, col 0 + pDst + pDstSurface->pitch * 3, // row 3, col 0 + pDst + pDstSurface->pitch * 2 + MAX_DST_COLUMN_BYTES, // row 2, col 1 + pDst + pDstSurface->pitch * 3 + MAX_DST_COLUMN_BYTES, // row 3, col 1 + pDst + pDstSurface->pitch * 2 + MAX_DST_COLUMN_BYTES * 2, // row 2, col 2 + pDst + pDstSurface->pitch * 3 + MAX_DST_COLUMN_BYTES * 2, // row 3, col 2 + pDst + pDstSurface->pitch * 2 + MAX_DST_COLUMN_BYTES * 3, // row 2, col 3 + pDst + pDstSurface->pitch * 3 + MAX_DST_COLUMN_BYTES * 3 // row 3, col 3 + }; + +#endif +#if 1 + // Raster tile height is quadruple simd16 tile height + static_assert(KNOB_TILE_Y_DIM == SIMD16_TILE_Y_DIM * 4, "Invalid tile y dim"); + + // Raster tile width is same as simd16 tile width + static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); + + // tile rows 0 thru 3 + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); + + pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; + + for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1) + { + ppDsts[i] += dy; + } + + // tile rows 4 thru 7 + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); + +#else + for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM * 2) + { + // Raster tile width is same as simd16 tile width + static_assert(KNOB_TILE_X_DIM * 2 == SIMD16_TILE_X_DIM, "Invalid tile x dim"); + + // Format conversion, convert from SOA to AOS, and store + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); + + pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; + + for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1) + { + ppDsts[i] += dy; + } + } + +#endif +#else struct DstPtrs { uint8_t* ppDsts[8]; @@ -1330,6 +1695,7 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstForma ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * pDstSurface->pitch; ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * pDstSurface->pitch; } +#endif } }; @@ -1340,6 +1706,7 @@ template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> { typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, DstFormat> GenericStoreTile; + static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; ////////////////////////////////////////////////////////////////////////// /// @brief Stores an 8x8 raster tile to the destination surface. @@ -1365,6 +1732,33 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, Dst // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. // We can compute the offsets to each column within the raster tile once and increment from these. // There will be 2 x 4-wide columns in an 8x8 raster tile. +#if USE_8x2_TILE_BACKEND + uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + + const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; + + uint8_t *ppDsts[] = + { + pDst, + pDst + DestRowWidthBytes, + pDst + DestRowWidthBytes / 4, + pDst + DestRowWidthBytes + DestRowWidthBytes / 4 + }; + + // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. + for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) + { + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); + + pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; + + ppDsts[0] += dy; + ppDsts[1] += dy; + ppDsts[2] += dy; + ppDsts[3] += dy; + } +#else uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); @@ -1388,6 +1782,7 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 8>, SrcFormat, Dst ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); pSrc += pSrcInc; } +#endif } }; @@ -1398,6 +1793,7 @@ template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> { typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, DstFormat> GenericStoreTile; + static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; ////////////////////////////////////////////////////////////////////////// /// @brief Stores an 8x8 raster tile to the destination surface. @@ -1423,6 +1819,33 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, Ds // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. // We can compute the offsets to each column within the raster tile once and increment from these. // There will be 2 x 4-wide columns in an 8x8 raster tile. +#if USE_8x2_TILE_BACKEND + uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + + const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; + + uint8_t *ppDsts[] = + { + pDst, + pDst + DestRowWidthBytes, + pDst + DestRowWidthBytes / 2, + pDst + DestRowWidthBytes + DestRowWidthBytes / 2 + }; + + // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. + for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) + { + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); + + pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; + + ppDsts[0] += dy; + ppDsts[1] += dy; + ppDsts[2] += dy; + ppDsts[3] += dy; + } +#else uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); @@ -1446,6 +1869,7 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 16>, SrcFormat, Ds ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); pSrc += pSrcInc; } +#endif } }; @@ -1456,6 +1880,8 @@ template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> { typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, DstFormat> GenericStoreTile; + static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; + static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; ////////////////////////////////////////////////////////////////////////// /// @brief Stores an 8x8 raster tile to the destination surface. @@ -1480,7 +1906,42 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, Ds // TileX is a row-major tiling mode where each 4KB tile consist of 8 x 512B rows. // We can compute the offsets to each column within the raster tile once and increment from these. - uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, +#if USE_8x2_TILE_BACKEND + uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + + const uint32_t dx = SIMD16_TILE_X_DIM * DST_BYTES_PER_PIXEL; + const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes - KNOB_TILE_X_DIM * DST_BYTES_PER_PIXEL; + + uint8_t* ppDsts[] = + { + pDst, // row 0, col 0 + pDst + DestRowWidthBytes, // row 1, col 0 + pDst + dx / 2, // row 0, col 1 + pDst + DestRowWidthBytes + dx / 2 // row 1, col 1 + }; + + for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) + { + for (uint32_t xx = 0; xx < KNOB_TILE_X_DIM; xx += SIMD16_TILE_X_DIM) + { + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); + + pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; + + ppDsts[0] += dx; + ppDsts[1] += dx; + ppDsts[2] += dx; + ppDsts[3] += dx; + } + + ppDsts[0] += dy; + ppDsts[1] += dy; + ppDsts[2] += dy; + ppDsts[3] += dy; + } +#else + uint8_t *pRow0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); uint8_t* pRow1 = pRow0 + DestRowWidthBytes; @@ -1500,6 +1961,7 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_XMAJOR, 32>, SrcFormat, Ds pRow0 += (DestRowWidthBytes * 2); pRow1 += (DestRowWidthBytes * 2); } +#endif } }; @@ -1537,22 +1999,36 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 32>, SrcFormat, Ds // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. // We can compute the offsets to each column within the raster tile once and increment from these. // There will be 2 x 4-wide columns in an 8x8 raster tile. - uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, +#if USE_8x2_TILE_BACKEND + uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); -#if USE_8x2_TILE_BACKEND - // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. - for (uint32_t row = 0; row < KNOB_TILE_Y_DIM; row += SIMD16_TILE_Y_DIM) - { - uint8_t *pRow = pCol0 + row * DestRowWidthBytes; + const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; - uint8_t *ppDsts[] = { pRow, pRow + DestRowWidthBytes, pRow + DestColumnBytes, pRow + DestColumnBytes + DestRowWidthBytes }; + uint8_t *ppDsts[] = + { + pDst, + pDst + DestRowWidthBytes, + pDst + DestColumnBytes, + pDst + DestRowWidthBytes + DestColumnBytes + }; + // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. + for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) + { ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; + + ppDsts[0] += dy; + ppDsts[1] += dy; + ppDsts[2] += dy; + ppDsts[3] += dy; } #else + uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + // Increment by a whole SIMD. 4x2 for AVX. 2x2 for SSE. uint32_t pSrcInc = (FormatTraits<SrcFormat>::bpp * KNOB_SIMD_WIDTH) / 8; @@ -1584,6 +2060,7 @@ template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> { typedef StoreRasterTile<TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, DstFormat> GenericStoreTile; + static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; ////////////////////////////////////////////////////////////////////////// /// @brief Stores an 8x8 raster tile to the destination surface. @@ -1610,7 +2087,54 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, Ds // TileY is a column-major tiling mode where each 4KB tile consist of 8 columns of 32 x 16B rows. // We can compute the offsets to each column within the raster tile once and increment from these. // There will be 2 x 4-wide columns in an 8x8 raster tile. - uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, +#if USE_8x2_TILE_BACKEND + uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + + const uint32_t dy = SIMD16_TILE_Y_DIM * DestRowWidthBytes; + +#if 1 + // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) + uint8_t *ppDsts[8]; + + { + for (uint32_t y = 0; y < 2; y += 1) + { + for (uint32_t x = 0; x < 4; x += 1) + { + ppDsts[x * 2 + y] = pDst + y * DestRowWidthBytes + x * DestColumnBytes; + } + } + } + +#else + uint8_t *ppDsts[] = + { + pDst, + pDst + DestRowWidthBytes, + pDst + DestColumnBytes, + pDst + DestRowWidthBytes + DestColumnBytes, + pDst + DestColumnBytes * 2, + pDst + DestRowWidthBytes + DestColumnBytes * 2, + pDst + DestColumnBytes * 3, + pDst + DestRowWidthBytes + DestColumnBytes * 3 + }; + +#endif + // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. + for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM) + { + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); + + pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; + + for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1) + { + ppDsts[i] += dy; + } + } +#else + uint8_t* pCol0 = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); uint8_t* pCol1 = pCol0 + DestColumnBytes; @@ -1641,6 +2165,7 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 64>, SrcFormat, Ds ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); pSrc += pSrcInc; } +#endif } }; @@ -1651,18 +2176,22 @@ template<SWR_FORMAT SrcFormat, SWR_FORMAT DstFormat> struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, DstFormat> { typedef StoreRasterTile<TilingTraits<SWR_TILE_NONE, 128>, SrcFormat, DstFormat> GenericStoreTile; - - static const size_t TILE_Y_COL_WIDTH_BYTES = 16; - static const size_t TILE_Y_ROWS = 32; - static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES; - - static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; - static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; - static const size_t MAX_DST_COLUMN_BYTES = 16; - - static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; +#if USE_8x2_TILE_BACKEND + static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; + +#else + static const size_t TILE_Y_COL_WIDTH_BYTES = 16; + static const size_t TILE_Y_ROWS = 32; + static const size_t TILE_Y_COL_BYTES = TILE_Y_ROWS * TILE_Y_COL_WIDTH_BYTES; + + static const size_t DST_BYTES_PER_PIXEL = FormatTraits<DstFormat>::bpp / 8; + static const size_t SRC_BYTES_PER_PIXEL = FormatTraits<SrcFormat>::bpp / 8; + static const size_t MAX_DST_COLUMN_BYTES = 16; + + static const size_t SRC_COLUMN_BYTES = KNOB_SIMD_WIDTH * SRC_BYTES_PER_PIXEL; static const size_t DST_COLUMN_BYTES_PER_SRC = TILE_Y_COL_BYTES * 4; +#endif ////////////////////////////////////////////////////////////////////////// /// @brief Stores an 8x8 raster tile to the destination surface. /// @param pSrc - Pointer to raster tile. @@ -1673,6 +2202,11 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, D SWR_SURFACE_STATE* pDstSurface, uint32_t x, uint32_t y, uint32_t sampleNum, uint32_t renderTargetArrayIndex) { +#if USE_8x2_TILE_BACKEND + static const uint32_t DestRowWidthBytes = 16; // 16B rows + static const uint32_t DestColumnBytes = DestRowWidthBytes * 32; // 16B x 32 rows. +#endif + // Punt non-full tiles to generic store uint32_t lodWidth = std::max(pDstSurface->width >> pDstSurface->lod, 1U); uint32_t lodHeight = std::max(pDstSurface->height >> pDstSurface->lod, 1U); @@ -1682,7 +2216,85 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, D return GenericStoreTile::Store(pSrc, pDstSurface, x, y, sampleNum, renderTargetArrayIndex); } - uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, +#if USE_8x2_TILE_BACKEND + uint8_t *pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, + pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); + + const uint32_t dy = SIMD16_TILE_Y_DIM * 2 * DestRowWidthBytes; // double up on tile y dim, one simd16 tile will do twice the rows + +#if 1 + // we have to break these large spans up, since ConvertPixelsSOAtoAOS() can only work on max 16B spans (a TileY limitation) + uint8_t *ppDsts[16]; + + { + for (uint32_t y = 0; y < 2; y += 1) + { + for (uint32_t x = 0; x < 4; x += 1) + { + ppDsts[x * 2 + (y + 0)] = pDst + (y + 0) * DestRowWidthBytes + x * DestColumnBytes; + ppDsts[x * 2 + (y + 8)] = pDst + (y + 2) * DestRowWidthBytes + x * DestColumnBytes; + } + } + } + +#else + uint8_t *ppDsts[] = + { + pDst, + pDst + DestRowWidthBytes, + pDst + DestColumnBytes, + pDst + DestRowWidthBytes + DestColumnBytes, + pDst + DestColumnBytes * 2, + pDst + DestRowWidthBytes + DestColumnBytes * 2, + pDst + DestColumnBytes * 3, + pDst + DestRowWidthBytes + DestColumnBytes * 3, + + pDst + DestRowWidthBytes * 2, + pDst + DestRowWidthBytes * 3, + pDst + DestRowWidthBytes * 2 + DestColumnBytes, + pDst + DestRowWidthBytes * 3 + DestColumnBytes, + pDst + DestRowWidthBytes * 2 + DestColumnBytes * 2, + pDst + DestRowWidthBytes * 3 + DestColumnBytes * 2, + pDst + DestRowWidthBytes * 2 + DestColumnBytes * 3, + pDst + DestRowWidthBytes * 3 + DestColumnBytes * 3 + }; + +#endif +#if 1 + // Raster tile height is quadruple simd16 tile height + static_assert(KNOB_TILE_Y_DIM == SIMD16_TILE_Y_DIM * 4, "Invalid tile y dim"); + + // Raster tile width is same as simd16 tile width + static_assert(KNOB_TILE_X_DIM == SIMD16_TILE_X_DIM, "Invalid tile x dim"); + + // tile rows 0 thru 3 + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); + + pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; + + for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1) + { + ppDsts[i] += dy; + } + + // tile rows 4 thru 7 + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); +#else + // The Hot Tile uses a row-major tiling mode and has a larger memory footprint. So we iterate in a row-major pattern. + for (uint32_t yy = 0; yy < KNOB_TILE_Y_DIM; yy += SIMD16_TILE_Y_DIM * 2) + { + ConvertPixelsSOAtoAOS<SrcFormat, DstFormat>::Convert(pSrc, ppDsts); + + pSrc += KNOB_SIMD16_WIDTH * SRC_BYTES_PER_PIXEL; + + for (uint32_t i = 0; i < sizeof(ppDsts) / sizeof(ppDsts[0]); i += 1) + { + ppDsts[i] += dy; + } + } +#endif +#else + uint8_t* pDst = (uint8_t*)ComputeSurfaceAddress<false, false>(x, y, pDstSurface->arrayIndex + renderTargetArrayIndex, pDstSurface->arrayIndex + renderTargetArrayIndex, sampleNum, pDstSurface->lod, pDstSurface); struct DstPtrs { @@ -1727,6 +2339,7 @@ struct OptStoreRasterTile< TilingTraits<SWR_TILE_MODE_YMAJOR, 128>, SrcFormat, D ptrs.ppDsts[6] = startPtrs.ppDsts[6] + 2 * TILE_Y_COL_WIDTH_BYTES; ptrs.ppDsts[7] = startPtrs.ppDsts[7] + 2 * TILE_Y_COL_WIDTH_BYTES; } +#endif } }; @@ -1776,7 +2389,6 @@ struct StoreMacroTile uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex) { PFN_STORE_TILES_INTERNAL pfnStore[SWR_MAX_NUM_MULTISAMPLES]; - for (uint32_t sampleNum = 0; sampleNum < pDstSurface->numSamples; sampleNum++) { size_t dstSurfAddress = (size_t)ComputeSurfaceAddress<false, false>( |