diff options
Diffstat (limited to 'src/gallium/drivers/swr')
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/common/simd16intrin.h | 26 | ||||
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/core/pa.h | 46 | ||||
-rw-r--r-- | src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp | 2 |
3 files changed, 47 insertions, 27 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h index 22a125b05ad..88814a58aa9 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/simd16intrin.h @@ -460,7 +460,10 @@ INLINE simd16scalar _simd16_cmp_ps(simd16scalar a, simd16scalar b) #define _simd16_cmple_ps(a, b) _simd16_cmp_ps<_CMP_LE_OQ>(a, b) SIMD16_EMU_AVX512_2(simd16scalar, _simd16_and_ps, _simd_and_ps) +SIMD16_EMU_AVX512_2(simd16scalar, _simd16_andnot_ps, _simd_andnot_ps) SIMD16_EMU_AVX512_2(simd16scalar, _simd16_or_ps, _simd_or_ps) +SIMD16_EMU_AVX512_2(simd16scalar, _simd16_xor_ps, _simd_xor_ps) + SIMD16_EMU_AVX512_1(simd16scalar, _simd16_rcp_ps, _simd_rcp_ps) SIMD16_EMU_AVX512_2(simd16scalar, _simd16_div_ps, _simd_div_ps) @@ -494,8 +497,6 @@ INLINE simd16scalard _simd16_castps_pd(simd16scalar a) return *reinterpret_cast<simd16scalard *>(&a); } -SIMD16_EMU_AVX512_2(simd16scalar, _simd16_andnot_ps, _mm256_andnot_ps) - template <int mode> INLINE simd16scalar _simd16_round_ps_temp(simd16scalar a) { @@ -518,10 +519,12 @@ SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epi32, _simd_max_epi32) SIMD16_EMU_AVX512_2(simd16scalari, _simd16_min_epu32, _simd_min_epu32) SIMD16_EMU_AVX512_2(simd16scalari, _simd16_max_epu32, _simd_max_epu32) SIMD16_EMU_AVX512_2(simd16scalari, _simd16_add_epi32, _simd_add_epi32) + SIMD16_EMU_AVX512_2(simd16scalari, _simd16_and_si, _simd_and_si) SIMD16_EMU_AVX512_2(simd16scalari, _simd16_andnot_si, _simd_andnot_si) SIMD16_EMU_AVX512_2(simd16scalari, _simd16_or_si, _simd_or_si) SIMD16_EMU_AVX512_2(simd16scalari, _simd16_xor_si, _simd_xor_si) + SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpeq_epi32, _simd_cmpeq_epi32) SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmpgt_epi32, _simd_cmpgt_epi32) SIMD16_EMU_AVX512_2(simd16scalari, _simd16_cmplt_epi32, _simd_cmplt_epi32) @@ -592,7 +595,6 @@ INLINE simd16scalari _simd16_srli_epi32_temp(simd16scalari a) SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmadd_ps, _simd_fmadd_ps) SIMD16_EMU_AVX512_3(simd16scalar, _simd16_fmsub_ps, _simd_fmsub_ps) -//__m256 _simd_i32gather_ps(const float* pBase, __m256i vOffsets, const int scale) template <int scale> INLINE simd16scalar _simd16_i32gather_ps_temp(const float *m, simd16scalari index) { @@ -606,7 +608,6 @@ INLINE simd16scalar _simd16_i32gather_ps_temp(const float *m, simd16scalari inde #define _simd16_i32gather_ps(m, index, scale) _simd16_i32gather_ps_temp<scale>(m, index) -//__m256 _simd_mask_i32gather_ps(__m256 vSrc, const float* pBase, __m256i vOffsets, __m256 vMask, const int scale) template <int scale> INLINE simd16scalar _simd16_mask_i32gather_ps_temp(simd16scalar a, const float *m, simd16scalari index, simd16scalari mask) { @@ -618,7 +619,7 @@ INLINE simd16scalar _simd16_mask_i32gather_ps_temp(simd16scalar a, const float * return result; } -#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp<scale>(a, m, mask, index) +#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp<scale>(a, m, index, mask) SIMD16_EMU_AVX512_2(simd16scalari, _simd16_shuffle_epi8, _simd_shuffle_epi8) SIMD16_EMU_AVX512_2(simd16scalari, _simd16_adds_epu8, _simd_adds_epu8) @@ -941,7 +942,10 @@ INLINE simd16scalar _simd16_cmp_ps_temp(simd16scalar a, simd16scalar b) #define _simd16_castpd_ps _mm512_castpd_ps #define _simd16_castps_pd _mm512_castps_pd +#define _simd16_and_ps _mm512_and_ps #define _simd16_andnot_ps _mm512_andnot_ps +#define _simd16_or_ps _mm512_or_ps +#define _simd16_xor_ps _mm512_xor_ps template <int mode> INLINE simd16scalar _simd16_round_ps_temp(simd16scalar a) @@ -960,6 +964,7 @@ INLINE simd16scalar _simd16_round_ps_temp(simd16scalar a) #define _simd16_min_epu32 _mm512_min_epu32 #define _simd16_max_epu32 _mm512_max_epu32 #define _simd16_add_epi32 _mm512_add_epi32 + #define _simd16_and_si _mm512_and_si512 #define _simd16_andnot_si _mm512_andnot_si512 #define _simd16_or_si _mm512_or_si512 @@ -1023,7 +1028,16 @@ INLINE int _simd16_testz_ps(simd16scalar a, simd16scalar b) #define _simd16_fmsub_ps _mm512_fmsub_ps #define _simd16_i32gather_ps(m, index, scale) _mm512_i32gather_ps(index, m, scale) -#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _mm512_mask_i32gather_ps(a, m, index, mask, scale) + +template <int scale> +INLINE simd16scalar _simd16_mask_i32gather_ps_temp(simd16scalar a, const float *m, simd16scalari index, simd16scalari mask) +{ + __mmask16 k = _mm512_cmpneq_epi32_mask(mask, _mm512_setzero_si512()); + + return _mm512_mask_i32gather_ps(a, k, index, m, scale); +} + +#define _simd16_mask_i32gather_ps(a, m, index, mask, scale) _simd16_mask_i32gather_ps_temp<scale>(a, m, index, mask) #define _simd16_abs_epi32 _mm512_abs_epi32 #define _simd16_cmpeq_epi64 _mm512_abs_epi32 diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h index 1053e0ab7a5..2e159daaaa4 100644 --- a/src/gallium/drivers/swr/rasterizer/core/pa.h +++ b/src/gallium/drivers/swr/rasterizer/core/pa.h @@ -160,7 +160,7 @@ struct PA_STATE_OPT : public PA_STATE bool nextReset{ false }; bool isStreaming{ false }; - SIMDMASK tmpIndices{ 0 }; // temporary index store for unused virtual function + SIMDMASK junkIndices { 0 }; // temporary index store for unused virtual function PA_STATE_OPT() {} PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts, @@ -257,7 +257,7 @@ struct PA_STATE_OPT : public PA_STATE SIMDMASK& GetNextVsIndices() { // unused in optimized PA, pass tmp buffer back - return tmpIndices; + return junkIndices; } bool GetNextStreamOutput() @@ -390,6 +390,12 @@ struct PA_STATE_CUT : public PA_STATE bool processCutVerts{ false }; // vertex indices with cuts should be processed as normal, otherwise they // are ignored. Fetch shader sends invalid verts on cuts that should be ignored // while the GS sends valid verts for every index + + simdvector junkVector; // junk simdvector for unimplemented API +#if ENABLE_AVX512_SIMD16 + simd16vector junkVector_simd16; // junk simd16vector for unimplemented API +#endif + // Topology state tracking uint32_t vert[MAX_NUM_VERTS_PER_PRIM]; uint32_t curIndex{ 0 }; @@ -471,8 +477,7 @@ struct PA_STATE_CUT : public PA_STATE { // unused SWR_ASSERT(0 && "Not implemented"); - static simdvector junk; - return junk; + return junkVector; } #if ENABLE_AVX512_SIMD16 @@ -480,8 +485,7 @@ struct PA_STATE_CUT : public PA_STATE { // unused SWR_ASSERT(0 && "Not implemented"); - static simd16vector junk; - return junk; + return junkVector_simd16; } #endif @@ -673,7 +677,7 @@ struct PA_STATE_CUT : public PA_STATE #if USE_SIMD16_FRONTEND simd16scalar temp = _simd16_i32gather_ps(pBase, offsets, 1); - verts[v].v[c] = useAlternateOffset ? temp.hi : temp.lo; + verts[v].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0); #else verts[v].v[c] = _simd_i32gather_ps(pBase, offsets, 1); #endif @@ -722,8 +726,7 @@ struct PA_STATE_CUT : public PA_STATE #if USE_SIMD16_FRONTEND verts[v].v[c] = _simd16_i32gather_ps(pBase, offsets, 1); #else - verts[v].v[c].lo = _simd_i32gather_ps(pBase, offsets, 1); - verts[v].v[c].hi = _simd_setzero_ps(); + verts[v].v[c] = _simd16_insert_ps(_simd15_setzero_ps(), _simd_i32gather_ps(pBase, offsets, 1)); #endif // move base to next component @@ -1158,16 +1161,14 @@ struct PA_TESS : PA_STATE simdvector& GetSimdVector(uint32_t index, uint32_t slot) { SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__); - static simdvector junk; - return junk; + return junkVector; } #if ENABLE_AVX512_SIMD16 simd16vector& GetSimdVector_simd16(uint32_t index, uint32_t slot) { SWR_INVALID("%s NOT IMPLEMENTED", __FUNCTION__); - static simd16vector junk; - return junk; + return junkVector_simd16; } #endif @@ -1225,7 +1226,7 @@ struct PA_TESS : PA_STATE mask, 4 /* gcc doesn't like sizeof(float) */); - verts[i].v[c] = useAlternateOffset ? temp.hi : temp.lo; + verts[i].v[c] = useAlternateOffset ? _simd16_extract_ps(temp, 1) : _simd16_extract_ps(temp, 0); #else verts[i].v[c] = _simd_mask_i32gather_ps( _simd_setzero_ps(), @@ -1274,13 +1275,13 @@ struct PA_TESS : PA_STATE mask, 4 /* gcc doesn't like sizeof(float) */); #else - verts[i].v[c].lo = _simd_mask_i32gather_ps( + simdscalar temp = _simd_mask_i32gather_ps( _simd_setzero_ps(), pBase, indices, _simd_castsi_ps(mask), 4 /* gcc doesn't like sizeof(float) */); - verts[i].v[c].hi = _simd_setzero_ps(); + verts[i].v[c] = _simd16_insert_ps(_simd16_setzero_ps(), temp, 0); #endif pBase += m_attributeStrideInVectors * SIMD_WIDTH; } @@ -1328,8 +1329,7 @@ struct PA_TESS : PA_STATE SIMDVERTEX& GetNextVsOutput() { SWR_NOT_IMPL; - static SIMDVERTEX junk; - return junk; + return junkVertex; } bool GetNextStreamOutput() @@ -1341,8 +1341,7 @@ struct PA_TESS : PA_STATE SIMDMASK& GetNextVsIndices() { SWR_NOT_IMPL; - static SIMDMASK junk; - return junk; + return junkIndices; } uint32_t NumPrims() @@ -1374,6 +1373,13 @@ private: uint32_t m_numVertsPerPrim = 0; SIMDSCALARI m_vPrimId; + + simdvector junkVector; // junk simdvector for unimplemented API +#if ENABLE_AVX512_SIMD16 + simd16vector junkVector_simd16; // junk simd16vector for unimplemented API +#endif + SIMDVERTEX junkVertex; // junk SIMDVERTEX for unimplemented API + SIMDMASK junkIndices; // temporary index store for unused virtual function }; // Primitive Assembler factory class, responsible for creating and initializing the correct assembler diff --git a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp index 511b3d0aeda..23bf1e0fff4 100644 --- a/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/pa_avx.cpp @@ -2509,7 +2509,7 @@ bool PaRectList2_simd16( uint32_t slot, simd16vector verts[]) { - SWR_INVALID("Is rect list used for anything other then clears?") + SWR_INVALID("Is rect list used for anything other then clears?"); SetNextPaState_simd16(pa, PaRectList0_simd16, PaRectListSingle0, 0, KNOB_SIMD16_WIDTH, true); return true; } |