diff options
4 files changed, 23 insertions, 9 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h index 4a91e95f7ff..3ad37de3d49 100644 --- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h +++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h @@ -114,6 +114,7 @@ OSALIGNSIMD(union) simdvector #define _simd_round_ps _mm256_round_ps #define _simd_castpd_ps _mm256_castpd_ps #define _simd_broadcast_ps(a) _mm256_broadcast_ps((const __m128*)(a)) +#define _simd_stream_ps _mm256_stream_ps #define _simd_load_sd _mm256_load_sd #define _simd_movemask_pd _mm256_movemask_pd diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h index dfcc1c0d39a..7a817ef0dd8 100644 --- a/src/gallium/drivers/swr/rasterizer/core/context.h +++ b/src/gallium/drivers/swr/rasterizer/core/context.h @@ -161,7 +161,7 @@ enum WORK_TYPE SHUTDOWN, }; -struct BE_WORK +OSALIGNSIMD(struct) BE_WORK { WORK_TYPE type; PFN_WORK_FUNC pfnWork; diff --git a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp index ccf0b70544f..7e07e6aeb2c 100644 --- a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp +++ b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp @@ -55,13 +55,11 @@ struct QUEUE mHead = 0; mTail = 0; mBlocks.clear(); - T* pNewBlock = (T*)arena.Alloc(sizeof(T)*mBlockSize); + T* pNewBlock = (T*)arena.AllocAligned(sizeof(T)*mBlockSize, KNOB_SIMD_WIDTH*4); mBlocks.push_back(pNewBlock); mCurBlock = pNewBlock; mCurBlockIdx = 0; - mNumEntries = 0; - _ReadWriteBarrier(); mLock = 0; } @@ -106,7 +104,20 @@ struct QUEUE template <typename ArenaT> bool enqueue_try_nosync(ArenaT& arena, const T* entry) { - memcpy(&mCurBlock[mTail], entry, sizeof(T)); + const float* pSrc = (const float*)entry; + float* pDst = (float*)&mCurBlock[mTail]; + + auto lambda = [&](int32_t i) + { + __m256 vSrc = _simd_load_ps(pSrc + i*KNOB_SIMD_WIDTH); + _simd_stream_ps(pDst + i*KNOB_SIMD_WIDTH, vSrc); + }; + + const uint32_t numSimdLines = sizeof(T) / (KNOB_SIMD_WIDTH*4); + static_assert(numSimdLines * KNOB_SIMD_WIDTH * 4 == sizeof(T), + "FIFO element size should be multiple of SIMD width."); + + UnrollerL<0, numSimdLines, 1>::step(lambda); mTail ++; if (mTail == mBlockSize) @@ -117,7 +128,7 @@ struct QUEUE } else { - T* newBlock = (T*)arena.Alloc(sizeof(T)*mBlockSize); + T* newBlock = (T*)arena.AllocAligned(sizeof(T)*mBlockSize, KNOB_SIMD_WIDTH*4); SWR_ASSERT(newBlock); mBlocks.push_back(newBlock); diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp index 28cd9298967..08a4d17821c 100644 --- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp +++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp @@ -565,8 +565,6 @@ bool WorkOnFifoBE( /// @brief Called when FE work is complete for this DC. INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC) { - _ReadWriteBarrier(); - if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStats) { pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &pDC->dynState.statsFE); @@ -584,8 +582,9 @@ INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC) } } + // Ensure all streaming writes are globally visible before marking this FE done + _mm_mfence(); pDC->doneFE = true; - InterlockedDecrement((volatile LONG*)&pContext->drawsOutstandingFE); } @@ -673,6 +672,9 @@ void WorkOnCompute( queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer); queue.finishedWork(); } + + // Ensure all streaming writes are globally visible before moving onto the next draw + _mm_mfence(); } } } |