summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--src/gallium/drivers/swr/rasterizer/common/simdintrin.h1
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/context.h2
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/fifo.hpp21
-rw-r--r--src/gallium/drivers/swr/rasterizer/core/threads.cpp8
4 files changed, 23 insertions, 9 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
index 4a91e95f7ff..3ad37de3d49 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
@@ -114,6 +114,7 @@ OSALIGNSIMD(union) simdvector
#define _simd_round_ps _mm256_round_ps
#define _simd_castpd_ps _mm256_castpd_ps
#define _simd_broadcast_ps(a) _mm256_broadcast_ps((const __m128*)(a))
+#define _simd_stream_ps _mm256_stream_ps
#define _simd_load_sd _mm256_load_sd
#define _simd_movemask_pd _mm256_movemask_pd
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index dfcc1c0d39a..7a817ef0dd8 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -161,7 +161,7 @@ enum WORK_TYPE
SHUTDOWN,
};
-struct BE_WORK
+OSALIGNSIMD(struct) BE_WORK
{
WORK_TYPE type;
PFN_WORK_FUNC pfnWork;
diff --git a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
index ccf0b70544f..7e07e6aeb2c 100644
--- a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
+++ b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
@@ -55,13 +55,11 @@ struct QUEUE
mHead = 0;
mTail = 0;
mBlocks.clear();
- T* pNewBlock = (T*)arena.Alloc(sizeof(T)*mBlockSize);
+ T* pNewBlock = (T*)arena.AllocAligned(sizeof(T)*mBlockSize, KNOB_SIMD_WIDTH*4);
mBlocks.push_back(pNewBlock);
mCurBlock = pNewBlock;
mCurBlockIdx = 0;
-
mNumEntries = 0;
- _ReadWriteBarrier();
mLock = 0;
}
@@ -106,7 +104,20 @@ struct QUEUE
template <typename ArenaT>
bool enqueue_try_nosync(ArenaT& arena, const T* entry)
{
- memcpy(&mCurBlock[mTail], entry, sizeof(T));
+ const float* pSrc = (const float*)entry;
+ float* pDst = (float*)&mCurBlock[mTail];
+
+ auto lambda = [&](int32_t i)
+ {
+ __m256 vSrc = _simd_load_ps(pSrc + i*KNOB_SIMD_WIDTH);
+ _simd_stream_ps(pDst + i*KNOB_SIMD_WIDTH, vSrc);
+ };
+
+ const uint32_t numSimdLines = sizeof(T) / (KNOB_SIMD_WIDTH*4);
+ static_assert(numSimdLines * KNOB_SIMD_WIDTH * 4 == sizeof(T),
+ "FIFO element size should be multiple of SIMD width.");
+
+ UnrollerL<0, numSimdLines, 1>::step(lambda);
mTail ++;
if (mTail == mBlockSize)
@@ -117,7 +128,7 @@ struct QUEUE
}
else
{
- T* newBlock = (T*)arena.Alloc(sizeof(T)*mBlockSize);
+ T* newBlock = (T*)arena.AllocAligned(sizeof(T)*mBlockSize, KNOB_SIMD_WIDTH*4);
SWR_ASSERT(newBlock);
mBlocks.push_back(newBlock);
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index 28cd9298967..08a4d17821c 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -565,8 +565,6 @@ bool WorkOnFifoBE(
/// @brief Called when FE work is complete for this DC.
INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
{
- _ReadWriteBarrier();
-
if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStats)
{
pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &pDC->dynState.statsFE);
@@ -584,8 +582,9 @@ INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
}
}
+ // Ensure all streaming writes are globally visible before marking this FE done
+ _mm_mfence();
pDC->doneFE = true;
-
InterlockedDecrement((volatile LONG*)&pContext->drawsOutstandingFE);
}
@@ -673,6 +672,9 @@ void WorkOnCompute(
queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer);
queue.finishedWork();
}
+
+ // Ensure all streaming writes are globally visible before moving onto the next draw
+ _mm_mfence();
}
}
}