swr: [rasterizer core] align Macrotile FIFO memory to SIMD size

Align and use streaming store instructions for BE fifo queues. Provides slightly faster enqueue and doesn't pollute the caches. Add appropriate memory fences to ensure streaming writes are globally visible. Signed-off-by: Tim Rowley <[email protected]>
author: Tim Rowley <[email protected]> 2016-10-04 12:59:30 -0500
committer: Tim Rowley <[email protected]> 2016-10-11 11:22:04 -0500
commit: 2966d9c691fd0cd51d83204cac6b3194b9dcb878 (patch)
tree: 066bd6639329f655ffd19301d500bff2d3ef75f6
parent: 6b3691c8762320df5afc8a7e79b9da09e272695b (diff)
4 files changed, 23 insertions, 9 deletions
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
index 4a91e95f7ff..3ad37de3d49 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
@@ -114,6 +114,7 @@ OSALIGNSIMD(union) simdvector
 #define _simd_round_ps _mm256_round_ps
 #define _simd_castpd_ps _mm256_castpd_ps
 #define _simd_broadcast_ps(a) _mm256_broadcast_ps((const __m128*)(a))
+#define _simd_stream_ps _mm256_stream_ps
 
 #define _simd_load_sd _mm256_load_sd
 #define _simd_movemask_pd _mm256_movemask_pd
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index dfcc1c0d39a..7a817ef0dd8 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -161,7 +161,7 @@ enum WORK_TYPE
     SHUTDOWN,
 };
 
-struct BE_WORK
+OSALIGNSIMD(struct) BE_WORK
 {
     WORK_TYPE type;
     PFN_WORK_FUNC pfnWork;
diff --git a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
index ccf0b70544f..7e07e6aeb2c 100644
--- a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
+++ b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
@@ -55,13 +55,11 @@ struct QUEUE
         mHead = 0;
         mTail = 0;
         mBlocks.clear();
-        T* pNewBlock = (T*)arena.Alloc(sizeof(T)*mBlockSize);
+        T* pNewBlock = (T*)arena.AllocAligned(sizeof(T)*mBlockSize, KNOB_SIMD_WIDTH*4);
         mBlocks.push_back(pNewBlock);
         mCurBlock = pNewBlock;
         mCurBlockIdx = 0;
-
         mNumEntries = 0;
-        _ReadWriteBarrier();
         mLock = 0;
     }
 
@@ -106,7 +104,20 @@ struct QUEUE
     template <typename ArenaT>
     bool enqueue_try_nosync(ArenaT& arena, const T* entry)
     {
-        memcpy(&mCurBlock[mTail], entry, sizeof(T));
+        const float* pSrc = (const float*)entry;
+        float* pDst = (float*)&mCurBlock[mTail];
+
+        auto lambda = [&](int32_t i)
+        {
+            __m256 vSrc = _simd_load_ps(pSrc + i*KNOB_SIMD_WIDTH);
+            _simd_stream_ps(pDst + i*KNOB_SIMD_WIDTH, vSrc);
+        };
+            
+        const uint32_t numSimdLines = sizeof(T) / (KNOB_SIMD_WIDTH*4);
+        static_assert(numSimdLines * KNOB_SIMD_WIDTH * 4 == sizeof(T),
+            "FIFO element size should be multiple of SIMD width.");
+
+        UnrollerL<0, numSimdLines, 1>::step(lambda);
 
         mTail ++;
         if (mTail == mBlockSize)
@@ -117,7 +128,7 @@ struct QUEUE
             }
             else
             {
-                T* newBlock = (T*)arena.Alloc(sizeof(T)*mBlockSize);
+                T* newBlock = (T*)arena.AllocAligned(sizeof(T)*mBlockSize, KNOB_SIMD_WIDTH*4);
                 SWR_ASSERT(newBlock);
 
                 mBlocks.push_back(newBlock);
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index 28cd9298967..08a4d17821c 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -565,8 +565,6 @@ bool WorkOnFifoBE(
 /// @brief Called when FE work is complete for this DC.
 INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
 {
-    _ReadWriteBarrier();
-
     if (pContext->pfnUpdateStatsFE && GetApiState(pDC).enableStats)
     {
         pContext->pfnUpdateStatsFE(GetPrivateState(pDC), &pDC->dynState.statsFE);
@@ -584,8 +582,9 @@ INLINE void CompleteDrawFE(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
         }
     }
 
+    // Ensure all streaming writes are globally visible before marking this FE done
+    _mm_mfence();
     pDC->doneFE = true;
-
     InterlockedDecrement((volatile LONG*)&pContext->drawsOutstandingFE);
 }
 
@@ -673,6 +672,9 @@ void WorkOnCompute(
                 queue.dispatch(pDC, workerId, threadGroupId, pSpillFillBuffer);
                 queue.finishedWork();
             }
+
+            // Ensure all streaming writes are globally visible before moving onto the next draw
+            _mm_mfence();
         }
     }
 }
author	Tim Rowley <[email protected]>	2016-10-04 12:59:30 -0500
committer	Tim Rowley <[email protected]>	2016-10-11 11:22:04 -0500
commit	2966d9c691fd0cd51d83204cac6b3194b9dcb878 (patch)
tree	066bd6639329f655ffd19301d500bff2d3ef75f6
parent	6b3691c8762320df5afc8a7e79b9da09e272695b (diff)